forked from dbro/csvquote
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcsvquote.c
177 lines (159 loc) · 5.35 KB
/
csvquote.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <unistd.h>
#define NDEBUG
#include "dbg.h"
#define READ_BUFFER_SIZE 4096
#define NON_PRINTING_FIELD_SEPARATOR 0x1F
#define NON_PRINTING_RECORD_SEPARATOR 0x1E
/*
TODO: verify that it handles multi-byte characters and unicode and utf-8 etc
*/
typedef void (*translator)(const char, const char, const char, char *);
typedef enum { RESTORE_MODE, SANITIZE_MODE } operation_mode;
void restore(const char delimiter, const char quotechar, const char recordsep, char *c) {
// the quotechar is not needed when restoring, but we include it
// to keep the function parameters consistent for both translators
switch (*c) {
case NON_PRINTING_FIELD_SEPARATOR:
*c = delimiter;
break;
case NON_PRINTING_RECORD_SEPARATOR:
*c = recordsep;
break;
// no default case needed
}
return;
}
void sanitize(const char delimiter, const char quotechar, const char recordsep, char *c) {
// maintain the state of quoting inside this function
// this is OK because we need to read the file
// sequentially (not in parallel) because the state
// at any point depends on all of the previous data
static bool isQuoteInEffect = false;
static bool isMaybeEscapedQuoteChar = false;
if (isMaybeEscapedQuoteChar) {
if (*c != quotechar) {
// this is the end of a quoted field
isQuoteInEffect = false;
}
isMaybeEscapedQuoteChar = false;
} else if (isQuoteInEffect) {
if (*c == quotechar) {
// this is either an escaped quote char or the end of a quoted
// field. need to read one more character to decide which
isMaybeEscapedQuoteChar = true;
} else if (*c == delimiter) {
*c = NON_PRINTING_FIELD_SEPARATOR;
} else if (*c == recordsep) {
*c = NON_PRINTING_RECORD_SEPARATOR;
}
} else {
// quote not in effect
if (*c == quotechar) {
isQuoteInEffect = true;
}
}
return;
}
int copy_file(FILE *in, const operation_mode op_mode,
const char del, const char quo, const char rec) {
char buffer[READ_BUFFER_SIZE];
size_t nbytes;
char *c, *stopat;
debug("copying file with d=%d\tq=%d\tr=%d", del, quo, rec);
translator trans;
switch (op_mode) {
case SANITIZE_MODE:
trans = sanitize;
break;
case RESTORE_MODE:
trans = restore;
break;
default:
sentinel("unexpected operating mode");
}
while ((nbytes = fread(buffer, sizeof(char), sizeof(buffer), in)) != 0)
{
stopat = buffer + (nbytes);
for (c=buffer; c<stopat; c++) {
(*trans)(del, quo, rec, c); // no error checking inside this loop
}
check(fwrite(buffer, sizeof(char), nbytes, stdout) == nbytes,
"Failed to write %zu bytes\n", nbytes);
}
return 0;
error:
return 1;
}
int main(int argc, char *argv[]) {
// default parameters
FILE *input = NULL;
char del = ',';
char quo = '"';
char rec = '\n';
operation_mode op_mode = SANITIZE_MODE;
int opt;
while ((opt = getopt(argc, argv, "usd:tq:r:")) != -1) {
switch (opt) {
case 'u':
op_mode = RESTORE_MODE;
break;
case 's':
op_mode = SANITIZE_MODE;
break;
case 'd':
del = optarg[0]; // byte
break;
case 't':
del = '\t';
break;
case 'q':
quo = optarg[0]; // byte
break;
case 'r':
rec = optarg[0]; // byte
break;
case ':':
// -d or -q or -r without operand
fprintf(stderr,
"Option -%c requires an operand\n", optopt);
goto usage;
case '?':
goto usage;
default:
fprintf(stderr,
"Unrecognized option: '-%c'\n", optopt);
goto usage;
}
}
// Process stdin or file names
if (optind >= argc) {
check(copy_file(stdin, op_mode, del, quo, rec) == 0,
"failed to copy from stdin");
} else {
// supports multiple file names
int i;
for (i=optind; i<argc; i++) {
input = fopen(argv[i], "r");
check(input != 0, "failed to open file %s", argv[optind]);
check(copy_file(input, op_mode, del, quo, rec) == 0,
"failed to copy from file %s", argv[i]);
if (input) { fclose(input); }
}
}
return 0;
usage:
fprintf(stderr, "Usage: %s [OPTION] [files]\n", argv[0]);
fprintf(stderr, "\tfiles are zero or more filenames. If none given, read from standard input\n");
fprintf(stderr, "\t-u\tdefault false\trestore mode. replace nonprinting characters with original characters\n");
fprintf(stderr, "\t-d\tdefault ,\tfield separator character\n");
fprintf(stderr, "\t-t\tdefault false\tuse tab as the field separator character\n");
fprintf(stderr, "\t-q\tdefault \"\tfield quoting character\n");
fprintf(stderr, "\t-r\tdefault \\n\trecord separator character\n");
return 1;
error:
if (input) { fclose(input); }
return 1;
}