1 /* $NetBSD: is_csv.c,v 1.5 2023/08/18 19:00:11 christos Exp $ */
2
3 /*-
4 * Copyright (c) 2019 Christos Zoulas
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 /*
30 * Parse CSV object serialization format (RFC-4180, RFC-7111)
31 */
32
33 #ifndef TEST
34 #include "file.h"
35
36 #ifndef lint
37 #if 0
38 FILE_RCSID("@(#)$File: is_csv.c,v 1.13 2023/07/17 16:08:17 christos Exp $")
39 #else
40 __RCSID("$NetBSD: is_csv.c,v 1.5 2023/08/18 19:00:11 christos Exp $");
41 #endif
42 #endif
43
44 #include <string.h>
45 #include "magic.h"
46 #else
47 #include <sys/types.h>
48 #endif
49
50
51 #ifdef DEBUG
52 #include <stdio.h>
53 #define DPRINTF(fmt, ...) printf(fmt, __VA_ARGS__)
54 #else
55 #define DPRINTF(fmt, ...)
56 #endif
57
58 /*
59 * if CSV_LINES == 0:
60 * check all the lines in the buffer
61 * otherwise:
62 * check only up-to the number of lines specified
63 *
64 * the last line count is always ignored if it does not end in CRLF
65 */
66 #ifndef CSV_LINES
67 #define CSV_LINES 10
68 #endif
69
70 static int csv_parse(const unsigned char *, const unsigned char *);
71
72 static const unsigned char *
eatquote(const unsigned char * uc,const unsigned char * ue)73 eatquote(const unsigned char *uc, const unsigned char *ue)
74 {
75 int quote = 0;
76
77 while (uc < ue) {
78 unsigned char c = *uc++;
79 if (c != '"') {
80 // We already got one, done.
81 if (quote) {
82 return --uc;
83 }
84 continue;
85 }
86 if (quote) {
87 // quote-quote escapes
88 quote = 0;
89 continue;
90 }
91 // first quote
92 quote = 1;
93 }
94 return ue;
95 }
96
97 static int
csv_parse(const unsigned char * uc,const unsigned char * ue)98 csv_parse(const unsigned char *uc, const unsigned char *ue)
99 {
100 size_t nf = 0, tf = 0, nl = 0;
101
102 while (uc < ue) {
103 switch (*uc++) {
104 case '"':
105 // Eat until the matching quote
106 uc = eatquote(uc, ue);
107 break;
108 case ',':
109 nf++;
110 break;
111 case '\n':
112 DPRINTF("%zu %zu %zu\n", nl, nf, tf);
113 nl++;
114 #if CSV_LINES
115 if (nl == CSV_LINES)
116 return tf != 0 && tf == nf;
117 #endif
118 if (tf == 0) {
119 // First time and no fields, give up
120 if (nf == 0)
121 return 0;
122 // First time, set the number of fields
123 tf = nf;
124 } else if (tf != nf) {
125 // Field number mismatch, we are done.
126 return 0;
127 }
128 nf = 0;
129 break;
130 default:
131 break;
132 }
133 }
134 return tf && nl >= 2;
135 }
136
137 #ifndef TEST
138 int
file_is_csv(struct magic_set * ms,const struct buffer * b,int looks_text,const char * code)139 file_is_csv(struct magic_set *ms, const struct buffer *b, int looks_text,
140 const char *code)
141 {
142 const unsigned char *uc = CAST(const unsigned char *, b->fbuf);
143 const unsigned char *ue = uc + b->flen;
144 int mime = ms->flags & MAGIC_MIME;
145
146 if (!looks_text)
147 return 0;
148
149 if ((ms->flags & (MAGIC_APPLE|MAGIC_EXTENSION)) != 0)
150 return 0;
151
152 if (!csv_parse(uc, ue))
153 return 0;
154
155 if (mime == MAGIC_MIME_ENCODING)
156 return 1;
157
158 if (mime) {
159 if (file_printf(ms, "text/csv") == -1)
160 return -1;
161 return 1;
162 }
163
164 if (file_printf(ms, "CSV %s%stext", code ? code : "",
165 code ? " " : "") == -1)
166 return -1;
167
168 return 1;
169 }
170
171 #else
172
173 #include <sys/types.h>
174 #include <sys/stat.h>
175 #include <stdio.h>
176 #include <fcntl.h>
177 #include <unistd.h>
178 #include <stdlib.h>
179 #include <stdint.h>
180 #include <err.h>
181
182 int
main(int argc,char * argv[])183 main(int argc, char *argv[])
184 {
185 int fd;
186 struct stat st;
187 unsigned char *p;
188
189 if ((fd = open(argv[1], O_RDONLY)) == -1)
190 err(EXIT_FAILURE, "Can't open `%s'", argv[1]);
191
192 if (fstat(fd, &st) == -1)
193 err(EXIT_FAILURE, "Can't stat `%s'", argv[1]);
194
195 if ((p = CAST(char *, malloc(st.st_size))) == NULL)
196 err(EXIT_FAILURE, "Can't allocate %jd bytes",
197 (intmax_t)st.st_size);
198 if (read(fd, p, st.st_size) != st.st_size)
199 err(EXIT_FAILURE, "Can't read %jd bytes",
200 (intmax_t)st.st_size);
201 printf("is csv %d\n", csv_parse(p, p + st.st_size));
202 return 0;
203 }
204 #endif
205