xref: /netbsd-src/external/bsd/file/dist/src/is_csv.c (revision e6c7e151de239c49d2e38720a061ed9d1fa99309)
1 /*	$NetBSD: is_csv.c,v 1.1.1.1 2019/12/17 02:23:53 christos Exp $	*/
2 
3 /*-
4  * Copyright (c) 2019 Christos Zoulas
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26  * POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*
30  * Parse CSV object serialization format (RFC-4180, RFC-7111)
31  */
32 
33 #ifndef TEST
34 #include "file.h"
35 
36 #ifndef lint
37 #if 0
38 FILE_RCSID("@(#)$File: is_csv.c,v 1.4 2019/06/26 20:31:31 christos Exp $")
39 #else
40 __RCSID("$NetBSD: is_csv.c,v 1.1.1.1 2019/12/17 02:23:53 christos Exp $");
41 #endif
42 #endif
43 
44 #include <string.h>
45 #include "magic.h"
46 #else
47 #include <sys/types.h>
48 #endif
49 
50 
51 #ifdef DEBUG
52 #include <stdio.h>
53 #define DPRINTF(fmt, ...) printf(fmt, __VA_ARGS__)
54 #else
55 #define DPRINTF(fmt, ...)
56 #endif
57 
58 /*
59  * if CSV_LINES == 0:
60  *	check all the lines in the buffer
61  * otherwise:
62  *	check only up-to the number of lines specified
63  *
64  * the last line count is always ignored if it does not end in CRLF
65  */
66 #ifndef CSV_LINES
67 #define CSV_LINES 10
68 #endif
69 
70 static int csv_parse(const unsigned char *, const unsigned char *);
71 
72 static const unsigned char *
73 eatquote(const unsigned char *uc, const unsigned char *ue)
74 {
75 	int quote = 0;
76 
77 	while (uc < ue) {
78 		unsigned char c = *uc++;
79 		if (c != '"') {
80 			// We already got one, done.
81 			if (quote) {
82 				return --uc;
83 			}
84 			continue;
85 		}
86 		if (quote) {
87 			// quote-quote escapes
88 			quote = 0;
89 			continue;
90 		}
91 		// first quote
92 		quote = 1;
93 	}
94 	return ue;
95 }
96 
97 static int
98 csv_parse(const unsigned char *uc, const unsigned char *ue)
99 {
100 	size_t nf = 0, tf = 0, nl = 0;
101 
102 	while (uc < ue) {
103 		unsigned char c;
104 		switch (c = *uc++) {
105 		case '"':
106 			// Eat until the matching quote
107 			uc = eatquote(uc, ue);
108 			break;
109 		case ',':
110 			nf++;
111 			break;
112 		case '\n':
113 			DPRINTF("%zu %zu %zu\n", nl, nf, tf);
114 			nl++;
115 #if CSV_LINES
116 			if (nl == CSV_LINES)
117 				return tf != 0 && tf == nf;
118 #endif
119 			if (tf == 0) {
120 				// First time and no fields, give up
121 				if (nf == 0)
122 					return 0;
123 				// First time, set the number of fields
124 				tf = nf;
125 			} else if (tf != nf) {
126 				// Field number mismatch, we are done.
127 				return 0;
128 			}
129 			nf = 0;
130 			break;
131 		default:
132 			break;
133 		}
134 	}
135 	return tf && nl > 2;
136 }
137 
138 #ifndef TEST
139 int
140 file_is_csv(struct magic_set *ms, const struct buffer *b, int looks_text)
141 {
142 	const unsigned char *uc = CAST(const unsigned char *, b->fbuf);
143 	const unsigned char *ue = uc + b->flen;
144 	int mime = ms->flags & MAGIC_MIME;
145 
146 	if (!looks_text)
147 		return 0;
148 
149 	if ((ms->flags & (MAGIC_APPLE|MAGIC_EXTENSION)) != 0)
150 		return 0;
151 
152 	if (!csv_parse(uc, ue))
153 		return 0;
154 
155 	if (mime == MAGIC_MIME_ENCODING)
156 		return 1;
157 
158 	if (mime) {
159 		if (file_printf(ms, "application/csv") == -1)
160 			return -1;
161 		return 1;
162 	}
163 
164 	if (file_printf(ms, "CSV text") == -1)
165 		return -1;
166 
167 	return 1;
168 }
169 
170 #else
171 
172 #include <sys/types.h>
173 #include <sys/stat.h>
174 #include <stdio.h>
175 #include <fcntl.h>
176 #include <unistd.h>
177 #include <stdlib.h>
178 #include <stdint.h>
179 #include <err.h>
180 
181 int
182 main(int argc, char *argv[])
183 {
184 	int fd, rv;
185 	struct stat st;
186 	unsigned char *p;
187 
188 	if ((fd = open(argv[1], O_RDONLY)) == -1)
189 		err(EXIT_FAILURE, "Can't open `%s'", argv[1]);
190 
191 	if (fstat(fd, &st) == -1)
192 		err(EXIT_FAILURE, "Can't stat `%s'", argv[1]);
193 
194 	if ((p = malloc(st.st_size)) == NULL)
195 		err(EXIT_FAILURE, "Can't allocate %jd bytes",
196 		    (intmax_t)st.st_size);
197 	if (read(fd, p, st.st_size) != st.st_size)
198 		err(EXIT_FAILURE, "Can't read %jd bytes",
199 		    (intmax_t)st.st_size);
200 	printf("is csv %d\n", csv_parse(p, p + st.st_size));
201 	return 0;
202 }
203 #endif
204