xref: /netbsd-src/external/bsd/file/dist/src/is_csv.c (revision ddb176824c39fb0db5ceef3e9e40dcaa273aec38)
1 /*	$NetBSD: is_csv.c,v 1.5 2023/08/18 19:00:11 christos Exp $	*/
2 
3 /*-
4  * Copyright (c) 2019 Christos Zoulas
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26  * POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*
30  * Parse CSV object serialization format (RFC-4180, RFC-7111)
31  */
32 
33 #ifndef TEST
34 #include "file.h"
35 
36 #ifndef lint
37 #if 0
38 FILE_RCSID("@(#)$File: is_csv.c,v 1.13 2023/07/17 16:08:17 christos Exp $")
39 #else
40 __RCSID("$NetBSD: is_csv.c,v 1.5 2023/08/18 19:00:11 christos Exp $");
41 #endif
42 #endif
43 
44 #include <string.h>
45 #include "magic.h"
46 #else
47 #include <sys/types.h>
48 #endif
49 
50 
51 #ifdef DEBUG
52 #include <stdio.h>
53 #define DPRINTF(fmt, ...) printf(fmt, __VA_ARGS__)
54 #else
55 #define DPRINTF(fmt, ...)
56 #endif
57 
58 /*
59  * if CSV_LINES == 0:
60  *	check all the lines in the buffer
61  * otherwise:
62  *	check only up-to the number of lines specified
63  *
64  * the last line count is always ignored if it does not end in CRLF
65  */
66 #ifndef CSV_LINES
67 #define CSV_LINES 10
68 #endif
69 
70 static int csv_parse(const unsigned char *, const unsigned char *);
71 
72 static const unsigned char *
eatquote(const unsigned char * uc,const unsigned char * ue)73 eatquote(const unsigned char *uc, const unsigned char *ue)
74 {
75 	int quote = 0;
76 
77 	while (uc < ue) {
78 		unsigned char c = *uc++;
79 		if (c != '"') {
80 			// We already got one, done.
81 			if (quote) {
82 				return --uc;
83 			}
84 			continue;
85 		}
86 		if (quote) {
87 			// quote-quote escapes
88 			quote = 0;
89 			continue;
90 		}
91 		// first quote
92 		quote = 1;
93 	}
94 	return ue;
95 }
96 
97 static int
csv_parse(const unsigned char * uc,const unsigned char * ue)98 csv_parse(const unsigned char *uc, const unsigned char *ue)
99 {
100 	size_t nf = 0, tf = 0, nl = 0;
101 
102 	while (uc < ue) {
103 		switch (*uc++) {
104 		case '"':
105 			// Eat until the matching quote
106 			uc = eatquote(uc, ue);
107 			break;
108 		case ',':
109 			nf++;
110 			break;
111 		case '\n':
112 			DPRINTF("%zu %zu %zu\n", nl, nf, tf);
113 			nl++;
114 #if CSV_LINES
115 			if (nl == CSV_LINES)
116 				return tf != 0 && tf == nf;
117 #endif
118 			if (tf == 0) {
119 				// First time and no fields, give up
120 				if (nf == 0)
121 					return 0;
122 				// First time, set the number of fields
123 				tf = nf;
124 			} else if (tf != nf) {
125 				// Field number mismatch, we are done.
126 				return 0;
127 			}
128 			nf = 0;
129 			break;
130 		default:
131 			break;
132 		}
133 	}
134 	return tf && nl >= 2;
135 }
136 
137 #ifndef TEST
138 int
file_is_csv(struct magic_set * ms,const struct buffer * b,int looks_text,const char * code)139 file_is_csv(struct magic_set *ms, const struct buffer *b, int looks_text,
140     const char *code)
141 {
142 	const unsigned char *uc = CAST(const unsigned char *, b->fbuf);
143 	const unsigned char *ue = uc + b->flen;
144 	int mime = ms->flags & MAGIC_MIME;
145 
146 	if (!looks_text)
147 		return 0;
148 
149 	if ((ms->flags & (MAGIC_APPLE|MAGIC_EXTENSION)) != 0)
150 		return 0;
151 
152 	if (!csv_parse(uc, ue))
153 		return 0;
154 
155 	if (mime == MAGIC_MIME_ENCODING)
156 		return 1;
157 
158 	if (mime) {
159 		if (file_printf(ms, "text/csv") == -1)
160 			return -1;
161 		return 1;
162 	}
163 
164 	if (file_printf(ms, "CSV %s%stext", code ? code : "",
165 	    code ? " " : "") == -1)
166 		return -1;
167 
168 	return 1;
169 }
170 
171 #else
172 
173 #include <sys/types.h>
174 #include <sys/stat.h>
175 #include <stdio.h>
176 #include <fcntl.h>
177 #include <unistd.h>
178 #include <stdlib.h>
179 #include <stdint.h>
180 #include <err.h>
181 
182 int
main(int argc,char * argv[])183 main(int argc, char *argv[])
184 {
185 	int fd;
186 	struct stat st;
187 	unsigned char *p;
188 
189 	if ((fd = open(argv[1], O_RDONLY)) == -1)
190 		err(EXIT_FAILURE, "Can't open `%s'", argv[1]);
191 
192 	if (fstat(fd, &st) == -1)
193 		err(EXIT_FAILURE, "Can't stat `%s'", argv[1]);
194 
195 	if ((p = CAST(char *, malloc(st.st_size))) == NULL)
196 		err(EXIT_FAILURE, "Can't allocate %jd bytes",
197 		    (intmax_t)st.st_size);
198 	if (read(fd, p, st.st_size) != st.st_size)
199 		err(EXIT_FAILURE, "Can't read %jd bytes",
200 		    (intmax_t)st.st_size);
201 	printf("is csv %d\n", csv_parse(p, p + st.st_size));
202 	return 0;
203 }
204 #endif
205