16fca56fbSSascha Wildner /*-
26fca56fbSSascha Wildner * Copyright (c) 2019 Christos Zoulas
36fca56fbSSascha Wildner * All rights reserved.
46fca56fbSSascha Wildner *
56fca56fbSSascha Wildner * Redistribution and use in source and binary forms, with or without
66fca56fbSSascha Wildner * modification, are permitted provided that the following conditions
76fca56fbSSascha Wildner * are met:
86fca56fbSSascha Wildner * 1. Redistributions of source code must retain the above copyright
96fca56fbSSascha Wildner * notice, this list of conditions and the following disclaimer.
106fca56fbSSascha Wildner * 2. Redistributions in binary form must reproduce the above copyright
116fca56fbSSascha Wildner * notice, this list of conditions and the following disclaimer in the
126fca56fbSSascha Wildner * documentation and/or other materials provided with the distribution.
136fca56fbSSascha Wildner *
146fca56fbSSascha Wildner * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
156fca56fbSSascha Wildner * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
166fca56fbSSascha Wildner * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
176fca56fbSSascha Wildner * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
186fca56fbSSascha Wildner * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
196fca56fbSSascha Wildner * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
206fca56fbSSascha Wildner * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
216fca56fbSSascha Wildner * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
226fca56fbSSascha Wildner * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
236fca56fbSSascha Wildner * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
246fca56fbSSascha Wildner * POSSIBILITY OF SUCH DAMAGE.
256fca56fbSSascha Wildner */
266fca56fbSSascha Wildner
276fca56fbSSascha Wildner /*
286fca56fbSSascha Wildner * Parse CSV object serialization format (RFC-4180, RFC-7111)
296fca56fbSSascha Wildner */
306fca56fbSSascha Wildner
316fca56fbSSascha Wildner #ifndef TEST
326fca56fbSSascha Wildner #include "file.h"
336fca56fbSSascha Wildner
346fca56fbSSascha Wildner #ifndef lint
35*3b9cdfa3SAntonio Huete Jimenez FILE_RCSID("@(#)$File: is_csv.c,v 1.7 2022/05/28 00:44:22 christos Exp $")
366fca56fbSSascha Wildner #endif
376fca56fbSSascha Wildner
386fca56fbSSascha Wildner #include <string.h>
396fca56fbSSascha Wildner #include "magic.h"
406fca56fbSSascha Wildner #else
416fca56fbSSascha Wildner #include <sys/types.h>
426fca56fbSSascha Wildner #endif
436fca56fbSSascha Wildner
446fca56fbSSascha Wildner
456fca56fbSSascha Wildner #ifdef DEBUG
466fca56fbSSascha Wildner #include <stdio.h>
476fca56fbSSascha Wildner #define DPRINTF(fmt, ...) printf(fmt, __VA_ARGS__)
486fca56fbSSascha Wildner #else
496fca56fbSSascha Wildner #define DPRINTF(fmt, ...)
506fca56fbSSascha Wildner #endif
516fca56fbSSascha Wildner
526fca56fbSSascha Wildner /*
536fca56fbSSascha Wildner * if CSV_LINES == 0:
546fca56fbSSascha Wildner * check all the lines in the buffer
556fca56fbSSascha Wildner * otherwise:
566fca56fbSSascha Wildner * check only up-to the number of lines specified
576fca56fbSSascha Wildner *
586fca56fbSSascha Wildner * the last line count is always ignored if it does not end in CRLF
596fca56fbSSascha Wildner */
606fca56fbSSascha Wildner #ifndef CSV_LINES
616fca56fbSSascha Wildner #define CSV_LINES 10
626fca56fbSSascha Wildner #endif
636fca56fbSSascha Wildner
646fca56fbSSascha Wildner static int csv_parse(const unsigned char *, const unsigned char *);
656fca56fbSSascha Wildner
666fca56fbSSascha Wildner static const unsigned char *
eatquote(const unsigned char * uc,const unsigned char * ue)676fca56fbSSascha Wildner eatquote(const unsigned char *uc, const unsigned char *ue)
686fca56fbSSascha Wildner {
696fca56fbSSascha Wildner int quote = 0;
706fca56fbSSascha Wildner
716fca56fbSSascha Wildner while (uc < ue) {
726fca56fbSSascha Wildner unsigned char c = *uc++;
736fca56fbSSascha Wildner if (c != '"') {
746fca56fbSSascha Wildner // We already got one, done.
756fca56fbSSascha Wildner if (quote) {
766fca56fbSSascha Wildner return --uc;
776fca56fbSSascha Wildner }
786fca56fbSSascha Wildner continue;
796fca56fbSSascha Wildner }
806fca56fbSSascha Wildner if (quote) {
816fca56fbSSascha Wildner // quote-quote escapes
826fca56fbSSascha Wildner quote = 0;
836fca56fbSSascha Wildner continue;
846fca56fbSSascha Wildner }
856fca56fbSSascha Wildner // first quote
866fca56fbSSascha Wildner quote = 1;
876fca56fbSSascha Wildner }
886fca56fbSSascha Wildner return ue;
896fca56fbSSascha Wildner }
906fca56fbSSascha Wildner
916fca56fbSSascha Wildner static int
csv_parse(const unsigned char * uc,const unsigned char * ue)926fca56fbSSascha Wildner csv_parse(const unsigned char *uc, const unsigned char *ue)
936fca56fbSSascha Wildner {
946fca56fbSSascha Wildner size_t nf = 0, tf = 0, nl = 0;
956fca56fbSSascha Wildner
966fca56fbSSascha Wildner while (uc < ue) {
97970935fdSSascha Wildner switch (*uc++) {
986fca56fbSSascha Wildner case '"':
996fca56fbSSascha Wildner // Eat until the matching quote
1006fca56fbSSascha Wildner uc = eatquote(uc, ue);
1016fca56fbSSascha Wildner break;
1026fca56fbSSascha Wildner case ',':
1036fca56fbSSascha Wildner nf++;
1046fca56fbSSascha Wildner break;
1056fca56fbSSascha Wildner case '\n':
1066fca56fbSSascha Wildner DPRINTF("%zu %zu %zu\n", nl, nf, tf);
1076fca56fbSSascha Wildner nl++;
1086fca56fbSSascha Wildner #if CSV_LINES
1096fca56fbSSascha Wildner if (nl == CSV_LINES)
1106fca56fbSSascha Wildner return tf != 0 && tf == nf;
1116fca56fbSSascha Wildner #endif
1126fca56fbSSascha Wildner if (tf == 0) {
1136fca56fbSSascha Wildner // First time and no fields, give up
1146fca56fbSSascha Wildner if (nf == 0)
1156fca56fbSSascha Wildner return 0;
1166fca56fbSSascha Wildner // First time, set the number of fields
1176fca56fbSSascha Wildner tf = nf;
1186fca56fbSSascha Wildner } else if (tf != nf) {
1196fca56fbSSascha Wildner // Field number mismatch, we are done.
1206fca56fbSSascha Wildner return 0;
1216fca56fbSSascha Wildner }
1226fca56fbSSascha Wildner nf = 0;
1236fca56fbSSascha Wildner break;
1246fca56fbSSascha Wildner default:
1256fca56fbSSascha Wildner break;
1266fca56fbSSascha Wildner }
1276fca56fbSSascha Wildner }
1286fca56fbSSascha Wildner return tf && nl > 2;
1296fca56fbSSascha Wildner }
1306fca56fbSSascha Wildner
1316fca56fbSSascha Wildner #ifndef TEST
1326fca56fbSSascha Wildner int
file_is_csv(struct magic_set * ms,const struct buffer * b,int looks_text)1336fca56fbSSascha Wildner file_is_csv(struct magic_set *ms, const struct buffer *b, int looks_text)
1346fca56fbSSascha Wildner {
1356fca56fbSSascha Wildner const unsigned char *uc = CAST(const unsigned char *, b->fbuf);
1366fca56fbSSascha Wildner const unsigned char *ue = uc + b->flen;
1376fca56fbSSascha Wildner int mime = ms->flags & MAGIC_MIME;
1386fca56fbSSascha Wildner
1396fca56fbSSascha Wildner if (!looks_text)
1406fca56fbSSascha Wildner return 0;
1416fca56fbSSascha Wildner
1426fca56fbSSascha Wildner if ((ms->flags & (MAGIC_APPLE|MAGIC_EXTENSION)) != 0)
1436fca56fbSSascha Wildner return 0;
1446fca56fbSSascha Wildner
1456fca56fbSSascha Wildner if (!csv_parse(uc, ue))
1466fca56fbSSascha Wildner return 0;
1476fca56fbSSascha Wildner
1486fca56fbSSascha Wildner if (mime == MAGIC_MIME_ENCODING)
1496fca56fbSSascha Wildner return 1;
1506fca56fbSSascha Wildner
1516fca56fbSSascha Wildner if (mime) {
152970935fdSSascha Wildner if (file_printf(ms, "text/csv") == -1)
1536fca56fbSSascha Wildner return -1;
1546fca56fbSSascha Wildner return 1;
1556fca56fbSSascha Wildner }
1566fca56fbSSascha Wildner
1576fca56fbSSascha Wildner if (file_printf(ms, "CSV text") == -1)
1586fca56fbSSascha Wildner return -1;
1596fca56fbSSascha Wildner
1606fca56fbSSascha Wildner return 1;
1616fca56fbSSascha Wildner }
1626fca56fbSSascha Wildner
1636fca56fbSSascha Wildner #else
1646fca56fbSSascha Wildner
1656fca56fbSSascha Wildner #include <sys/types.h>
1666fca56fbSSascha Wildner #include <sys/stat.h>
1676fca56fbSSascha Wildner #include <stdio.h>
1686fca56fbSSascha Wildner #include <fcntl.h>
1696fca56fbSSascha Wildner #include <unistd.h>
1706fca56fbSSascha Wildner #include <stdlib.h>
1716fca56fbSSascha Wildner #include <stdint.h>
1726fca56fbSSascha Wildner #include <err.h>
1736fca56fbSSascha Wildner
1746fca56fbSSascha Wildner int
main(int argc,char * argv[])1756fca56fbSSascha Wildner main(int argc, char *argv[])
1766fca56fbSSascha Wildner {
1776fca56fbSSascha Wildner int fd, rv;
1786fca56fbSSascha Wildner struct stat st;
1796fca56fbSSascha Wildner unsigned char *p;
1806fca56fbSSascha Wildner
1816fca56fbSSascha Wildner if ((fd = open(argv[1], O_RDONLY)) == -1)
1826fca56fbSSascha Wildner err(EXIT_FAILURE, "Can't open `%s'", argv[1]);
1836fca56fbSSascha Wildner
1846fca56fbSSascha Wildner if (fstat(fd, &st) == -1)
1856fca56fbSSascha Wildner err(EXIT_FAILURE, "Can't stat `%s'", argv[1]);
1866fca56fbSSascha Wildner
187*3b9cdfa3SAntonio Huete Jimenez if ((p = CAST(char *, malloc(st.st_size))) == NULL)
1886fca56fbSSascha Wildner err(EXIT_FAILURE, "Can't allocate %jd bytes",
1896fca56fbSSascha Wildner (intmax_t)st.st_size);
1906fca56fbSSascha Wildner if (read(fd, p, st.st_size) != st.st_size)
1916fca56fbSSascha Wildner err(EXIT_FAILURE, "Can't read %jd bytes",
1926fca56fbSSascha Wildner (intmax_t)st.st_size);
1936fca56fbSSascha Wildner printf("is csv %d\n", csv_parse(p, p + st.st_size));
1946fca56fbSSascha Wildner return 0;
1956fca56fbSSascha Wildner }
1966fca56fbSSascha Wildner #endif
197