1 /* $NetBSD: is_csv.c,v 1.4 2022/09/24 20:21:46 christos Exp $ */ 2 3 /*- 4 * Copyright (c) 2019 Christos Zoulas 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 * POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /* 30 * Parse CSV object serialization format (RFC-4180, RFC-7111) 31 */ 32 33 #ifndef TEST 34 #include "file.h" 35 36 #ifndef lint 37 #if 0 38 FILE_RCSID("@(#)$File: is_csv.c,v 1.8 2022/09/16 14:15:29 christos Exp $") 39 #else 40 __RCSID("$NetBSD: is_csv.c,v 1.4 2022/09/24 20:21:46 christos Exp $"); 41 #endif 42 #endif 43 44 #include <string.h> 45 #include "magic.h" 46 #else 47 #include <sys/types.h> 48 #endif 49 50 51 #ifdef DEBUG 52 #include <stdio.h> 53 #define DPRINTF(fmt, ...) printf(fmt, __VA_ARGS__) 54 #else 55 #define DPRINTF(fmt, ...) 56 #endif 57 58 /* 59 * if CSV_LINES == 0: 60 * check all the lines in the buffer 61 * otherwise: 62 * check only up-to the number of lines specified 63 * 64 * the last line count is always ignored if it does not end in CRLF 65 */ 66 #ifndef CSV_LINES 67 #define CSV_LINES 10 68 #endif 69 70 static int csv_parse(const unsigned char *, const unsigned char *); 71 72 static const unsigned char * 73 eatquote(const unsigned char *uc, const unsigned char *ue) 74 { 75 int quote = 0; 76 77 while (uc < ue) { 78 unsigned char c = *uc++; 79 if (c != '"') { 80 // We already got one, done. 81 if (quote) { 82 return --uc; 83 } 84 continue; 85 } 86 if (quote) { 87 // quote-quote escapes 88 quote = 0; 89 continue; 90 } 91 // first quote 92 quote = 1; 93 } 94 return ue; 95 } 96 97 static int 98 csv_parse(const unsigned char *uc, const unsigned char *ue) 99 { 100 size_t nf = 0, tf = 0, nl = 0; 101 102 while (uc < ue) { 103 switch (*uc++) { 104 case '"': 105 // Eat until the matching quote 106 uc = eatquote(uc, ue); 107 break; 108 case ',': 109 nf++; 110 break; 111 case '\n': 112 DPRINTF("%zu %zu %zu\n", nl, nf, tf); 113 nl++; 114 #if CSV_LINES 115 if (nl == CSV_LINES) 116 return tf != 0 && tf == nf; 117 #endif 118 if (tf == 0) { 119 // First time and no fields, give up 120 if (nf == 0) 121 return 0; 122 // First time, set the number of fields 123 tf = nf; 124 } else if (tf != nf) { 125 // Field number mismatch, we are done. 126 return 0; 127 } 128 nf = 0; 129 break; 130 default: 131 break; 132 } 133 } 134 return tf && nl > 2; 135 } 136 137 #ifndef TEST 138 int 139 file_is_csv(struct magic_set *ms, const struct buffer *b, int looks_text) 140 { 141 const unsigned char *uc = CAST(const unsigned char *, b->fbuf); 142 const unsigned char *ue = uc + b->flen; 143 int mime = ms->flags & MAGIC_MIME; 144 145 if (!looks_text) 146 return 0; 147 148 if ((ms->flags & (MAGIC_APPLE|MAGIC_EXTENSION)) != 0) 149 return 0; 150 151 if (!csv_parse(uc, ue)) 152 return 0; 153 154 if (mime == MAGIC_MIME_ENCODING) 155 return 1; 156 157 if (mime) { 158 if (file_printf(ms, "text/csv") == -1) 159 return -1; 160 return 1; 161 } 162 163 if (file_printf(ms, "CSV text") == -1) 164 return -1; 165 166 return 1; 167 } 168 169 #else 170 171 #include <sys/types.h> 172 #include <sys/stat.h> 173 #include <stdio.h> 174 #include <fcntl.h> 175 #include <unistd.h> 176 #include <stdlib.h> 177 #include <stdint.h> 178 #include <err.h> 179 180 int 181 main(int argc, char *argv[]) 182 { 183 int fd; 184 struct stat st; 185 unsigned char *p; 186 187 if ((fd = open(argv[1], O_RDONLY)) == -1) 188 err(EXIT_FAILURE, "Can't open `%s'", argv[1]); 189 190 if (fstat(fd, &st) == -1) 191 err(EXIT_FAILURE, "Can't stat `%s'", argv[1]); 192 193 if ((p = CAST(char *, malloc(st.st_size))) == NULL) 194 err(EXIT_FAILURE, "Can't allocate %jd bytes", 195 (intmax_t)st.st_size); 196 if (read(fd, p, st.st_size) != st.st_size) 197 err(EXIT_FAILURE, "Can't read %jd bytes", 198 (intmax_t)st.st_size); 199 printf("is csv %d\n", csv_parse(p, p + st.st_size)); 200 return 0; 201 } 202 #endif 203