1 /* $NetBSD: csplit.c,v 1.1 2006/09/25 19:21:42 christos Exp $ */ 2 /* $FreeBSD: src/usr.bin/csplit/csplit.c,v 1.9 2004/03/22 11:15:03 tjr Exp$ */ 3 4 /*- 5 * Copyright (c) 2002 Tim J. Robbins. 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 /* 31 * csplit -- split files based on context 32 * 33 * This utility splits its input into numbered output files by line number 34 * or by a regular expression. Regular expression matches have an optional 35 * offset with them, allowing the split to occur a specified number of 36 * lines before or after the match. 37 * 38 * To handle negative offsets, we stop reading when the match occurs and 39 * store the offset that the file should have been split at, then use 40 * this output file as input until all the "overflowed" lines have been read. 41 * The file is then closed and truncated to the correct length. 42 * 43 * We assume that the output files can be seeked upon (ie. they cannot be 44 * symlinks to named pipes or character devices), but make no such 45 * assumption about the input. 46 */ 47 48 #include <sys/cdefs.h> 49 #ifndef lint 50 __RCSID("$NetBSD: csplit.c,v 1.1 2006/09/25 19:21:42 christos Exp $"); 51 #endif 52 53 #include <sys/types.h> 54 55 #include <ctype.h> 56 #include <err.h> 57 #include <errno.h> 58 #include <limits.h> 59 #include <locale.h> 60 #include <regex.h> 61 #include <signal.h> 62 #include <stdint.h> 63 #include <stdio.h> 64 #include <stdlib.h> 65 #include <string.h> 66 #include <unistd.h> 67 68 static void cleanup(void); 69 static void do_lineno(const char *); 70 static void do_rexp(const char *); 71 static char *getline(void); 72 static void handlesig(int); 73 static FILE *newfile(void); 74 static void toomuch(FILE *, long); 75 static void usage(void) __attribute__((__noreturn__)); 76 77 /* 78 * Command line options 79 */ 80 const char *prefix; /* File name prefix */ 81 long sufflen; /* Number of decimal digits for suffix */ 82 int sflag; /* Suppress output of file names */ 83 int kflag; /* Keep output if error occurs */ 84 85 /* 86 * Other miscellaneous globals (XXX too many) 87 */ 88 long lineno; /* Current line number in input file */ 89 long reps; /* Number of repetitions for this pattern */ 90 long nfiles; /* Number of files output so far */ 91 long maxfiles; /* Maximum number of files we can create */ 92 char currfile[PATH_MAX]; /* Current output file */ 93 const char *infn; /* Name of the input file */ 94 FILE *infile; /* Input file handle */ 95 FILE *overfile; /* Overflow file for toomuch() */ 96 off_t truncofs; /* Offset this file should be truncated at */ 97 int doclean; /* Should cleanup() remove output? */ 98 99 int 100 main(int argc, char *argv[]) 101 { 102 struct sigaction sa; 103 long i; 104 int ch; 105 const char *expr; 106 char *ep, *p; 107 FILE *ofp; 108 109 (void)setlocale(LC_ALL, ""); 110 111 kflag = sflag = 0; 112 prefix = "xx"; 113 sufflen = 2; 114 while ((ch = getopt(argc, argv, "ksf:n:")) > 0) { 115 switch (ch) { 116 case 'f': 117 prefix = optarg; 118 break; 119 case 'k': 120 kflag = 1; 121 break; 122 case 'n': 123 errno = 0; 124 sufflen = strtol(optarg, &ep, 10); 125 if (sufflen <= 0 || *ep != '\0' || errno != 0) 126 errx(1, "%s: bad suffix length", optarg); 127 break; 128 case 's': 129 sflag = 1; 130 break; 131 default: 132 usage(); 133 /*NOTREACHED*/ 134 } 135 } 136 137 if (sufflen + strlen(prefix) >= PATH_MAX) 138 errx(1, "name too long"); 139 140 argc -= optind; 141 argv += optind; 142 143 if ((infn = *argv++) == NULL) 144 usage(); 145 if (strcmp(infn, "-") == 0) { 146 infile = stdin; 147 infn = "stdin"; 148 } else if ((infile = fopen(infn, "r")) == NULL) 149 err(1, "%s", infn); 150 151 if (!kflag) { 152 doclean = 1; 153 (void)atexit(cleanup); 154 sa.sa_flags = 0; 155 sa.sa_handler = handlesig; 156 (void)sigemptyset(&sa.sa_mask); 157 (void)sigaddset(&sa.sa_mask, SIGHUP); 158 (void)sigaddset(&sa.sa_mask, SIGINT); 159 (void)sigaddset(&sa.sa_mask, SIGTERM); 160 (void)sigaction(SIGHUP, &sa, NULL); 161 (void)sigaction(SIGINT, &sa, NULL); 162 (void)sigaction(SIGTERM, &sa, NULL); 163 } 164 165 lineno = 0; 166 nfiles = 0; 167 truncofs = 0; 168 overfile = NULL; 169 170 /* Ensure 10^sufflen < LONG_MAX. */ 171 for (maxfiles = 1, i = 0; i < sufflen; i++) { 172 if (maxfiles > LONG_MAX / 10) 173 errx(1, "%ld: suffix too long (limit %ld)", 174 sufflen, i); 175 maxfiles *= 10; 176 } 177 178 /* Create files based on supplied patterns. */ 179 while (nfiles < maxfiles - 1 && (expr = *argv++) != NULL) { 180 /* Look ahead & see if this pattern has any repetitions. */ 181 if (*argv != NULL && **argv == '{') { 182 errno = 0; 183 reps = strtol(*argv + 1, &ep, 10); 184 if (reps < 0 || *ep != '}' || errno != 0) 185 errx(1, "%s: bad repetition count", *argv + 1); 186 argv++; 187 } else 188 reps = 0; 189 190 if (*expr == '/' || *expr == '%') { 191 do 192 do_rexp(expr); 193 while (reps-- != 0 && nfiles < maxfiles - 1); 194 } else if (isdigit((unsigned char)*expr)) 195 do_lineno(expr); 196 else 197 errx(1, "%s: unrecognised pattern", expr); 198 } 199 200 /* Copy the rest into a new file. */ 201 if (!feof(infile)) { 202 ofp = newfile(); 203 while ((p = getline()) != NULL && fputs(p, ofp) == 0) 204 ; 205 if (!sflag) 206 (void)printf("%jd\n", (intmax_t)ftello(ofp)); 207 if (fclose(ofp) != 0) 208 err(1, "%s", currfile); 209 } 210 211 toomuch(NULL, 0L); 212 doclean = 0; 213 214 return (0); 215 } 216 217 static void 218 usage(void) 219 { 220 221 (void)fprintf(stderr, 222 "Usage: %s [-ks] [-f prefix] [-n number] file args ...\n", getprogname()); 223 exit(1); 224 } 225 226 static void 227 handlesig(int sig) 228 { 229 char msg[BUFSIZ]; 230 size_t len; 231 232 len = snprintf(msg, sizeof(msg), "%s: Caught %s, cleaning up\n", 233 getprogname(), strsignal(sig)); 234 if (len < sizeof(msg)) 235 (void)write(STDERR_FILENO, msg, len); 236 cleanup(); 237 _exit(2); 238 } 239 240 /* Create a new output file. */ 241 static FILE * 242 newfile(void) 243 { 244 FILE *fp; 245 246 if ((size_t)snprintf(currfile, sizeof(currfile), "%s%0*ld", prefix, 247 (int)sufflen, nfiles) >= sizeof(currfile)) 248 errx(1, "%s: %s", currfile, strerror(ENAMETOOLONG)); 249 if ((fp = fopen(currfile, "w+")) == NULL) 250 err(1, "%s", currfile); 251 nfiles++; 252 253 return (fp); 254 } 255 256 /* Remove partial output, called before exiting. */ 257 static void 258 cleanup(void) 259 { 260 char fnbuf[PATH_MAX]; 261 long i; 262 263 if (!doclean) 264 return; 265 266 /* 267 * NOTE: One cannot portably assume to be able to call snprintf() 268 * from inside a signal handler. It does, however, appear to be safe 269 * to do on FreeBSD and NetBSD. The solution to this problem is worse 270 * than the problem itself. 271 */ 272 273 for (i = 0; i < nfiles; i++) { 274 (void)snprintf(fnbuf, sizeof(fnbuf), "%s%0*ld", prefix, 275 (int)sufflen, i); 276 (void)unlink(fnbuf); 277 } 278 } 279 280 /* Read a line from the input into a static buffer. */ 281 static char * 282 getline(void) 283 { 284 static char lbuf[LINE_MAX]; 285 FILE *src; 286 287 src = overfile != NULL ? overfile : infile; 288 289 again: if (fgets(lbuf, sizeof(lbuf), src) == NULL) { 290 if (src == overfile) { 291 src = infile; 292 goto again; 293 } 294 return (NULL); 295 } 296 if (ferror(src)) 297 err(1, "%s", infn); 298 lineno++; 299 300 return (lbuf); 301 } 302 303 /* Conceptually rewind the input (as obtained by getline()) back `n' lines. */ 304 static void 305 toomuch(FILE *ofp, long n) 306 { 307 char buf[BUFSIZ]; 308 size_t i, nread; 309 310 if (overfile != NULL) { 311 /* 312 * Truncate the previous file we overflowed into back to 313 * the correct length, close it. 314 */ 315 if (fflush(overfile) != 0) 316 err(1, "overflow"); 317 if (ftruncate(fileno(overfile), truncofs) != 0) 318 err(1, "overflow"); 319 if (fclose(overfile) != 0) 320 err(1, "overflow"); 321 overfile = NULL; 322 } 323 324 if (n == 0) 325 /* Just tidying up */ 326 return; 327 328 lineno -= n; 329 330 /* 331 * Wind the overflow file backwards to `n' lines before the 332 * current one. 333 */ 334 do { 335 if (ftello(ofp) < (off_t)sizeof(buf)) 336 rewind(ofp); 337 else 338 (void)fseeko(ofp, -(off_t)sizeof(buf), SEEK_CUR); 339 if (ferror(ofp)) 340 errx(1, "%s: can't seek", currfile); 341 if ((nread = fread(buf, 1, sizeof(buf), ofp)) == 0) 342 errx(1, "can't read overflowed output"); 343 if (fseeko(ofp, -(off_t)nread, SEEK_CUR) != 0) 344 err(1, "%s", currfile); 345 for (i = 1; i <= nread; i++) 346 if (buf[nread - i] == '\n' && n-- == 0) 347 break; 348 if (ftello(ofp) == 0) 349 break; 350 } while (n > 0); 351 if (fseeko(ofp, (off_t)nread - i + 1, SEEK_CUR) != 0) 352 err(1, "%s", currfile); 353 354 /* 355 * getline() will read from here. Next call will truncate to 356 * truncofs in this file. 357 */ 358 overfile = ofp; 359 truncofs = ftello(overfile); 360 } 361 362 /* Handle splits for /regexp/ and %regexp% patterns. */ 363 static void 364 do_rexp(const char *expr) 365 { 366 regex_t cre; 367 intmax_t nwritten; 368 long ofs; 369 int first; 370 char *ecopy, *ep, *p, *pofs, *re; 371 FILE *ofp; 372 373 if ((ecopy = strdup(expr)) == NULL) 374 err(1, "strdup"); 375 376 re = ecopy + 1; 377 if ((pofs = strrchr(ecopy, *expr)) == NULL || pofs[-1] == '\\') 378 errx(1, "%s: missing trailing %c", expr, *expr); 379 *pofs++ = '\0'; 380 381 if (*pofs != '\0') { 382 errno = 0; 383 ofs = strtol(pofs, &ep, 10); 384 if (*ep != '\0' || errno != 0) 385 errx(1, "%s: bad offset", pofs); 386 } else 387 ofs = 0; 388 389 if (regcomp(&cre, re, REG_BASIC|REG_NOSUB) != 0) 390 errx(1, "%s: bad regular expression", re); 391 392 if (*expr == '/') 393 /* /regexp/: Save results to a file. */ 394 ofp = newfile(); 395 else { 396 /* %regexp%: Make a temporary file for overflow. */ 397 if ((ofp = tmpfile()) == NULL) 398 err(1, "tmpfile"); 399 } 400 401 /* Read and output lines until we get a match. */ 402 first = 1; 403 while ((p = getline()) != NULL) { 404 if (fputs(p, ofp) != 0) 405 break; 406 if (!first && regexec(&cre, p, 0, NULL, 0) == 0) 407 break; 408 first = 0; 409 } 410 411 if (p == NULL) 412 errx(1, "%s: no match", re); 413 414 if (ofs <= 0) { 415 /* 416 * Negative (or zero) offset: throw back any lines we should 417 * not have read yet. 418 */ 419 if (p != NULL) { 420 toomuch(ofp, -ofs + 1); 421 nwritten = (intmax_t)truncofs; 422 } else 423 nwritten = (intmax_t)ftello(ofp); 424 } else { 425 /* 426 * Positive offset: copy the requested number of lines 427 * after the match. 428 */ 429 while (--ofs > 0 && (p = getline()) != NULL) 430 if (fputs(p, ofp) != 0) 431 break; 432 toomuch(NULL, 0L); 433 nwritten = (intmax_t)ftello(ofp); 434 if (fclose(ofp) != 0) 435 err(1, "%s", currfile); 436 } 437 438 if (!sflag && *expr == '/') 439 (void)printf("%jd\n", nwritten); 440 441 regfree(&cre); 442 free(ecopy); 443 } 444 445 /* Handle splits based on line number. */ 446 static void 447 do_lineno(const char *expr) 448 { 449 long lastline, tgtline; 450 char *ep, *p; 451 FILE *ofp; 452 453 errno = 0; 454 tgtline = strtol(expr, &ep, 10); 455 if (tgtline <= 0 || errno != 0 || *ep != '\0') 456 errx(1, "%s: bad line number", expr); 457 lastline = tgtline; 458 if (lastline <= lineno) 459 errx(1, "%s: can't go backwards", expr); 460 461 while (nfiles < maxfiles - 1) { 462 ofp = newfile(); 463 while (lineno + 1 != lastline) { 464 if ((p = getline()) == NULL) 465 errx(1, "%ld: out of range", lastline); 466 if (fputs(p, ofp) != 0) 467 break; 468 } 469 if (!sflag) 470 (void)printf("%jd\n", (intmax_t)ftello(ofp)); 471 if (fclose(ofp) != 0) 472 err(1, "%s", currfile); 473 if (reps-- == 0) 474 break; 475 lastline += tgtline; 476 } 477 } 478