1 /* $OpenBSD: csplit.c,v 1.8 2015/10/11 17:43:03 semarie Exp $ */ 2 /* $FreeBSD: src/usr.bin/csplit/csplit.c,v 1.9 2004/03/22 11:15:03 tjr Exp $ */ 3 4 /*- 5 * Copyright (c) 2002 Tim J. Robbins. 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 /* 31 * csplit -- split files based on context 32 * 33 * This utility splits its input into numbered output files by line number 34 * or by a regular expression. Regular expression matches have an optional 35 * offset with them, allowing the split to occur a specified number of 36 * lines before or after the match. 37 * 38 * To handle negative offsets, we stop reading when the match occurs and 39 * store the offset that the file should have been split at, then use 40 * this output file as input until all the "overflowed" lines have been read. 41 * The file is then closed and truncated to the correct length. 42 * 43 * We assume that the output files can be seeked upon (ie. they cannot be 44 * symlinks to named pipes or character devices), but make no such 45 * assumption about the input. 46 */ 47 48 #include <sys/types.h> 49 50 #include <ctype.h> 51 #include <err.h> 52 #include <errno.h> 53 #include <limits.h> 54 #include <locale.h> 55 #include <regex.h> 56 #include <signal.h> 57 #include <stdint.h> 58 #include <stdio.h> 59 #include <stdlib.h> 60 #include <string.h> 61 #include <unistd.h> 62 63 void cleanup(void); 64 void do_lineno(const char *); 65 void do_rexp(const char *); 66 char *get_line(void); 67 void handlesig(int); 68 FILE *newfile(void); 69 void toomuch(FILE *, long); 70 void usage(void); 71 72 /* 73 * Command line options 74 */ 75 const char *prefix; /* File name prefix */ 76 long sufflen; /* Number of decimal digits for suffix */ 77 int sflag; /* Suppress output of file names */ 78 int kflag; /* Keep output if error occurs */ 79 80 /* 81 * Other miscellaneous globals (XXX too many) 82 */ 83 long lineno; /* Current line number in input file */ 84 long reps; /* Number of repetitions for this pattern */ 85 long nfiles; /* Number of files output so far */ 86 long maxfiles; /* Maximum number of files we can create */ 87 char currfile[PATH_MAX]; /* Current output file */ 88 const char *infn; /* Name of the input file */ 89 FILE *infile; /* Input file handle */ 90 FILE *overfile; /* Overflow file for toomuch() */ 91 off_t truncofs; /* Offset this file should be truncated at */ 92 int doclean; /* Should cleanup() remove output? */ 93 94 int 95 main(int argc, char *argv[]) 96 { 97 struct sigaction sa; 98 long i; 99 int ch; 100 const char *expr; 101 char *ep, *p; 102 FILE *ofp; 103 104 setlocale(LC_ALL, ""); 105 106 if (pledge("stdio rpath wpath cpath", NULL) == -1) 107 err(1, "pledge"); 108 109 kflag = sflag = 0; 110 prefix = "xx"; 111 sufflen = 2; 112 while ((ch = getopt(argc, argv, "f:kn:s")) != -1) { 113 switch (ch) { 114 case 'f': 115 prefix = optarg; 116 break; 117 case 'k': 118 kflag = 1; 119 break; 120 case 'n': 121 errno = 0; 122 sufflen = strtol(optarg, &ep, 10); 123 if (sufflen <= 0 || *ep != '\0' || errno != 0) 124 errx(1, "%s: bad suffix length", optarg); 125 break; 126 case 's': 127 sflag = 1; 128 break; 129 default: 130 usage(); 131 /*NOTREACHED*/ 132 } 133 } 134 135 if (sufflen + strlen(prefix) >= PATH_MAX) 136 errx(1, "name too long"); 137 138 argc -= optind; 139 argv += optind; 140 141 if ((infn = *argv++) == NULL) 142 usage(); 143 if (strcmp(infn, "-") == 0) { 144 infile = stdin; 145 infn = "stdin"; 146 } else if ((infile = fopen(infn, "r")) == NULL) 147 err(1, "%s", infn); 148 149 if (!kflag) { 150 doclean = 1; 151 atexit(cleanup); 152 sa.sa_flags = 0; 153 sa.sa_handler = handlesig; 154 sigemptyset(&sa.sa_mask); 155 sigaddset(&sa.sa_mask, SIGHUP); 156 sigaddset(&sa.sa_mask, SIGINT); 157 sigaddset(&sa.sa_mask, SIGTERM); 158 sigaction(SIGHUP, &sa, NULL); 159 sigaction(SIGINT, &sa, NULL); 160 sigaction(SIGTERM, &sa, NULL); 161 } 162 163 lineno = 0; 164 nfiles = 0; 165 truncofs = 0; 166 overfile = NULL; 167 168 /* Ensure 10^sufflen < LONG_MAX. */ 169 for (maxfiles = 1, i = 0; i < sufflen; i++) { 170 if (maxfiles > LONG_MAX / 10) 171 errx(1, "%ld: suffix too long (limit %ld)", 172 sufflen, i); 173 maxfiles *= 10; 174 } 175 176 /* Create files based on supplied patterns. */ 177 while (nfiles < maxfiles - 1 && (expr = *argv++) != NULL) { 178 /* Look ahead & see if this pattern has any repetitions. */ 179 if (*argv != NULL && **argv == '{') { 180 errno = 0; 181 reps = strtol(*argv + 1, &ep, 10); 182 if (reps < 0 || *ep != '}' || errno != 0) 183 errx(1, "%s: bad repetition count", *argv + 1); 184 argv++; 185 } else 186 reps = 0; 187 188 if (*expr == '/' || *expr == '%') { 189 do { 190 do_rexp(expr); 191 } while (reps-- != 0 && nfiles < maxfiles - 1); 192 } else if (isdigit((unsigned char)*expr)) 193 do_lineno(expr); 194 else 195 errx(1, "%s: unrecognised pattern", expr); 196 } 197 198 /* Copy the rest into a new file. */ 199 if (!feof(infile)) { 200 ofp = newfile(); 201 while ((p = get_line()) != NULL && fputs(p, ofp) == 0) 202 ; 203 if (!sflag) 204 printf("%jd\n", (intmax_t)ftello(ofp)); 205 if (fclose(ofp) != 0) 206 err(1, "%s", currfile); 207 } 208 209 toomuch(NULL, 0); 210 doclean = 0; 211 212 return (0); 213 } 214 215 void 216 usage(void) 217 { 218 extern char *__progname; 219 220 fprintf(stderr, 221 "usage: %s [-ks] [-f prefix] [-n number] file args ...\n", 222 __progname); 223 exit(1); 224 } 225 226 /* ARGSUSED */ 227 void 228 handlesig(int sig) 229 { 230 const char msg[] = "csplit: caught signal, cleaning up\n"; 231 232 write(STDERR_FILENO, msg, sizeof(msg) - 1); 233 cleanup(); 234 _exit(2); 235 } 236 237 /* Create a new output file. */ 238 FILE * 239 newfile(void) 240 { 241 FILE *fp; 242 243 if ((size_t)snprintf(currfile, sizeof(currfile), "%s%0*ld", prefix, 244 (int)sufflen, nfiles) >= sizeof(currfile)) 245 errc(1, ENAMETOOLONG, "%s", currfile); 246 if ((fp = fopen(currfile, "w+")) == NULL) 247 err(1, "%s", currfile); 248 nfiles++; 249 250 return (fp); 251 } 252 253 /* Remove partial output, called before exiting. */ 254 void 255 cleanup(void) 256 { 257 char fnbuf[PATH_MAX]; 258 long i; 259 260 if (!doclean) 261 return; 262 263 /* 264 * NOTE: One cannot portably assume to be able to call snprintf() from 265 * inside a signal handler. It is, however, safe to do on OpenBSD. 266 */ 267 for (i = 0; i < nfiles; i++) { 268 snprintf(fnbuf, sizeof(fnbuf), "%s%0*ld", prefix, 269 (int)sufflen, i); 270 unlink(fnbuf); 271 } 272 } 273 274 /* Read a line from the input into a static buffer. */ 275 char * 276 get_line(void) 277 { 278 static char lbuf[LINE_MAX]; 279 FILE *src; 280 281 src = overfile != NULL ? overfile : infile; 282 283 again: if (fgets(lbuf, sizeof(lbuf), src) == NULL) { 284 if (src == overfile) { 285 src = infile; 286 goto again; 287 } 288 return (NULL); 289 } 290 if (ferror(src)) 291 err(1, "%s", infn); 292 lineno++; 293 294 return (lbuf); 295 } 296 297 /* Conceptually rewind the input (as obtained by get_line()) back `n' lines. */ 298 void 299 toomuch(FILE *ofp, long n) 300 { 301 char buf[BUFSIZ]; 302 size_t i, nread; 303 304 if (overfile != NULL) { 305 /* 306 * Truncate the previous file we overflowed into back to 307 * the correct length, close it. 308 */ 309 if (fflush(overfile) != 0) 310 err(1, "overflow"); 311 if (ftruncate(fileno(overfile), truncofs) != 0) 312 err(1, "overflow"); 313 if (fclose(overfile) != 0) 314 err(1, "overflow"); 315 overfile = NULL; 316 } 317 318 if (n == 0) 319 /* Just tidying up */ 320 return; 321 322 lineno -= n; 323 324 /* 325 * Wind the overflow file backwards to `n' lines before the 326 * current one. 327 */ 328 do { 329 if (ftello(ofp) < (off_t)sizeof(buf)) 330 rewind(ofp); 331 else 332 fseeko(ofp, -(off_t)sizeof(buf), SEEK_CUR); 333 if (ferror(ofp)) 334 errx(1, "%s: can't seek", currfile); 335 if ((nread = fread(buf, 1, sizeof(buf), ofp)) == 0) 336 errx(1, "can't read overflowed output"); 337 if (fseeko(ofp, -(off_t)nread, SEEK_CUR) != 0) 338 err(1, "%s", currfile); 339 for (i = 1; i <= nread; i++) 340 if (buf[nread - i] == '\n' && n-- == 0) 341 break; 342 if (ftello(ofp) == 0) 343 break; 344 } while (n > 0); 345 if (fseeko(ofp, (off_t)(nread - i + 1), SEEK_CUR) != 0) 346 err(1, "%s", currfile); 347 348 /* 349 * get_line() will read from here. Next call will truncate to 350 * truncofs in this file. 351 */ 352 overfile = ofp; 353 truncofs = ftello(overfile); 354 } 355 356 /* Handle splits for /regexp/ and %regexp% patterns. */ 357 void 358 do_rexp(const char *expr) 359 { 360 regex_t cre; 361 intmax_t nwritten; 362 long ofs; 363 int first; 364 char *ecopy, *ep, *p, *pofs, *re; 365 FILE *ofp; 366 367 if ((ecopy = strdup(expr)) == NULL) 368 err(1, "strdup"); 369 370 re = ecopy + 1; 371 if ((pofs = strrchr(ecopy, *expr)) == NULL || pofs[-1] == '\\') 372 errx(1, "%s: missing trailing %c", expr, *expr); 373 *pofs++ = '\0'; 374 375 if (*pofs != '\0') { 376 errno = 0; 377 ofs = strtol(pofs, &ep, 10); 378 if (*ep != '\0' || errno != 0) 379 errx(1, "%s: bad offset", pofs); 380 } else 381 ofs = 0; 382 383 if (regcomp(&cre, re, REG_BASIC|REG_NOSUB) != 0) 384 errx(1, "%s: bad regular expression", re); 385 386 if (*expr == '/') 387 /* /regexp/: Save results to a file. */ 388 ofp = newfile(); 389 else { 390 /* %regexp%: Make a temporary file for overflow. */ 391 if ((ofp = tmpfile()) == NULL) 392 err(1, "tmpfile"); 393 } 394 395 /* Read and output lines until we get a match. */ 396 first = 1; 397 while ((p = get_line()) != NULL) { 398 if (fputs(p, ofp) != 0) 399 break; 400 if (!first && regexec(&cre, p, 0, NULL, 0) == 0) 401 break; 402 first = 0; 403 } 404 405 if (p == NULL) 406 errx(1, "%s: no match", re); 407 408 if (ofs <= 0) { 409 /* 410 * Negative (or zero) offset: throw back any lines we should 411 * not have read yet. 412 */ 413 if (p != NULL) { 414 toomuch(ofp, -ofs + 1); 415 nwritten = (intmax_t)truncofs; 416 } else 417 nwritten = (intmax_t)ftello(ofp); 418 } else { 419 /* 420 * Positive offset: copy the requested number of lines 421 * after the match. 422 */ 423 while (--ofs > 0 && (p = get_line()) != NULL) 424 fputs(p, ofp); 425 toomuch(NULL, 0); 426 nwritten = (intmax_t)ftello(ofp); 427 if (fclose(ofp) != 0) 428 err(1, "%s", currfile); 429 } 430 431 if (!sflag && *expr == '/') 432 printf("%jd\n", nwritten); 433 434 regfree(&cre); 435 free(ecopy); 436 } 437 438 /* Handle splits based on line number. */ 439 void 440 do_lineno(const char *expr) 441 { 442 long lastline, tgtline; 443 char *ep, *p; 444 FILE *ofp; 445 446 errno = 0; 447 tgtline = strtol(expr, &ep, 10); 448 if (tgtline <= 0 || errno != 0 || *ep != '\0') 449 errx(1, "%s: bad line number", expr); 450 lastline = tgtline; 451 if (lastline <= lineno) 452 errx(1, "%s: can't go backwards", expr); 453 454 while (nfiles < maxfiles - 1) { 455 ofp = newfile(); 456 while (lineno + 1 != lastline) { 457 if ((p = get_line()) == NULL) 458 errx(1, "%ld: out of range", lastline); 459 if (fputs(p, ofp) != 0) 460 break; 461 } 462 if (!sflag) 463 printf("%jd\n", (intmax_t)ftello(ofp)); 464 if (fclose(ofp) != 0) 465 err(1, "%s", currfile); 466 if (reps-- == 0) 467 break; 468 lastline += tgtline; 469 } 470 } 471