1 /* $NetBSD: join.c,v 1.26 2006/01/04 01:44:06 perry Exp $ */ 2 3 /*- 4 * Copyright (c) 1991 The Regents of the University of California. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Steve Hayman of Indiana University, Michiro Hikida and David 9 * Goodenough. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 36 #include <sys/cdefs.h> 37 #ifndef lint 38 __COPYRIGHT( 39 "@(#) Copyright (c) 1991 The Regents of the University of California.\n\ 40 All rights reserved.\n"); 41 #endif /* not lint */ 42 43 #ifndef lint 44 #if 0 45 static char sccsid[] = "from: @(#)join.c 5.1 (Berkeley) 11/18/91"; 46 #else 47 __RCSID("$NetBSD: join.c,v 1.26 2006/01/04 01:44:06 perry Exp $"); 48 #endif 49 #endif /* not lint */ 50 51 #include <sys/types.h> 52 #include <ctype.h> 53 #include <err.h> 54 #include <errno.h> 55 #include <stdio.h> 56 #include <stdlib.h> 57 #include <string.h> 58 #include <unistd.h> 59 60 /* 61 * There's a structure per input file which encapsulates the state of the 62 * file. We repeatedly read lines from each file until we've read in all 63 * the consecutive lines from the file with a common join field. Then we 64 * compare the set of lines with an equivalent set from the other file. 65 */ 66 typedef struct { 67 char *line; /* line */ 68 u_long linealloc; /* line allocated count */ 69 char **fields; /* line field(s) */ 70 u_long fieldcnt; /* line field(s) count */ 71 u_long fieldalloc; /* line field(s) allocated count */ 72 } LINE; 73 74 LINE noline = {"", 0, 0, 0, 0}; /* arg for outfield if no line to output */ 75 76 typedef struct { 77 FILE *fp; /* file descriptor */ 78 u_long joinf; /* join field (-1, -2, -j) */ 79 int unpair; /* output unpairable lines (-a) */ 80 int number; /* 1 for file 1, 2 for file 2 */ 81 82 LINE *set; /* set of lines with same field */ 83 u_long pushback; /* line on the stack */ 84 u_long setcnt; /* set count */ 85 u_long setalloc; /* set allocated count */ 86 } INPUT; 87 INPUT input1 = { NULL, 0, 0, 1, NULL, -1, 0, 0, }, 88 input2 = { NULL, 0, 0, 2, NULL, -1, 0, 0, }; 89 90 typedef struct { 91 u_long fileno; /* file number */ 92 u_long fieldno; /* field number */ 93 } OLIST; 94 OLIST *olist; /* output field list */ 95 u_long olistcnt; /* output field list count */ 96 u_long olistalloc; /* output field allocated count */ 97 98 int joinout = 1; /* show lines with matched join fields (-v) */ 99 int needsep; /* need separator character */ 100 int spans = 1; /* span multiple delimiters (-t) */ 101 char *empty; /* empty field replacement string (-e) */ 102 char *tabchar = " \t"; /* delimiter characters (-t) */ 103 104 int cmp(LINE *, u_long, LINE *, u_long); 105 void enomem(void); 106 void fieldarg(char *); 107 void joinlines(INPUT *, INPUT *); 108 int main(int, char **); 109 void obsolete(char **); 110 void outfield(LINE *, u_long); 111 void outoneline(INPUT *, LINE *); 112 void outtwoline(INPUT *, LINE *, INPUT *, LINE *); 113 void slurp(INPUT *); 114 void usage(void); 115 116 int 117 main(int argc, char *argv[]) 118 { 119 INPUT *F1, *F2; 120 int aflag, ch, cval, vflag; 121 char *end; 122 123 F1 = &input1; 124 F2 = &input2; 125 126 aflag = vflag = 0; 127 obsolete(argv); 128 while ((ch = getopt(argc, argv, "\01a:e:j:1:2:o:t:v:")) != -1) { 129 switch (ch) { 130 case '\01': 131 aflag = 1; 132 F1->unpair = F2->unpair = 1; 133 break; 134 case '1': 135 if ((F1->joinf = strtol(optarg, &end, 10)) < 1) { 136 warnx("-1 option field number less than 1"); 137 usage(); 138 } 139 if (*end) { 140 warnx("illegal field number -- %s", optarg); 141 usage(); 142 } 143 --F1->joinf; 144 break; 145 case '2': 146 if ((F2->joinf = strtol(optarg, &end, 10)) < 1) { 147 warnx("-2 option field number less than 1"); 148 usage(); 149 } 150 if (*end) { 151 warnx("illegal field number -- %s", optarg); 152 usage(); 153 } 154 --F2->joinf; 155 break; 156 case 'a': 157 aflag = 1; 158 switch(strtol(optarg, &end, 10)) { 159 case 1: 160 F1->unpair = 1; 161 break; 162 case 2: 163 F2->unpair = 1; 164 break; 165 default: 166 warnx("-a option file number not 1 or 2"); 167 usage(); 168 break; 169 } 170 if (*end) { 171 warnx("illegal file number -- %s", optarg); 172 usage(); 173 } 174 break; 175 case 'e': 176 empty = optarg; 177 break; 178 case 'j': 179 if ((F1->joinf = F2->joinf = 180 strtol(optarg, &end, 10)) < 1) { 181 warnx("-j option field number less than 1"); 182 usage(); 183 } 184 if (*end) { 185 warnx("illegal field number -- %s", optarg); 186 usage(); 187 } 188 --F1->joinf; 189 --F2->joinf; 190 break; 191 case 'o': 192 fieldarg(optarg); 193 break; 194 case 't': 195 spans = 0; 196 if (strlen(tabchar = optarg) != 1) { 197 warnx("illegal tab character specification"); 198 usage(); 199 } 200 break; 201 case 'v': 202 vflag = 1; 203 joinout = 0; 204 switch(strtol(optarg, &end, 10)) { 205 case 1: 206 F1->unpair = 1; 207 break; 208 case 2: 209 F2->unpair = 1; 210 break; 211 default: 212 warnx("-v option file number not 1 or 2"); 213 usage(); 214 break; 215 } 216 if (*end) { 217 warnx("illegal file number -- %s", optarg); 218 usage(); 219 } 220 break; 221 case '?': 222 default: 223 usage(); 224 } 225 } 226 argc -= optind; 227 argv += optind; 228 229 if (aflag && vflag) 230 errx(1, "-a and -v options mutually exclusive"); 231 232 if (argc != 2) 233 usage(); 234 235 /* Open the files; "-" means stdin. */ 236 if (!strcmp(*argv, "-")) 237 F1->fp = stdin; 238 else if ((F1->fp = fopen(*argv, "r")) == NULL) 239 err(1, "%s", *argv); 240 ++argv; 241 if (!strcmp(*argv, "-")) 242 F2->fp = stdin; 243 else if ((F2->fp = fopen(*argv, "r")) == NULL) 244 err(1, "%s", *argv); 245 if (F1->fp == stdin && F2->fp == stdin) 246 errx(1, "only one input file may be stdin"); 247 248 slurp(F1); 249 slurp(F2); 250 while (F1->setcnt && F2->setcnt) { 251 cval = cmp(F1->set, F1->joinf, F2->set, F2->joinf); 252 if (cval == 0) { 253 /* Oh joy, oh rapture, oh beauty divine! */ 254 if (joinout) 255 joinlines(F1, F2); 256 slurp(F1); 257 slurp(F2); 258 } else if (cval < 0) { 259 /* File 1 takes the lead... */ 260 if (F1->unpair) 261 joinlines(F1, NULL); 262 slurp(F1); 263 } else { 264 /* File 2 takes the lead... */ 265 if (F2->unpair) 266 joinlines(F2, NULL); 267 slurp(F2); 268 } 269 } 270 271 /* 272 * Now that one of the files is used up, optionally output any 273 * remaining lines from the other file. 274 */ 275 if (F1->unpair) 276 while (F1->setcnt) { 277 joinlines(F1, NULL); 278 slurp(F1); 279 } 280 if (F2->unpair) 281 while (F2->setcnt) { 282 joinlines(F2, NULL); 283 slurp(F2); 284 } 285 exit(0); 286 } 287 288 void 289 slurp(INPUT *F) 290 { 291 LINE *lp; 292 LINE tmp; 293 LINE *nline; 294 size_t len; 295 int cnt; 296 char *bp, *fieldp; 297 u_long nsize; 298 299 /* 300 * Read all of the lines from an input file that have the same 301 * join field. 302 */ 303 for (F->setcnt = 0;; ++F->setcnt) { 304 /* 305 * If we're out of space to hold line structures, allocate 306 * more. Initialize the structure so that we know that this 307 * is new space. 308 */ 309 if (F->setcnt == F->setalloc) { 310 cnt = F->setalloc; 311 if (F->setalloc == 0) 312 nsize = 64; 313 else 314 nsize = F->setalloc << 1; 315 if ((nline = realloc(F->set, 316 nsize * sizeof(LINE))) == NULL) 317 enomem(); 318 F->set = nline; 319 F->setalloc = nsize; 320 memset(F->set + cnt, 0, 321 (F->setalloc - cnt) * sizeof(LINE)); 322 } 323 324 /* 325 * Get any pushed back line, else get the next line. Allocate 326 * space as necessary. If taking the line from the stack swap 327 * the two structures so that we don't lose the allocated space. 328 * This could be avoided by doing another level of indirection, 329 * but it's probably okay as is. 330 */ 331 lp = &F->set[F->setcnt]; 332 if (F->pushback != -1) { 333 tmp = F->set[F->setcnt]; 334 F->set[F->setcnt] = F->set[F->pushback]; 335 F->set[F->pushback] = tmp; 336 F->pushback = -1; 337 continue; 338 } 339 if ((bp = fgetln(F->fp, &len)) == NULL) 340 return; 341 if (lp->linealloc <= len + 1) { 342 char *n; 343 344 if (lp->linealloc == 0) 345 nsize = 128; 346 else 347 nsize = lp->linealloc; 348 while (nsize <= len + 1) 349 nsize <<= 1; 350 if ((n = realloc(lp->line, 351 nsize * sizeof(char))) == NULL) 352 enomem(); 353 lp->line = n; 354 lp->linealloc = nsize; 355 } 356 memmove(lp->line, bp, len); 357 358 /* Replace trailing newline, if it exists. */ 359 if (bp[len - 1] == '\n') 360 lp->line[len - 1] = '\0'; 361 else 362 lp->line[len] = '\0'; 363 bp = lp->line; 364 365 /* Split the line into fields, allocate space as necessary. */ 366 lp->fieldcnt = 0; 367 while ((fieldp = strsep(&bp, tabchar)) != NULL) { 368 if (spans && *fieldp == '\0') 369 continue; 370 if (lp->fieldcnt == lp->fieldalloc) { 371 char **n; 372 373 if (lp->fieldalloc == 0) 374 nsize = 16; 375 else 376 nsize = lp->fieldalloc << 1; 377 if ((n = realloc(lp->fields, 378 nsize * sizeof(char *))) == NULL) 379 enomem(); 380 lp->fields = n; 381 lp->fieldalloc = nsize; 382 } 383 lp->fields[lp->fieldcnt++] = fieldp; 384 } 385 386 /* See if the join field value has changed. */ 387 if (F->setcnt && cmp(lp, F->joinf, lp - 1, F->joinf)) { 388 F->pushback = F->setcnt; 389 break; 390 } 391 } 392 } 393 394 int 395 cmp(LINE *lp1, u_long fieldno1, LINE *lp2, u_long fieldno2) 396 { 397 398 if (lp1->fieldcnt <= fieldno1) 399 return (lp2->fieldcnt <= fieldno2 ? 0 : 1); 400 if (lp2->fieldcnt <= fieldno2) 401 return (-1); 402 return (strcmp(lp1->fields[fieldno1], lp2->fields[fieldno2])); 403 } 404 405 void 406 joinlines(INPUT *F1, INPUT *F2) 407 { 408 int cnt1, cnt2; 409 410 /* 411 * Output the results of a join comparison. The output may be from 412 * either file 1 or file 2 (in which case the first argument is the 413 * file from which to output) or from both. 414 */ 415 if (F2 == NULL) { 416 for (cnt1 = 0; cnt1 < F1->setcnt; ++cnt1) 417 outoneline(F1, &F1->set[cnt1]); 418 return; 419 } 420 for (cnt1 = 0; cnt1 < F1->setcnt; ++cnt1) 421 for (cnt2 = 0; cnt2 < F2->setcnt; ++cnt2) 422 outtwoline(F1, &F1->set[cnt1], F2, &F2->set[cnt2]); 423 } 424 425 void 426 outoneline(INPUT *F, LINE *lp) 427 { 428 int cnt; 429 430 /* 431 * Output a single line from one of the files, according to the 432 * join rules. This happens when we are writing unmatched single 433 * lines. Output empty fields in the right places. 434 */ 435 if (olist) 436 for (cnt = 0; cnt < olistcnt; ++cnt) { 437 if (olist[cnt].fileno == F->number) 438 outfield(lp, olist[cnt].fieldno); 439 else 440 outfield(&noline, 1); 441 } 442 else 443 for (cnt = 0; cnt < lp->fieldcnt; ++cnt) 444 outfield(lp, cnt); 445 (void)printf("\n"); 446 if (ferror(stdout)) 447 err(1, "stdout"); 448 needsep = 0; 449 } 450 451 void 452 outtwoline(INPUT *F1, LINE *lp1, INPUT *F2, LINE *lp2) 453 { 454 int cnt; 455 456 /* Output a pair of lines according to the join list (if any). */ 457 if (olist) { 458 for (cnt = 0; cnt < olistcnt; ++cnt) 459 if (olist[cnt].fileno == 1) 460 outfield(lp1, olist[cnt].fieldno); 461 else /* if (olist[cnt].fileno == 2) */ 462 outfield(lp2, olist[cnt].fieldno); 463 } else { 464 /* 465 * Output the join field, then the remaining fields from F1 466 * and F2. 467 */ 468 outfield(lp1, F1->joinf); 469 for (cnt = 0; cnt < lp1->fieldcnt; ++cnt) 470 if (F1->joinf != cnt) 471 outfield(lp1, cnt); 472 for (cnt = 0; cnt < lp2->fieldcnt; ++cnt) 473 if (F2->joinf != cnt) 474 outfield(lp2, cnt); 475 } 476 (void)printf("\n"); 477 if (ferror(stdout)) 478 err(1, "stdout"); 479 needsep = 0; 480 } 481 482 void 483 outfield(LINE *lp, u_long fieldno) 484 { 485 if (needsep++) 486 (void)printf("%c", *tabchar); 487 if (!ferror(stdout)) { 488 if (lp->fieldcnt <= fieldno) { 489 if (empty != NULL) 490 (void)printf("%s", empty); 491 } else { 492 if (*lp->fields[fieldno] == '\0') 493 return; 494 (void)printf("%s", lp->fields[fieldno]); 495 } 496 } 497 if (ferror(stdout)) 498 err(1, "stdout"); 499 } 500 501 /* 502 * Convert an output list argument "2.1, 1.3, 2.4" into an array of output 503 * fields. 504 */ 505 void 506 fieldarg(char *option) 507 { 508 u_long fieldno; 509 char *end, *token; 510 OLIST *n; 511 512 while ((token = strsep(&option, ", \t")) != NULL) { 513 if (*token == '\0') 514 continue; 515 if ((token[0] != '1' && token[0] != '2') || token[1] != '.') 516 errx(1, "malformed -o option field"); 517 fieldno = strtol(token + 2, &end, 10); 518 if (*end) 519 errx(1, "malformed -o option field"); 520 if (fieldno == 0) 521 errx(1, "field numbers are 1 based"); 522 if (olistcnt == olistalloc) { 523 if ((n = realloc(olist, 524 (olistalloc + 50) * sizeof(OLIST))) == NULL) 525 enomem(); 526 olist = n; 527 olistalloc += 50; 528 } 529 olist[olistcnt].fileno = token[0] - '0'; 530 olist[olistcnt].fieldno = fieldno - 1; 531 ++olistcnt; 532 } 533 } 534 535 void 536 obsolete(char **argv) 537 { 538 int len; 539 char **p, *ap, *t; 540 541 while ((ap = *++argv) != NULL) { 542 /* Return if "--". */ 543 if (ap[0] == '-' && ap[1] == '-') 544 return; 545 switch (ap[1]) { 546 case 'a': 547 /* 548 * The original join allowed "-a", which meant the 549 * same as -a1 plus -a2. POSIX 1003.2, Draft 11.2 550 * only specifies this as "-a 1" and "a -2", so we 551 * have to use another option flag, one that is 552 * unlikely to ever be used or accidentally entered 553 * on the command line. (Well, we could reallocate 554 * the argv array, but that hardly seems worthwhile.) 555 */ 556 if (ap[2] == '\0') 557 ap[1] = '\01'; 558 break; 559 case 'j': 560 /* 561 * The original join allowed "-j[12] arg" and "-j arg". 562 * Convert the former to "-[12] arg". Don't convert 563 * the latter since getopt(3) can handle it. 564 */ 565 switch(ap[2]) { 566 case '1': 567 if (ap[3] != '\0') 568 goto jbad; 569 ap[1] = '1'; 570 ap[2] = '\0'; 571 break; 572 case '2': 573 if (ap[3] != '\0') 574 goto jbad; 575 ap[1] = '2'; 576 ap[2] = '\0'; 577 break; 578 case '\0': 579 break; 580 default: 581 jbad: errx(1, "illegal option -- %s", ap); 582 usage(); 583 } 584 break; 585 case 'o': 586 /* 587 * The original join allowed "-o arg arg". Convert to 588 * "-o arg -o arg". 589 */ 590 if (ap[2] != '\0') 591 break; 592 for (p = argv + 2; *p; ++p) { 593 if ((p[0][0] != '1' && p[0][0] != '2') || 594 p[0][1] != '.') 595 break; 596 len = strlen(*p); 597 if (len - 2 != strspn(*p + 2, "0123456789")) 598 break; 599 if ((t = malloc(len + 3)) == NULL) 600 enomem(); 601 t[0] = '-'; 602 t[1] = 'o'; 603 memmove(t + 2, *p, len + 1); 604 *p = t; 605 } 606 argv = p - 1; 607 break; 608 } 609 } 610 } 611 612 void 613 enomem(void) 614 { 615 errx(1, "no memory"); 616 } 617 618 void 619 usage(void) 620 { 621 (void)fprintf(stderr, 622 "usage: %s [-a fileno | -v fileno] [-e string] [-j fileno field]\n" 623 " [-o list] [-t char] [-1 field] [-2 field] file1 file2\n", 624 getprogname()); 625 exit(1); 626 } 627