xref: /openbsd-src/usr.bin/join/join.c (revision f2da64fbbbf1b03f09f390ab01267c93dfd77c4c)
1 /* $OpenBSD: join.c,v 1.27 2015/10/09 01:37:07 deraadt Exp $	*/
2 
3 /*-
4  * Copyright (c) 1991, 1993, 1994
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * Steve Hayman of the Computer Science Department, Indiana University,
9  * Michiro Hikida and David Goodenough.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 3. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 
36 #include <err.h>
37 #include <stdio.h>
38 #include <stdlib.h>
39 #include <string.h>
40 #include <unistd.h>
41 
42 #define MAXIMUM(a, b)	(((a) > (b)) ? (a) : (b))
43 
44 /*
45  * There's a structure per input file which encapsulates the state of the
46  * file.  We repeatedly read lines from each file until we've read in all
47  * the consecutive lines from the file with a common join field.  Then we
48  * compare the set of lines with an equivalent set from the other file.
49  */
50 typedef struct {
51 	char *line;			/* line */
52 	u_long linealloc;		/* line allocated count */
53 	char **fields;			/* line field(s) */
54 	u_long fieldcnt;		/* line field(s) count */
55 	u_long fieldalloc;		/* line field(s) allocated count */
56 	u_long cfieldc;			/* current field count */
57 	long	 fpos;			/* fpos of start of field */
58 } LINE;
59 
60 typedef struct {
61 	FILE *fp;			/* file descriptor */
62 	u_long joinf;			/* join field (-1, -2, -j) */
63 	int unpair;			/* output unpairable lines (-a) */
64 	u_long number;			/* 1 for file 1, 2 for file 2 */
65 	LINE *set;			/* set of lines with same field */
66 	int pushbool;			/* if pushback is set */
67 	u_long pushback;		/* line on the stack */
68 	u_long setcnt;			/* set count */
69 	u_long setalloc;		/* set allocated count */
70 	u_long setusedc;		/* sets used  */
71 } INPUT;
72 INPUT input1 = { NULL, 0, 0, 1, NULL, 0, 0, 0, 0, 0 },
73       input2 = { NULL, 0, 0, 2, NULL, 0, 0, 0, 0, 0 };
74 
75 typedef struct {
76 	u_long	filenum;	/* file number */
77 	u_long	fieldno;	/* field number */
78 } OLIST;
79 OLIST *olist;			/* output field list */
80 u_long olistcnt;		/* output field list count */
81 u_long olistalloc;		/* output field allocated count */
82 
83 int joinout = 1;		/* show lines with matched join fields (-v) */
84 int needsep;			/* need separator character */
85 int spans = 1;			/* span multiple delimiters (-t) */
86 char *empty;			/* empty field replacement string (-e) */
87 char *tabchar = " \t";		/* delimiter characters (-t) */
88 
89 int  cmp(LINE *, u_long, LINE *, u_long);
90 void fieldarg(char *);
91 void joinlines(INPUT *, INPUT *);
92 void obsolete(char **);
93 void outfield(LINE *, u_long, int);
94 void outoneline(INPUT *, LINE *);
95 void outtwoline(INPUT *, LINE *, INPUT *, LINE *);
96 void slurp(INPUT *);
97 void slurpit(INPUT *);
98 void usage(void);
99 
100 int
101 main(int argc, char *argv[])
102 {
103 	INPUT *F1, *F2;
104 	int aflag, ch, cval, vflag;
105 	char *end;
106 
107 	if (pledge("stdio rpath", NULL) == -1)
108 		err(1, "pledge");
109 
110 	F1 = &input1;
111 	F2 = &input2;
112 
113 	aflag = vflag = 0;
114 	obsolete(argv);
115 	while ((ch = getopt(argc, argv, "\01a:e:j:1:2:o:t:v:")) != -1) {
116 		switch (ch) {
117 		case '\01':		/* See comment in obsolete(). */
118 			aflag = 1;
119 			F1->unpair = F2->unpair = 1;
120 			break;
121 		case '1':
122 			if ((F1->joinf = strtol(optarg, &end, 10)) < 1)
123 				errx(1, "-1 option field number less than 1");
124 			if (*end)
125 				errx(1, "illegal field number -- %s", optarg);
126 			--F1->joinf;
127 			break;
128 		case '2':
129 			if ((F2->joinf = strtol(optarg, &end, 10)) < 1)
130 				errx(1, "-2 option field number less than 1");
131 			if (*end)
132 				errx(1, "illegal field number -- %s", optarg);
133 			--F2->joinf;
134 			break;
135 		case 'a':
136 			aflag = 1;
137 			switch(strtol(optarg, &end, 10)) {
138 			case 1:
139 				F1->unpair = 1;
140 				break;
141 			case 2:
142 				F2->unpair = 1;
143 				break;
144 			default:
145 				errx(1, "-a option file number not 1 or 2");
146 				break;
147 			}
148 			if (*end)
149 				errx(1, "illegal file number -- %s", optarg);
150 			break;
151 		case 'e':
152 			empty = optarg;
153 			break;
154 		case 'j':
155 			if ((F1->joinf = F2->joinf = strtol(optarg, &end, 10)) < 1)
156 				errx(1, "-j option field number less than 1");
157 			if (*end)
158 				errx(1, "illegal field number -- %s", optarg);
159 			--F1->joinf;
160 			--F2->joinf;
161 			break;
162 		case 'o':
163 			fieldarg(optarg);
164 			break;
165 		case 't':
166 			spans = 0;
167 			if (strlen(tabchar = optarg) != 1)
168 				errx(1, "illegal tab character specification");
169 			break;
170 		case 'v':
171 			vflag = 1;
172 			joinout = 0;
173 			switch (strtol(optarg, &end, 10)) {
174 			case 1:
175 				F1->unpair = 1;
176 				break;
177 			case 2:
178 				F2->unpair = 1;
179 				break;
180 			default:
181 				errx(1, "-v option file number not 1 or 2");
182 				break;
183 			}
184 			if (*end)
185 				errx(1, "illegal file number -- %s", optarg);
186 			break;
187 		case '?':
188 		default:
189 			usage();
190 		}
191 	}
192 	argc -= optind;
193 	argv += optind;
194 
195 	if (aflag && vflag)
196 		errx(1, "the -a and -v options are mutually exclusive");
197 
198 	if (argc != 2)
199 		usage();
200 
201 	/* Open the files; "-" means stdin. */
202 	if (!strcmp(*argv, "-"))
203 		F1->fp = stdin;
204 	else if ((F1->fp = fopen(*argv, "r")) == NULL)
205 		err(1, "%s", *argv);
206 	++argv;
207 	if (!strcmp(*argv, "-"))
208 		F2->fp = stdin;
209 	else if ((F2->fp = fopen(*argv, "r")) == NULL)
210 		err(1, "%s", *argv);
211 	if (F1->fp == stdin && F2->fp == stdin)
212 		errx(1, "only one input file may be stdin");
213 
214 	if (pledge("stdio", NULL) == -1)
215 		err(1, "pledge");
216 
217 	F1->setusedc = 0;
218 	F2->setusedc = 0;
219 	slurp(F1);
220 	slurp(F2);
221 	F1->set->cfieldc = 0;
222 	F2->set->cfieldc = 0;
223 
224 	/*
225 	 * We try to let the files have the same field value, advancing
226 	 * whoever falls behind and always advancing the file(s) we output
227 	 * from.
228 	*/
229 	while (F1->setcnt && F2->setcnt) {
230 		cval = cmp(F1->set, F1->joinf, F2->set, F2->joinf);
231 		if (cval == 0) {
232 			/* Oh joy, oh rapture, oh beauty divine! */
233 			if (joinout)
234 				joinlines(F1, F2);
235 			slurp(F1);
236 			slurp(F2);
237 		}
238 		else {
239 			if (F1->unpair
240 			&& (cval < 0 || F2->set->cfieldc == F2->setusedc -1)) {
241 				joinlines(F1, NULL);
242 				slurp(F1);
243 			}
244 			else if (cval < 0)
245 				/* File 1 takes the lead... */
246 				slurp(F1);
247 			if (F2->unpair
248 			&& (cval > 0 || F1->set->cfieldc == F1->setusedc -1)) {
249 				joinlines(F2, NULL);
250 				slurp(F2);
251 			}
252 			else if (cval > 0)
253 				/* File 2 takes the lead... */
254 				slurp(F2);
255 		}
256 	}
257 
258 	/*
259 	 * Now that one of the files is used up, optionally output any
260 	 * remaining lines from the other file.
261 	 */
262 	if (F1->unpair)
263 		while (F1->setcnt) {
264 			joinlines(F1, NULL);
265 			slurp(F1);
266 		}
267 	if (F2->unpair)
268 		while (F2->setcnt) {
269 			joinlines(F2, NULL);
270 			slurp(F2);
271 		}
272 
273 	return 0;
274 }
275 
276 /* wrapper around slurpit() to keep track of what field we are on */
277 void slurp(INPUT *F)
278 {
279 	long fpos;
280 	u_long cfieldc;
281 
282 	if (F->set == NULL) {
283 		fpos = 0;
284 		cfieldc = 0;
285 	}
286 	else {
287 		fpos = F->set->fpos;
288 		cfieldc = F->set->cfieldc;
289 	}
290 	slurpit(F);
291 	if (F->set == NULL)
292 		return;
293 	else if (fpos != F->set->fpos)
294 		F->set->cfieldc = cfieldc+1;
295 }
296 
297 void
298 slurpit(INPUT *F)
299 {
300 	LINE *lp, *lastlp, tmp;
301 	size_t len;
302 	u_long cnt;
303 	char *bp, *fieldp;
304 	long fpos;
305 	/*
306 	 * Read all of the lines from an input file that have the same
307 	 * join field.
308 	 */
309 
310 	F->setcnt = 0;
311 	for (lastlp = NULL; ; ++F->setcnt, lastlp = lp) {
312 		/*
313 		 * If we're out of space to hold line structures, allocate
314 		 * more.  Initialize the structure so that we know that this
315 		 * is new space.
316 		 */
317 		if (F->setcnt == F->setalloc) {
318 			LINE *p;
319 			u_long newsize = F->setalloc + 50;
320 			cnt = F->setalloc;
321 			if ((p = reallocarray(F->set, newsize, sizeof(LINE)))
322 			    == NULL)
323 				err(1, NULL);
324 			F->set = p;
325 			F->setalloc = newsize;
326 			memset(F->set + cnt, 0, 50 * sizeof(LINE));
327 			/* re-set lastlp in case it moved */
328 			if (lastlp != NULL)
329 				lastlp = &F->set[F->setcnt - 1];
330 		}
331 		/*
332 		 * Get any pushed back line, else get the next line.  Allocate
333 		 * space as necessary.  If taking the line from the stack swap
334 		 * the two structures so that we don't lose space allocated to
335 		 * either structure.  This could be avoided by doing another
336 		 * level of indirection, but it's probably okay as is.
337 		 */
338 		lp = &F->set[F->setcnt];
339 		if (F->pushbool) {
340 			tmp = F->set[F->setcnt];
341 			F->set[F->setcnt] = F->set[F->pushback];
342 			F->set[F->pushback] = tmp;
343 			F->pushbool = 0;
344 			continue;
345 		}
346 		if ((bp = fgetln(F->fp, &len)) == NULL)
347 			return;
348 		/*
349 		 * we depend on knowing on what field we are, one safe way is
350 		 * the file position.
351 		*/
352 		fpos = ftell(F->fp) - len;
353 		if (lp->linealloc <= len + 1) {
354 			char *p;
355 			u_long newsize = lp->linealloc +
356 			    MAXIMUM(100, len + 1 - lp->linealloc);
357 			if ((p = realloc(lp->line, newsize)) == NULL)
358 				err(1, NULL);
359 			lp->line = p;
360 			lp->linealloc = newsize;
361 		}
362 		F->setusedc++;
363 		memmove(lp->line, bp, len);
364 		lp->fpos = fpos;
365 		/* Replace trailing newline, if it exists. */
366 		if (bp[len - 1] == '\n')
367 			lp->line[len - 1] = '\0';
368 		else
369 			lp->line[len] = '\0';
370 		bp = lp->line;
371 
372 		/* Split the line into fields, allocate space as necessary. */
373 		lp->fieldcnt = 0;
374 		while ((fieldp = strsep(&bp, tabchar)) != NULL) {
375 			if (spans && *fieldp == '\0')
376 				continue;
377 			if (lp->fieldcnt == lp->fieldalloc) {
378 				char **p;
379 				u_long newsize = lp->fieldalloc + 50;
380 				if ((p = reallocarray(lp->fields, newsize,
381 				    sizeof(char *))) == NULL)
382 					err(1, NULL);
383 				lp->fields = p;
384 				lp->fieldalloc = newsize;
385 			}
386 			lp->fields[lp->fieldcnt++] = fieldp;
387 		}
388 
389 		/* See if the join field value has changed. */
390 		if (lastlp != NULL && cmp(lp, F->joinf, lastlp, F->joinf)) {
391 			F->pushbool = 1;
392 			F->pushback = F->setcnt;
393 			break;
394 		}
395 	}
396 }
397 
398 int
399 cmp(LINE *lp1, u_long fieldno1, LINE *lp2, u_long fieldno2)
400 {
401 	if (lp1->fieldcnt <= fieldno1)
402 		return (-1);
403 	else if (lp2->fieldcnt <= fieldno2)
404 		return (1);
405 	return (strcmp(lp1->fields[fieldno1], lp2->fields[fieldno2]));
406 }
407 
408 void
409 joinlines(INPUT *F1, INPUT *F2)
410 {
411 	u_long cnt1, cnt2;
412 
413 	/*
414 	 * Output the results of a join comparison.  The output may be from
415 	 * either file 1 or file 2 (in which case the first argument is the
416 	 * file from which to output) or from both.
417 	 */
418 	if (F2 == NULL) {
419 		for (cnt1 = 0; cnt1 < F1->setcnt; ++cnt1)
420 			outoneline(F1, &F1->set[cnt1]);
421 		return;
422 	}
423 	for (cnt1 = 0; cnt1 < F1->setcnt; ++cnt1)
424 		for (cnt2 = 0; cnt2 < F2->setcnt; ++cnt2)
425 			outtwoline(F1, &F1->set[cnt1], F2, &F2->set[cnt2]);
426 }
427 
428 void
429 outoneline(INPUT *F, LINE *lp)
430 {
431 	u_long cnt;
432 
433 	/*
434 	 * Output a single line from one of the files, according to the
435 	 * join rules.  This happens when we are writing unmatched single
436 	 * lines.  Output empty fields in the right places.
437 	 */
438 	if (olist)
439 		for (cnt = 0; cnt < olistcnt; ++cnt) {
440 			if (olist[cnt].filenum == F->number)
441 				outfield(lp, olist[cnt].fieldno, 0);
442 			else if (olist[cnt].filenum == 0)
443 				outfield(lp, F->joinf, 0);
444 			else
445 				outfield(lp, 0, 1);
446 		}
447 	else {
448 		/*
449 		 * Output the join field, then the remaining fields from F
450 		 */
451 		outfield(lp, F->joinf, 0);
452 		for (cnt = 0; cnt < lp->fieldcnt; ++cnt)
453 			if (F->joinf != cnt)
454 				outfield(lp, cnt, 0);
455 	}
456 
457 	putchar('\n');
458 	if (ferror(stdout))
459 		err(1, "stdout");
460 	needsep = 0;
461 }
462 
463 void
464 outtwoline(INPUT *F1, LINE *lp1, INPUT *F2, LINE *lp2)
465 {
466 	u_long cnt;
467 
468 	/* Output a pair of lines according to the join list (if any). */
469 	if (olist) {
470 		for (cnt = 0; cnt < olistcnt; ++cnt)
471 			if (olist[cnt].filenum == 0) {
472 				if (lp1->fieldcnt >= F1->joinf)
473 					outfield(lp1, F1->joinf, 0);
474 				else
475 					outfield(lp2, F2->joinf, 0);
476 			} else if (olist[cnt].filenum == 1)
477 				outfield(lp1, olist[cnt].fieldno, 0);
478 			else /* if (olist[cnt].filenum == 2) */
479 				outfield(lp2, olist[cnt].fieldno, 0);
480 	} else {
481 		/*
482 		 * Output the join field, then the remaining fields from F1
483 		 * and F2.
484 		 */
485 		outfield(lp1, F1->joinf, 0);
486 		for (cnt = 0; cnt < lp1->fieldcnt; ++cnt)
487 			if (F1->joinf != cnt)
488 				outfield(lp1, cnt, 0);
489 		for (cnt = 0; cnt < lp2->fieldcnt; ++cnt)
490 			if (F2->joinf != cnt)
491 				outfield(lp2, cnt, 0);
492 	}
493 	putchar('\n');
494 	if (ferror(stdout))
495 		err(1, "stdout");
496 	needsep = 0;
497 }
498 
499 void
500 outfield(LINE *lp, u_long fieldno, int out_empty)
501 {
502 	if (needsep++)
503 		putchar((int)*tabchar);
504 	if (!ferror(stdout)) {
505 		if (lp->fieldcnt <= fieldno || out_empty) {
506 			if (empty != NULL)
507 				fputs(empty, stdout);
508 		} else {
509 			if (*lp->fields[fieldno] == '\0')
510 				return;
511 			fputs(lp->fields[fieldno], stdout);
512 		}
513 	}
514 	if (ferror(stdout))
515 		err(1, "stdout");
516 }
517 
518 /*
519  * Convert an output list argument "2.1, 1.3, 2.4" into an array of output
520  * fields.
521  */
522 void
523 fieldarg(char *option)
524 {
525 	u_long fieldno, filenum;
526 	char *end, *token;
527 
528 	while ((token = strsep(&option, ", \t")) != NULL) {
529 		if (*token == '\0')
530 			continue;
531 		if (token[0] == '0')
532 			filenum = fieldno = 0;
533 		else if ((token[0] == '1' || token[0] == '2') &&
534 		    token[1] == '.') {
535 			filenum = token[0] - '0';
536 			fieldno = strtol(token + 2, &end, 10);
537 			if (*end)
538 				errx(1, "malformed -o option field");
539 			if (fieldno == 0)
540 				errx(1, "field numbers are 1 based");
541 			--fieldno;
542 		} else
543 			errx(1, "malformed -o option field");
544 		if (olistcnt == olistalloc) {
545 			OLIST *p;
546 			u_long newsize = olistalloc + 50;
547 			if ((p = reallocarray(olist, newsize, sizeof(OLIST)))
548 			    == NULL)
549 				err(1, NULL);
550 			olist = p;
551 			olistalloc = newsize;
552 		}
553 		olist[olistcnt].filenum = filenum;
554 		olist[olistcnt].fieldno = fieldno;
555 		++olistcnt;
556 	}
557 }
558 
559 void
560 obsolete(char **argv)
561 {
562 	size_t len;
563 	char **p, *ap, *t;
564 
565 	while ((ap = *++argv) != NULL) {
566 		/* Return if "--". */
567 		if (ap[0] == '-' && ap[1] == '-')
568 			return;
569 		/* skip if not an option */
570 		if (ap[0] != '-')
571 			continue;
572 		switch (ap[1]) {
573 		case 'a':
574 			/*
575 			 * The original join allowed "-a", which meant the
576 			 * same as -a1 plus -a2.  POSIX 1003.2, Draft 11.2
577 			 * only specifies this as "-a 1" and "a -2", so we
578 			 * have to use another option flag, one that is
579 			 * unlikely to ever be used or accidentally entered
580 			 * on the command line.  (Well, we could reallocate
581 			 * the argv array, but that hardly seems worthwhile.)
582 			 */
583 			if (ap[2] == '\0' && (argv[1] == NULL ||
584 			    (strcmp(argv[1], "1") != 0 &&
585 			    strcmp(argv[1], "2") != 0))) {
586 				ap[1] = '\01';
587 				warnx("-a option used without an argument; "
588 				    "reverting to historical behavior");
589 			}
590 			break;
591 		case 'j':
592 			/*
593 			 * The original join allowed "-j[12] arg" and "-j arg".
594 			 * Convert the former to "-[12] arg".  Don't convert
595 			 * the latter since getopt(3) can handle it.
596 			 */
597 			switch(ap[2]) {
598 			case '1':
599 			case '2':
600 				if (ap[3] != '\0')
601 					goto jbad;
602 				ap[1] = ap[2];
603 				ap[2] = '\0';
604 				break;
605 			case '\0':
606 				break;
607 			default:
608 jbad:				warnx("unknown option -- %s", ap + 1);
609 				usage();
610 			}
611 			break;
612 		case 'o':
613 			/*
614 			 * The original join allowed "-o arg arg".
615 			 * Convert to "-o arg -o arg".
616 			 */
617 			if (ap[2] != '\0' || argv[1] == NULL)
618 				break;
619 			for (p = argv + 2; *p != NULL; ++p) {
620 				if (p[0][0] == '0' || ((p[0][0] != '1' &&
621 				    p[0][0] != '2') || p[0][1] != '.'))
622 					break;
623 				len = strlen(*p);
624 				if (len - 2 != strspn(*p + 2, "0123456789"))
625 					break;
626 				if ((t = malloc(len + 3)) == NULL)
627 					err(1, NULL);
628 				t[0] = '-';
629 				t[1] = 'o';
630 				memmove(t + 2, *p, len + 1);
631 				*p = t;
632 			}
633 			argv = p - 1;
634 			break;
635 		}
636 	}
637 }
638 
639 void
640 usage(void)
641 {
642 	int len;
643 	extern char *__progname;
644 
645 	len = strlen(__progname) + sizeof("usage: ");
646 	(void)fprintf(stderr, "usage: %s [-1 field] [-2 field] "
647 	    "[-a file_number | -v file_number] [-e string]\n"
648 	    "%*s[-o list] [-t char] file1 file2\n",
649 	    __progname, len, "");
650 	exit(1);
651 }
652