xref: /plan9/sys/src/cmd/join.c (revision 14f51593fd82e19ba95969a8c07ff71131015979)
1 /*	join F1 F2 on stuff */
2 #include <u.h>
3 #include <libc.h>
4 #include <bio.h>
5 #include <ctype.h>
6 
7 enum {
8 	F1,
9 	F2,
10 	NIN,
11 	F0,
12 };
13 
14 #define	NFLD	100	/* max field per line */
15 #define comp() runestrcmp(ppi[F1][j1], ppi[F2][j2])
16 
17 Biobuf *f[NIN];
18 Rune buf[NIN][Bsize];	/* input lines */
19 Rune *ppi[NIN][NFLD+1];	/* pointers to fields in lines */
20 Rune	sep1	= ' ';	/* default field separator */
21 Rune	sep2	= '\t';
22 int	j1	= 1;	/* join of this field of file 1 */
23 int	j2	= 1;	/* join of this field of file 2 */
24 int	a1;
25 int 	a2;
26 
27 int	olist[NIN*NFLD];  /* output these fields */
28 int	olistf[NIN*NFLD]; /* from these files */
29 int	no;		/* number of entries in olist */
30 char *sepstr	= " ";
31 int	discard;	/* count of truncated lines */
32 Rune	null[Bsize]	= L"";
33 Biobuf binbuf, boutbuf;
34 Biobuf *bin, *bout;
35 
36 char	*getoptarg(int*, char***);
37 int	input(int);
38 void	join(int);
39 void	oparse(char*);
40 void	output(int, int);
41 Rune	*strtorune(Rune *, char *);
42 
43 void
main(int argc,char ** argv)44 main(int argc, char **argv)
45 {
46 	int i;
47 	vlong off1, off2;
48 
49 	bin = &binbuf;
50 	bout = &boutbuf;
51 	Binit(bin, 0, OREAD);
52 	Binit(bout, 1, OWRITE);
53 
54 	argv0 = argv[0];
55 	while (argc > 1 && argv[1][0] == '-') {
56 		if (argv[1][1] == '\0')
57 			break;
58 		switch (argv[1][1]) {
59 		case '-':
60 			argc--;
61 			argv++;
62 			goto proceed;
63 		case 'a':
64 			switch(*getoptarg(&argc, &argv)) {
65 			case '1':
66 				a1++;
67 				break;
68 			case '2':
69 				a2++;
70 				break;
71 			default:
72 				sysfatal("incomplete option -a");
73 			}
74 			break;
75 		case 'e':
76 			strtorune(null, getoptarg(&argc, &argv));
77 			break;
78 		case 't':
79 			sepstr=getoptarg(&argc, &argv);
80 			chartorune(&sep1, sepstr);
81 			sep2 = sep1;
82 			break;
83 		case 'o':
84 			if(argv[1][2]!=0 ||
85 			   argc>2 && strchr(argv[2],',')!=0)
86 				oparse(getoptarg(&argc, &argv));
87 			else for (no = 0; no<2*NFLD && argc>2; no++){
88 				if (argv[2][0] == '1' && argv[2][1] == '.') {
89 					olistf[no] = F1;
90 					olist[no] = atoi(&argv[2][2]);
91 				} else if (argv[2][0] == '2' && argv[2][1] == '.') {
92 					olist[no] = atoi(&argv[2][2]);
93 					olistf[no] = F2;
94 				} else if (argv[2][0] == '0')
95 					olistf[no] = F0;
96 				else
97 					break;
98 				argc--;
99 				argv++;
100 			}
101 			break;
102 		case 'j':
103 			if(argc <= 2)
104 				break;
105 			if (argv[1][2] == '1')
106 				j1 = atoi(argv[2]);
107 			else if (argv[1][2] == '2')
108 				j2 = atoi(argv[2]);
109 			else
110 				j1 = j2 = atoi(argv[2]);
111 			argc--;
112 			argv++;
113 			break;
114 		case '1':
115 			j1 = atoi(getoptarg(&argc, &argv));
116 			break;
117 		case '2':
118 			j2 = atoi(getoptarg(&argc, &argv));
119 			break;
120 		}
121 		argc--;
122 		argv++;
123 	}
124 proceed:
125 	for (i = 0; i < no; i++)
126 		if (olist[i]-- > NFLD)	/* 0 origin */
127 			sysfatal("field number too big in -o");
128 	if (argc != 3) {
129 		fprint(2, "usage: join [-1 x -2 y] [-o list] file1 file2\n");
130 		exits("usage");
131 	}
132 	if (j1 < 1  || j2 < 1)
133 		sysfatal("invalid field indices");
134 	j1--;
135 	j2--;	/* everyone else believes in 0 origin */
136 
137 	if (strcmp(argv[1], "-") == 0)
138 		f[F1] = bin;
139 	else if ((f[F1] = Bopen(argv[1], OREAD)) == 0)
140 		sysfatal("can't open %s: %r", argv[1]);
141 	if(strcmp(argv[2], "-") == 0)
142 		f[F2] = bin;
143 	else if ((f[F2] = Bopen(argv[2], OREAD)) == 0)
144 		sysfatal("can't open %s: %r", argv[2]);
145 
146 	off1 = Boffset(f[F1]);
147 	off2 = Boffset(f[F2]);
148 	if(Bseek(f[F2], 0, 2) >= 0){
149 		Bseek(f[F2], off2, 0);
150 		join(F2);
151 	}else if(Bseek(f[F1], 0, 2) >= 0){
152 		Bseek(f[F1], off1, 0);
153 		Bseek(f[F2], off2, 0);
154 		join(F1);
155 	}else
156 		sysfatal("neither file is randomly accessible");
157 	if (discard)
158 		sysfatal("some input line was truncated");
159 	exits("");
160 }
161 
162 char *
runetostr(char * buf,Rune * r)163 runetostr(char *buf, Rune *r)
164 {
165 	char *s;
166 
167 	for(s = buf; *r; r++)
168 		s += runetochar(s, r);
169 	*s = '\0';
170 	return buf;
171 }
172 
173 Rune *
strtorune(Rune * buf,char * s)174 strtorune(Rune *buf, char *s)
175 {
176 	Rune *r;
177 
178 	for (r = buf; *s; r++)
179 		s += chartorune(r, s);
180 	*r = '\0';
181 	return buf;
182 }
183 
184 void
readboth(int n[])185 readboth(int n[])
186 {
187 	n[F1] = input(F1);
188 	n[F2] = input(F2);
189 }
190 
191 void
seekbotreadboth(int seekf,vlong bot,int n[])192 seekbotreadboth(int seekf, vlong bot, int n[])
193 {
194 	Bseek(f[seekf], bot, 0);
195 	readboth(n);
196 }
197 
198 void
join(int seekf)199 join(int seekf)
200 {
201 	int cmp, less;
202 	int n[NIN];
203 	vlong top, bot;
204 
205 	less = seekf == F2;
206 	top = 0;
207 	bot = Boffset(f[seekf]);
208 	readboth(n);
209 	while(n[F1]>0 && n[F2]>0 || (a1||a2) && n[F1]+n[F2]>0) {
210 		cmp = comp();
211 		if(n[F1]>0 && n[F2]>0 && cmp>0 || n[F1]==0) {
212 			if(a2)
213 				output(0, n[F2]);
214 			if (seekf == F2)
215 				bot = Boffset(f[seekf]);
216 			n[F2] = input(F2);
217 		} else if(n[F1]>0 && n[F2]>0 && cmp<0 || n[F2]==0) {
218 			if(a1)
219 				output(n[F1], 0);
220 			if (seekf == F1)
221 				bot = Boffset(f[seekf]);
222 			n[F1] = input(F1);
223 		} else {
224 			/* n[F1]>0 && n[F2]>0 && cmp==0 */
225 			while(n[F2]>0 && cmp==0) {
226 				output(n[F1], n[F2]);
227 				top = Boffset(f[seekf]);
228 				n[seekf] = input(seekf);
229 				cmp = comp();
230 			}
231 			seekbotreadboth(seekf, bot, n);
232 			for(;;) {
233 				cmp = comp();
234 				if(n[F1]>0 && n[F2]>0 && cmp==0) {
235 					output(n[F1], n[F2]);
236 					n[seekf] = input(seekf);
237 				} else if(n[F1]>0 && n[F2]>0 &&
238 				    (less? cmp<0 :cmp>0) || n[seekf]==0)
239 					seekbotreadboth(seekf, bot, n);
240 				else {
241 					/*
242 					 * n[F1]>0 && n[F2]>0 &&
243 					 * (less? cmp>0 :cmp<0) ||
244 					 * n[seekf==F1? F2: F1]==0
245 					 */
246 					Bseek(f[seekf], top, 0);
247 					bot = top;
248 					n[seekf] = input(seekf);
249 					break;
250 				}
251 			}
252 		}
253 	}
254 }
255 
256 int
input(int n)257 input(int n)		/* get input line and split into fields */
258 {
259 	int c, i, len;
260 	char *line;
261 	Rune *bp;
262 	Rune **pp;
263 
264 	bp = buf[n];
265 	pp = ppi[n];
266 	line = Brdline(f[n], '\n');
267 	if (line == nil)
268 		return(0);
269 	len = Blinelen(f[n]) - 1;
270 	c = line[len];
271 	line[len] = '\0';
272 	strtorune(bp, line);
273 	line[len] = c;			/* restore delimiter */
274 	if (c != '\n')
275 		discard++;
276 
277 	i = 0;
278 	do {
279 		i++;
280 		if (sep1 == ' ')	/* strip multiples */
281 			while ((c = *bp) == sep1 || c == sep2)
282 				bp++;	/* skip blanks */
283 		*pp++ = bp;		/* record beginning */
284 		while ((c = *bp) != sep1 && c != sep2 && c != '\0')
285 			bp++;
286 		*bp++ = '\0';		/* mark end by overwriting blank */
287 	} while (c != '\0' && i < NFLD-1);
288 
289 	*pp = 0;
290 	return(i);
291 }
292 
293 void
prfields(int f,int on,int jn)294 prfields(int f, int on, int jn)
295 {
296 	int i;
297 	char buf[Bsize];
298 
299 	for (i = 0; i < on; i++)
300 		if (i != jn)
301 			Bprint(bout, "%s%s", sepstr, runetostr(buf, ppi[f][i]));
302 }
303 
304 void
output(int on1,int on2)305 output(int on1, int on2)	/* print items from olist */
306 {
307 	int i;
308 	Rune *temp;
309 	char buf[Bsize];
310 
311 	if (no <= 0) {	/* default case */
312 		Bprint(bout, "%s", runetostr(buf, on1? ppi[F1][j1]: ppi[F2][j2]));
313 		prfields(F1, on1, j1);
314 		prfields(F2, on2, j2);
315 		Bputc(bout, '\n');
316 	} else {
317 		for (i = 0; i < no; i++) {
318 			if (olistf[i]==F0 && on1>j1)
319 				temp = ppi[F1][j1];
320 			else if (olistf[i]==F0 && on2>j2)
321 				temp = ppi[F2][j2];
322 			else {
323 				temp = ppi[olistf[i]][olist[i]];
324 				if(olistf[i]==F1 && on1<=olist[i] ||
325 				   olistf[i]==F2 && on2<=olist[i] ||
326 				   *temp==0)
327 					temp = null;
328 			}
329 			Bprint(bout, "%s", runetostr(buf, temp));
330 			if (i == no - 1)
331 				Bputc(bout, '\n');
332 			else
333 				Bprint(bout, "%s", sepstr);
334 		}
335 	}
336 }
337 
338 char *
getoptarg(int * argcp,char *** argvp)339 getoptarg(int *argcp, char ***argvp)
340 {
341 	int argc = *argcp;
342 	char **argv = *argvp;
343 	if(argv[1][2] != 0)
344 		return &argv[1][2];
345 	if(argc<=2 || argv[2][0]=='-')
346 		sysfatal("incomplete option %s", argv[1]);
347 	*argcp = argc-1;
348 	*argvp = ++argv;
349 	return argv[1];
350 }
351 
352 void
oparse(char * s)353 oparse(char *s)
354 {
355 	for (no = 0; no<2*NFLD && *s; no++, s++) {
356 		switch(*s) {
357 		case 0:
358 			return;
359 		case '0':
360 			olistf[no] = F0;
361 			break;
362 		case '1':
363 		case '2':
364 			if(s[1] == '.' && isdigit(s[2])) {
365 				olistf[no] = *s=='1'? F1: F2;
366 				olist[no] = atoi(s += 2);
367 				break;
368 			}
369 			/* fall thru */
370 		default:
371 			sysfatal("invalid -o list");
372 		}
373 		if(s[1] == ',')
374 			s++;
375 	}
376 }
377