1 /* join F1 F2 on stuff */
2 #include <u.h>
3 #include <libc.h>
4 #include <bio.h>
5 #include <ctype.h>
6
7 enum {
8 F1,
9 F2,
10 NIN,
11 F0,
12 };
13
14 #define NFLD 100 /* max field per line */
15 #define comp() runestrcmp(ppi[F1][j1], ppi[F2][j2])
16
17 Biobuf *f[NIN];
18 Rune buf[NIN][Bsize]; /* input lines */
19 Rune *ppi[NIN][NFLD+1]; /* pointers to fields in lines */
20 Rune sep1 = ' '; /* default field separator */
21 Rune sep2 = '\t';
22 int j1 = 1; /* join of this field of file 1 */
23 int j2 = 1; /* join of this field of file 2 */
24 int a1;
25 int a2;
26
27 int olist[NIN*NFLD]; /* output these fields */
28 int olistf[NIN*NFLD]; /* from these files */
29 int no; /* number of entries in olist */
30 char *sepstr = " ";
31 int discard; /* count of truncated lines */
32 Rune null[Bsize] = L"";
33 Biobuf binbuf, boutbuf;
34 Biobuf *bin, *bout;
35
36 char *getoptarg(int*, char***);
37 int input(int);
38 void join(int);
39 void oparse(char*);
40 void output(int, int);
41 Rune *strtorune(Rune *, char *);
42
43 void
main(int argc,char ** argv)44 main(int argc, char **argv)
45 {
46 int i;
47 vlong off1, off2;
48
49 bin = &binbuf;
50 bout = &boutbuf;
51 Binit(bin, 0, OREAD);
52 Binit(bout, 1, OWRITE);
53
54 argv0 = argv[0];
55 while (argc > 1 && argv[1][0] == '-') {
56 if (argv[1][1] == '\0')
57 break;
58 switch (argv[1][1]) {
59 case '-':
60 argc--;
61 argv++;
62 goto proceed;
63 case 'a':
64 switch(*getoptarg(&argc, &argv)) {
65 case '1':
66 a1++;
67 break;
68 case '2':
69 a2++;
70 break;
71 default:
72 sysfatal("incomplete option -a");
73 }
74 break;
75 case 'e':
76 strtorune(null, getoptarg(&argc, &argv));
77 break;
78 case 't':
79 sepstr=getoptarg(&argc, &argv);
80 chartorune(&sep1, sepstr);
81 sep2 = sep1;
82 break;
83 case 'o':
84 if(argv[1][2]!=0 ||
85 argc>2 && strchr(argv[2],',')!=0)
86 oparse(getoptarg(&argc, &argv));
87 else for (no = 0; no<2*NFLD && argc>2; no++){
88 if (argv[2][0] == '1' && argv[2][1] == '.') {
89 olistf[no] = F1;
90 olist[no] = atoi(&argv[2][2]);
91 } else if (argv[2][0] == '2' && argv[2][1] == '.') {
92 olist[no] = atoi(&argv[2][2]);
93 olistf[no] = F2;
94 } else if (argv[2][0] == '0')
95 olistf[no] = F0;
96 else
97 break;
98 argc--;
99 argv++;
100 }
101 break;
102 case 'j':
103 if(argc <= 2)
104 break;
105 if (argv[1][2] == '1')
106 j1 = atoi(argv[2]);
107 else if (argv[1][2] == '2')
108 j2 = atoi(argv[2]);
109 else
110 j1 = j2 = atoi(argv[2]);
111 argc--;
112 argv++;
113 break;
114 case '1':
115 j1 = atoi(getoptarg(&argc, &argv));
116 break;
117 case '2':
118 j2 = atoi(getoptarg(&argc, &argv));
119 break;
120 }
121 argc--;
122 argv++;
123 }
124 proceed:
125 for (i = 0; i < no; i++)
126 if (olist[i]-- > NFLD) /* 0 origin */
127 sysfatal("field number too big in -o");
128 if (argc != 3) {
129 fprint(2, "usage: join [-1 x -2 y] [-o list] file1 file2\n");
130 exits("usage");
131 }
132 if (j1 < 1 || j2 < 1)
133 sysfatal("invalid field indices");
134 j1--;
135 j2--; /* everyone else believes in 0 origin */
136
137 if (strcmp(argv[1], "-") == 0)
138 f[F1] = bin;
139 else if ((f[F1] = Bopen(argv[1], OREAD)) == 0)
140 sysfatal("can't open %s: %r", argv[1]);
141 if(strcmp(argv[2], "-") == 0)
142 f[F2] = bin;
143 else if ((f[F2] = Bopen(argv[2], OREAD)) == 0)
144 sysfatal("can't open %s: %r", argv[2]);
145
146 off1 = Boffset(f[F1]);
147 off2 = Boffset(f[F2]);
148 if(Bseek(f[F2], 0, 2) >= 0){
149 Bseek(f[F2], off2, 0);
150 join(F2);
151 }else if(Bseek(f[F1], 0, 2) >= 0){
152 Bseek(f[F1], off1, 0);
153 Bseek(f[F2], off2, 0);
154 join(F1);
155 }else
156 sysfatal("neither file is randomly accessible");
157 if (discard)
158 sysfatal("some input line was truncated");
159 exits("");
160 }
161
162 char *
runetostr(char * buf,Rune * r)163 runetostr(char *buf, Rune *r)
164 {
165 char *s;
166
167 for(s = buf; *r; r++)
168 s += runetochar(s, r);
169 *s = '\0';
170 return buf;
171 }
172
173 Rune *
strtorune(Rune * buf,char * s)174 strtorune(Rune *buf, char *s)
175 {
176 Rune *r;
177
178 for (r = buf; *s; r++)
179 s += chartorune(r, s);
180 *r = '\0';
181 return buf;
182 }
183
184 void
readboth(int n[])185 readboth(int n[])
186 {
187 n[F1] = input(F1);
188 n[F2] = input(F2);
189 }
190
191 void
seekbotreadboth(int seekf,vlong bot,int n[])192 seekbotreadboth(int seekf, vlong bot, int n[])
193 {
194 Bseek(f[seekf], bot, 0);
195 readboth(n);
196 }
197
198 void
join(int seekf)199 join(int seekf)
200 {
201 int cmp, less;
202 int n[NIN];
203 vlong top, bot;
204
205 less = seekf == F2;
206 top = 0;
207 bot = Boffset(f[seekf]);
208 readboth(n);
209 while(n[F1]>0 && n[F2]>0 || (a1||a2) && n[F1]+n[F2]>0) {
210 cmp = comp();
211 if(n[F1]>0 && n[F2]>0 && cmp>0 || n[F1]==0) {
212 if(a2)
213 output(0, n[F2]);
214 if (seekf == F2)
215 bot = Boffset(f[seekf]);
216 n[F2] = input(F2);
217 } else if(n[F1]>0 && n[F2]>0 && cmp<0 || n[F2]==0) {
218 if(a1)
219 output(n[F1], 0);
220 if (seekf == F1)
221 bot = Boffset(f[seekf]);
222 n[F1] = input(F1);
223 } else {
224 /* n[F1]>0 && n[F2]>0 && cmp==0 */
225 while(n[F2]>0 && cmp==0) {
226 output(n[F1], n[F2]);
227 top = Boffset(f[seekf]);
228 n[seekf] = input(seekf);
229 cmp = comp();
230 }
231 seekbotreadboth(seekf, bot, n);
232 for(;;) {
233 cmp = comp();
234 if(n[F1]>0 && n[F2]>0 && cmp==0) {
235 output(n[F1], n[F2]);
236 n[seekf] = input(seekf);
237 } else if(n[F1]>0 && n[F2]>0 &&
238 (less? cmp<0 :cmp>0) || n[seekf]==0)
239 seekbotreadboth(seekf, bot, n);
240 else {
241 /*
242 * n[F1]>0 && n[F2]>0 &&
243 * (less? cmp>0 :cmp<0) ||
244 * n[seekf==F1? F2: F1]==0
245 */
246 Bseek(f[seekf], top, 0);
247 bot = top;
248 n[seekf] = input(seekf);
249 break;
250 }
251 }
252 }
253 }
254 }
255
256 int
input(int n)257 input(int n) /* get input line and split into fields */
258 {
259 int c, i, len;
260 char *line;
261 Rune *bp;
262 Rune **pp;
263
264 bp = buf[n];
265 pp = ppi[n];
266 line = Brdline(f[n], '\n');
267 if (line == nil)
268 return(0);
269 len = Blinelen(f[n]) - 1;
270 c = line[len];
271 line[len] = '\0';
272 strtorune(bp, line);
273 line[len] = c; /* restore delimiter */
274 if (c != '\n')
275 discard++;
276
277 i = 0;
278 do {
279 i++;
280 if (sep1 == ' ') /* strip multiples */
281 while ((c = *bp) == sep1 || c == sep2)
282 bp++; /* skip blanks */
283 *pp++ = bp; /* record beginning */
284 while ((c = *bp) != sep1 && c != sep2 && c != '\0')
285 bp++;
286 *bp++ = '\0'; /* mark end by overwriting blank */
287 } while (c != '\0' && i < NFLD-1);
288
289 *pp = 0;
290 return(i);
291 }
292
293 void
prfields(int f,int on,int jn)294 prfields(int f, int on, int jn)
295 {
296 int i;
297 char buf[Bsize];
298
299 for (i = 0; i < on; i++)
300 if (i != jn)
301 Bprint(bout, "%s%s", sepstr, runetostr(buf, ppi[f][i]));
302 }
303
304 void
output(int on1,int on2)305 output(int on1, int on2) /* print items from olist */
306 {
307 int i;
308 Rune *temp;
309 char buf[Bsize];
310
311 if (no <= 0) { /* default case */
312 Bprint(bout, "%s", runetostr(buf, on1? ppi[F1][j1]: ppi[F2][j2]));
313 prfields(F1, on1, j1);
314 prfields(F2, on2, j2);
315 Bputc(bout, '\n');
316 } else {
317 for (i = 0; i < no; i++) {
318 if (olistf[i]==F0 && on1>j1)
319 temp = ppi[F1][j1];
320 else if (olistf[i]==F0 && on2>j2)
321 temp = ppi[F2][j2];
322 else {
323 temp = ppi[olistf[i]][olist[i]];
324 if(olistf[i]==F1 && on1<=olist[i] ||
325 olistf[i]==F2 && on2<=olist[i] ||
326 *temp==0)
327 temp = null;
328 }
329 Bprint(bout, "%s", runetostr(buf, temp));
330 if (i == no - 1)
331 Bputc(bout, '\n');
332 else
333 Bprint(bout, "%s", sepstr);
334 }
335 }
336 }
337
338 char *
getoptarg(int * argcp,char *** argvp)339 getoptarg(int *argcp, char ***argvp)
340 {
341 int argc = *argcp;
342 char **argv = *argvp;
343 if(argv[1][2] != 0)
344 return &argv[1][2];
345 if(argc<=2 || argv[2][0]=='-')
346 sysfatal("incomplete option %s", argv[1]);
347 *argcp = argc-1;
348 *argvp = ++argv;
349 return argv[1];
350 }
351
352 void
oparse(char * s)353 oparse(char *s)
354 {
355 for (no = 0; no<2*NFLD && *s; no++, s++) {
356 switch(*s) {
357 case 0:
358 return;
359 case '0':
360 olistf[no] = F0;
361 break;
362 case '1':
363 case '2':
364 if(s[1] == '.' && isdigit(s[2])) {
365 olistf[no] = *s=='1'? F1: F2;
366 olist[no] = atoi(s += 2);
367 break;
368 }
369 /* fall thru */
370 default:
371 sysfatal("invalid -o list");
372 }
373 if(s[1] == ',')
374 s++;
375 }
376 }
377