1 /* join F1 F2 on stuff */ 2 #include <u.h> 3 #include <libc.h> 4 #include <bio.h> 5 #include <ctype.h> 6 7 enum { 8 F1, 9 F2, 10 NIN, 11 F0, 12 }; 13 14 #define NFLD 100 /* max field per line */ 15 #define comp() runestrcmp(ppi[F1][j1], ppi[F2][j2]) 16 17 Biobuf *f[NIN]; 18 Rune buf[NIN][Bsize]; /* input lines */ 19 Rune *ppi[NIN][NFLD+1]; /* pointers to fields in lines */ 20 Rune sep1 = ' '; /* default field separator */ 21 Rune sep2 = '\t'; 22 int j1 = 1; /* join of this field of file 1 */ 23 int j2 = 1; /* join of this field of file 2 */ 24 int a1; 25 int a2; 26 27 int olist[NIN*NFLD]; /* output these fields */ 28 int olistf[NIN*NFLD]; /* from these files */ 29 int no; /* number of entries in olist */ 30 char *sepstr = " "; 31 int discard; /* count of truncated lines */ 32 Rune null[Bsize] = L""; 33 Biobuf binbuf, boutbuf; 34 Biobuf *bin, *bout; 35 36 char *getoptarg(int*, char***); 37 int input(int); 38 void join(int); 39 void oparse(char*); 40 void output(int, int); 41 Rune *strtorune(Rune *, char *); 42 43 void 44 main(int argc, char **argv) 45 { 46 int i; 47 vlong off1, off2; 48 49 bin = &binbuf; 50 bout = &boutbuf; 51 Binit(bin, 0, OREAD); 52 Binit(bout, 1, OWRITE); 53 54 argv0 = argv[0]; 55 while (argc > 1 && argv[1][0] == '-') { 56 if (argv[1][1] == '\0') 57 break; 58 switch (argv[1][1]) { 59 case '-': 60 argc--; 61 argv++; 62 goto proceed; 63 case 'a': 64 switch(*getoptarg(&argc, &argv)) { 65 case '1': 66 a1++; 67 break; 68 case '2': 69 a2++; 70 break; 71 default: 72 sysfatal("incomplete option -a"); 73 } 74 break; 75 case 'e': 76 strtorune(null, getoptarg(&argc, &argv)); 77 break; 78 case 't': 79 sepstr=getoptarg(&argc, &argv); 80 chartorune(&sep1, sepstr); 81 sep2 = sep1; 82 break; 83 case 'o': 84 if(argv[1][2]!=0 || 85 argc>2 && strchr(argv[2],',')!=0) 86 oparse(getoptarg(&argc, &argv)); 87 else for (no = 0; no<2*NFLD && argc>2; no++){ 88 if (argv[2][0] == '1' && argv[2][1] == '.') { 89 olistf[no] = F1; 90 olist[no] = atoi(&argv[2][2]); 91 } else if (argv[2][0] == '2' && argv[2][1] == '.') { 92 olist[no] = atoi(&argv[2][2]); 93 olistf[no] = F2; 94 } else if (argv[2][0] == '0') 95 olistf[no] = F0; 96 else 97 break; 98 argc--; 99 argv++; 100 } 101 break; 102 case 'j': 103 if(argc <= 2) 104 break; 105 if (argv[1][2] == '1') 106 j1 = atoi(argv[2]); 107 else if (argv[1][2] == '2') 108 j2 = atoi(argv[2]); 109 else 110 j1 = j2 = atoi(argv[2]); 111 argc--; 112 argv++; 113 break; 114 case '1': 115 j1 = atoi(getoptarg(&argc, &argv)); 116 break; 117 case '2': 118 j2 = atoi(getoptarg(&argc, &argv)); 119 break; 120 } 121 argc--; 122 argv++; 123 } 124 proceed: 125 for (i = 0; i < no; i++) 126 if (olist[i]-- > NFLD) /* 0 origin */ 127 sysfatal("field number too big in -o"); 128 if (argc != 3) { 129 fprint(2, "usage: join [-1 x -2 y] [-o list] file1 file2\n"); 130 exits("usage"); 131 } 132 if (j1 < 1 || j2 < 1) 133 sysfatal("invalid field indices"); 134 j1--; 135 j2--; /* everyone else believes in 0 origin */ 136 137 if (strcmp(argv[1], "-") == 0) 138 f[F1] = bin; 139 else if ((f[F1] = Bopen(argv[1], OREAD)) == 0) 140 sysfatal("can't open %s: %r", argv[1]); 141 if(strcmp(argv[2], "-") == 0) 142 f[F2] = bin; 143 else if ((f[F2] = Bopen(argv[2], OREAD)) == 0) 144 sysfatal("can't open %s: %r", argv[2]); 145 146 off1 = Boffset(f[F1]); 147 off2 = Boffset(f[F2]); 148 if(Bseek(f[F2], 0, 2) >= 0){ 149 Bseek(f[F2], off2, 0); 150 join(F2); 151 }else if(Bseek(f[F1], 0, 2) >= 0){ 152 Bseek(f[F1], off1, 0); 153 Bseek(f[F2], off2, 0); 154 join(F1); 155 }else 156 sysfatal("neither file is randomly accessible"); 157 if (discard) 158 sysfatal("some input line was truncated"); 159 exits(""); 160 } 161 162 char * 163 runetostr(char *buf, Rune *r) 164 { 165 char *s; 166 167 for(s = buf; *r; r++) 168 s += runetochar(s, r); 169 *s = '\0'; 170 return buf; 171 } 172 173 Rune * 174 strtorune(Rune *buf, char *s) 175 { 176 Rune *r; 177 178 for (r = buf; *s; r++) 179 s += chartorune(r, s); 180 *r = '\0'; 181 return buf; 182 } 183 184 void 185 readboth(int n[]) 186 { 187 n[F1] = input(F1); 188 n[F2] = input(F2); 189 } 190 191 void 192 seekbotreadboth(int seekf, vlong bot, int n[]) 193 { 194 Bseek(f[seekf], bot, 0); 195 readboth(n); 196 } 197 198 void 199 join(int seekf) 200 { 201 int cmp, less; 202 int n[NIN]; 203 vlong top, bot; 204 205 less = seekf == F2; 206 top = 0; 207 bot = Boffset(f[seekf]); 208 readboth(n); 209 while(n[F1]>0 && n[F2]>0 || (a1||a2) && n[F1]+n[F2]>0) { 210 cmp = comp(); 211 if(n[F1]>0 && n[F2]>0 && cmp>0 || n[F1]==0) { 212 if(a2) 213 output(0, n[F2]); 214 if (seekf == F2) 215 bot = Boffset(f[seekf]); 216 n[F2] = input(F2); 217 } else if(n[F1]>0 && n[F2]>0 && cmp<0 || n[F2]==0) { 218 if(a1) 219 output(n[F1], 0); 220 if (seekf == F1) 221 bot = Boffset(f[seekf]); 222 n[F1] = input(F1); 223 } else { 224 /* n[F1]>0 && n[F2]>0 && cmp==0 */ 225 while(n[F2]>0 && cmp==0) { 226 output(n[F1], n[F2]); 227 top = Boffset(f[seekf]); 228 n[seekf] = input(seekf); 229 cmp = comp(); 230 } 231 seekbotreadboth(seekf, bot, n); 232 for(;;) { 233 cmp = comp(); 234 if(n[F1]>0 && n[F2]>0 && cmp==0) { 235 output(n[F1], n[F2]); 236 n[seekf] = input(seekf); 237 } else if(n[F1]>0 && n[F2]>0 && 238 (less? cmp<0 :cmp>0) || n[seekf]==0) 239 seekbotreadboth(seekf, bot, n); 240 else { 241 /* 242 * n[F1]>0 && n[F2]>0 && 243 * (less? cmp>0 :cmp<0) || 244 * n[seekf==F1? F2: F1]==0 245 */ 246 Bseek(f[seekf], top, 0); 247 bot = top; 248 n[seekf] = input(seekf); 249 break; 250 } 251 } 252 } 253 } 254 } 255 256 int 257 input(int n) /* get input line and split into fields */ 258 { 259 int c, i, len; 260 char *line; 261 Rune *bp; 262 Rune **pp; 263 264 bp = buf[n]; 265 pp = ppi[n]; 266 line = Brdline(f[n], '\n'); 267 if (line == nil) 268 return(0); 269 len = Blinelen(f[n]) - 1; 270 c = line[len]; 271 line[len] = '\0'; 272 strtorune(bp, line); 273 line[len] = c; /* restore delimiter */ 274 if (c != '\n') 275 discard++; 276 277 i = 0; 278 do { 279 i++; 280 if (sep1 == ' ') /* strip multiples */ 281 while ((c = *bp) == sep1 || c == sep2) 282 bp++; /* skip blanks */ 283 *pp++ = bp; /* record beginning */ 284 while ((c = *bp) != sep1 && c != sep2 && c != '\0') 285 bp++; 286 *bp++ = '\0'; /* mark end by overwriting blank */ 287 } while (c != '\0' && i < NFLD-1); 288 289 *pp = 0; 290 return(i); 291 } 292 293 void 294 prfields(int f, int on, int jn) 295 { 296 int i; 297 char buf[Bsize]; 298 299 for (i = 0; i < on; i++) 300 if (i != jn) 301 Bprint(bout, "%s%s", sepstr, runetostr(buf, ppi[f][i])); 302 } 303 304 void 305 output(int on1, int on2) /* print items from olist */ 306 { 307 int i; 308 Rune *temp; 309 char buf[Bsize]; 310 311 if (no <= 0) { /* default case */ 312 Bprint(bout, "%s", runetostr(buf, on1? ppi[F1][j1]: ppi[F2][j2])); 313 prfields(F1, on1, j1); 314 prfields(F2, on2, j2); 315 Bputc(bout, '\n'); 316 } else { 317 for (i = 0; i < no; i++) { 318 if (olistf[i]==F0 && on1>j1) 319 temp = ppi[F1][j1]; 320 else if (olistf[i]==F0 && on2>j2) 321 temp = ppi[F2][j2]; 322 else { 323 temp = ppi[olistf[i]][olist[i]]; 324 if(olistf[i]==F1 && on1<=olist[i] || 325 olistf[i]==F2 && on2<=olist[i] || 326 *temp==0) 327 temp = null; 328 } 329 Bprint(bout, "%s", runetostr(buf, temp)); 330 if (i == no - 1) 331 Bputc(bout, '\n'); 332 else 333 Bprint(bout, "%s", sepstr); 334 } 335 } 336 } 337 338 char * 339 getoptarg(int *argcp, char ***argvp) 340 { 341 int argc = *argcp; 342 char **argv = *argvp; 343 if(argv[1][2] != 0) 344 return &argv[1][2]; 345 if(argc<=2 || argv[2][0]=='-') 346 sysfatal("incomplete option %s", argv[1]); 347 *argcp = argc-1; 348 *argvp = ++argv; 349 return argv[1]; 350 } 351 352 void 353 oparse(char *s) 354 { 355 for (no = 0; no<2*NFLD && *s; no++, s++) { 356 switch(*s) { 357 case 0: 358 return; 359 case '0': 360 olistf[no] = F0; 361 break; 362 case '1': 363 case '2': 364 if(s[1] == '.' && isdigit(s[2])) { 365 olistf[no] = *s=='1'? F1: F2; 366 olist[no] = atoi(s += 2); 367 break; 368 } 369 /* fall thru */ 370 default: 371 sysfatal("invalid -o list"); 372 } 373 if(s[1] == ',') 374 s++; 375 } 376 } 377