1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include <regexp.h>
5 #include <ctype.h>
6 #include "dict.h"
7
8 /*
9 * Assumed index file structure: lines of form
10 * [^\t]+\t[0-9]+
11 * First field is key, second is byte offset into dictionary.
12 * Should be sorted with args -u -t' ' +0f -1 +0 -1 +1n -2
13 */
14 typedef struct Addr Addr;
15
16 struct Addr {
17 int n; /* number of offsets */
18 int cur; /* current position within doff array */
19 int maxn; /* actual current size of doff array */
20 ulong doff[1]; /* doff[maxn], with 0..n-1 significant */
21 };
22
23 Biobuf binbuf;
24 Biobuf boutbuf;
25 Biobuf *bin = &binbuf; /* user cmd input */
26 Biobuf *bout = &boutbuf; /* output */
27 Biobuf *bdict; /* dictionary */
28 Biobuf *bindex; /* index file */
29 long indextop; /* index offset at end of file */
30 int lastcmd; /* last executed command */
31 Addr *dot; /* "current" address */
32 Dict *dict; /* current dictionary */
33 int linelen;
34 int breaklen = 60;
35 int outinhibit;
36 int debug;
37
38 void execcmd(int);
39 int getpref(char*, Rune*);
40 Entry getentry(int);
41 int getfield(Rune*);
42 long locate(Rune*);
43 int parseaddr(char*, char**);
44 int parsecmd(char*);
45 int search(char*, int);
46 long seeknextline(Biobuf*, long);
47 void setdotnext(void);
48 void setdotprev(void);
49 void sortaddr(Addr*);
50 void usage(void);
51
52 enum {
53 Plen=300, /* max length of a search pattern */
54 Fieldlen=200, /* max length of an index field */
55 Aslots=10, /* initial number of slots in an address */
56 };
57
58 void
main(int argc,char ** argv)59 main(int argc, char **argv)
60 {
61 int i, cmd, kflag;
62 char *line, *p;
63
64 Binit(&binbuf, 0, OREAD);
65 Binit(&boutbuf, 1, OWRITE);
66 kflag = 0;
67 line = 0;
68 dict = 0;
69 for(i=0; dicts[i].name; i++){
70 if(access(dicts[i].path, 0)>=0 && access(dicts[i].indexpath, 0)>=0){
71 dict = &dicts[i];
72 break;
73 }
74 }
75 ARGBEGIN {
76 case 'd':
77 p = ARGF();
78 dict = 0;
79 if(p) {
80 for(i=0; dicts[i].name; i++)
81 if(strcmp(p, dicts[i].name)==0) {
82 dict = &dicts[i];
83 break;
84 }
85 }
86 if(!dict)
87 usage();
88 break;
89 case 'c':
90 line = ARGF();
91 if(!line)
92 usage();
93 break;
94 case 'k':
95 kflag++;
96 break;
97 case 'D':
98 debug++;
99 break;
100 default:
101 usage();
102 ARGEND }
103 if(dict == 0){
104 err("no dictionaries present on this system");
105 exits("nodict");
106 }
107
108 if(kflag) {
109 (*dict->printkey)();
110 exits(0);
111 }
112 if(argc > 1)
113 usage();
114 else if(argc == 1) {
115 if(line)
116 usage();
117 p = argv[0];
118 line = malloc(strlen(p)+5);
119 sprint(line, "/%s/P\n", p);
120 }
121 bdict = Bopen(dict->path, OREAD);
122 if(!bdict) {
123 err("can't open dictionary %s", dict->path);
124 exits("nodict");
125 }
126 bindex = Bopen(dict->indexpath, OREAD);
127 if(!bindex) {
128 err("can't open index %s", dict->indexpath);
129 exits("noindex");
130 }
131 indextop = Bseek(bindex, 0L, 2);
132
133 dot = malloc(sizeof(Addr)+(Aslots-1)*sizeof(ulong));
134 dot->n = 0;
135 dot->cur = 0;
136 dot->maxn = Aslots;
137 lastcmd = 0;
138
139 if(line) {
140 cmd = parsecmd(line);
141 if(cmd)
142 execcmd(cmd);
143 } else {
144 for(;;) {
145 Bprint(bout, "*");
146 Bflush(bout);
147 line = Brdline(bin, '\n');
148 linelen = 0;
149 if(!line)
150 break;
151 cmd = parsecmd(line);
152 if(cmd) {
153 execcmd(cmd);
154 lastcmd = cmd;
155 }
156 }
157 }
158 exits(0);
159 }
160
161 void
usage(void)162 usage(void)
163 {
164 int i;
165 char *a, *b;
166
167 Bprint(bout, "Usage: %s [-d dict] [-k] [-c cmd] [word]\n", argv0);
168 Bprint(bout, "dictionaries (brackets mark dictionaries not present on this system):\n");
169 for(i = 0; dicts[i].name; i++){
170 a = b = "";
171 if(access(dicts[i].path, 0)<0 || access(dicts[i].indexpath, 0)<0){
172 a = "[";
173 b = "]";
174 }
175 Bprint(bout, " %s%s\t%s%s\n", a, dicts[i].name, dicts[i].desc, b);
176 }
177 exits("usage");
178 }
179
180 int
parsecmd(char * line)181 parsecmd(char *line)
182 {
183 char *e;
184 int cmd, ans;
185
186 if(parseaddr(line, &e) >= 0)
187 line = e;
188 else
189 return 0;
190 cmd = *line;
191 ans = cmd;
192 if(isupper(cmd))
193 cmd = tolower(cmd);
194 if(!(cmd == 'a' || cmd == 'h' || cmd == 'p' || cmd == 'r' ||
195 cmd == '\n')) {
196 err("unknown command %c", cmd);
197 return 0;
198 }
199 if(cmd == '\n')
200 switch(lastcmd) {
201 case 0: ans = 'H'; break;
202 case 'H': ans = 'p'; break;
203 default : ans = lastcmd; break;
204 }
205 else if(line[1] != '\n' && line[1] != 0)
206 err("extra stuff after command %c ignored", cmd);
207 return ans;
208 }
209
210 void
execcmd(int cmd)211 execcmd(int cmd)
212 {
213 Entry e;
214 int cur, doall;
215
216 if(isupper(cmd)) {
217 doall = 1;
218 cmd = tolower(cmd);
219 cur = 0;
220 } else {
221 doall = 0;
222 cur = dot->cur;
223 }
224
225 if(debug && doall && cmd == 'a')
226 Bprint(bout, "%d entries, cur=%d\n", dot->n, cur+1);
227 for(;;){
228 if(cur >= dot->n)
229 break;
230 if(doall) {
231 Bprint(bout, "%d\t", cur+1);
232 linelen += 4 + (cur >= 10);
233 }
234 switch(cmd) {
235 case 'a':
236 Bprint(bout, "#%lud\n", dot->doff[cur]);
237 break;
238 case 'h':
239 case 'p':
240 case 'r':
241 e = getentry(cur);
242 (*dict->printentry)(e, cmd);
243 break;
244 }
245 cur++;
246 if(doall) {
247 if(cmd == 'p' || cmd == 'r') {
248 Bputc(bout, '\n');
249 linelen = 0;
250 }
251 } else
252 break;
253 }
254 if(cur >= dot->n)
255 cur = 0;
256 dot->cur = cur;
257 }
258
259 /*
260 * Address syntax: ('.' | '/' re '/' | '!' re '!' | number | '#' number) ('+' | '-')*
261 * Answer goes in dot.
262 * Return -1 if address starts, but get error.
263 * Return 0 if no address.
264 */
265 int
parseaddr(char * line,char ** eptr)266 parseaddr(char *line, char **eptr)
267 {
268 int delim, plen;
269 ulong v;
270 char *e;
271 char pat[Plen];
272
273 if(*line == '/' || *line == '!') {
274 /* anchored regular expression match; '!' means no folding */
275 if(*line == '/') {
276 delim = '/';
277 e = strpbrk(line+1, "/\n");
278 } else {
279 delim = '!';
280 e = strpbrk(line+1, "!\n");
281 }
282 plen = e-line-1;
283 if(plen >= Plen-3) {
284 err("pattern too big");
285 return -1;
286 }
287 pat[0] = '^';
288 memcpy(pat+1, line+1, plen);
289 pat[plen+1] = '$';
290 pat[plen+2] = 0;
291 if(*e == '\n')
292 line = e;
293 else
294 line = e+1;
295 if(!search(pat, delim == '/')) {
296 err("pattern not found");
297 return -1;
298 }
299 } else if(*line == '#') {
300 /* absolute byte offset into dictionary */
301 line++;
302 if(!isdigit(*line))
303 return -1;
304 v = strtoul(line, &e, 10);
305 line = e;
306 dot->doff[0] = v;
307 dot->n = 1;
308 dot->cur = 0;
309 } else if(isdigit(*line)) {
310 v = strtoul(line, &e, 10);
311 line = e;
312 if(v < 1 || v > dot->n)
313 err(".%d not in range [1,%d], ignored",
314 v, dot->n);
315 else
316 dot->cur = v-1;
317 } else if(*line == '.') {
318 line++;
319 } else {
320 *eptr = line;
321 return 0;
322 }
323 while(*line == '+' || *line == '-') {
324 if(*line == '+')
325 setdotnext();
326 else
327 setdotprev();
328 line++;
329 }
330 *eptr = line;
331 return 1;
332 }
333
334 /*
335 * Index file is sorted by folded field1.
336 * Method: find pre, a folded prefix of r.e. pat,
337 * and then low = offset to beginning of
338 * line in index file where first match of prefix occurs.
339 * Then go through index until prefix no longer matches,
340 * adding each line that matches real pattern to dot.
341 * Finally, sort dot offsets (uniquing).
342 * We know pat len < Plen, and that it is surrounded by ^..$
343 */
344 int
search(char * pat,int dofold)345 search(char *pat, int dofold)
346 {
347 int needre, prelen, match, n;
348 Reprog *re;
349 long ioff, v;
350 Rune pre[Plen];
351 Rune lit[Plen];
352 Rune entry[Fieldlen];
353 char fpat[Plen];
354
355 prelen = getpref(pat+1, pre);
356 if(pat[prelen+1] == 0 || pat[prelen+1] == '$') {
357 runescpy(lit, pre);
358 if(dofold)
359 fold(lit);
360 needre = 0;
361 SET(re);
362 } else {
363 needre = 1;
364 if(dofold) {
365 foldre(fpat, pat);
366 re = regcomp(fpat);
367 } else
368 re = regcomp(pat);
369 }
370 fold(pre);
371 ioff = locate(pre);
372 if(ioff < 0)
373 return 0;
374 dot->n = 0;
375 Bseek(bindex, ioff, 0);
376 for(;;) {
377 if(!getfield(entry))
378 break;
379 if(dofold)
380 fold(entry);
381 if(needre)
382 match = rregexec(re, entry, 0, 0);
383 else
384 match = (acomp(lit, entry) == 0);
385 if(match) {
386 if(!getfield(entry))
387 break;
388 v = runetol(entry);
389 if(dot->n >= dot->maxn) {
390 n = 2*dot->maxn;
391 dot = realloc(dot,
392 sizeof(Addr)+(n-1)*sizeof(long));
393 if(!dot) {
394 err("out of memory");
395 exits("nomem");
396 }
397 dot->maxn = n;
398 }
399 dot->doff[dot->n++] = v;
400 } else {
401 if(!dofold)
402 fold(entry);
403 if(*pre) {
404 n = acomp(pre, entry);
405 if(n < -1 || (!needre && n < 0))
406 break;
407 }
408 /* get to next index entry */
409 if(!getfield(entry))
410 break;
411 }
412 }
413 sortaddr(dot);
414 dot->cur = 0;
415 return dot->n;
416 }
417
418 /*
419 * Return offset in index file of first line whose folded
420 * first field has pre as a prefix. -1 if none found.
421 */
422 long
locate(Rune * pre)423 locate(Rune *pre)
424 {
425 long top, bot, mid;
426 Rune entry[Fieldlen];
427
428 if(*pre == 0)
429 return 0;
430 bot = 0;
431 top = indextop;
432 if(debug>1)
433 fprint(2, "locate looking for prefix %S\n", pre);
434 for(;;) {
435 /*
436 * Loop invariant: foldkey(bot) < pre <= foldkey(top)
437 * and bot < top, and bot,top point at beginning of lines
438 */
439 mid = (top+bot) / 2;
440 mid = seeknextline(bindex, mid);
441 if(debug > 1)
442 fprint(2, "bot=%ld, mid=%ld->%ld, top=%ld\n",
443 bot, (top+bot) / 2, mid, top);
444 if(mid == top || !getfield(entry))
445 break;
446 if(debug > 1)
447 fprint(2, "key=%S\n", entry);
448 /*
449 * here mid is strictly between bot and top
450 */
451 fold(entry);
452 if(acomp(pre, entry) <= 0)
453 top = mid;
454 else
455 bot = mid;
456 }
457 /*
458 * bot < top, but they don't necessarily point at successive lines
459 * Use linear search from bot to find first line that pre is a
460 * prefix of
461 */
462 while((bot = seeknextline(bindex, bot)) <= top) {
463 if(!getfield(entry))
464 return -1;
465 if(debug > 1)
466 fprint(2, "key=%S\n", entry);
467 fold(entry);
468 switch(acomp(pre, entry)) {
469 case -2:
470 return -1;
471 case -1:
472 case 0:
473 return bot;
474 case 1:
475 case 2:
476 continue;
477 }
478 }
479 return -1;
480
481 }
482
483 /*
484 * Get prefix of non re-metacharacters, runified, into pre,
485 * and return length
486 */
487 int
getpref(char * pat,Rune * pre)488 getpref(char *pat, Rune *pre)
489 {
490 int n, r;
491 char *p;
492
493 p = pat;
494 while(*p) {
495 n = chartorune(pre, p);
496 r = *pre;
497 switch(r) {
498 case L'.': case L'*': case L'+': case L'?':
499 case L'[': case L']': case L'(': case ')':
500 case L'|': case L'^': case L'$':
501 *pre = 0;
502 return p-pat;
503 case L'\\':
504 p += n;
505 p += chartorune(++pre, p);
506 pre++;
507 break;
508 default:
509 p += n;
510 pre++;
511 }
512 }
513 return p-pat;
514 }
515
516 long
seeknextline(Biobuf * b,long off)517 seeknextline(Biobuf *b, long off)
518 {
519 long c;
520
521 Bseek(b, off, 0);
522 do {
523 c = Bgetrune(b);
524 } while(c>=0 && c!='\n');
525 return Boffset(b);
526 }
527
528 /*
529 * Get next field out of index file (either tab- or nl- terminated)
530 * Answer in *rp, assumed to be Fieldlen long.
531 * Return 0 if read error first.
532 */
533 int
getfield(Rune * rp)534 getfield(Rune *rp)
535 {
536 long c;
537 int n;
538
539 for(n=Fieldlen; n-- > 0; ) {
540 if ((c = Bgetrune(bindex)) < 0)
541 return 0;
542 if(c == '\t' || c == '\n') {
543 *rp = L'\0';
544 return 1;
545 }
546 *rp++ = c;
547 }
548 err("word too long");
549 return 0;
550 }
551
552 /*
553 * A compare longs function suitable for qsort
554 */
555 static int
longcmp(void * av,void * bv)556 longcmp(void *av, void *bv)
557 {
558 long v;
559 long *a, *b;
560
561 a = av;
562 b = bv;
563
564 v = *a - *b;
565 if(v < 0)
566 return -1;
567 else if(v == 0)
568 return 0;
569 else
570 return 1;
571 }
572
573 void
sortaddr(Addr * a)574 sortaddr(Addr *a)
575 {
576 int i, j;
577 long v;
578
579 if(a->n <= 1)
580 return;
581
582 qsort(a->doff, a->n, sizeof(long), longcmp);
583
584 /* remove duplicates */
585 for(i=0, j=0; j < a->n; j++) {
586 v = a->doff[j];
587 if(i > 0 && v == a->doff[i-1])
588 continue;
589 a->doff[i++] = v;
590 }
591 a->n = i;
592 }
593
594 Entry
getentry(int i)595 getentry(int i)
596 {
597 long b, e, n;
598 static Entry ans;
599 static int anslen = 0;
600
601 b = dot->doff[i];
602 e = (*dict->nextoff)(b+1);
603 ans.doff = b;
604 if(e < 0) {
605 err("couldn't seek to entry");
606 ans.start = 0;
607 ans.end = 0;
608 } else {
609 n = e-b;
610 if(n+1 > anslen) {
611 ans.start = realloc(ans.start, n+1);
612 if(!ans.start) {
613 err("out of memory");
614 exits("nomem");
615 }
616 anslen = n+1;
617 }
618 Bseek(bdict, b, 0);
619 n = Bread(bdict, ans.start, n);
620 ans.end = ans.start + n;
621 *ans.end = 0;
622 }
623 return ans;
624 }
625
626 void
setdotnext(void)627 setdotnext(void)
628 {
629 long b;
630
631 b = (*dict->nextoff)(dot->doff[dot->cur]+1);
632 if(b < 0) {
633 err("couldn't find a next entry");
634 return;
635 }
636 dot->doff[0] = b;
637 dot->n = 1;
638 dot->cur = 0;
639 }
640
641 void
setdotprev(void)642 setdotprev(void)
643 {
644 int tryback;
645 long here, last, p;
646
647 if(dot->cur < 0 || dot->cur >= dot->n)
648 return;
649 tryback = 2000;
650 here = dot->doff[dot->cur];
651 last = 0;
652 while(last == 0) {
653 p = here - tryback;
654 if(p < 0)
655 p = 0;
656 for(;;) {
657 p = (*dict->nextoff)(p+1);
658 if(p < 0)
659 return; /* shouldn't happen */
660 if(p >= here)
661 break;
662 last = p;
663 }
664 if(!last) {
665 if(here - tryback < 0) {
666 err("can't find a previous entry");
667 return;
668 }
669 tryback = 2*tryback;
670 }
671 }
672 dot->doff[0] = last;
673 dot->n = 1;
674 dot->cur = 0;
675 }
676