xref: /plan9/sys/src/cmd/dict/dict.c (revision 705edaf815611fc3f2ae8bff889936d14f63b56f)
1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include <regexp.h>
5 #include <ctype.h>
6 #include "dict.h"
7 
8 /*
9  * Assumed index file structure: lines of form
10  * 	[^\t]+\t[0-9]+
11  * First field is key, second is byte offset into dictionary.
12  * Should be sorted with args -u -t'	' +0f -1 +0 -1 +1n -2
13  */
14 typedef struct Addr Addr;
15 
16 struct Addr {
17 	int	n;		/* number of offsets */
18 	int	cur;		/* current position within doff array */
19 	int	maxn;		/* actual current size of doff array */
20 	ulong	doff[1];	/* doff[maxn], with 0..n-1 significant */
21 };
22 
23 Biobuf	binbuf;
24 Biobuf	boutbuf;
25 Biobuf	*bin = &binbuf;		/* user cmd input */
26 Biobuf	*bout = &boutbuf;	/* output */
27 Biobuf	*bdict;			/* dictionary */
28 Biobuf	*bindex;		/* index file */
29 long	indextop;		/* index offset at end of file */
30 int	lastcmd;		/* last executed command */
31 Addr	*dot;			/* "current" address */
32 Dict	*dict;			/* current dictionary */
33 int	linelen;
34 int	breaklen = 60;
35 int	outinhibit;
36 int	debug;
37 
38 void	execcmd(int);
39 int	getpref(char*, Rune*);
40 Entry	getentry(int);
41 int	getfield(Rune*);
42 long	locate(Rune*);
43 int	parseaddr(char*, char**);
44 int	parsecmd(char*);
45 int	search(char*, int);
46 long	seeknextline(Biobuf*, long);
47 void	setdotnext(void);
48 void	setdotprev(void);
49 void	sortaddr(Addr*);
50 void	usage(void);
51 
52 enum {
53 	Plen=300,	/* max length of a search pattern */
54 	Fieldlen=200,	/* max length of an index field */
55 	Aslots=10,	/* initial number of slots in an address */
56 };
57 
58 void
main(int argc,char ** argv)59 main(int argc, char **argv)
60 {
61 	int i, cmd, kflag;
62 	char *line, *p;
63 
64 	Binit(&binbuf, 0, OREAD);
65 	Binit(&boutbuf, 1, OWRITE);
66 	kflag = 0;
67 	line = 0;
68 	dict = 0;
69 	for(i=0; dicts[i].name; i++){
70 		if(access(dicts[i].path, 0)>=0 && access(dicts[i].indexpath, 0)>=0){
71 			dict = &dicts[i];
72 			break;
73 		}
74 	}
75 	ARGBEGIN {
76 		case 'd':
77 			p = ARGF();
78 			dict = 0;
79 			if(p) {
80 				for(i=0; dicts[i].name; i++)
81 					if(strcmp(p, dicts[i].name)==0) {
82 						dict = &dicts[i];
83 						break;
84 					}
85 			}
86 			if(!dict)
87 				usage();
88 			break;
89 		case 'c':
90 			line = ARGF();
91 			if(!line)
92 				usage();
93 			break;
94 		case 'k':
95 			kflag++;
96 			break;
97 		case 'D':
98 			debug++;
99 			break;
100 		default:
101 			usage();
102 	ARGEND }
103 	if(dict == 0){
104 		err("no dictionaries present on this system");
105 		exits("nodict");
106 	}
107 
108 	if(kflag) {
109 		(*dict->printkey)();
110 		exits(0);
111 	}
112 	if(argc > 1)
113 		usage();
114 	else if(argc == 1) {
115 		if(line)
116 			usage();
117 		p = argv[0];
118 		line = malloc(strlen(p)+5);
119 		sprint(line, "/%s/P\n", p);
120 	}
121 	bdict = Bopen(dict->path, OREAD);
122 	if(!bdict) {
123 		err("can't open dictionary %s", dict->path);
124 		exits("nodict");
125 	}
126 	bindex = Bopen(dict->indexpath, OREAD);
127 	if(!bindex) {
128 		err("can't open index %s", dict->indexpath);
129 		exits("noindex");
130 	}
131 	indextop = Bseek(bindex, 0L, 2);
132 
133 	dot = malloc(sizeof(Addr)+(Aslots-1)*sizeof(ulong));
134 	dot->n = 0;
135 	dot->cur = 0;
136 	dot->maxn = Aslots;
137 	lastcmd = 0;
138 
139 	if(line) {
140 		cmd = parsecmd(line);
141 		if(cmd)
142 			execcmd(cmd);
143 	} else {
144 		for(;;) {
145 			Bprint(bout, "*");
146 			Bflush(bout);
147 			line = Brdline(bin, '\n');
148 			linelen = 0;
149 			if(!line)
150 				break;
151 			cmd = parsecmd(line);
152 			if(cmd) {
153 				execcmd(cmd);
154 				lastcmd = cmd;
155 			}
156 		}
157 	}
158 	exits(0);
159 }
160 
161 void
usage(void)162 usage(void)
163 {
164 	int i;
165 	char *a, *b;
166 
167 	Bprint(bout, "Usage: %s [-d dict] [-k] [-c cmd] [word]\n", argv0);
168 	Bprint(bout, "dictionaries (brackets mark dictionaries not present on this system):\n");
169 	for(i = 0; dicts[i].name; i++){
170 		a = b = "";
171 		if(access(dicts[i].path, 0)<0 || access(dicts[i].indexpath, 0)<0){
172 			a = "[";
173 			b = "]";
174 		}
175 		Bprint(bout, "   %s%s\t%s%s\n", a, dicts[i].name, dicts[i].desc, b);
176 	}
177 	exits("usage");
178 }
179 
180 int
parsecmd(char * line)181 parsecmd(char *line)
182 {
183 	char *e;
184 	int cmd, ans;
185 
186 	if(parseaddr(line, &e) >= 0)
187 		line = e;
188 	else
189 		return 0;
190 	cmd = *line;
191 	ans = cmd;
192 	if(isupper(cmd))
193 		cmd = tolower(cmd);
194 	if(!(cmd == 'a' || cmd == 'h' || cmd == 'p' || cmd == 'r' ||
195 	     cmd == '\n')) {
196 		err("unknown command %c", cmd);
197 		return 0;
198 	}
199 	if(cmd == '\n')
200 		switch(lastcmd) {
201 			case 0:	ans = 'H'; break;
202 			case 'H':	ans = 'p'; break;
203 			default :	ans = lastcmd; break;
204 		}
205 	else if(line[1] != '\n' && line[1] != 0)
206 		err("extra stuff after command %c ignored", cmd);
207 	return ans;
208 }
209 
210 void
execcmd(int cmd)211 execcmd(int cmd)
212 {
213 	Entry e;
214 	int cur, doall;
215 
216 	if(isupper(cmd)) {
217 		doall = 1;
218 		cmd = tolower(cmd);
219 		cur = 0;
220 	} else {
221 		doall = 0;
222 		cur = dot->cur;
223 	}
224 
225 	if(debug && doall && cmd == 'a')
226 		Bprint(bout, "%d entries, cur=%d\n", dot->n, cur+1);
227 	for(;;){
228 		if(cur >= dot->n)
229 			break;
230 		if(doall) {
231 			Bprint(bout, "%d\t", cur+1);
232 			linelen += 4 + (cur >= 10);
233 		}
234 		switch(cmd) {
235 		case 'a':
236 			Bprint(bout, "#%lud\n", dot->doff[cur]);
237 			break;
238 		case 'h':
239 		case 'p':
240 		case 'r':
241 			e = getentry(cur);
242 			(*dict->printentry)(e, cmd);
243 			break;
244 		}
245 		cur++;
246 		if(doall) {
247 			if(cmd == 'p' || cmd == 'r') {
248 				Bputc(bout, '\n');
249 				linelen = 0;
250 			}
251 		} else
252 			break;
253 	}
254 	if(cur >= dot->n)
255 		cur = 0;
256 	dot->cur = cur;
257 }
258 
259 /*
260  * Address syntax: ('.' | '/' re '/' | '!' re '!' | number | '#' number) ('+' | '-')*
261  * Answer goes in dot.
262  * Return -1 if address starts, but get error.
263  * Return 0 if no address.
264  */
265 int
parseaddr(char * line,char ** eptr)266 parseaddr(char *line, char **eptr)
267 {
268 	int delim, plen;
269 	ulong v;
270 	char *e;
271 	char pat[Plen];
272 
273 	if(*line == '/' || *line == '!') {
274 		/* anchored regular expression match; '!' means no folding */
275 		if(*line == '/') {
276 			delim = '/';
277 			e = strpbrk(line+1, "/\n");
278 		} else {
279 			delim = '!';
280 			e = strpbrk(line+1, "!\n");
281 		}
282 		plen = e-line-1;
283 		if(plen >= Plen-3) {
284 			err("pattern too big");
285 			return -1;
286 		}
287 		pat[0] = '^';
288 		memcpy(pat+1, line+1, plen);
289 		pat[plen+1] = '$';
290 		pat[plen+2] = 0;
291 		if(*e == '\n')
292 			line = e;
293 		else
294 			line = e+1;
295 		if(!search(pat, delim == '/')) {
296 			err("pattern not found");
297 			return -1;
298 		}
299 	} else if(*line == '#') {
300 		/* absolute byte offset into dictionary */
301 		line++;
302 		if(!isdigit(*line))
303 			return -1;
304 		v = strtoul(line, &e, 10);
305 		line = e;
306 		dot->doff[0] = v;
307 		dot->n = 1;
308 		dot->cur = 0;
309 	} else if(isdigit(*line)) {
310 		v = strtoul(line, &e, 10);
311 		line = e;
312 		if(v < 1 || v > dot->n)
313 			err(".%d not in range [1,%d], ignored",
314 				v, dot->n);
315 		else
316 			dot->cur = v-1;
317 	} else if(*line == '.') {
318 		line++;
319 	} else {
320 		*eptr = line;
321 		return 0;
322 	}
323 	while(*line == '+' || *line == '-') {
324 		if(*line == '+')
325 			setdotnext();
326 		else
327 			setdotprev();
328 		line++;
329 	}
330 	*eptr = line;
331 	return 1;
332 }
333 
334 /*
335  * Index file is sorted by folded field1.
336  * Method: find pre, a folded prefix of r.e. pat,
337  * and then low = offset to beginning of
338  * line in index file where first match of prefix occurs.
339  * Then go through index until prefix no longer matches,
340  * adding each line that matches real pattern to dot.
341  * Finally, sort dot offsets (uniquing).
342  * We know pat len < Plen, and that it is surrounded by ^..$
343  */
344 int
search(char * pat,int dofold)345 search(char *pat, int dofold)
346 {
347 	int needre, prelen, match, n;
348 	Reprog *re;
349 	long ioff, v;
350 	Rune pre[Plen];
351 	Rune lit[Plen];
352 	Rune entry[Fieldlen];
353 	char fpat[Plen];
354 
355 	prelen = getpref(pat+1, pre);
356 	if(pat[prelen+1] == 0 || pat[prelen+1] == '$') {
357 		runescpy(lit, pre);
358 		if(dofold)
359 			fold(lit);
360 		needre = 0;
361 		SET(re);
362 	} else {
363 		needre = 1;
364 		if(dofold) {
365 			foldre(fpat, pat);
366 			re = regcomp(fpat);
367 		} else
368 			re = regcomp(pat);
369 	}
370 	fold(pre);
371 	ioff = locate(pre);
372 	if(ioff < 0)
373 		return 0;
374 	dot->n = 0;
375 	Bseek(bindex, ioff, 0);
376 	for(;;) {
377 		if(!getfield(entry))
378 			break;
379 		if(dofold)
380 			fold(entry);
381 		if(needre)
382 			match = rregexec(re, entry, 0, 0);
383 		else
384 			match = (acomp(lit, entry) == 0);
385 		if(match) {
386 			if(!getfield(entry))
387 				break;
388 			v = runetol(entry);
389 			if(dot->n >= dot->maxn) {
390 				n = 2*dot->maxn;
391 				dot = realloc(dot,
392 					sizeof(Addr)+(n-1)*sizeof(long));
393 				if(!dot) {
394 					err("out of memory");
395 					exits("nomem");
396 				}
397 				dot->maxn = n;
398 			}
399 			dot->doff[dot->n++] = v;
400 		} else {
401 			if(!dofold)
402 				fold(entry);
403 			if(*pre) {
404 				n = acomp(pre, entry);
405 				if(n < -1 || (!needre && n < 0))
406 					break;
407 			}
408 			/* get to next index entry */
409 			if(!getfield(entry))
410 				break;
411 		}
412 	}
413 	sortaddr(dot);
414 	dot->cur = 0;
415 	return dot->n;
416 }
417 
418 /*
419  * Return offset in index file of first line whose folded
420  * first field has pre as a prefix.  -1 if none found.
421  */
422 long
locate(Rune * pre)423 locate(Rune *pre)
424 {
425 	long top, bot, mid;
426 	Rune entry[Fieldlen];
427 
428 	if(*pre == 0)
429 		return 0;
430 	bot = 0;
431 	top = indextop;
432 	if(debug>1)
433 		fprint(2, "locate looking for prefix %S\n", pre);
434 	for(;;) {
435 		/*
436 		 * Loop invariant: foldkey(bot) < pre <= foldkey(top)
437 		 * and bot < top, and bot,top point at beginning of lines
438 		 */
439 		mid = (top+bot) / 2;
440 		mid = seeknextline(bindex, mid);
441 		if(debug > 1)
442 			fprint(2, "bot=%ld, mid=%ld->%ld, top=%ld\n",
443 				bot, (top+bot) / 2, mid, top);
444 		if(mid == top || !getfield(entry))
445 			break;
446 		if(debug > 1)
447 			fprint(2, "key=%S\n", entry);
448 		/*
449 		 * here mid is strictly between bot and top
450 		 */
451 		fold(entry);
452 		if(acomp(pre, entry) <= 0)
453 			top = mid;
454 		else
455 			bot = mid;
456 	}
457 	/*
458 	 * bot < top, but they don't necessarily point at successive lines
459 	 * Use linear search from bot to find first line that pre is a
460 	 * prefix of
461 	 */
462 	while((bot = seeknextline(bindex, bot)) <= top) {
463 		if(!getfield(entry))
464 			return -1;
465 		if(debug > 1)
466 			fprint(2, "key=%S\n", entry);
467 		fold(entry);
468 		switch(acomp(pre, entry)) {
469 		case -2:
470 			return -1;
471 		case -1:
472 		case 0:
473 			return bot;
474 		case 1:
475 		case 2:
476 			continue;
477 		}
478 	}
479 	return -1;
480 
481 }
482 
483 /*
484  * Get prefix of non re-metacharacters, runified, into pre,
485  * and return length
486  */
487 int
getpref(char * pat,Rune * pre)488 getpref(char *pat, Rune *pre)
489 {
490 	int n, r;
491 	char *p;
492 
493 	p = pat;
494 	while(*p) {
495 		n = chartorune(pre, p);
496 		r = *pre;
497 		switch(r) {
498 		case L'.': case L'*': case L'+': case L'?':
499 		case L'[': case L']': case L'(': case ')':
500 		case L'|': case L'^': case L'$':
501 			*pre = 0;
502 			return p-pat;
503 		case L'\\':
504 			p += n;
505 			p += chartorune(++pre, p);
506 			pre++;
507 			break;
508 		default:
509 			p += n;
510 			pre++;
511 		}
512 	}
513 	return p-pat;
514 }
515 
516 long
seeknextline(Biobuf * b,long off)517 seeknextline(Biobuf *b, long off)
518 {
519 	long c;
520 
521 	Bseek(b, off, 0);
522 	do {
523 		c = Bgetrune(b);
524 	} while(c>=0 && c!='\n');
525 	return Boffset(b);
526 }
527 
528 /*
529  * Get next field out of index file (either tab- or nl- terminated)
530  * Answer in *rp, assumed to be Fieldlen long.
531  * Return 0 if read error first.
532  */
533 int
getfield(Rune * rp)534 getfield(Rune *rp)
535 {
536 	long c;
537 	int n;
538 
539 	for(n=Fieldlen; n-- > 0; ) {
540 		if ((c = Bgetrune(bindex)) < 0)
541 			return 0;
542 		if(c == '\t' || c == '\n') {
543 			*rp = L'\0';
544 			return 1;
545 		}
546 		*rp++ = c;
547 	}
548 	err("word too long");
549 	return 0;
550 }
551 
552 /*
553  * A compare longs function suitable for qsort
554  */
555 static int
longcmp(void * av,void * bv)556 longcmp(void *av, void *bv)
557 {
558 	long v;
559 	long *a, *b;
560 
561 	a = av;
562 	b = bv;
563 
564 	v = *a - *b;
565 	if(v < 0)
566 		return -1;
567 	else if(v == 0)
568 		return 0;
569 	else
570 		return 1;
571 }
572 
573 void
sortaddr(Addr * a)574 sortaddr(Addr *a)
575 {
576 	int i, j;
577 	long v;
578 
579 	if(a->n <= 1)
580 		return;
581 
582 	qsort(a->doff, a->n, sizeof(long), longcmp);
583 
584 	/* remove duplicates */
585 	for(i=0, j=0; j < a->n; j++) {
586 		v = a->doff[j];
587 		if(i > 0 && v == a->doff[i-1])
588 			continue;
589 		a->doff[i++] = v;
590 	}
591 	a->n = i;
592 }
593 
594 Entry
getentry(int i)595 getentry(int i)
596 {
597 	long b, e, n;
598 	static Entry ans;
599 	static int anslen = 0;
600 
601 	b = dot->doff[i];
602 	e = (*dict->nextoff)(b+1);
603 	ans.doff = b;
604 	if(e < 0) {
605 		err("couldn't seek to entry");
606 		ans.start = 0;
607 		ans.end = 0;
608 	} else {
609 		n = e-b;
610 		if(n+1 > anslen) {
611 			ans.start = realloc(ans.start, n+1);
612 			if(!ans.start) {
613 				err("out of memory");
614 				exits("nomem");
615 			}
616 			anslen = n+1;
617 		}
618 		Bseek(bdict, b, 0);
619 		n = Bread(bdict, ans.start, n);
620 		ans.end = ans.start + n;
621 		*ans.end = 0;
622 	}
623 	return ans;
624 }
625 
626 void
setdotnext(void)627 setdotnext(void)
628 {
629 	long b;
630 
631 	b = (*dict->nextoff)(dot->doff[dot->cur]+1);
632 	if(b < 0) {
633 		err("couldn't find a next entry");
634 		return;
635 	}
636 	dot->doff[0] = b;
637 	dot->n = 1;
638 	dot->cur = 0;
639 }
640 
641 void
setdotprev(void)642 setdotprev(void)
643 {
644 	int tryback;
645 	long here, last, p;
646 
647 	if(dot->cur < 0 || dot->cur >= dot->n)
648 		return;
649 	tryback = 2000;
650 	here = dot->doff[dot->cur];
651 	last = 0;
652 	while(last == 0) {
653 		p = here - tryback;
654 		if(p < 0)
655 			p = 0;
656 		for(;;) {
657 			p = (*dict->nextoff)(p+1);
658 			if(p < 0)
659 				return; /* shouldn't happen */
660 			if(p >= here)
661 				break;
662 			last = p;
663 		}
664 		if(!last) {
665 			if(here - tryback < 0) {
666 				err("can't find a previous entry");
667 				return;
668 			}
669 			tryback = 2*tryback;
670 		}
671 	}
672 	dot->doff[0] = last;
673 	dot->n = 1;
674 	dot->cur = 0;
675 }
676