xref: /plan9/sys/src/cmd/dict/utils.c (revision eaba85aa6b158bdf68fdb77f770e3ba0899a8b5e)
1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include "dict.h"
5 
6 Dict dicts[] = {
7 	{"oed",		"Oxford English Dictionary, 2nd Ed.",
8 	 "/lib/dict/oed2",	"/lib/dict/oed2index",
9 	 oednextoff,	oedprintentry,		oedprintkey},
10 	{"ahd",		"American Heritage Dictionary, 2nd College Ed.",
11 	 "/lib/ahd/DICT.DB",	"/lib/ahd/index",
12 	 ahdnextoff,	ahdprintentry,		ahdprintkey},
13 	{"pgw",		"Project Gutenberg Webster Dictionary",
14 	 "/lib/dict/pgw",	"/lib/dict/pgwindex",
15 	 pgwnextoff,	pgwprintentry,		pgwprintkey},
16 	{"thesaurus",	"Collins Thesaurus",
17 	 "/lib/dict/thesaurus",	"/lib/dict/thesindex",
18 	 thesnextoff,	thesprintentry,	thesprintkey},
19 	{"roget",		"Project Gutenberg Roget's Thesaurus",
20 	 "/lib/dict/roget", "/lib/dict/rogetindex",
21 	 rogetnextoff,	rogetprintentry,	rogetprintkey},
22 
23 	{"ce",		"Gendai Chinese->English",
24 	 "/lib/dict/world/sansdata/sandic24.dat",
25 	 "/lib/dict/world/sansdata/ceindex",
26 	 worldnextoff,	worldprintentry,	worldprintkey},
27 	{"ceh",		"Gendai Chinese->English (Hanzi index)",
28 	 "/lib/dict/world/sansdata/sandic24.dat",
29 	 "/lib/dict/world/sansdata/cehindex",
30 	 worldnextoff,	worldprintentry,	worldprintkey},
31 	{"ec",		"Gendai English->Chinese",
32 	 "/lib/dict/world/sansdata/sandic24.dat",
33 	 "/lib/dict/world/sansdata/ecindex",
34 	 worldnextoff,	worldprintentry,	worldprintkey},
35 
36 	{"dae",		"Gyldendal Danish->English",
37 	 "/lib/dict/world/gylddata/sandic30.dat",
38 	 "/lib/dict/world/gylddata/daeindex",
39 	 worldnextoff,	worldprintentry,	worldprintkey},
40 	{"eda",		"Gyldendal English->Danish",
41 	 "/lib/dict/world/gylddata/sandic29.dat",
42 	 "/lib/dict/world/gylddata/edaindex",
43 	 worldnextoff,	worldprintentry,	worldprintkey},
44 
45 	{"due",		"Wolters-Noordhoff Dutch->English",
46 	 "/lib/dict/world/woltdata/sandic07.dat",
47 	 "/lib/dict/world/woltdata/deindex",
48 	 worldnextoff,	worldprintentry,	worldprintkey},
49 	{"edu",		"Wolters-Noordhoff English->Dutch",
50 	 "/lib/dict/world/woltdata/sandic06.dat",
51 	 "/lib/dict/world/woltdata/edindex",
52 	 worldnextoff,	worldprintentry,	worldprintkey},
53 
54 	{"fie",		"WSOY Finnish->English",
55 	 "/lib/dict/world/werndata/sandic32.dat",
56 	 "/lib/dict/world/werndata/fieindex",
57 	 worldnextoff,	worldprintentry,	worldprintkey},
58 	{"efi",		"WSOY English->Finnish",
59 	 "/lib/dict/world/werndata/sandic31.dat",
60 	 "/lib/dict/world/werndata/efiindex",
61 	 worldnextoff,	worldprintentry,	worldprintkey},
62 
63 	{"fe",		"Collins French->English",
64 	 "/lib/dict/fe",	"/lib/dict/feindex",
65 	 pcollnextoff,	pcollprintentry,	pcollprintkey},
66 	{"ef",		"Collins English->French",
67 	 "/lib/dict/ef",	"/lib/dict/efindex",
68 	 pcollnextoff,	pcollprintentry,	pcollprintkey},
69 
70 	{"ge",		"Collins German->English",
71 	 "/lib/dict/ge",	"/lib/dict/geindex",
72 	 pcollgnextoff,	pcollgprintentry,	pcollgprintkey},
73 	{"eg",		"Collins English->German",
74 	 "/lib/dict/eg",	"/lib/dict/egindex",
75 	 pcollgnextoff,	pcollgprintentry,	pcollgprintkey},
76 
77 	{"ie",		"Collins Italian->English",
78 	 "/lib/dict/ie",	"/lib/dict/ieindex",
79 	 pcollnextoff,	pcollprintentry,	pcollprintkey},
80 	{"ei",		"Collins English->Italian",
81 	 "/lib/dict/ei",	"/lib/dict/eiindex",
82 	 pcollnextoff,	pcollprintentry,	pcollprintkey},
83 
84 	{"je",		"Sanshusha Japanese->English",
85 	 "/lib/dict/world/sansdata/sandic18.dat",
86 	 "/lib/dict/world/sansdata/jeindex",
87 	 worldnextoff,	worldprintentry,	worldprintkey},
88 	{"jek",		"Sanshusha Japanese->English (Kanji index)",
89 	 "/lib/dict/world/sansdata/sandic18.dat",
90 	 "/lib/dict/world/sansdata/jekindex",
91 	 worldnextoff,	worldprintentry,	worldprintkey},
92 	{"ej",		"Sanshusha English->Japanese",
93 	 "/lib/dict/world/sansdata/sandic18.dat",
94 	 "/lib/dict/world/sansdata/ejindex",
95 	 worldnextoff,	worldprintentry,	worldprintkey},
96 
97 	{"tjeg",	"Sanshusha technical Japanese->English,German",
98 	 "/lib/dict/world/sansdata/sandic16.dat",
99 	 "/lib/dict/world/sansdata/tjegindex",
100 	 worldnextoff,	worldprintentry,	worldprintkey},
101 	{"tjegk",	"Sanshusha technical Japanese->English,German (Kanji index)",
102 	 "/lib/dict/world/sansdata/sandic16.dat",
103 	 "/lib/dict/world/sansdata/tjegkindex",
104 	 worldnextoff,	worldprintentry,	worldprintkey},
105 	{"tegj",	"Sanshusha technical English->German,Japanese",
106 	 "/lib/dict/world/sansdata/sandic16.dat",
107 	 "/lib/dict/world/sansdata/tegjindex",
108 	 worldnextoff,	worldprintentry,	worldprintkey},
109 	{"tgje",	"Sanshusha technical German->Japanese,English",
110 	 "/lib/dict/world/sansdata/sandic16.dat",
111 	 "/lib/dict/world/sansdata/tgjeindex",
112 	 worldnextoff,	worldprintentry,	worldprintkey},
113 
114 	{"ne",		"Kunnskapforlaget Norwegian->English",
115 	 "/lib/dict/world/kunndata/sandic28.dat",
116 	 "/lib/dict/world/kunndata/neindex",
117 	 worldnextoff,	worldprintentry,	worldprintkey},
118 	{"en",		"Kunnskapforlaget English->Norwegian",
119 	 "/lib/dict/world/kunndata/sandic27.dat",
120 	 "/lib/dict/world/kunndata/enindex",
121 	 worldnextoff,	worldprintentry,	worldprintkey},
122 
123 	{"re",		"Leon Ungier Russian->English",
124 	 "/lib/dict/re",	"/lib/dict/reindex",
125 	 simplenextoff,	simpleprintentry,	simpleprintkey},
126 	{"er",		"Leon Ungier English->Russian",
127 	 "/lib/dict/re",	"/lib/dict/erindex",
128 	 simplenextoff,	simpleprintentry,	simpleprintkey},
129 
130 	{"se",		"Collins Spanish->English",
131 	 "/lib/dict/se",	"/lib/dict/seindex",
132 	 pcollnextoff,	pcollprintentry,	pcollprintkey},
133 	{"es",		"Collins English->Spanish",
134 	 "/lib/dict/es",	"/lib/dict/esindex",
135 	 pcollnextoff,	pcollprintentry,	pcollprintkey},
136 
137 	{"swe",		"Esselte Studium Swedish->English",
138 	 "/lib/dict/world/essedata/sandic34.dat",
139 	 "/lib/dict/world/essedata/sweindex",
140 	 worldnextoff,	worldprintentry,	worldprintkey},
141 	{"esw",		"Esselte Studium English->Swedish",
142 	 "/lib/dict/world/essedata/sandic33.dat",
143 	 "/lib/dict/world/essedata/eswindex",
144 	 worldnextoff,	worldprintentry,	worldprintkey},
145 
146 	{"movie",	"Movies -- by title",
147 	 "/lib/movie/data",	"/lib/dict/movtindex",
148 	 movienextoff,	movieprintentry,	movieprintkey},
149 	{"moviea",	"Movies -- by actor",
150 	 "/lib/movie/data",	"/lib/dict/movaindex",
151 	 movienextoff,	movieprintentry,	movieprintkey},
152 	{"movied",	"Movies -- by director",
153 	 "/lib/movie/data",	"/lib/dict/movdindex",
154 	 movienextoff,	movieprintentry,	movieprintkey},
155 
156 	{"slang",	"English Slang",
157 	 "/lib/dict/slang",	"/lib/dict/slangindex",
158 	 slangnextoff,	slangprintentry,	slangprintkey},
159 
160 	{"robert",	"Robert Électronique",
161 	 "/lib/dict/robert/_pointers",	"/lib/dict/robert/_index",
162 	 robertnextoff,	robertindexentry,	robertprintkey},
163 	{"robertv",	"Robert Électronique - formes des verbes",
164 	 "/lib/dict/robert/flex.rob",	"/lib/dict/robert/_flexindex",
165 	 robertnextflex,	robertflexentry,	robertprintkey},
166 
167 	{0, 0, 0, 0, 0}
168 };
169 
170 typedef struct Lig Lig;
171 struct Lig {
172 	Rune	start;		/* accent rune */
173 	Rune	*pairs;		/* <char,accented version> pairs */
174 };
175 
176 static Lig ligtab[Nligs] = {
177 [LACU-LIGS]	{L'´',	L"AÁaáCĆcćEÉeégģIÍiíıíLĹlĺNŃnńOÓoóRŔrŕSŚsśUÚuúYÝyýZŹzź"},
178 [LGRV-LIGS]	{L'ˋ',	L"AÀaàEÈeèIÌiìıìOÒoòUÙuù"},
179 [LUML-LIGS]	{L'¨',	L"AÄaäEËeëIÏiïOÖoöUÜuüYŸyÿ"},
180 [LCED-LIGS]	{L'¸',	L"CÇcçGĢKĶkķLĻlļNŅnņRŖrŗSŞsşTŢtţ"},
181 [LTIL-LIGS]	{L'˜',	L"AÃaãIĨiĩıĩNÑnñOÕoõUŨuũ"},
182 [LBRV-LIGS]	{L'˘',	L"AĂaăEĔeĕGĞgğIĬiĭıĭOŎoŏUŬuŭ"},
183 [LRNG-LIGS]	{L'˚',	L"AÅaåUŮuů"},
184 [LDOT-LIGS]	{L'˙',	L"CĊcċEĖeėGĠgġIİLĿlŀZŻzż"},
185 [LDTB-LIGS]	{L'.',	L""},
186 [LFRN-LIGS]	{L'⌢',	L"AÂaâCĈcĉEÊeêGĜgĝHĤhĥIÎiîıîJĴjĵOÔoôSŜsŝUÛuûWŴwŵYŶyŷ"},
187 [LFRB-LIGS]	{L'̯',	L""},
188 [LOGO-LIGS]	{L'˛',	L"AĄaąEĘeęIĮiįıįUŲuų"},
189 [LMAC-LIGS]	{L'¯',	L"AĀaāEĒeēIĪiīıīOŌoōUŪuū"},
190 [LHCK-LIGS]	{L'ˇ',	L"CČcčDĎdďEĚeěLĽlľNŇnňRŘrřSŠsšTŤtťZŽzž"},
191 [LASP-LIGS]	{L'ʽ',	L""},
192 [LLEN-LIGS]	{L'ʼ',	L""},
193 [LBRB-LIGS]	{L'̮',	L""}
194 };
195 
196 Rune *multitab[Nmulti] = {
197 [MAAS-MULTI]	L"ʽα",
198 [MALN-MULTI]	L"ʼα",
199 [MAND-MULTI]	L"and",
200 [MAOQ-MULTI]	L"a/q",
201 [MBRA-MULTI]	L"<|",
202 [MDD-MULTI]	L"..",
203 [MDDD-MULTI]	L"...",
204 [MEAS-MULTI]	L"ʽε",
205 [MELN-MULTI]	L"ʼε",
206 [MEMM-MULTI]	L"——",
207 [MHAS-MULTI]	L"ʽη",
208 [MHLN-MULTI]	L"ʼη",
209 [MIAS-MULTI]	L"ʽι",
210 [MILN-MULTI]	L"ʼι",
211 [MLCT-MULTI]	L"ct",
212 [MLFF-MULTI]	L"ff",
213 [MLFFI-MULTI]	L"ffi",
214 [MLFFL-MULTI]	L"ffl",
215 [MLFL-MULTI]	L"fl",
216 [MLFI-MULTI]	L"fi",
217 [MLLS-MULTI]	L"ɫɫ",
218 [MLST-MULTI]	L"st",
219 [MOAS-MULTI]	L"ʽο",
220 [MOLN-MULTI]	L"ʼο",
221 [MOR-MULTI]	L"or",
222 [MRAS-MULTI]	L"ʽρ",
223 [MRLN-MULTI]	L"ʼρ",
224 [MTT-MULTI]	L"~~",
225 [MUAS-MULTI]	L"ʽυ",
226 [MULN-MULTI]	L"ʼυ",
227 [MWAS-MULTI]	L"ʽω",
228 [MWLN-MULTI]	L"ʼω",
229 [MOE-MULTI]	L"oe",
230 [MES-MULTI]	L"  ",
231 };
232 
233 static Rune 	*ttabstack[20];
234 static int	ntt;
235 
236 /*
237  * tab is an array of n Assoc's, sorted by key.
238  * Look for key in tab, and return corresponding val
239  * or -1 if not there
240  */
241 long
lookassoc(Assoc * tab,int n,char * key)242 lookassoc(Assoc *tab, int n, char *key)
243 {
244 	Assoc *q;
245 	long i, low, high;
246 	int r;
247 
248 	for(low = -1, high = n; high > low+1; ){
249 		i = (high+low)/2;
250 		q = &tab[i];
251 		if((r=strcmp(key, q->key))<0)
252 			high = i;
253 		else if(r == 0)
254 			return q->val;
255 		else
256 			low=i;
257 	}
258 	return -1;
259 }
260 
261 long
looknassoc(Nassoc * tab,int n,long key)262 looknassoc(Nassoc *tab, int n, long key)
263 {
264 	Nassoc *q;
265 	long i, low, high;
266 
267 	for(low = -1, high = n; high > low+1; ){
268 		i = (high+low)/2;
269 		q = &tab[i];
270 		if(key < q->key)
271 			high = i;
272 		else if(key == q->key)
273 			return q->val;
274 		else
275 			low=i;
276 	}
277 	return -1;
278 }
279 
280 void
err(char * fmt,...)281 err(char *fmt, ...)
282 {
283 	char buf[1000];
284 	va_list v;
285 
286 	va_start(v, fmt);
287 	vsnprint(buf, sizeof(buf), fmt, v);
288 	va_end(v);
289 	fprint(2, "%s: %s\n", argv0, buf);
290 }
291 
292 /*
293  * Write the rune r to bout, keeping track of line length
294  * and breaking the lines (at blanks) when they get too long
295  */
296 void
outrune(long r)297 outrune(long r)
298 {
299 	if(outinhibit)
300 		return;
301 	if(++linelen > breaklen && r == L' ') {
302 		Bputc(bout, '\n');
303 		linelen = 0;
304 	} else
305 		Bputrune(bout, r);
306 }
307 
308 void
outrunes(Rune * rp)309 outrunes(Rune *rp)
310 {
311 	Rune r;
312 
313 	while((r = *rp++) != 0)
314 		outrune(r);
315 }
316 
317 /* like outrune, but when arg is know to be a char */
318 void
outchar(int c)319 outchar(int c)
320 {
321 	if(outinhibit)
322 		return;
323 	if(++linelen > breaklen && c == ' ') {
324 		c ='\n';
325 		linelen = 0;
326 	}
327 	Bputc(bout, c);
328 }
329 
330 void
outchars(char * s)331 outchars(char *s)
332 {
333 	char c;
334 
335 	while((c = *s++) != 0)
336 		outchar(c);
337 }
338 
339 void
outprint(char * fmt,...)340 outprint(char *fmt, ...)
341 {
342 	char buf[1000];
343 	va_list v;
344 
345 	va_start(v, fmt);
346 	vsnprint(buf, sizeof(buf), fmt, v);
347 	va_end(v);
348 	outchars(buf);
349 }
350 
351 void
outpiece(char * b,char * e)352 outpiece(char *b, char *e)
353 {
354 	int c, lastc;
355 
356 	lastc = 0;
357 	while(b < e) {
358 		c = *b++;
359 		if(c == '\n')
360 			c = ' ';
361 		if(!(c == ' ' && lastc == ' '))
362 			outchar(c);
363 		lastc = c;
364 	}
365 }
366 
367 /*
368  * Go to new line if not already there; indent if ind != 0.
369  * If ind > 1, leave a blank line too.
370  * Slight hack: assume if current line is only one or two
371  * characters long, then they were spaces.
372  */
373 void
outnl(int ind)374 outnl(int ind)
375 {
376 	if(outinhibit)
377 		return;
378 	if(ind) {
379 		if(ind > 1) {
380 			if(linelen > 2)
381 				Bputc(bout, '\n');
382 			Bprint(bout, "\n  ");
383 		} else if(linelen == 0)
384 			Bprint(bout, "  ");
385 		else if(linelen == 1)
386 			Bputc(bout, ' ');
387 		else if(linelen != 2)
388 			Bprint(bout, "\n  ");
389 		linelen = 2;
390 	} else {
391 		if(linelen) {
392 			Bputc(bout, '\n');
393 			linelen = 0;
394 		}
395 	}
396 }
397 
398 /*
399  * Fold the runes in null-terminated rp.
400  * Use the sort(1) definition of folding (uppercase to lowercase,
401  * accented characters to corresponding unaccented chars)
402  */
403 void
fold(Rune * rp)404 fold(Rune *rp)
405 {
406 	Rune r;
407 
408 	while((r = *rp) != 0) {
409 		r = tobaserune(r);
410 		if(isupperrune(r))
411 			r = tolowerrune(r);
412 		*rp++ = r;
413 	}
414 }
415 
416 /*
417  * Like fold, but put folded result into new
418  * (assumed to have enough space).
419  * old is a regular expression, but we know that
420  * metacharacters aren't affected
421  */
422 void
foldre(char * new,char * old)423 foldre(char *new, char *old)
424 {
425 	Rune r;
426 
427 	while(*old) {
428 		old += chartorune(&r, old);
429 		r = tobaserune(r);
430 		if(isupperrune(r))
431 			r = tolowerrune(r);
432 		new += runetochar(new, &r);
433 	}
434 	*new = 0;
435 }
436 
437 /*
438  *	acomp(s, t) returns:
439  *		-2 if s strictly precedes t
440  *		-1 if s is a prefix of t
441  *		0 if s is the same as t
442  *		1 if t is a prefix of s
443  *		2 if t strictly precedes s
444  */
445 
446 int
acomp(Rune * s,Rune * t)447 acomp(Rune *s, Rune *t)
448 {
449 	int cs, ct;
450 
451 	for(;;) {
452 		cs = *s;
453 		ct = *t;
454 		if(cs != ct)
455 			break;
456 		if(cs == 0)
457 			return 0;
458 		s++;
459 		t++;
460 	}
461 	if(cs == 0)
462 		return -1;
463 	if(ct == 0)
464 		return 1;
465 	if(cs < ct)
466 		return -2;
467 	return 2;
468 }
469 
470 /*
471  * Copy null terminated Runes from 'from' to 'to'.
472  */
473 void
runescpy(Rune * to,Rune * from)474 runescpy(Rune *to, Rune *from)
475 {
476 	while((*to++ = *from++) != 0)
477 		continue;
478 }
479 
480 /*
481  * Conversion of unsigned number to long, no overflow detection
482  */
483 long
runetol(Rune * r)484 runetol(Rune *r)
485 {
486 	int c;
487 	long n;
488 
489 	n = 0;
490 	for(;; r++){
491 		c = *r;
492 		if(L'0'<=c && c<=L'9')
493 			c -= '0';
494 		else
495 			break;
496 		n = n*10 + c;
497 	}
498 	return n;
499 }
500 
501 /*
502  * See if there is a rune corresponding to the accented
503  * version of r with accent acc (acc in [LIGS..LIGE-1]),
504  * and return it if so, else return NONE.
505  */
506 Rune
liglookup(Rune acc,Rune r)507 liglookup(Rune acc, Rune r)
508 {
509 	Rune *p;
510 
511 	if(acc < LIGS || acc >= LIGE)
512 		return NONE;
513 	for(p = ligtab[acc-LIGS].pairs; *p; p += 2)
514 		if(*p == r)
515 			return *(p+1);
516 	return NONE;
517 }
518 
519 /*
520  * Maintain a translation table stack (a translation table
521  * is an array of Runes indexed by bytes or 7-bit bytes).
522  * If starting is true, push the curtab onto the stack
523  * and return newtab; else pop the top of the stack and
524  * return it.
525  * If curtab is 0, initialize the stack and return.
526  */
527 Rune *
changett(Rune * curtab,Rune * newtab,int starting)528 changett(Rune *curtab, Rune *newtab, int starting)
529 {
530 	if(curtab == 0) {
531 		ntt = 0;
532 		return 0;
533 	}
534 	if(starting) {
535 		if(ntt >= asize(ttabstack)) {
536 			if(debug)
537 				err("translation stack overflow");
538 			return curtab;
539 		}
540 		ttabstack[ntt++] = curtab;
541 		return newtab;
542 	} else {
543 		if(ntt == 0) {
544 			if(debug)
545 				err("translation stack underflow");
546 			return curtab;
547 		}
548 		return ttabstack[--ntt];
549 	}
550 }
551