xref: /plan9-contrib/sys/src/cmd/dict/utils.c (revision d46c239f8612929b7dbade67d0d071633df3a15d)
1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include "dict.h"
5 
6 Dict dicts[] = {
7 	{"oed",		"Oxford English Dictionary, 2nd Ed.",
8 	 "/lib/dict/oed2",	"/lib/dict/oed2index",
9 	 oednextoff,	oedprintentry,		oedprintkey},
10 	{"ahd",		"American Heritage Dictionary, 2nd College Ed.",
11 	 "/lib/ahd/DICT.DB",	"/lib/ahd/index",
12 	 ahdnextoff,	ahdprintentry,		ahdprintkey},
13 	{"thesaurus",	"Collins Thesaurus",
14 	 "/lib/dict/thesaurus",	"/lib/dict/thesindex",
15 	 thesnextoff,	thesprintentry,	thesprintkey},
16 
17 	{"ce",		"Gendai Chinese->English",
18 	 "/lib/dict/world/sansdata/sandic24.dat",
19 	 "/lib/dict/world/sansdata/ceindex",
20 	 worldnextoff,	worldprintentry,	worldprintkey},
21 	{"ceh",		"Gendai Chinese->English (Hanzi index)",
22 	 "/lib/dict/world/sansdata/sandic24.dat",
23 	 "/lib/dict/world/sansdata/cehindex",
24 	 worldnextoff,	worldprintentry,	worldprintkey},
25 	{"ec",		"Gendai English->Chinese",
26 	 "/lib/dict/world/sansdata/sandic24.dat",
27 	 "/lib/dict/world/sansdata/ecindex",
28 	 worldnextoff,	worldprintentry,	worldprintkey},
29 
30 	{"dae",		"Gyldendal Danish->English",
31 	 "/lib/dict/world/gylddata/sandic30.dat",
32 	 "/lib/dict/world/gylddata/daeindex",
33 	 worldnextoff,	worldprintentry,	worldprintkey},
34 	{"eda",		"Gyldendal English->Danish",
35 	 "/lib/dict/world/gylddata/sandic29.dat",
36 	 "/lib/dict/world/gylddata/edaindex",
37 	 worldnextoff,	worldprintentry,	worldprintkey},
38 
39 	{"due",		"Wolters-Noordhoff Dutch->English",
40 	 "/lib/dict/world/woltdata/sandic07.dat",
41 	 "/lib/dict/world/woltdata/deindex",
42 	 worldnextoff,	worldprintentry,	worldprintkey},
43 	{"edu",		"Wolters-Noordhoff English->Dutch",
44 	 "/lib/dict/world/woltdata/sandic06.dat",
45 	 "/lib/dict/world/woltdata/edindex",
46 	 worldnextoff,	worldprintentry,	worldprintkey},
47 
48 	{"fie",		"WSOY Finnish->English",
49 	 "/lib/dict/world/werndata/sandic32.dat",
50 	 "/lib/dict/world/werndata/fieindex",
51 	 worldnextoff,	worldprintentry,	worldprintkey},
52 	{"efi",		"WSOY English->Finnish",
53 	 "/lib/dict/world/werndata/sandic31.dat",
54 	 "/lib/dict/world/werndata/efiindex",
55 	 worldnextoff,	worldprintentry,	worldprintkey},
56 
57 	{"fe",		"Collins French->English",
58 	 "/lib/dict/fe",	"/lib/dict/feindex",
59 	 pcollnextoff,	pcollprintentry,	pcollprintkey},
60 	{"ef",		"Collins English->French",
61 	 "/lib/dict/ef",	"/lib/dict/efindex",
62 	 pcollnextoff,	pcollprintentry,	pcollprintkey},
63 
64 	{"ge",		"Collins German->English",
65 	 "/lib/dict/ge",	"/lib/dict/geindex",
66 	 pcollgnextoff,	pcollgprintentry,	pcollgprintkey},
67 	{"eg",		"Collins English->German",
68 	 "/lib/dict/eg",	"/lib/dict/egindex",
69 	 pcollgnextoff,	pcollgprintentry,	pcollgprintkey},
70 
71 	{"ie",		"Collins Italian->English",
72 	 "/lib/dict/ie",	"/lib/dict/ieindex",
73 	 pcollnextoff,	pcollprintentry,	pcollprintkey},
74 	{"ei",		"Collins English->Italian",
75 	 "/lib/dict/ei",	"/lib/dict/eiindex",
76 	 pcollnextoff,	pcollprintentry,	pcollprintkey},
77 
78 	{"je",		"Sanshusha Japanese->English",
79 	 "/lib/dict/world/sansdata/sandic18.dat",
80 	 "/lib/dict/world/sansdata/jeindex",
81 	 worldnextoff,	worldprintentry,	worldprintkey},
82 	{"jek",		"Sanshusha Japanese->English (Kanji index)",
83 	 "/lib/dict/world/sansdata/sandic18.dat",
84 	 "/lib/dict/world/sansdata/jekindex",
85 	 worldnextoff,	worldprintentry,	worldprintkey},
86 	{"ej",		"Sanshusha English->Japanese",
87 	 "/lib/dict/world/sansdata/sandic18.dat",
88 	 "/lib/dict/world/sansdata/ejindex",
89 	 worldnextoff,	worldprintentry,	worldprintkey},
90 
91 	{"tjeg",	"Sanshusha technical Japanese->English,German",
92 	 "/lib/dict/world/sansdata/sandic16.dat",
93 	 "/lib/dict/world/sansdata/tjegindex",
94 	 worldnextoff,	worldprintentry,	worldprintkey},
95 	{"tjegk",	"Sanshusha technical Japanese->English,German (Kanji index)",
96 	 "/lib/dict/world/sansdata/sandic16.dat",
97 	 "/lib/dict/world/sansdata/tjegkindex",
98 	 worldnextoff,	worldprintentry,	worldprintkey},
99 	{"tegj",	"Sanshusha technical English->German,Japanese",
100 	 "/lib/dict/world/sansdata/sandic16.dat",
101 	 "/lib/dict/world/sansdata/tegjindex",
102 	 worldnextoff,	worldprintentry,	worldprintkey},
103 	{"tgje",	"Sanshusha technical German->Japanese,English",
104 	 "/lib/dict/world/sansdata/sandic16.dat",
105 	 "/lib/dict/world/sansdata/tgjeindex",
106 	 worldnextoff,	worldprintentry,	worldprintkey},
107 
108 	{"ne",		"Kunnskapforlaget Norwegian->English",
109 	 "/lib/dict/world/kunndata/sandic28.dat",
110 	 "/lib/dict/world/kunndata/neindex",
111 	 worldnextoff,	worldprintentry,	worldprintkey},
112 	{"en",		"Kunnskapforlaget English->Norwegian",
113 	 "/lib/dict/world/kunndata/sandic27.dat",
114 	 "/lib/dict/world/kunndata/enindex",
115 	 worldnextoff,	worldprintentry,	worldprintkey},
116 
117 	{"re",		"Leon Ungier Russian->English",
118 	 "/lib/dict/re",	"/lib/dict/reindex",
119 	 simplenextoff,	simpleprintentry,	simpleprintkey},
120 	{"er",		"Leon Ungier English->Russian",
121 	 "/lib/dict/re",	"/lib/dict/erindex",
122 	 simplenextoff,	simpleprintentry,	simpleprintkey},
123 
124 	{"se",		"Collins Spanish->English",
125 	 "/lib/dict/se",	"/lib/dict/seindex",
126 	 pcollnextoff,	pcollprintentry,	pcollprintkey},
127 	{"es",		"Collins English->Spanish",
128 	 "/lib/dict/es",	"/lib/dict/esindex",
129 	 pcollnextoff,	pcollprintentry,	pcollprintkey},
130 
131 	{"swe",		"Esselte Studium Swedish->English",
132 	 "/lib/dict/world/essedata/sandic34.dat",
133 	 "/lib/dict/world/essedata/sweindex",
134 	 worldnextoff,	worldprintentry,	worldprintkey},
135 	{"esw",		"Esselte Studium English->Swedish",
136 	 "/lib/dict/world/essedata/sandic33.dat",
137 	 "/lib/dict/world/essedata/eswindex",
138 	 worldnextoff,	worldprintentry,	worldprintkey},
139 
140 	{"movie",	"Movies -- by title",
141 	 "/lib/movie/data",	"/lib/dict/movtindex",
142 	 movienextoff,	movieprintentry,	movieprintkey},
143 	{"moviea",	"Movies -- by actor",
144 	 "/lib/movie/data",	"/lib/dict/movaindex",
145 	 movienextoff,	movieprintentry,	movieprintkey},
146 	{"movied",	"Movies -- by director",
147 	 "/lib/movie/data",	"/lib/dict/movdindex",
148 	 movienextoff,	movieprintentry,	movieprintkey},
149 
150 	{"slang",	"English Slang",
151 	 "/lib/dict/slang",	"/lib/dict/slangindex",
152 	 slangnextoff,	slangprintentry,	slangprintkey},
153 
154 	{"robert",	"Robert Électronique",
155 	 "/lib/dict/robert/_pointers",	"/lib/dict/robert/_index",
156 	 robertnextoff,	robertindexentry,	robertprintkey},
157 	{"robertv",	"Robert Électronique - formes des verbes",
158 	 "/lib/dict/robert/flex.rob",	"/lib/dict/robert/_flexindex",
159 	 robertnextflex,	robertflexentry,	robertprintkey},
160 
161 	{0, 0, 0, 0, 0}
162 };
163 
164 typedef struct Lig Lig;
165 struct Lig {
166 	Rune	start;		/* accent rune */
167 	Rune	*pairs;		/* <char,accented version> pairs */
168 };
169 
170 static Lig ligtab[Nligs] = {
171 [LACU-LIGS]	{L'´',	L"AÁaáCĆcćEÉeégģIÍiíıíLĹlĺNŃnńOÓoóRŔrŕSŚsśUÚuúYÝyýZŹzź"},
172 [LGRV-LIGS]	{L'ˋ',	L"AÀaàEÈeèIÌiìıìOÒoòUÙuù"},
173 [LUML-LIGS]	{L'¨',	L"AÄaäEËeëIÏiïOÖoöUÜuüYŸyÿ"},
174 [LCED-LIGS]	{L'¸',	L"CÇcçGĢKĶkķLĻlļNŅnņRŖrŗSŞsşTŢtţ"},
175 [LTIL-LIGS]	{L'˜',	L"AÃaãIĨiĩıĩNÑnñOÕoõUŨuũ"},
176 [LBRV-LIGS]	{L'˘',	L"AĂaăEĔeĕGĞgğIĬiĭıĭOŎoŏUŬuŭ"},
177 [LRNG-LIGS]	{L'˚',	L"AÅaåUŮuů"},
178 [LDOT-LIGS]	{L'˙',	L"CĊcċEĖeėGĠgġIİLĿlŀZŻzż"},
179 [LDTB-LIGS]	{L'.',	L""},
180 [LFRN-LIGS]	{L'⌢',	L"AÂaâCĈcĉEÊeêGĜgĝHĤhĥIÎiîıîJĴjĵOÔoôSŜsŝUÛuûWŴwŵYŶyŷ"},
181 [LFRB-LIGS]	{L'̯',	L""},
182 [LOGO-LIGS]	{L'˛',	L"AĄaąEĘeęIĮiįıįUŲuų"},
183 [LMAC-LIGS]	{L'¯',	L"AĀaāEĒeēIĪiīıīOŌoōUŪuū"},
184 [LHCK-LIGS]	{L'ˇ',	L"CČcčDĎdďEĚeěLĽlľNŇnňRŘrřSŠsšTŤtťZŽzž"},
185 [LASP-LIGS]	{L'ʽ',	L""},
186 [LLEN-LIGS]	{L'ʼ',	L""},
187 [LBRB-LIGS]	{L'̮',	L""}
188 };
189 
190 Rune *multitab[Nmulti] = {
191 [MAAS-MULTI]	L"ʽα",
192 [MALN-MULTI]	L"ʼα",
193 [MAND-MULTI]	L"and",
194 [MAOQ-MULTI]	L"a/q",
195 [MBRA-MULTI]	L"<|",
196 [MDD-MULTI]	L"..",
197 [MDDD-MULTI]	L"...",
198 [MEAS-MULTI]	L"ʽε",
199 [MELN-MULTI]	L"ʼε",
200 [MEMM-MULTI]	L"——",
201 [MHAS-MULTI]	L"ʽη",
202 [MHLN-MULTI]	L"ʼη",
203 [MIAS-MULTI]	L"ʽι",
204 [MILN-MULTI]	L"ʼι",
205 [MLCT-MULTI]	L"ct",
206 [MLFF-MULTI]	L"ff",
207 [MLFFI-MULTI]	L"ffi",
208 [MLFFL-MULTI]	L"ffl",
209 [MLFL-MULTI]	L"fl",
210 [MLFI-MULTI]	L"fi",
211 [MLLS-MULTI]	L"ɫɫ",
212 [MLST-MULTI]	L"st",
213 [MOAS-MULTI]	L"ʽο",
214 [MOLN-MULTI]	L"ʼο",
215 [MOR-MULTI]	L"or",
216 [MRAS-MULTI]	L"ʽρ",
217 [MRLN-MULTI]	L"ʼρ",
218 [MTT-MULTI]	L"~~",
219 [MUAS-MULTI]	L"ʽυ",
220 [MULN-MULTI]	L"ʼυ",
221 [MWAS-MULTI]	L"ʽω",
222 [MWLN-MULTI]	L"ʼω",
223 [MOE-MULTI]	L"oe",
224 [MES-MULTI]	L"  ",
225 };
226 
227 #define	risupper(r)	(L'A' <= (r) && (r) <= L'Z')
228 #define	rislatin1(r)	(0xC0 <= (r) && (r) <= 0xFF)
229 #define	rtolower(r)	((r)-'A'+'a')
230 
231 static Rune latin_fold_tab[] =
232 {
233 /*	Table to fold latin 1 characters to ASCII equivalents
234 			based at Rune value 0xc0
235 
236 	 À    Á    Â    Ã    Ä    Å    Æ    Ç
237 	 È    É    Ê    Ë    Ì    Í    Î    Ï
238 	 Ð    Ñ    Ò    Ó    Ô    Õ    Ö    ×
239 	 Ø    Ù    Ú    Û    Ü    Ý    Þ    ß
240 	 à    á    â    ã    ä    å    æ    ç
241 	 è    é    ê    ë    ì    í    î    ï
242 	 ð    ñ    ò    ó    ô    õ    ö    ÷
243 	 ø    ù    ú    û    ü    ý    þ    ÿ
244 */
245 	'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c',
246 	'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
247 	'd', 'n', 'o', 'o', 'o', 'o', 'o',  0 ,
248 	'o', 'u', 'u', 'u', 'u', 'y',  0 ,  0 ,
249 	'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c',
250 	'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
251 	'd', 'n', 'o', 'o', 'o', 'o', 'o',  0 ,
252 	'o', 'u', 'u', 'u', 'u', 'y',  0 , 'y',
253 };
254 
255 static Rune 	*ttabstack[20];
256 static int	ntt;
257 
258 /*
259  * tab is an array of n Assoc's, sorted by key.
260  * Look for key in tab, and return corresponding val
261  * or -1 if not there
262  */
263 long
264 lookassoc(Assoc *tab, int n, char *key)
265 {
266 	Assoc *q;
267 	long i, low, high;
268 	int r;
269 
270 	for(low = -1, high = n; high > low+1; ){
271 		i = (high+low)/2;
272 		q = &tab[i];
273 		if((r=strcmp(key, q->key))<0)
274 			high = i;
275 		else if(r == 0)
276 			return q->val;
277 		else
278 			low=i;
279 	}
280 	return -1;
281 }
282 
283 long
284 looknassoc(Nassoc *tab, int n, long key)
285 {
286 	Nassoc *q;
287 	long i, low, high;
288 
289 	for(low = -1, high = n; high > low+1; ){
290 		i = (high+low)/2;
291 		q = &tab[i];
292 		if(key < q->key)
293 			high = i;
294 		else if(key == q->key)
295 			return q->val;
296 		else
297 			low=i;
298 	}
299 	return -1;
300 }
301 
302 void
303 err(char *fmt, ...)
304 {
305 	char buf[1000];
306 	va_list v;
307 
308 	va_start(v, fmt);
309 	vsnprint(buf, sizeof(buf), fmt, v);
310 	va_end(v);
311 	fprint(2, "%s: %s\n", argv0, buf);
312 }
313 
314 /*
315  * Write the rune r to bout, keeping track of line length
316  * and breaking the lines (at blanks) when they get too long
317  */
318 void
319 outrune(long r)
320 {
321 	if(outinhibit)
322 		return;
323 	if(++linelen > breaklen && r == L' ') {
324 		Bputc(bout, '\n');
325 		linelen = 0;
326 	} else
327 		Bputrune(bout, r);
328 }
329 
330 void
331 outrunes(Rune *rp)
332 {
333 	Rune r;
334 
335 	while((r = *rp++) != 0)
336 		outrune(r);
337 }
338 
339 /* like outrune, but when arg is know to be a char */
340 void
341 outchar(int c)
342 {
343 	if(outinhibit)
344 		return;
345 	if(++linelen > breaklen && c == ' ') {
346 		c ='\n';
347 		linelen = 0;
348 	}
349 	Bputc(bout, c);
350 }
351 
352 void
353 outchars(char *s)
354 {
355 	char c;
356 
357 	while((c = *s++) != 0)
358 		outchar(c);
359 }
360 
361 void
362 outprint(char *fmt, ...)
363 {
364 	char buf[1000];
365 	va_list v;
366 
367 	va_start(v, fmt);
368 	vsnprint(buf, sizeof(buf), fmt, v);
369 	va_end(v);
370 	outchars(buf);
371 }
372 
373 void
374 outpiece(char *b, char *e)
375 {
376 	int c, lastc;
377 
378 	lastc = 0;
379 	while(b < e) {
380 		c = *b++;
381 		if(c == '\n')
382 			c = ' ';
383 		if(!(c == ' ' && lastc == ' '))
384 			outchar(c);
385 		lastc = c;
386 	}
387 }
388 
389 /*
390  * Go to new line if not already there; indent if ind != 0.
391  * If ind > 1, leave a blank line too.
392  * Slight hack: assume if current line is only one or two
393  * characters long, then they were spaces.
394  */
395 void
396 outnl(int ind)
397 {
398 	if(outinhibit)
399 		return;
400 	if(ind) {
401 		if(ind > 1) {
402 			if(linelen > 2)
403 				Bputc(bout, '\n');
404 			Bprint(bout, "\n  ");
405 		} else if(linelen == 0)
406 			Bprint(bout, "  ");
407 		else if(linelen == 1)
408 			Bputc(bout, ' ');
409 		else if(linelen != 2)
410 			Bprint(bout, "\n  ");
411 		linelen = 2;
412 	} else {
413 		if(linelen) {
414 			Bputc(bout, '\n');
415 			linelen = 0;
416 		}
417 	}
418 }
419 
420 /*
421  * Fold the runes in null-terminated rp.
422  * Use the sort(1) definition of folding (uppercase to lowercase,
423  * latin1-accented characters to corresponding unaccented chars)
424  */
425 void
426 fold(Rune *rp)
427 {
428 	Rune r;
429 
430 	while((r = *rp) != 0) {
431 		if (rislatin1(r) && latin_fold_tab[r-0xc0])
432 				r = latin_fold_tab[r-0xc0];
433 		if(risupper(r))
434 			r = rtolower(r);
435 		*rp++ = r;
436 	}
437 }
438 
439 /*
440  * Like fold, but put folded result into new
441  * (assumed to have enough space).
442  * old is a regular expression, but we know that
443  * metacharacters aren't affected
444  */
445 void
446 foldre(char *new, char *old)
447 {
448 	Rune r;
449 
450 	while(*old) {
451 		old += chartorune(&r, old);
452 		if (rislatin1(r) && latin_fold_tab[r-0xc0])
453 				r = latin_fold_tab[r-0xc0];
454 		if(risupper(r))
455 			r = rtolower(r);
456 		new += runetochar(new, &r);
457 	}
458 	*new = 0;
459 }
460 
461 /*
462  *	acomp(s, t) returns:
463  *		-2 if s strictly precedes t
464  *		-1 if s is a prefix of t
465  *		0 if s is the same as t
466  *		1 if t is a prefix of s
467  *		2 if t strictly precedes s
468  */
469 
470 int
471 acomp(Rune *s, Rune *t)
472 {
473 	int cs, ct;
474 
475 	for(;;) {
476 		cs = *s;
477 		ct = *t;
478 		if(cs != ct)
479 			break;
480 		if(cs == 0)
481 			return 0;
482 		s++;
483 		t++;
484 	}
485 	if(cs == 0)
486 		return -1;
487 	if(ct == 0)
488 		return 1;
489 	if(cs < ct)
490 		return -2;
491 	return 2;
492 }
493 
494 /*
495  * Copy null terminated Runes from 'from' to 'to'.
496  */
497 void
498 runescpy(Rune *to, Rune *from)
499 {
500 	while((*to++ = *from++) != 0)
501 		continue;
502 }
503 
504 /*
505  * Conversion of unsigned number to long, no overflow detection
506  */
507 long
508 runetol(Rune *r)
509 {
510 	int c;
511 	long n;
512 
513 	n = 0;
514 	for(;; r++){
515 		c = *r;
516 		if(L'0'<=c && c<=L'9')
517 			c -= '0';
518 		else
519 			break;
520 		n = n*10 + c;
521 	}
522 	return n;
523 }
524 
525 /*
526  * See if there is a rune corresponding to the accented
527  * version of r with accent acc (acc in [LIGS..LIGE-1]),
528  * and return it if so, else return NONE.
529  */
530 Rune
531 liglookup(Rune acc, Rune r)
532 {
533 	Rune *p;
534 
535 	if(acc < LIGS || acc >= LIGE)
536 		return NONE;
537 	for(p = ligtab[acc-LIGS].pairs; *p; p += 2)
538 		if(*p == r)
539 			return *(p+1);
540 	return NONE;
541 }
542 
543 /*
544  * Maintain a translation table stack (a translation table
545  * is an array of Runes indexed by bytes or 7-bit bytes).
546  * If starting is true, push the curtab onto the stack
547  * and return newtab; else pop the top of the stack and
548  * return it.
549  * If curtab is 0, initialize the stack and return.
550  */
551 Rune *
552 changett(Rune *curtab, Rune *newtab, int starting)
553 {
554 	if(curtab == 0) {
555 		ntt = 0;
556 		return 0;
557 	}
558 	if(starting) {
559 		if(ntt >= asize(ttabstack)) {
560 			if(debug)
561 				err("translation stack overflow");
562 			return curtab;
563 		}
564 		ttabstack[ntt++] = curtab;
565 		return newtab;
566 	} else {
567 		if(ntt == 0) {
568 			if(debug)
569 				err("translation stack underflow");
570 			return curtab;
571 		}
572 		return ttabstack[--ntt];
573 	}
574 }
575