xref: /plan9/sys/src/cmd/dict/utils.c (revision ff8c3af2f44d95267f67219afa20ba82ff6cf7e4)
1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include "dict.h"
5 
6 Dict dicts[] = {
7 	{"oed",		"Oxford English Dictionary, 2nd Ed.",
8 	 "/lib/dict/oed2",	"/lib/dict/oed2index",
9 	 oednextoff,	oedprintentry,		oedprintkey},
10 	{"ahd",		"American Heritage Dictionary, 2nd College Ed.",
11 	 "/lib/ahd/DICT.DB",	"/lib/ahd/index",
12 	 ahdnextoff,	ahdprintentry,		ahdprintkey},
13 	{"pgw",		"Project Gutenberg Webster Dictionary",
14 	 "/lib/dict/pgw",	"/lib/dict/pgwindex",
15 	 pgwnextoff,	pgwprintentry,		pgwprintkey},
16 	{"thesaurus",	"Collins Thesaurus",
17 	 "/lib/dict/thesaurus",	"/lib/dict/thesindex",
18 	 thesnextoff,	thesprintentry,	thesprintkey},
19 
20 	{"ce",		"Gendai Chinese->English",
21 	 "/lib/dict/world/sansdata/sandic24.dat",
22 	 "/lib/dict/world/sansdata/ceindex",
23 	 worldnextoff,	worldprintentry,	worldprintkey},
24 	{"ceh",		"Gendai Chinese->English (Hanzi index)",
25 	 "/lib/dict/world/sansdata/sandic24.dat",
26 	 "/lib/dict/world/sansdata/cehindex",
27 	 worldnextoff,	worldprintentry,	worldprintkey},
28 	{"ec",		"Gendai English->Chinese",
29 	 "/lib/dict/world/sansdata/sandic24.dat",
30 	 "/lib/dict/world/sansdata/ecindex",
31 	 worldnextoff,	worldprintentry,	worldprintkey},
32 
33 	{"dae",		"Gyldendal Danish->English",
34 	 "/lib/dict/world/gylddata/sandic30.dat",
35 	 "/lib/dict/world/gylddata/daeindex",
36 	 worldnextoff,	worldprintentry,	worldprintkey},
37 	{"eda",		"Gyldendal English->Danish",
38 	 "/lib/dict/world/gylddata/sandic29.dat",
39 	 "/lib/dict/world/gylddata/edaindex",
40 	 worldnextoff,	worldprintentry,	worldprintkey},
41 
42 	{"due",		"Wolters-Noordhoff Dutch->English",
43 	 "/lib/dict/world/woltdata/sandic07.dat",
44 	 "/lib/dict/world/woltdata/deindex",
45 	 worldnextoff,	worldprintentry,	worldprintkey},
46 	{"edu",		"Wolters-Noordhoff English->Dutch",
47 	 "/lib/dict/world/woltdata/sandic06.dat",
48 	 "/lib/dict/world/woltdata/edindex",
49 	 worldnextoff,	worldprintentry,	worldprintkey},
50 
51 	{"fie",		"WSOY Finnish->English",
52 	 "/lib/dict/world/werndata/sandic32.dat",
53 	 "/lib/dict/world/werndata/fieindex",
54 	 worldnextoff,	worldprintentry,	worldprintkey},
55 	{"efi",		"WSOY English->Finnish",
56 	 "/lib/dict/world/werndata/sandic31.dat",
57 	 "/lib/dict/world/werndata/efiindex",
58 	 worldnextoff,	worldprintentry,	worldprintkey},
59 
60 	{"fe",		"Collins French->English",
61 	 "/lib/dict/fe",	"/lib/dict/feindex",
62 	 pcollnextoff,	pcollprintentry,	pcollprintkey},
63 	{"ef",		"Collins English->French",
64 	 "/lib/dict/ef",	"/lib/dict/efindex",
65 	 pcollnextoff,	pcollprintentry,	pcollprintkey},
66 
67 	{"ge",		"Collins German->English",
68 	 "/lib/dict/ge",	"/lib/dict/geindex",
69 	 pcollgnextoff,	pcollgprintentry,	pcollgprintkey},
70 	{"eg",		"Collins English->German",
71 	 "/lib/dict/eg",	"/lib/dict/egindex",
72 	 pcollgnextoff,	pcollgprintentry,	pcollgprintkey},
73 
74 	{"ie",		"Collins Italian->English",
75 	 "/lib/dict/ie",	"/lib/dict/ieindex",
76 	 pcollnextoff,	pcollprintentry,	pcollprintkey},
77 	{"ei",		"Collins English->Italian",
78 	 "/lib/dict/ei",	"/lib/dict/eiindex",
79 	 pcollnextoff,	pcollprintentry,	pcollprintkey},
80 
81 	{"je",		"Sanshusha Japanese->English",
82 	 "/lib/dict/world/sansdata/sandic18.dat",
83 	 "/lib/dict/world/sansdata/jeindex",
84 	 worldnextoff,	worldprintentry,	worldprintkey},
85 	{"jek",		"Sanshusha Japanese->English (Kanji index)",
86 	 "/lib/dict/world/sansdata/sandic18.dat",
87 	 "/lib/dict/world/sansdata/jekindex",
88 	 worldnextoff,	worldprintentry,	worldprintkey},
89 	{"ej",		"Sanshusha English->Japanese",
90 	 "/lib/dict/world/sansdata/sandic18.dat",
91 	 "/lib/dict/world/sansdata/ejindex",
92 	 worldnextoff,	worldprintentry,	worldprintkey},
93 
94 	{"tjeg",	"Sanshusha technical Japanese->English,German",
95 	 "/lib/dict/world/sansdata/sandic16.dat",
96 	 "/lib/dict/world/sansdata/tjegindex",
97 	 worldnextoff,	worldprintentry,	worldprintkey},
98 	{"tjegk",	"Sanshusha technical Japanese->English,German (Kanji index)",
99 	 "/lib/dict/world/sansdata/sandic16.dat",
100 	 "/lib/dict/world/sansdata/tjegkindex",
101 	 worldnextoff,	worldprintentry,	worldprintkey},
102 	{"tegj",	"Sanshusha technical English->German,Japanese",
103 	 "/lib/dict/world/sansdata/sandic16.dat",
104 	 "/lib/dict/world/sansdata/tegjindex",
105 	 worldnextoff,	worldprintentry,	worldprintkey},
106 	{"tgje",	"Sanshusha technical German->Japanese,English",
107 	 "/lib/dict/world/sansdata/sandic16.dat",
108 	 "/lib/dict/world/sansdata/tgjeindex",
109 	 worldnextoff,	worldprintentry,	worldprintkey},
110 
111 	{"ne",		"Kunnskapforlaget Norwegian->English",
112 	 "/lib/dict/world/kunndata/sandic28.dat",
113 	 "/lib/dict/world/kunndata/neindex",
114 	 worldnextoff,	worldprintentry,	worldprintkey},
115 	{"en",		"Kunnskapforlaget English->Norwegian",
116 	 "/lib/dict/world/kunndata/sandic27.dat",
117 	 "/lib/dict/world/kunndata/enindex",
118 	 worldnextoff,	worldprintentry,	worldprintkey},
119 
120 	{"re",		"Leon Ungier Russian->English",
121 	 "/lib/dict/re",	"/lib/dict/reindex",
122 	 simplenextoff,	simpleprintentry,	simpleprintkey},
123 	{"er",		"Leon Ungier English->Russian",
124 	 "/lib/dict/re",	"/lib/dict/erindex",
125 	 simplenextoff,	simpleprintentry,	simpleprintkey},
126 
127 	{"se",		"Collins Spanish->English",
128 	 "/lib/dict/se",	"/lib/dict/seindex",
129 	 pcollnextoff,	pcollprintentry,	pcollprintkey},
130 	{"es",		"Collins English->Spanish",
131 	 "/lib/dict/es",	"/lib/dict/esindex",
132 	 pcollnextoff,	pcollprintentry,	pcollprintkey},
133 
134 	{"swe",		"Esselte Studium Swedish->English",
135 	 "/lib/dict/world/essedata/sandic34.dat",
136 	 "/lib/dict/world/essedata/sweindex",
137 	 worldnextoff,	worldprintentry,	worldprintkey},
138 	{"esw",		"Esselte Studium English->Swedish",
139 	 "/lib/dict/world/essedata/sandic33.dat",
140 	 "/lib/dict/world/essedata/eswindex",
141 	 worldnextoff,	worldprintentry,	worldprintkey},
142 
143 	{"movie",	"Movies -- by title",
144 	 "/lib/movie/data",	"/lib/dict/movtindex",
145 	 movienextoff,	movieprintentry,	movieprintkey},
146 	{"moviea",	"Movies -- by actor",
147 	 "/lib/movie/data",	"/lib/dict/movaindex",
148 	 movienextoff,	movieprintentry,	movieprintkey},
149 	{"movied",	"Movies -- by director",
150 	 "/lib/movie/data",	"/lib/dict/movdindex",
151 	 movienextoff,	movieprintentry,	movieprintkey},
152 
153 	{"slang",	"English Slang",
154 	 "/lib/dict/slang",	"/lib/dict/slangindex",
155 	 slangnextoff,	slangprintentry,	slangprintkey},
156 
157 	{"robert",	"Robert Électronique",
158 	 "/lib/dict/robert/_pointers",	"/lib/dict/robert/_index",
159 	 robertnextoff,	robertindexentry,	robertprintkey},
160 	{"robertv",	"Robert Électronique - formes des verbes",
161 	 "/lib/dict/robert/flex.rob",	"/lib/dict/robert/_flexindex",
162 	 robertnextflex,	robertflexentry,	robertprintkey},
163 
164 	{0, 0, 0, 0, 0}
165 };
166 
167 typedef struct Lig Lig;
168 struct Lig {
169 	Rune	start;		/* accent rune */
170 	Rune	*pairs;		/* <char,accented version> pairs */
171 };
172 
173 static Lig ligtab[Nligs] = {
174 [LACU-LIGS]	{L'´',	L"AÁaáCĆcćEÉeégģIÍiíıíLĹlĺNŃnńOÓoóRŔrŕSŚsśUÚuúYÝyýZŹzź"},
175 [LGRV-LIGS]	{L'ˋ',	L"AÀaàEÈeèIÌiìıìOÒoòUÙuù"},
176 [LUML-LIGS]	{L'¨',	L"AÄaäEËeëIÏiïOÖoöUÜuüYŸyÿ"},
177 [LCED-LIGS]	{L'¸',	L"CÇcçGĢKĶkķLĻlļNŅnņRŖrŗSŞsşTŢtţ"},
178 [LTIL-LIGS]	{L'˜',	L"AÃaãIĨiĩıĩNÑnñOÕoõUŨuũ"},
179 [LBRV-LIGS]	{L'˘',	L"AĂaăEĔeĕGĞgğIĬiĭıĭOŎoŏUŬuŭ"},
180 [LRNG-LIGS]	{L'˚',	L"AÅaåUŮuů"},
181 [LDOT-LIGS]	{L'˙',	L"CĊcċEĖeėGĠgġIİLĿlŀZŻzż"},
182 [LDTB-LIGS]	{L'.',	L""},
183 [LFRN-LIGS]	{L'⌢',	L"AÂaâCĈcĉEÊeêGĜgĝHĤhĥIÎiîıîJĴjĵOÔoôSŜsŝUÛuûWŴwŵYŶyŷ"},
184 [LFRB-LIGS]	{L'̯',	L""},
185 [LOGO-LIGS]	{L'˛',	L"AĄaąEĘeęIĮiįıįUŲuų"},
186 [LMAC-LIGS]	{L'¯',	L"AĀaāEĒeēIĪiīıīOŌoōUŪuū"},
187 [LHCK-LIGS]	{L'ˇ',	L"CČcčDĎdďEĚeěLĽlľNŇnňRŘrřSŠsšTŤtťZŽzž"},
188 [LASP-LIGS]	{L'ʽ',	L""},
189 [LLEN-LIGS]	{L'ʼ',	L""},
190 [LBRB-LIGS]	{L'̮',	L""}
191 };
192 
193 Rune *multitab[Nmulti] = {
194 [MAAS-MULTI]	L"ʽα",
195 [MALN-MULTI]	L"ʼα",
196 [MAND-MULTI]	L"and",
197 [MAOQ-MULTI]	L"a/q",
198 [MBRA-MULTI]	L"<|",
199 [MDD-MULTI]	L"..",
200 [MDDD-MULTI]	L"...",
201 [MEAS-MULTI]	L"ʽε",
202 [MELN-MULTI]	L"ʼε",
203 [MEMM-MULTI]	L"——",
204 [MHAS-MULTI]	L"ʽη",
205 [MHLN-MULTI]	L"ʼη",
206 [MIAS-MULTI]	L"ʽι",
207 [MILN-MULTI]	L"ʼι",
208 [MLCT-MULTI]	L"ct",
209 [MLFF-MULTI]	L"ff",
210 [MLFFI-MULTI]	L"ffi",
211 [MLFFL-MULTI]	L"ffl",
212 [MLFL-MULTI]	L"fl",
213 [MLFI-MULTI]	L"fi",
214 [MLLS-MULTI]	L"ɫɫ",
215 [MLST-MULTI]	L"st",
216 [MOAS-MULTI]	L"ʽο",
217 [MOLN-MULTI]	L"ʼο",
218 [MOR-MULTI]	L"or",
219 [MRAS-MULTI]	L"ʽρ",
220 [MRLN-MULTI]	L"ʼρ",
221 [MTT-MULTI]	L"~~",
222 [MUAS-MULTI]	L"ʽυ",
223 [MULN-MULTI]	L"ʼυ",
224 [MWAS-MULTI]	L"ʽω",
225 [MWLN-MULTI]	L"ʼω",
226 [MOE-MULTI]	L"oe",
227 [MES-MULTI]	L"  ",
228 };
229 
230 #define	risupper(r)	(L'A' <= (r) && (r) <= L'Z')
231 #define	rislatin1(r)	(0xC0 <= (r) && (r) <= 0xFF)
232 #define	rtolower(r)	((r)-'A'+'a')
233 
234 static Rune latin_fold_tab[] =
235 {
236 /*	Table to fold latin 1 characters to ASCII equivalents
237 			based at Rune value 0xc0
238 
239 	 À    Á    Â    Ã    Ä    Å    Æ    Ç
240 	 È    É    Ê    Ë    Ì    Í    Î    Ï
241 	 Ð    Ñ    Ò    Ó    Ô    Õ    Ö    ×
242 	 Ø    Ù    Ú    Û    Ü    Ý    Þ    ß
243 	 à    á    â    ã    ä    å    æ    ç
244 	 è    é    ê    ë    ì    í    î    ï
245 	 ð    ñ    ò    ó    ô    õ    ö    ÷
246 	 ø    ù    ú    û    ü    ý    þ    ÿ
247 */
248 	'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c',
249 	'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
250 	'd', 'n', 'o', 'o', 'o', 'o', 'o',  0 ,
251 	'o', 'u', 'u', 'u', 'u', 'y',  0 ,  0 ,
252 	'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c',
253 	'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
254 	'd', 'n', 'o', 'o', 'o', 'o', 'o',  0 ,
255 	'o', 'u', 'u', 'u', 'u', 'y',  0 , 'y',
256 };
257 
258 static Rune 	*ttabstack[20];
259 static int	ntt;
260 
261 /*
262  * tab is an array of n Assoc's, sorted by key.
263  * Look for key in tab, and return corresponding val
264  * or -1 if not there
265  */
266 long
267 lookassoc(Assoc *tab, int n, char *key)
268 {
269 	Assoc *q;
270 	long i, low, high;
271 	int r;
272 
273 	for(low = -1, high = n; high > low+1; ){
274 		i = (high+low)/2;
275 		q = &tab[i];
276 		if((r=strcmp(key, q->key))<0)
277 			high = i;
278 		else if(r == 0)
279 			return q->val;
280 		else
281 			low=i;
282 	}
283 	return -1;
284 }
285 
286 long
287 looknassoc(Nassoc *tab, int n, long key)
288 {
289 	Nassoc *q;
290 	long i, low, high;
291 
292 	for(low = -1, high = n; high > low+1; ){
293 		i = (high+low)/2;
294 		q = &tab[i];
295 		if(key < q->key)
296 			high = i;
297 		else if(key == q->key)
298 			return q->val;
299 		else
300 			low=i;
301 	}
302 	return -1;
303 }
304 
305 void
306 err(char *fmt, ...)
307 {
308 	char buf[1000];
309 	va_list v;
310 
311 	va_start(v, fmt);
312 	vsnprint(buf, sizeof(buf), fmt, v);
313 	va_end(v);
314 	fprint(2, "%s: %s\n", argv0, buf);
315 }
316 
317 /*
318  * Write the rune r to bout, keeping track of line length
319  * and breaking the lines (at blanks) when they get too long
320  */
321 void
322 outrune(long r)
323 {
324 	if(outinhibit)
325 		return;
326 	if(++linelen > breaklen && r == L' ') {
327 		Bputc(bout, '\n');
328 		linelen = 0;
329 	} else
330 		Bputrune(bout, r);
331 }
332 
333 void
334 outrunes(Rune *rp)
335 {
336 	Rune r;
337 
338 	while((r = *rp++) != 0)
339 		outrune(r);
340 }
341 
342 /* like outrune, but when arg is know to be a char */
343 void
344 outchar(int c)
345 {
346 	if(outinhibit)
347 		return;
348 	if(++linelen > breaklen && c == ' ') {
349 		c ='\n';
350 		linelen = 0;
351 	}
352 	Bputc(bout, c);
353 }
354 
355 void
356 outchars(char *s)
357 {
358 	char c;
359 
360 	while((c = *s++) != 0)
361 		outchar(c);
362 }
363 
364 void
365 outprint(char *fmt, ...)
366 {
367 	char buf[1000];
368 	va_list v;
369 
370 	va_start(v, fmt);
371 	vsnprint(buf, sizeof(buf), fmt, v);
372 	va_end(v);
373 	outchars(buf);
374 }
375 
376 void
377 outpiece(char *b, char *e)
378 {
379 	int c, lastc;
380 
381 	lastc = 0;
382 	while(b < e) {
383 		c = *b++;
384 		if(c == '\n')
385 			c = ' ';
386 		if(!(c == ' ' && lastc == ' '))
387 			outchar(c);
388 		lastc = c;
389 	}
390 }
391 
392 /*
393  * Go to new line if not already there; indent if ind != 0.
394  * If ind > 1, leave a blank line too.
395  * Slight hack: assume if current line is only one or two
396  * characters long, then they were spaces.
397  */
398 void
399 outnl(int ind)
400 {
401 	if(outinhibit)
402 		return;
403 	if(ind) {
404 		if(ind > 1) {
405 			if(linelen > 2)
406 				Bputc(bout, '\n');
407 			Bprint(bout, "\n  ");
408 		} else if(linelen == 0)
409 			Bprint(bout, "  ");
410 		else if(linelen == 1)
411 			Bputc(bout, ' ');
412 		else if(linelen != 2)
413 			Bprint(bout, "\n  ");
414 		linelen = 2;
415 	} else {
416 		if(linelen) {
417 			Bputc(bout, '\n');
418 			linelen = 0;
419 		}
420 	}
421 }
422 
423 /*
424  * Fold the runes in null-terminated rp.
425  * Use the sort(1) definition of folding (uppercase to lowercase,
426  * latin1-accented characters to corresponding unaccented chars)
427  */
428 void
429 fold(Rune *rp)
430 {
431 	Rune r;
432 
433 	while((r = *rp) != 0) {
434 		if (rislatin1(r) && latin_fold_tab[r-0xc0])
435 				r = latin_fold_tab[r-0xc0];
436 		if(risupper(r))
437 			r = rtolower(r);
438 		*rp++ = r;
439 	}
440 }
441 
442 /*
443  * Like fold, but put folded result into new
444  * (assumed to have enough space).
445  * old is a regular expression, but we know that
446  * metacharacters aren't affected
447  */
448 void
449 foldre(char *new, char *old)
450 {
451 	Rune r;
452 
453 	while(*old) {
454 		old += chartorune(&r, old);
455 		if (rislatin1(r) && latin_fold_tab[r-0xc0])
456 				r = latin_fold_tab[r-0xc0];
457 		if(risupper(r))
458 			r = rtolower(r);
459 		new += runetochar(new, &r);
460 	}
461 	*new = 0;
462 }
463 
464 /*
465  *	acomp(s, t) returns:
466  *		-2 if s strictly precedes t
467  *		-1 if s is a prefix of t
468  *		0 if s is the same as t
469  *		1 if t is a prefix of s
470  *		2 if t strictly precedes s
471  */
472 
473 int
474 acomp(Rune *s, Rune *t)
475 {
476 	int cs, ct;
477 
478 	for(;;) {
479 		cs = *s;
480 		ct = *t;
481 		if(cs != ct)
482 			break;
483 		if(cs == 0)
484 			return 0;
485 		s++;
486 		t++;
487 	}
488 	if(cs == 0)
489 		return -1;
490 	if(ct == 0)
491 		return 1;
492 	if(cs < ct)
493 		return -2;
494 	return 2;
495 }
496 
497 /*
498  * Copy null terminated Runes from 'from' to 'to'.
499  */
500 void
501 runescpy(Rune *to, Rune *from)
502 {
503 	while((*to++ = *from++) != 0)
504 		continue;
505 }
506 
507 /*
508  * Conversion of unsigned number to long, no overflow detection
509  */
510 long
511 runetol(Rune *r)
512 {
513 	int c;
514 	long n;
515 
516 	n = 0;
517 	for(;; r++){
518 		c = *r;
519 		if(L'0'<=c && c<=L'9')
520 			c -= '0';
521 		else
522 			break;
523 		n = n*10 + c;
524 	}
525 	return n;
526 }
527 
528 /*
529  * See if there is a rune corresponding to the accented
530  * version of r with accent acc (acc in [LIGS..LIGE-1]),
531  * and return it if so, else return NONE.
532  */
533 Rune
534 liglookup(Rune acc, Rune r)
535 {
536 	Rune *p;
537 
538 	if(acc < LIGS || acc >= LIGE)
539 		return NONE;
540 	for(p = ligtab[acc-LIGS].pairs; *p; p += 2)
541 		if(*p == r)
542 			return *(p+1);
543 	return NONE;
544 }
545 
546 /*
547  * Maintain a translation table stack (a translation table
548  * is an array of Runes indexed by bytes or 7-bit bytes).
549  * If starting is true, push the curtab onto the stack
550  * and return newtab; else pop the top of the stack and
551  * return it.
552  * If curtab is 0, initialize the stack and return.
553  */
554 Rune *
555 changett(Rune *curtab, Rune *newtab, int starting)
556 {
557 	if(curtab == 0) {
558 		ntt = 0;
559 		return 0;
560 	}
561 	if(starting) {
562 		if(ntt >= asize(ttabstack)) {
563 			if(debug)
564 				err("translation stack overflow");
565 			return curtab;
566 		}
567 		ttabstack[ntt++] = curtab;
568 		return newtab;
569 	} else {
570 		if(ntt == 0) {
571 			if(debug)
572 				err("translation stack underflow");
573 			return curtab;
574 		}
575 		return ttabstack[--ntt];
576 	}
577 }
578