xref: /plan9-contrib/sys/src/cmd/dict/utils.c (revision 4d44ba9b9ee4246ddbd96c7fcaf0918ab92ab35a)
1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include "dict.h"
5 
6 Dict dicts[] = {
7 	{"oed",		"Oxford English Dictionary, 2nd Ed.",
8 	 "/lib/dict/oed2",	"/lib/dict/oed2index",
9 	 oednextoff,	oedprintentry,		oedprintkey},
10 	{"ahd",		"American Heritage Dictionary, 2nd College Ed.",
11 	 "/lib/ahd/DICT.DB",	"/lib/ahd/index",
12 	 ahdnextoff,	ahdprintentry,		ahdprintkey},
13 	{"pgw",		"Project Gutenberg Webster Dictionary",
14 	 "/lib/dict/pgw",	"/lib/dict/pgwindex",
15 	 pgwnextoff,	pgwprintentry,		pgwprintkey},
16 	{"thesaurus",	"Collins Thesaurus",
17 	 "/lib/dict/thesaurus",	"/lib/dict/thesindex",
18 	 thesnextoff,	thesprintentry,	thesprintkey},
19 	{"roget",		"Project Gutenberg Roget's Thesaurus",
20 	 "/lib/dict/roget", "/lib/dict/rogetindex",
21 	 rogetnextoff,	rogetprintentry,	rogetprintkey},
22 
23 	{"ce",		"Gendai Chinese->English",
24 	 "/lib/dict/world/sansdata/sandic24.dat",
25 	 "/lib/dict/world/sansdata/ceindex",
26 	 worldnextoff,	worldprintentry,	worldprintkey},
27 	{"ceh",		"Gendai Chinese->English (Hanzi index)",
28 	 "/lib/dict/world/sansdata/sandic24.dat",
29 	 "/lib/dict/world/sansdata/cehindex",
30 	 worldnextoff,	worldprintentry,	worldprintkey},
31 	{"ec",		"Gendai English->Chinese",
32 	 "/lib/dict/world/sansdata/sandic24.dat",
33 	 "/lib/dict/world/sansdata/ecindex",
34 	 worldnextoff,	worldprintentry,	worldprintkey},
35 
36 	{"dae",		"Gyldendal Danish->English",
37 	 "/lib/dict/world/gylddata/sandic30.dat",
38 	 "/lib/dict/world/gylddata/daeindex",
39 	 worldnextoff,	worldprintentry,	worldprintkey},
40 	{"eda",		"Gyldendal English->Danish",
41 	 "/lib/dict/world/gylddata/sandic29.dat",
42 	 "/lib/dict/world/gylddata/edaindex",
43 	 worldnextoff,	worldprintentry,	worldprintkey},
44 
45 	{"due",		"Wolters-Noordhoff Dutch->English",
46 	 "/lib/dict/world/woltdata/sandic07.dat",
47 	 "/lib/dict/world/woltdata/deindex",
48 	 worldnextoff,	worldprintentry,	worldprintkey},
49 	{"edu",		"Wolters-Noordhoff English->Dutch",
50 	 "/lib/dict/world/woltdata/sandic06.dat",
51 	 "/lib/dict/world/woltdata/edindex",
52 	 worldnextoff,	worldprintentry,	worldprintkey},
53 
54 	{"fie",		"WSOY Finnish->English",
55 	 "/lib/dict/world/werndata/sandic32.dat",
56 	 "/lib/dict/world/werndata/fieindex",
57 	 worldnextoff,	worldprintentry,	worldprintkey},
58 	{"efi",		"WSOY English->Finnish",
59 	 "/lib/dict/world/werndata/sandic31.dat",
60 	 "/lib/dict/world/werndata/efiindex",
61 	 worldnextoff,	worldprintentry,	worldprintkey},
62 
63 	{"fe",		"Collins French->English",
64 	 "/lib/dict/fe",	"/lib/dict/feindex",
65 	 pcollnextoff,	pcollprintentry,	pcollprintkey},
66 	{"ef",		"Collins English->French",
67 	 "/lib/dict/ef",	"/lib/dict/efindex",
68 	 pcollnextoff,	pcollprintentry,	pcollprintkey},
69 
70 	{"ge",		"Collins German->English",
71 	 "/lib/dict/ge",	"/lib/dict/geindex",
72 	 pcollgnextoff,	pcollgprintentry,	pcollgprintkey},
73 	{"eg",		"Collins English->German",
74 	 "/lib/dict/eg",	"/lib/dict/egindex",
75 	 pcollgnextoff,	pcollgprintentry,	pcollgprintkey},
76 
77 	{"ie",		"Collins Italian->English",
78 	 "/lib/dict/ie",	"/lib/dict/ieindex",
79 	 pcollnextoff,	pcollprintentry,	pcollprintkey},
80 	{"ei",		"Collins English->Italian",
81 	 "/lib/dict/ei",	"/lib/dict/eiindex",
82 	 pcollnextoff,	pcollprintentry,	pcollprintkey},
83 
84 	{"je",		"Sanshusha Japanese->English",
85 	 "/lib/dict/world/sansdata/sandic18.dat",
86 	 "/lib/dict/world/sansdata/jeindex",
87 	 worldnextoff,	worldprintentry,	worldprintkey},
88 	{"jek",		"Sanshusha Japanese->English (Kanji index)",
89 	 "/lib/dict/world/sansdata/sandic18.dat",
90 	 "/lib/dict/world/sansdata/jekindex",
91 	 worldnextoff,	worldprintentry,	worldprintkey},
92 	{"ej",		"Sanshusha English->Japanese",
93 	 "/lib/dict/world/sansdata/sandic18.dat",
94 	 "/lib/dict/world/sansdata/ejindex",
95 	 worldnextoff,	worldprintentry,	worldprintkey},
96 
97 	{"tjeg",	"Sanshusha technical Japanese->English,German",
98 	 "/lib/dict/world/sansdata/sandic16.dat",
99 	 "/lib/dict/world/sansdata/tjegindex",
100 	 worldnextoff,	worldprintentry,	worldprintkey},
101 	{"tjegk",	"Sanshusha technical Japanese->English,German (Kanji index)",
102 	 "/lib/dict/world/sansdata/sandic16.dat",
103 	 "/lib/dict/world/sansdata/tjegkindex",
104 	 worldnextoff,	worldprintentry,	worldprintkey},
105 	{"tegj",	"Sanshusha technical English->German,Japanese",
106 	 "/lib/dict/world/sansdata/sandic16.dat",
107 	 "/lib/dict/world/sansdata/tegjindex",
108 	 worldnextoff,	worldprintentry,	worldprintkey},
109 	{"tgje",	"Sanshusha technical German->Japanese,English",
110 	 "/lib/dict/world/sansdata/sandic16.dat",
111 	 "/lib/dict/world/sansdata/tgjeindex",
112 	 worldnextoff,	worldprintentry,	worldprintkey},
113 
114 	{"ne",		"Kunnskapforlaget Norwegian->English",
115 	 "/lib/dict/world/kunndata/sandic28.dat",
116 	 "/lib/dict/world/kunndata/neindex",
117 	 worldnextoff,	worldprintentry,	worldprintkey},
118 	{"en",		"Kunnskapforlaget English->Norwegian",
119 	 "/lib/dict/world/kunndata/sandic27.dat",
120 	 "/lib/dict/world/kunndata/enindex",
121 	 worldnextoff,	worldprintentry,	worldprintkey},
122 
123 	{"re",		"Leon Ungier Russian->English",
124 	 "/lib/dict/re",	"/lib/dict/reindex",
125 	 simplenextoff,	simpleprintentry,	simpleprintkey},
126 	{"er",		"Leon Ungier English->Russian",
127 	 "/lib/dict/re",	"/lib/dict/erindex",
128 	 simplenextoff,	simpleprintentry,	simpleprintkey},
129 
130 	{"se",		"Collins Spanish->English",
131 	 "/lib/dict/se",	"/lib/dict/seindex",
132 	 pcollnextoff,	pcollprintentry,	pcollprintkey},
133 	{"es",		"Collins English->Spanish",
134 	 "/lib/dict/es",	"/lib/dict/esindex",
135 	 pcollnextoff,	pcollprintentry,	pcollprintkey},
136 
137 	{"swe",		"Esselte Studium Swedish->English",
138 	 "/lib/dict/world/essedata/sandic34.dat",
139 	 "/lib/dict/world/essedata/sweindex",
140 	 worldnextoff,	worldprintentry,	worldprintkey},
141 	{"esw",		"Esselte Studium English->Swedish",
142 	 "/lib/dict/world/essedata/sandic33.dat",
143 	 "/lib/dict/world/essedata/eswindex",
144 	 worldnextoff,	worldprintentry,	worldprintkey},
145 
146 	{"movie",	"Movies -- by title",
147 	 "/lib/movie/data",	"/lib/dict/movtindex",
148 	 movienextoff,	movieprintentry,	movieprintkey},
149 	{"moviea",	"Movies -- by actor",
150 	 "/lib/movie/data",	"/lib/dict/movaindex",
151 	 movienextoff,	movieprintentry,	movieprintkey},
152 	{"movied",	"Movies -- by director",
153 	 "/lib/movie/data",	"/lib/dict/movdindex",
154 	 movienextoff,	movieprintentry,	movieprintkey},
155 
156 	{"slang",	"English Slang",
157 	 "/lib/dict/slang",	"/lib/dict/slangindex",
158 	 slangnextoff,	slangprintentry,	slangprintkey},
159 
160 	{"robert",	"Robert Électronique",
161 	 "/lib/dict/robert/_pointers",	"/lib/dict/robert/_index",
162 	 robertnextoff,	robertindexentry,	robertprintkey},
163 	{"robertv",	"Robert Électronique - formes des verbes",
164 	 "/lib/dict/robert/flex.rob",	"/lib/dict/robert/_flexindex",
165 	 robertnextflex,	robertflexentry,	robertprintkey},
166 
167 	{0, 0, 0, 0, 0}
168 };
169 
170 typedef struct Lig Lig;
171 struct Lig {
172 	Rune	start;		/* accent rune */
173 	Rune	*pairs;		/* <char,accented version> pairs */
174 };
175 
176 static Lig ligtab[Nligs] = {
177 [LACU-LIGS]	{L'´',	L"AÁaáCĆcćEÉeégģIÍiíıíLĹlĺNŃnńOÓoóRŔrŕSŚsśUÚuúYÝyýZŹzź"},
178 [LGRV-LIGS]	{L'ˋ',	L"AÀaàEÈeèIÌiìıìOÒoòUÙuù"},
179 [LUML-LIGS]	{L'¨',	L"AÄaäEËeëIÏiïOÖoöUÜuüYŸyÿ"},
180 [LCED-LIGS]	{L'¸',	L"CÇcçGĢKĶkķLĻlļNŅnņRŖrŗSŞsşTŢtţ"},
181 [LTIL-LIGS]	{L'˜',	L"AÃaãIĨiĩıĩNÑnñOÕoõUŨuũ"},
182 [LBRV-LIGS]	{L'˘',	L"AĂaăEĔeĕGĞgğIĬiĭıĭOŎoŏUŬuŭ"},
183 [LRNG-LIGS]	{L'˚',	L"AÅaåUŮuů"},
184 [LDOT-LIGS]	{L'˙',	L"CĊcċEĖeėGĠgġIİLĿlŀZŻzż"},
185 [LDTB-LIGS]	{L'.',	L""},
186 [LFRN-LIGS]	{L'⌢',	L"AÂaâCĈcĉEÊeêGĜgĝHĤhĥIÎiîıîJĴjĵOÔoôSŜsŝUÛuûWŴwŵYŶyŷ"},
187 [LFRB-LIGS]	{L'̯',	L""},
188 [LOGO-LIGS]	{L'˛',	L"AĄaąEĘeęIĮiįıįUŲuų"},
189 [LMAC-LIGS]	{L'¯',	L"AĀaāEĒeēIĪiīıīOŌoōUŪuū"},
190 [LHCK-LIGS]	{L'ˇ',	L"CČcčDĎdďEĚeěLĽlľNŇnňRŘrřSŠsšTŤtťZŽzž"},
191 [LASP-LIGS]	{L'ʽ',	L""},
192 [LLEN-LIGS]	{L'ʼ',	L""},
193 [LBRB-LIGS]	{L'̮',	L""}
194 };
195 
196 Rune *multitab[Nmulti] = {
197 [MAAS-MULTI]	L"ʽα",
198 [MALN-MULTI]	L"ʼα",
199 [MAND-MULTI]	L"and",
200 [MAOQ-MULTI]	L"a/q",
201 [MBRA-MULTI]	L"<|",
202 [MDD-MULTI]	L"..",
203 [MDDD-MULTI]	L"...",
204 [MEAS-MULTI]	L"ʽε",
205 [MELN-MULTI]	L"ʼε",
206 [MEMM-MULTI]	L"——",
207 [MHAS-MULTI]	L"ʽη",
208 [MHLN-MULTI]	L"ʼη",
209 [MIAS-MULTI]	L"ʽι",
210 [MILN-MULTI]	L"ʼι",
211 [MLCT-MULTI]	L"ct",
212 [MLFF-MULTI]	L"ff",
213 [MLFFI-MULTI]	L"ffi",
214 [MLFFL-MULTI]	L"ffl",
215 [MLFL-MULTI]	L"fl",
216 [MLFI-MULTI]	L"fi",
217 [MLLS-MULTI]	L"ɫɫ",
218 [MLST-MULTI]	L"st",
219 [MOAS-MULTI]	L"ʽο",
220 [MOLN-MULTI]	L"ʼο",
221 [MOR-MULTI]	L"or",
222 [MRAS-MULTI]	L"ʽρ",
223 [MRLN-MULTI]	L"ʼρ",
224 [MTT-MULTI]	L"~~",
225 [MUAS-MULTI]	L"ʽυ",
226 [MULN-MULTI]	L"ʼυ",
227 [MWAS-MULTI]	L"ʽω",
228 [MWLN-MULTI]	L"ʼω",
229 [MOE-MULTI]	L"oe",
230 [MES-MULTI]	L"  ",
231 };
232 
233 #define	risupper(r)	(L'A' <= (r) && (r) <= L'Z')
234 #define	rislatin1(r)	(0xC0 <= (r) && (r) <= 0xFF)
235 #define	rtolower(r)	((r)-'A'+'a')
236 
237 static Rune latin_fold_tab[] =
238 {
239 /*	Table to fold latin 1 characters to ASCII equivalents
240 			based at Rune value 0xc0
241 
242 	 À    Á    Â    Ã    Ä    Å    Æ    Ç
243 	 È    É    Ê    Ë    Ì    Í    Î    Ï
244 	 Ð    Ñ    Ò    Ó    Ô    Õ    Ö    ×
245 	 Ø    Ù    Ú    Û    Ü    Ý    Þ    ß
246 	 à    á    â    ã    ä    å    æ    ç
247 	 è    é    ê    ë    ì    í    î    ï
248 	 ð    ñ    ò    ó    ô    õ    ö    ÷
249 	 ø    ù    ú    û    ü    ý    þ    ÿ
250 */
251 	'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c',
252 	'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
253 	'd', 'n', 'o', 'o', 'o', 'o', 'o',  0 ,
254 	'o', 'u', 'u', 'u', 'u', 'y',  0 ,  0 ,
255 	'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c',
256 	'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
257 	'd', 'n', 'o', 'o', 'o', 'o', 'o',  0 ,
258 	'o', 'u', 'u', 'u', 'u', 'y',  0 , 'y',
259 };
260 
261 static Rune 	*ttabstack[20];
262 static int	ntt;
263 
264 /*
265  * tab is an array of n Assoc's, sorted by key.
266  * Look for key in tab, and return corresponding val
267  * or -1 if not there
268  */
269 long
270 lookassoc(Assoc *tab, int n, char *key)
271 {
272 	Assoc *q;
273 	long i, low, high;
274 	int r;
275 
276 	for(low = -1, high = n; high > low+1; ){
277 		i = (high+low)/2;
278 		q = &tab[i];
279 		if((r=strcmp(key, q->key))<0)
280 			high = i;
281 		else if(r == 0)
282 			return q->val;
283 		else
284 			low=i;
285 	}
286 	return -1;
287 }
288 
289 long
290 looknassoc(Nassoc *tab, int n, long key)
291 {
292 	Nassoc *q;
293 	long i, low, high;
294 
295 	for(low = -1, high = n; high > low+1; ){
296 		i = (high+low)/2;
297 		q = &tab[i];
298 		if(key < q->key)
299 			high = i;
300 		else if(key == q->key)
301 			return q->val;
302 		else
303 			low=i;
304 	}
305 	return -1;
306 }
307 
308 void
309 err(char *fmt, ...)
310 {
311 	char buf[1000];
312 	va_list v;
313 
314 	va_start(v, fmt);
315 	vsnprint(buf, sizeof(buf), fmt, v);
316 	va_end(v);
317 	fprint(2, "%s: %s\n", argv0, buf);
318 }
319 
320 /*
321  * Write the rune r to bout, keeping track of line length
322  * and breaking the lines (at blanks) when they get too long
323  */
324 void
325 outrune(long r)
326 {
327 	if(outinhibit)
328 		return;
329 	if(++linelen > breaklen && r == L' ') {
330 		Bputc(bout, '\n');
331 		linelen = 0;
332 	} else
333 		Bputrune(bout, r);
334 }
335 
336 void
337 outrunes(Rune *rp)
338 {
339 	Rune r;
340 
341 	while((r = *rp++) != 0)
342 		outrune(r);
343 }
344 
345 /* like outrune, but when arg is know to be a char */
346 void
347 outchar(int c)
348 {
349 	if(outinhibit)
350 		return;
351 	if(++linelen > breaklen && c == ' ') {
352 		c ='\n';
353 		linelen = 0;
354 	}
355 	Bputc(bout, c);
356 }
357 
358 void
359 outchars(char *s)
360 {
361 	char c;
362 
363 	while((c = *s++) != 0)
364 		outchar(c);
365 }
366 
367 void
368 outprint(char *fmt, ...)
369 {
370 	char buf[1000];
371 	va_list v;
372 
373 	va_start(v, fmt);
374 	vsnprint(buf, sizeof(buf), fmt, v);
375 	va_end(v);
376 	outchars(buf);
377 }
378 
379 void
380 outpiece(char *b, char *e)
381 {
382 	int c, lastc;
383 
384 	lastc = 0;
385 	while(b < e) {
386 		c = *b++;
387 		if(c == '\n')
388 			c = ' ';
389 		if(!(c == ' ' && lastc == ' '))
390 			outchar(c);
391 		lastc = c;
392 	}
393 }
394 
395 /*
396  * Go to new line if not already there; indent if ind != 0.
397  * If ind > 1, leave a blank line too.
398  * Slight hack: assume if current line is only one or two
399  * characters long, then they were spaces.
400  */
401 void
402 outnl(int ind)
403 {
404 	if(outinhibit)
405 		return;
406 	if(ind) {
407 		if(ind > 1) {
408 			if(linelen > 2)
409 				Bputc(bout, '\n');
410 			Bprint(bout, "\n  ");
411 		} else if(linelen == 0)
412 			Bprint(bout, "  ");
413 		else if(linelen == 1)
414 			Bputc(bout, ' ');
415 		else if(linelen != 2)
416 			Bprint(bout, "\n  ");
417 		linelen = 2;
418 	} else {
419 		if(linelen) {
420 			Bputc(bout, '\n');
421 			linelen = 0;
422 		}
423 	}
424 }
425 
426 /*
427  * Fold the runes in null-terminated rp.
428  * Use the sort(1) definition of folding (uppercase to lowercase,
429  * latin1-accented characters to corresponding unaccented chars)
430  */
431 void
432 fold(Rune *rp)
433 {
434 	Rune r;
435 
436 	while((r = *rp) != 0) {
437 		if (rislatin1(r) && latin_fold_tab[r-0xc0])
438 				r = latin_fold_tab[r-0xc0];
439 		if(risupper(r))
440 			r = rtolower(r);
441 		*rp++ = r;
442 	}
443 }
444 
445 /*
446  * Like fold, but put folded result into new
447  * (assumed to have enough space).
448  * old is a regular expression, but we know that
449  * metacharacters aren't affected
450  */
451 void
452 foldre(char *new, char *old)
453 {
454 	Rune r;
455 
456 	while(*old) {
457 		old += chartorune(&r, old);
458 		if (rislatin1(r) && latin_fold_tab[r-0xc0])
459 				r = latin_fold_tab[r-0xc0];
460 		if(risupper(r))
461 			r = rtolower(r);
462 		new += runetochar(new, &r);
463 	}
464 	*new = 0;
465 }
466 
467 /*
468  *	acomp(s, t) returns:
469  *		-2 if s strictly precedes t
470  *		-1 if s is a prefix of t
471  *		0 if s is the same as t
472  *		1 if t is a prefix of s
473  *		2 if t strictly precedes s
474  */
475 
476 int
477 acomp(Rune *s, Rune *t)
478 {
479 	int cs, ct;
480 
481 	for(;;) {
482 		cs = *s;
483 		ct = *t;
484 		if(cs != ct)
485 			break;
486 		if(cs == 0)
487 			return 0;
488 		s++;
489 		t++;
490 	}
491 	if(cs == 0)
492 		return -1;
493 	if(ct == 0)
494 		return 1;
495 	if(cs < ct)
496 		return -2;
497 	return 2;
498 }
499 
500 /*
501  * Copy null terminated Runes from 'from' to 'to'.
502  */
503 void
504 runescpy(Rune *to, Rune *from)
505 {
506 	while((*to++ = *from++) != 0)
507 		continue;
508 }
509 
510 /*
511  * Conversion of unsigned number to long, no overflow detection
512  */
513 long
514 runetol(Rune *r)
515 {
516 	int c;
517 	long n;
518 
519 	n = 0;
520 	for(;; r++){
521 		c = *r;
522 		if(L'0'<=c && c<=L'9')
523 			c -= '0';
524 		else
525 			break;
526 		n = n*10 + c;
527 	}
528 	return n;
529 }
530 
531 /*
532  * See if there is a rune corresponding to the accented
533  * version of r with accent acc (acc in [LIGS..LIGE-1]),
534  * and return it if so, else return NONE.
535  */
536 Rune
537 liglookup(Rune acc, Rune r)
538 {
539 	Rune *p;
540 
541 	if(acc < LIGS || acc >= LIGE)
542 		return NONE;
543 	for(p = ligtab[acc-LIGS].pairs; *p; p += 2)
544 		if(*p == r)
545 			return *(p+1);
546 	return NONE;
547 }
548 
549 /*
550  * Maintain a translation table stack (a translation table
551  * is an array of Runes indexed by bytes or 7-bit bytes).
552  * If starting is true, push the curtab onto the stack
553  * and return newtab; else pop the top of the stack and
554  * return it.
555  * If curtab is 0, initialize the stack and return.
556  */
557 Rune *
558 changett(Rune *curtab, Rune *newtab, int starting)
559 {
560 	if(curtab == 0) {
561 		ntt = 0;
562 		return 0;
563 	}
564 	if(starting) {
565 		if(ntt >= asize(ttabstack)) {
566 			if(debug)
567 				err("translation stack overflow");
568 			return curtab;
569 		}
570 		ttabstack[ntt++] = curtab;
571 		return newtab;
572 	} else {
573 		if(ntt == 0) {
574 			if(debug)
575 				err("translation stack underflow");
576 			return curtab;
577 		}
578 		return ttabstack[--ntt];
579 	}
580 }
581