xref: /plan9/sys/src/cmd/dict/utils.c (revision 219b2ee8daee37f4aad58d63f21287faa8e4ffdc)
1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include <stdarg.h>
5 #include "dict.h"
6 
7 Dict dicts[] = {
8 	{"oed",		"Oxford English Dictionary, 2nd Ed.",
9 	 "/lib/dict/oed2",	"/lib/dict/oed2index",
10 	 oednextoff,	oedprintentry,		oedprintkey},
11 	{"ahd",		"American Heritage Dictionary, 2nd College Ed.",
12 	 "/lib/ahd/DICT.DB",	"/lib/ahd/index",
13 	 ahdnextoff,	ahdprintentry,		ahdprintkey},
14 	{"thesaurus",	"Collins Thesaurus",
15 	 "/lib/dict/thesaurus",	"/lib/dict/thesindex",
16 	 thesnextoff,	thesprintentry,	thesprintkey},
17 
18 	{"ce",		"Gendai Chinese->English",
19 	 "/lib/dict/world/sansdata/sandic24.dat",
20 	 "/lib/dict/world/sansdata/ceindex",
21 	 worldnextoff,	worldprintentry,	worldprintkey},
22 	{"ceh",		"Gendai Chinese->English (Hanzi index)",
23 	 "/lib/dict/world/sansdata/sandic24.dat",
24 	 "/lib/dict/world/sansdata/cehindex",
25 	 worldnextoff,	worldprintentry,	worldprintkey},
26 	{"ec",		"Gendai English->Chinese",
27 	 "/lib/dict/world/sansdata/sandic24.dat",
28 	 "/lib/dict/world/sansdata/ecindex",
29 	 worldnextoff,	worldprintentry,	worldprintkey},
30 
31 	{"dae",		"Gyldendal Danish->English",
32 	 "/lib/dict/world/gylddata/sandic30.dat",
33 	 "/lib/dict/world/gylddata/daeindex",
34 	 worldnextoff,	worldprintentry,	worldprintkey},
35 	{"eda",		"Gyldendal English->Danish",
36 	 "/lib/dict/world/gylddata/sandic29.dat",
37 	 "/lib/dict/world/gylddata/edaindex",
38 	 worldnextoff,	worldprintentry,	worldprintkey},
39 
40 	{"due",		"Wolters-Noordhoff Dutch->English",
41 	 "/lib/dict/world/woltdata/sandic07.dat",
42 	 "/lib/dict/world/woltdata/deindex",
43 	 worldnextoff,	worldprintentry,	worldprintkey},
44 	{"edu",		"Wolters-Noordhoff English->Dutch",
45 	 "/lib/dict/world/woltdata/sandic06.dat",
46 	 "/lib/dict/world/woltdata/edindex",
47 	 worldnextoff,	worldprintentry,	worldprintkey},
48 
49 	{"fie",		"WSOY Finnish->English",
50 	 "/lib/dict/world/werndata/sandic32.dat",
51 	 "/lib/dict/world/werndata/fieindex",
52 	 worldnextoff,	worldprintentry,	worldprintkey},
53 	{"efi",		"WSOY English->Finnish",
54 	 "/lib/dict/world/werndata/sandic31.dat",
55 	 "/lib/dict/world/werndata/efiindex",
56 	 worldnextoff,	worldprintentry,	worldprintkey},
57 
58 	{"fe",		"Collins French->English",
59 	 "/lib/dict/fe",	"/lib/dict/feindex",
60 	 pcollnextoff,	pcollprintentry,	pcollprintkey},
61 	{"ef",		"Collins English->French",
62 	 "/lib/dict/ef",	"/lib/dict/efindex",
63 	 pcollnextoff,	pcollprintentry,	pcollprintkey},
64 
65 	{"ge",		"Collins German->English",
66 	 "/lib/dict/ge",	"/lib/dict/geindex",
67 	 pcollgnextoff,	pcollgprintentry,	pcollgprintkey},
68 	{"eg",		"Collins English->German",
69 	 "/lib/dict/eg",	"/lib/dict/egindex",
70 	 pcollgnextoff,	pcollgprintentry,	pcollgprintkey},
71 
72 	{"ie",		"Collins Italian->English",
73 	 "/lib/dict/ie",	"/lib/dict/ieindex",
74 	 pcollnextoff,	pcollprintentry,	pcollprintkey},
75 	{"ei",		"Collins English->Italian",
76 	 "/lib/dict/ei",	"/lib/dict/eiindex",
77 	 pcollnextoff,	pcollprintentry,	pcollprintkey},
78 
79 	{"je",		"Sanshusha Japanese->English",
80 	 "/lib/dict/world/sansdata/sandic18.dat",
81 	 "/lib/dict/world/sansdata/jeindex",
82 	 worldnextoff,	worldprintentry,	worldprintkey},
83 	{"jek",		"Sanshusha Japanese->English (Kanji index)",
84 	 "/lib/dict/world/sansdata/sandic18.dat",
85 	 "/lib/dict/world/sansdata/jekindex",
86 	 worldnextoff,	worldprintentry,	worldprintkey},
87 	{"ej",		"Sanshusha English->Japanese",
88 	 "/lib/dict/world/sansdata/sandic18.dat",
89 	 "/lib/dict/world/sansdata/ejindex",
90 	 worldnextoff,	worldprintentry,	worldprintkey},
91 
92 	{"tjeg",	"Sanshusha technical Japanese->English,German",
93 	 "/lib/dict/world/sansdata/sandic16.dat",
94 	 "/lib/dict/world/sansdata/tjegindex",
95 	 worldnextoff,	worldprintentry,	worldprintkey},
96 	{"tjegk",	"Sanshusha technical Japanese->English,German (Kanji index)",
97 	 "/lib/dict/world/sansdata/sandic16.dat",
98 	 "/lib/dict/world/sansdata/tjegkindex",
99 	 worldnextoff,	worldprintentry,	worldprintkey},
100 	{"tegj",	"Sanshusha technical English->German,Japanese",
101 	 "/lib/dict/world/sansdata/sandic16.dat",
102 	 "/lib/dict/world/sansdata/tegjindex",
103 	 worldnextoff,	worldprintentry,	worldprintkey},
104 	{"tgje",	"Sanshusha technical German->Japanese,English",
105 	 "/lib/dict/world/sansdata/sandic16.dat",
106 	 "/lib/dict/world/sansdata/tgjeindex",
107 	 worldnextoff,	worldprintentry,	worldprintkey},
108 
109 	{"ne",		"Kunnskapforlaget Norwegian->English",
110 	 "/lib/dict/world/kunndata/sandic28.dat",
111 	 "/lib/dict/world/kunndata/neindex",
112 	 worldnextoff,	worldprintentry,	worldprintkey},
113 	{"en",		"Kunnskapforlaget English->Norwegian",
114 	 "/lib/dict/world/kunndata/sandic27.dat",
115 	 "/lib/dict/world/kunndata/enindex",
116 	 worldnextoff,	worldprintentry,	worldprintkey},
117 
118 	{"re",		"Leon Ungier Russian->English",
119 	 "/lib/dict/re",	"/lib/dict/reindex",
120 	 simplenextoff,	simpleprintentry,	simpleprintkey},
121 	{"er",		"Leon Ungier English->Russian",
122 	 "/lib/dict/re",	"/lib/dict/erindex",
123 	 simplenextoff,	simpleprintentry,	simpleprintkey},
124 
125 	{"se",		"Collins Spanish->English",
126 	 "/lib/dict/se",	"/lib/dict/seindex",
127 	 pcollnextoff,	pcollprintentry,	pcollprintkey},
128 	{"es",		"Collins English->Spanish",
129 	 "/lib/dict/es",	"/lib/dict/esindex",
130 	 pcollnextoff,	pcollprintentry,	pcollprintkey},
131 
132 	{"swe",		"Esselte Studium Swedish->English",
133 	 "/lib/dict/world/essedata/sandic34.dat",
134 	 "/lib/dict/world/essedata/sweindex",
135 	 worldnextoff,	worldprintentry,	worldprintkey},
136 	{"esw",		"Esselte Studium English->Swedish",
137 	 "/lib/dict/world/essedata/sandic33.dat",
138 	 "/lib/dict/world/essedata/eswindex",
139 	 worldnextoff,	worldprintentry,	worldprintkey},
140 
141 	{"movie",	"Movies -- by title",
142 	 "/lib/movie/data",	"/lib/dict/movtindex",
143 	 movienextoff,	movieprintentry,	movieprintkey},
144 	{"moviea",	"Movies -- by actor",
145 	 "/lib/movie/data",	"/lib/dict/movaindex",
146 	 movienextoff,	movieprintentry,	movieprintkey},
147 	{"movied",	"Movies -- by director",
148 	 "/lib/movie/data",	"/lib/dict/movdindex",
149 	 movienextoff,	movieprintentry,	movieprintkey},
150 
151 	{"slang",	"English Slang",
152 	 "/lib/dict/slang",	"/lib/dict/slangindex",
153 	 slangnextoff,	slangprintentry,	slangprintkey},
154 
155 	{"robert",	"Robert Électronique",
156 	 "/lib/dict/robert/_pointers",	"/lib/dict/robert/_index",
157 	 robertnextoff,	robertindexentry,	robertprintkey},
158 	{"robertv",	"Robert Électronique - formes des verbes",
159 	 "/lib/dict/robert/flex.rob",	"/lib/dict/robert/_flexindex",
160 	 robertnextflex,	robertflexentry,	robertprintkey},
161 
162 	{0, 0, 0, 0, 0}
163 };
164 
165 typedef struct Lig Lig;
166 struct Lig {
167 	Rune	start;		/* accent rune */
168 	Rune	*pairs;		/* <char,accented version> pairs */
169 };
170 
171 static Lig ligtab[Nligs] = {
172 [LACU-LIGS]	{L'´',	L"AÁaáCĆcćEÉeégģIÍiíıíLĹlĺNŃnńOÓoóRŔrŕSŚsśUÚuúYÝyýZŹzź"},
173 [LGRV-LIGS]	{L'ˋ',	L"AÀaàEÈeèIÌiìıìOÒoòUÙuù"},
174 [LUML-LIGS]	{L'¨',	L"AÄaäEËeëIÏiïOÖoöUÜuüYŸyÿ"},
175 [LCED-LIGS]	{L'¸',	L"CÇcçGĢKĶkķLĻlļNŅnņRŖrŗSŞsşTŢtţ"},
176 [LTIL-LIGS]	{L'˜',	L"AÃaãIĨiĩıĩNÑnñOÕoõUŨuũ"},
177 [LBRV-LIGS]	{L'˘',	L"AĂaăEĔeĕGĞgğIĬiĭıĭOŎoŏUŬuŭ"},
178 [LRNG-LIGS]	{L'˚',	L"AÅaåUŮuů"},
179 [LDOT-LIGS]	{L'˙',	L"CĊcċEĖeėGĠgġIİLĿlŀZŻzż"},
180 [LDTB-LIGS]	{L'.',	L""},
181 [LFRN-LIGS]	{L'⌢',	L"AÂaâCĈcĉEÊeêGĜgĝHĤhĥIÎiîıîJĴjĵOÔoôSŜsŝUÛuûWŴwŵYŶyŷ"},
182 [LFRB-LIGS]	{L'̯',	L""},
183 [LOGO-LIGS]	{L'˛',	L"AĄaąEĘeęIĮiįıįUŲuų"},
184 [LMAC-LIGS]	{L'¯',	L"AĀaāEĒeēIĪiīıīOŌoōUŪuū"},
185 [LHCK-LIGS]	{L'ˇ',	L"CČcčDĎdďEĚeěLĽlľNŇnňRŘrřSŠsšTŤtťZŽzž"},
186 [LASP-LIGS]	{L'ʽ',	L""},
187 [LLEN-LIGS]	{L'ʼ',	L""},
188 [LBRB-LIGS]	{L'̮',	L""}
189 };
190 
191 Rune *multitab[Nmulti] = {
192 [MAAS-MULTI]	L"ʽα",
193 [MALN-MULTI]	L"ʼα",
194 [MAND-MULTI]	L"and",
195 [MAOQ-MULTI]	L"a/q",
196 [MBRA-MULTI]	L"<|",
197 [MDD-MULTI]	L"..",
198 [MDDD-MULTI]	L"...",
199 [MEAS-MULTI]	L"ʽε",
200 [MELN-MULTI]	L"ʼε",
201 [MEMM-MULTI]	L"——",
202 [MHAS-MULTI]	L"ʽη",
203 [MHLN-MULTI]	L"ʼη",
204 [MIAS-MULTI]	L"ʽι",
205 [MILN-MULTI]	L"ʼι",
206 [MLCT-MULTI]	L"ct",
207 [MLFF-MULTI]	L"ff",
208 [MLFFI-MULTI]	L"ffi",
209 [MLFFL-MULTI]	L"ffl",
210 [MLFL-MULTI]	L"fl",
211 [MLFI-MULTI]	L"fi",
212 [MLLS-MULTI]	L"ɫɫ",
213 [MLST-MULTI]	L"st",
214 [MOAS-MULTI]	L"ʽο",
215 [MOLN-MULTI]	L"ʼο",
216 [MOR-MULTI]	L"or",
217 [MRAS-MULTI]	L"ʽρ",
218 [MRLN-MULTI]	L"ʼρ",
219 [MTT-MULTI]	L"~~",
220 [MUAS-MULTI]	L"ʽυ",
221 [MULN-MULTI]	L"ʼυ",
222 [MWAS-MULTI]	L"ʽω",
223 [MWLN-MULTI]	L"ʼω",
224 [MOE-MULTI]	L"oe",
225 [MES-MULTI]	L"  ",
226 };
227 
228 #define	risupper(r)	(L'A' <= (r) && (r) <= L'Z')
229 #define	rislatin1(r)	(0xC0 <= (r) && (r) <= 0xFF)
230 #define	rtolower(r)	((r)-'A'+'a')
231 
232 static Rune latin_fold_tab[] =
233 {
234 /*	Table to fold latin 1 characters to ASCII equivalents
235 			based at Rune value 0xc0
236 
237 	 À    Á    Â    Ã    Ä    Å    Æ    Ç
238 	 È    É    Ê    Ë    Ì    Í    Î    Ï
239 	 Ð    Ñ    Ò    Ó    Ô    Õ    Ö    ×
240 	 Ø    Ù    Ú    Û    Ü    Ý    Þ    ß
241 	 à    á    â    ã    ä    å    æ    ç
242 	 è    é    ê    ë    ì    í    î    ï
243 	 ð    ñ    ò    ó    ô    õ    ö    ÷
244 	 ø    ù    ú    û    ü    ý    þ    ÿ
245 */
246 	'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c',
247 	'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
248 	'd', 'n', 'o', 'o', 'o', 'o', 'o',  0 ,
249 	'o', 'u', 'u', 'u', 'u', 'y',  0 ,  0 ,
250 	'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c',
251 	'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
252 	'd', 'n', 'o', 'o', 'o', 'o', 'o',  0 ,
253 	'o', 'u', 'u', 'u', 'u', 'y',  0 , 'y',
254 };
255 
256 static Rune 	*ttabstack[20];
257 static int	ntt;
258 
259 /*
260  * tab is an array of n Assoc's, sorted by key.
261  * Look for key in tab, and return corresponding val
262  * or -1 if not there
263  */
264 long
265 lookassoc(Assoc *tab, int n, char *key)
266 {
267 	Assoc *q;
268 	long i, low, high;
269 	int r;
270 
271 	for(low = -1, high = n; high > low+1; ){
272 		i = (high+low)/2;
273 		q = &tab[i];
274 		if((r=strcmp(key, q->key))<0)
275 			high = i;
276 		else if(r == 0)
277 			return q->val;
278 		else
279 			low=i;
280 	}
281 	return -1;
282 }
283 
284 long
285 looknassoc(Nassoc *tab, int n, long key)
286 {
287 	Nassoc *q;
288 	long i, low, high;
289 
290 	for(low = -1, high = n; high > low+1; ){
291 		i = (high+low)/2;
292 		q = &tab[i];
293 		if(key < q->key)
294 			high = i;
295 		else if(key == q->key)
296 			return q->val;
297 		else
298 			low=i;
299 	}
300 	return -1;
301 }
302 
303 void
304 err(char *fmt, ...)
305 {
306 	char buf[1000];
307 	va_list v;
308 
309 	va_start(v, fmt);
310 	doprint(buf, &buf[1000], fmt, v);
311 	va_end(v);
312 	fprint(2, "%s: %s\n", argv0, buf);
313 }
314 
315 /*
316  * Write the rune r to bout, keeping track of line length
317  * and breaking the lines (at blanks) when they get too long
318  */
319 void
320 outrune(long r)
321 {
322 	if(outinhibit)
323 		return;
324 	if(++linelen > breaklen && r == L' ') {
325 		Bputc(bout, '\n');
326 		linelen = 0;
327 	} else
328 		Bputrune(bout, r);
329 }
330 
331 void
332 outrunes(Rune *rp)
333 {
334 	Rune r;
335 
336 	while((r = *rp++) != 0)
337 		outrune(r);
338 }
339 
340 /* like outrune, but when arg is know to be a char */
341 void
342 outchar(int c)
343 {
344 	if(outinhibit)
345 		return;
346 	if(++linelen > breaklen && c == ' ') {
347 		c ='\n';
348 		linelen = 0;
349 	}
350 	BPUTC(bout, c);
351 }
352 
353 void
354 outchars(char *s)
355 {
356 	char c;
357 
358 	while((c = *s++) != 0)
359 		outchar(c);
360 }
361 
362 void
363 outprint(char *fmt, ...)
364 {
365 	char buf[1000];
366 	va_list v;
367 
368 	va_start(v, fmt);
369 	doprint(buf, &buf[1000], fmt, v);
370 	va_end(v);
371 	outchars(buf);
372 }
373 
374 void
375 outpiece(char *b, char *e)
376 {
377 	int c, lastc;
378 
379 	lastc = 0;
380 	while(b < e) {
381 		c = *b++;
382 		if(c == '\n')
383 			c = ' ';
384 		if(!(c == ' ' && lastc == ' '))
385 			outchar(c);
386 		lastc = c;
387 	}
388 }
389 
390 /*
391  * Go to new line if not already there; indent if ind != 0.
392  * If ind > 1, leave a blank line too.
393  * Slight hack: assume if current line is only one or two
394  * characters long, then they were spaces.
395  */
396 void
397 outnl(int ind)
398 {
399 	if(outinhibit)
400 		return;
401 	if(ind) {
402 		if(ind > 1) {
403 			if(linelen > 2)
404 				Bputc(bout, '\n');
405 			Bprint(bout, "\n  ");
406 		} else if(linelen == 0)
407 			Bprint(bout, "  ");
408 		else if(linelen == 1)
409 			Bputc(bout, ' ');
410 		else if(linelen != 2)
411 			Bprint(bout, "\n  ");
412 		linelen = 2;
413 	} else {
414 		if(linelen) {
415 			Bputc(bout, '\n');
416 			linelen = 0;
417 		}
418 	}
419 }
420 
421 /*
422  * Fold the runes in null-terminated rp.
423  * Use the sort(1) definition of folding (uppercase to lowercase,
424  * latin1-accented characters to corresponding unaccented chars)
425  */
426 void
427 fold(Rune *rp)
428 {
429 	Rune r;
430 
431 	while((r = *rp) != 0) {
432 		if (rislatin1(r) && latin_fold_tab[r-0xc0])
433 				r = latin_fold_tab[r-0xc0];
434 		if(risupper(r))
435 			r = rtolower(r);
436 		*rp++ = r;
437 	}
438 }
439 
440 /*
441  * Like fold, but put folded result into new
442  * (assumed to have enough space).
443  * old is a regular expression, but we know that
444  * metacharacters aren't affected
445  */
446 void
447 foldre(char *new, char *old)
448 {
449 	Rune r;
450 
451 	while(*old) {
452 		old += chartorune(&r, old);
453 		if (rislatin1(r) && latin_fold_tab[r-0xc0])
454 				r = latin_fold_tab[r-0xc0];
455 		if(risupper(r))
456 			r = rtolower(r);
457 		new += runetochar(new, &r);
458 	}
459 	*new = 0;
460 }
461 
462 /*
463  *	acomp(s, t) returns:
464  *		-2 if s strictly precedes t
465  *		-1 if s is a prefix of t
466  *		0 if s is the same as t
467  *		1 if t is a prefix of s
468  *		2 if t strictly precedes s
469  */
470 
471 int
472 acomp(Rune *s, Rune *t)
473 {
474 	int cs, ct;
475 
476 	for(;;) {
477 		cs = *s;
478 		ct = *t;
479 		if(cs != ct)
480 			break;
481 		if(cs == 0)
482 			return 0;
483 		s++;
484 		t++;
485 	}
486 	if(cs == 0)
487 		return -1;
488 	if(ct == 0)
489 		return 1;
490 	if(cs < ct)
491 		return -2;
492 	return 2;
493 }
494 
495 /*
496  * Copy null terminated Runes from 'from' to 'to'.
497  */
498 void
499 runescpy(Rune *to, Rune *from)
500 {
501 	while((*to++ = *from++) != 0)
502 		continue;
503 }
504 
505 /*
506  * Conversion of unsigned number to long, no overflow detection
507  */
508 long
509 runetol(Rune *r)
510 {
511 	int c;
512 	long n;
513 
514 	n = 0;
515 	for(;; r++){
516 		c = *r;
517 		if(L'0'<=c && c<=L'9')
518 			c -= '0';
519 		else
520 			break;
521 		n = n*10 + c;
522 	}
523 	return n;
524 }
525 
526 /*
527  * See if there is a rune corresponding to the accented
528  * version of r with accent acc (acc in [LIGS..LIGE-1]),
529  * and return it if so, else return NONE.
530  */
531 Rune
532 liglookup(Rune acc, Rune r)
533 {
534 	Rune *p;
535 
536 	if(acc < LIGS || acc >= LIGE)
537 		return NONE;
538 	for(p = ligtab[acc-LIGS].pairs; *p; p += 2)
539 		if(*p == r)
540 			return *(p+1);
541 	return NONE;
542 }
543 
544 /*
545  * Maintain a translation table stack (a translation table
546  * is an array of Runes indexed by bytes or 7-bit bytes).
547  * If starting is true, push the curtab onto the stack
548  * and return newtab; else pop the top of the stack and
549  * return it.
550  * If curtab is 0, initialize the stack and return.
551  */
552 Rune *
553 changett(Rune *curtab, Rune *newtab, int starting)
554 {
555 	if(curtab == 0) {
556 		ntt = 0;
557 		return 0;
558 	}
559 	if(starting) {
560 		if(ntt >= asize(ttabstack)) {
561 			if(debug)
562 				err("translation stack overflow");
563 			return curtab;
564 		}
565 		ttabstack[ntt++] = curtab;
566 		return newtab;
567 	} else {
568 		if(ntt == 0) {
569 			if(debug)
570 				err("translation stack underflow");
571 			return curtab;
572 		}
573 		return ttabstack[--ntt];
574 	}
575 }
576