xref: /plan9/sys/src/cmd/spell/sprog.c (revision b85a83648eec38fe82b6f00adfd7828ceec5ee8d)
1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include <ctype.h>
5 #include "code.h"
6 
7 /* fig leaves for possibly signed char quantities */
8 #define ISUPPER(c)	isupper((c)&0xff)
9 #define ISLOWER(c)	islower((c)&0xff)
10 #define	ISALPHA(c)	isalpha((c)&0xff)
11 #define	ISDIGIT(c)	isdigit((c)&0xff)
12 #define ISVOWEL(c)	voweltab[(c)&0xff]
13 #define Tolower(c)	(ISUPPER(c)? (c)-'A'+'a': (c))
14 #define pair(a,b)	(((a)<<8) | (b))
15 #define DLEV		2
16 #define DSIZ		40
17 
18 typedef	long	Bits;
19 #define	Set(h, f)	((long)(h) & (f))
20 
21 Bits 	nop(char*, char*, char*, int, int);
22 Bits 	strip(char*, char*, char*, int, int);
23 Bits 	ize(char*, char*, char*, int, int);
24 Bits 	i_to_y(char*, char*, char*, int, int);
25 Bits 	ily(char*, char*, char*, int, int);
26 Bits 	subst(char*, char*, char*, int, int);
27 Bits 	CCe(char*, char*, char*, int, int);
28 Bits 	tion(char*, char*, char*, int, int);
29 Bits 	an(char*, char*, char*, int, int);
30 Bits 	s(char*, char*, char*, int, int);
31 Bits 	es(char*, char*, char*, int, int);
32 Bits 	bility(char*, char*, char*, int, int);
33 Bits 	y_to_e(char*, char*, char*, int, int);
34 Bits 	VCe(char*, char*, char*, int, int);
35 
36 Bits 	trypref(char*, char*, int, int);
37 Bits	tryword(char*, char*, int, int);
38 Bits 	trysuff(char*, int, int);
39 Bits	dict(char*, char*);
40 void	typeprint(Bits);
41 void	pcomma(char*);
42 
43 void	ise(void);
44 int	ordinal(void);
45 char*	skipv(char*);
46 int	inun(char*, Bits);
47 char*	ztos(char*);
48 void	readdict(char*);
49 
50 typedef	struct	Ptab	Ptab;
51 struct	Ptab
52 {
53 	char*	s;
54 	int	flag;
55 };
56 
57 typedef	struct	Suftab	Suftab;
58 struct	Suftab
59 {
60 	char	*suf;
61 	Bits	(*p1)(char*, char*, char*, int, int);
62 	int	n1;
63 	char	*d1;
64 	char	*a1;
65 	int	flag;
66 	int	affixable;
67 	Bits	(*p2)(char*, char*, char*, int, int);
68 	int	n2;
69 	char	*d2;
70 	char	*a2;
71 };
72 
73 Suftab	staba[] = {
74 	{"aibohp",subst,1,"-e+ia","",NOUN, NOUN},
75 	0
76 };
77 
78 Suftab	stabc[] =
79 {
80 	{"cai",strip,1,"","+c",N_AFFIX, ADJ|NOUN},
81 	{"citsi",strip,2,"","+ic",N_AFFIX, ADJ | N_AFFIX | NOUN},
82 	{"citi",ize,1,"-e+ic","",N_AFFIX, ADJ },
83 	{"cihparg",i_to_y,1,"-y+ic","",NOUN, ADJ|NOUN },
84 	{"cipocs",ize,1,"-e+ic","",NOUN, ADJ },
85 	{"cirtem",i_to_y,1,"-y+ic","",NOUN, ADJ },
86 	{"cigol",i_to_y,1,"-y+ic","",NOUN, ADJ },
87 	{"cimono",i_to_y,1,"-y+ic","",NOUN, ADJ },
88 	{"cibohp",subst,1,"-e+ic","",NOUN, ADJ },
89 	0
90 };
91 Suftab	stabd[] =
92 {
93 	{"de",strip,1,"","+d",ED,ADJ |COMP,i_to_y,2,"-y+ied","+ed"},
94 	{"dooh",ily,4,"-y+ihood","+hood",NOUN | ADV, NOUN},
95 	0
96 };
97 Suftab	stabe[] =
98 {
99 	/*
100 	 * V_affix for comment ->commence->commentment??
101 	 */
102 	{"ecna",subst,1,"-t+ce","",ADJ,N_AFFIX|_Y|NOUN|VERB|ACTOR|V_AFFIX},
103 	{"ecne",subst,1,"-t+ce","",ADJ,N_AFFIX|_Y|NOUN|VERB|ACTOR|V_AFFIX},
104 	{"elbaif",i_to_y,4,"-y+iable","",V_IRREG,ADJ},
105 	{"elba",CCe,4,"-e+able","+able",V_AFFIX,ADJ},
106 	{"evi",subst,0,"-ion+ive","",N_AFFIX | V_AFFIX,NOUN | N_AFFIX| ADJ},
107 	{"ezi",CCe,3,"-e+ize","+ize",N_AFFIX|ADJ ,V_AFFIX | VERB |ION | COMP},
108 	{"ekil",strip,4,"","+like",N_AFFIX ,ADJ},
109 	0
110 };
111 Suftab	stabg[] =
112 {
113 	{"gniee",strip,3,"","+ing",V_IRREG ,ADJ|NOUN},
114 	{"gnikam",strip,6,"","+making",NOUN,NOUN},
115 	{"gnipeek",strip,7,"","+keeping",NOUN,NOUN},
116 	{"gni",CCe,3,"-e+ing","+ing",V_IRREG ,ADJ|ED|NOUN},
117 	0
118 };
119 Suftab	stabl[] =
120 {
121 	{"ladio",strip,2,"","+al",NOUN |ADJ,ADJ},
122 	{"laci",strip,2,"","+al",NOUN |ADJ,ADJ |NOUN|N_AFFIX},
123 	{"latnem",strip,2,"","+al",N_AFFIX,ADJ},
124 	{"lanoi",strip,2,"","+al",N_AFFIX,ADJ|NOUN},
125 	{"luf",ily,3,"-y+iful","+ful",N_AFFIX,ADJ | NOUN},
126 	0
127 };
128 Suftab	stabm[] =
129 {
130 		/* congregational + ism */
131 	{"msi",CCe,3,"-e+ism","ism",N_AFFIX|ADJ,NOUN},
132 	{"margo",subst,-1,"-ph+m","",NOUN,NOUN},
133 	0
134 };
135 Suftab	stabn[] =
136 {
137 	{"noitacifi",i_to_y,6,"-y+ication","",ION,NOUN | N_AFFIX},
138 	{"noitazi",ize,4,"-e+ation","",ION,NOUN| N_AFFIX},
139 	{"noit",tion,3,"-e+ion","+ion",ION,NOUN| N_AFFIX | V_AFFIX |VERB|ACTOR},
140 	{"naino",an,3,"","+ian",NOUN|PROP_COLLECT,NOUN| N_AFFIX},
141 	{"namow",strip,5,"","+woman",MAN,PROP_COLLECT|N_AFFIX},
142 	{"nam",strip,3,"","+man",MAN,PROP_COLLECT | N_AFFIX | VERB},
143 	{"na",an,1,"","+n",NOUN|PROP_COLLECT,NOUN | N_AFFIX},
144 	{"nemow",strip,5,"","+women",MAN,PROP_COLLECT},
145 	{"nem",strip,3,"","+man",MAN,PROP_COLLECT},
146 	{"nosrep",strip,6,"","+person",MAN,PROP_COLLECT},
147 	0
148 };
149 Suftab	stabp[] =
150 {
151 	{"pihs",strip,4,"","+ship",NOUN|PROP_COLLECT,NOUN| N_AFFIX},
152 	0
153 };
154 Suftab	stabr[] =
155 {
156 	{"rehparg",subst,1,"-y+er","",ACTOR,NOUN,strip,2,"","+er"},
157 	{"reyhparg",nop,0,"","",0,NOUN},
158 	{"reyl",nop,0,"","",0,NOUN},
159 	{"rekam",strip,5,"","+maker",NOUN,NOUN},
160 	{"repeek",strip,6,"","+keeper",NOUN,NOUN},
161 	{"re",strip,1,"","+r",ACTOR,NOUN | N_AFFIX|VERB|ADJ,	i_to_y,2,"-y+ier","+er"},
162 	{"rota",tion,2,"-e+or","",ION,NOUN| N_AFFIX|_Y},
163 	{"rotc",tion,2,"","+or",ION,NOUN| N_AFFIX},
164 	{"rotp",tion,2,"","+or",ION,NOUN| N_AFFIX},
165 	0
166 };
167 Suftab	stabs[] =
168 {
169 	{"ssen",ily,4,"-y+iness","+ness",ADJ|ADV,NOUN| N_AFFIX},
170 	{"ssel",ily,4,"-y+iless","+less",NOUN | PROP_COLLECT,ADJ },
171 	{"se",s,1,"","+s",NOUN | V_IRREG,DONT_TOUCH ,	es,2,"-y+ies","+es"},
172 	{"s'",s,2,"","+'s",PROP_COLLECT | NOUN,DONT_TOUCH },
173 	{"s",s,1,"","+s",NOUN | V_IRREG,DONT_TOUCH  },
174 	0
175 };
176 Suftab	stabt[] =
177 {
178 	{"tnem",strip,4,"","+ment",V_AFFIX,NOUN | N_AFFIX | ADJ|VERB},
179 	{"tse",strip,2,"","+st",EST,DONT_TOUCH,	i_to_y,3,"-y+iest","+est" },
180 	{"tsigol",i_to_y,2,"-y+ist","",N_AFFIX,NOUN | N_AFFIX},
181 	{"tsi",CCe,3,"-e+ist","+ist",N_AFFIX|ADJ,NOUN | N_AFFIX|COMP},
182 	0
183 };
184 Suftab	staby[] =
185 {
186 	{"ycna",subst,1,"-t+cy","",ADJ | N_AFFIX,NOUN | N_AFFIX},
187 	{"ycne",subst,1,"-t+cy","",ADJ | N_AFFIX,NOUN | N_AFFIX},
188 	{"ytilib",bility,5,"-le+ility","",ADJ | V_AFFIX,NOUN | N_AFFIX},
189 	{"ytisuo",nop,0,"","",NOUN},
190 	{"ytilb",nop,0,"","",0,NOUN},
191 	{"yti",CCe,3,"-e+ity","+ity",ADJ ,NOUN | N_AFFIX },
192 	{"ylb",y_to_e,1,"-e+y","",ADJ,ADV},
193 	{"ylc",nop,0,"","",0},
194 	{"ylelb",nop,0,"","",0},
195 	{"ylelp",nop,0,"","",0},
196 	{"yl",ily,2,"-y+ily","+ly",ADJ,ADV|COMP},
197 	{"yrtem",subst,0,"-er+ry","",NOUN,NOUN | N_AFFIX},
198 	{"y",CCe,1,"-e+y","+y",_Y,ADJ|COMP},
199 	0
200 };
201 Suftab	stabz[] =
202 {
203 	0
204 };
205 Suftab*	suftab[] =
206 {
207 	staba,
208 	stabz,
209 	stabc,
210 	stabd,
211 	stabe,
212 	stabz,
213 	stabg,
214 	stabz,
215 	stabz,
216 	stabz,
217 	stabz,
218 	stabl,
219 	stabm,
220 	stabn,
221 	stabz,
222 	stabp,
223 	stabz,
224 	stabr,
225 	stabs,
226 	stabt,
227 	stabz,
228 	stabz,
229 	stabz,
230 	stabz,
231 	staby,
232 	stabz,
233 };
234 
235 Ptab	ptaba[] =
236 {
237 	"anti", 0,
238 	"auto", 0,
239 	0
240 };
241 Ptab	ptabb[] =
242 {
243 	"bio", 0,
244 	0
245 };
246 Ptab	ptabc[] =
247 {
248 	"counter", 0,
249 	0
250 };
251 Ptab	ptabd[] =
252 {
253 	"dis", 0,
254 	0
255 };
256 Ptab	ptabe[] =
257 {
258 	"electro", 0,
259 	0
260 };
261 Ptab	ptabf[] =
262 {
263 	"femto", 0,
264 	0
265 };
266 Ptab	ptabg[] =
267 {
268 	"geo", 0,
269 	"giga", 0,
270 	0
271 };
272 Ptab	ptabh[] =
273 {
274 	"hyper", 0,
275 	0
276 };
277 Ptab	ptabi[] =
278 {
279 	"immuno", 0,
280 	"im", IN,
281 	"intra", 0,
282 	"inter", 0,
283 	"in", IN,
284 	"ir", IN,
285 	"iso", 0,
286 	0
287 };
288 Ptab	ptabj[] =
289 {
290 	0
291 };
292 Ptab	ptabk[] =
293 {
294 	"kilo", 0,
295 	0
296 };
297 Ptab	ptabl[] =
298 {
299 	0
300 };
301 Ptab	ptabm[] =
302 {
303 	"magneto", 0,
304 	"mega", 0,
305 	"meta", 0,
306 	"micro", 0,
307 	"mid", 0,
308 	"milli", 0,
309 	"mini", 0,
310 	"mis", 0,
311 	"mono", 0,
312 	"multi", 0,
313 	0
314 };
315 Ptab	ptabn[] =
316 {
317 	"nano", 0,
318 	"neuro", 0,
319 	"non", 0,
320 	0
321 };
322 Ptab	ptabo[] =
323 {
324 	"out", 0,
325 	"over", 0,
326 	0
327 };
328 Ptab	ptabp[] =
329 {
330 	"para", 0,
331 	"photo", 0,
332 	"pico", 0,
333 	"poly", 0,
334 	"pre", 0,
335 	"pseudo", 0,
336 	"psycho", 0,
337 	0
338 };
339 Ptab	ptabq[] =
340 {
341 	"quasi", 0,
342 	0
343 };
344 Ptab	ptabr[] =
345 {
346 	"radio", 0,
347 	"re", 0,
348 	0
349 };
350 Ptab	ptabs[] =
351 {
352 	"semi", 0,
353 	"stereo", 0,
354 	"sub", 0,
355 	"super", 0,
356 	0
357 };
358 Ptab	ptabt[] =
359 {
360 	"tele", 0,
361 	"tera", 0,
362 	"thermo", 0,
363 	0
364 };
365 Ptab	ptabu[] =
366 {
367 	"ultra", 0,
368 	"under", 0,	/*must precede un*/
369 	"un", IN,
370 	0
371 };
372 Ptab	ptabv[] =
373 {
374 	0
375 };
376 Ptab	ptabw[] =
377 {
378 	0
379 };
380 Ptab	ptabx[] =
381 {
382 	0
383 };
384 Ptab	ptaby[] =
385 {
386 	0
387 };
388 Ptab	ptabz[] =
389 {
390 	0
391 };
392 
393 Ptab*	preftab[] =
394 {
395 	ptaba,
396 	ptabb,
397 	ptabc,
398 	ptabd,
399 	ptabe,
400 	ptabf,
401 	ptabg,
402 	ptabh,
403 	ptabi,
404 	ptabj,
405 	ptabk,
406 	ptabl,
407 	ptabm,
408 	ptabn,
409 	ptabo,
410 	ptabp,
411 	ptabq,
412 	ptabr,
413 	ptabs,
414 	ptabt,
415 	ptabu,
416 	ptabv,
417 	ptabw,
418 	ptabx,
419 	ptaby,
420 	ptabz,
421 };
422 
423 typedef struct {
424 	char *mesg;
425 	enum { NONE, SUFF, PREF} type;
426 } Deriv;
427 
428 int	aflag;
429 int	cflag;
430 int	fflag;
431 int	vflag;
432 int	xflag;
433 int 	nflag;
434 char	word[500];
435 char*	original;
436 Deriv	emptyderiv;
437 Deriv	deriv[DSIZ+3];
438 char	affix[DSIZ*10];	/* 10 is longest affix message */
439 int	prefcount;
440 int 	suffcount;
441 char*	acmeid;
442 char	space[300000];	/* must be as large as "words"+"space" in pcode run */
443 Bits	encode[2048];	/* must be as long as "codes" in pcode run */
444 int	nencode;
445 char	voweltab[256];
446 char*	spacep[128*128+1];	/* pointer to words starting with 'xx' */
447 Biobuf	bin;
448 Biobuf	bout;
449 
450 char*	codefile = "/sys/lib/amspell";
451 char*	brfile = "/sys/lib/brspell";
452 char*	Usage = "usage";
453 
454 void
main(int argc,char * argv[])455 main(int argc, char *argv[])
456 {
457 	char *ep, *cp;
458 	char *dp;
459 	int j, i, c;
460 	int low;
461 	Bits h;
462 
463 	Binit(&bin, 0, OREAD);
464 	Binit(&bout, 1, OWRITE);
465 	for(i=0; c = "aeiouyAEIOUY"[i]; i++)
466 		voweltab[c] = 1;
467 	while(argc > 1) {
468 		if(argv[1][0] != '-')
469 			break;
470 		for(i=1; c = argv[1][i]; i++)
471 		switch(c) {
472 		default:
473 			fprint(2, "usage: spell [-bcCvx] [-f file]\n");
474 			exits(Usage);
475 
476 		case 'a':
477 			aflag++;
478 			continue;
479 
480 		case 'b':
481 			ise();
482 			if(!fflag)
483 				codefile = brfile;
484 			continue;
485 
486 		case 'C':		/* for "correct" */
487 			vflag++;
488 		case 'c':		/* for ocr */
489 			cflag++;
490 			continue;
491 
492 		case 'v':
493 			vflag++;
494 			continue;
495 
496 		case 'x':
497 			xflag++;
498 			continue;
499 
500 		case 'f':
501 			if(argc <= 2) {
502 				fprint(2, "spell: -f requires another argument\n");
503 				exits(Usage);
504 			}
505 			argv++;
506 			argc--;
507 			codefile = argv[1];
508 			fflag++;
509 			goto brk;
510 		}
511 	brk:
512 		argv++;
513 		argc--;
514 	}
515 	readdict(codefile);
516 	if(argc > 1) {
517 		fprint(2, "usage: spell [-bcCvx] [-f file]\n");
518 		exits(Usage);
519 	}
520 	if(aflag)
521 		cflag = vflag = 0;
522 
523 	for(;;) {
524 		affix[0] = 0;
525 		original = Brdline(&bin, '\n');
526 		if(original == 0)
527 			exits(0);
528 		original[Blinelen(&bin)-1] = 0;
529 		low = 0;
530 
531 		if(aflag) {
532 			acmeid = original;
533 			while(*original != ':')
534 				if(*original++ == 0)
535 					exits(0);
536 			while(*++original != ':')
537 				if(*original == 0)
538 					exits(0);
539 			*original++ = 0;
540 		}
541 		for(ep=word,dp=original; j = *dp; ep++,dp++) {
542 			if(ISLOWER(j))
543 				low++;
544 			if(ep >= word+sizeof(word)-1)
545 				break;
546 			*ep = j;
547 		}
548 		*ep = 0;
549 
550 		if(ISDIGIT(word[0]) && ordinal())
551 			continue;
552 
553 		h = 0;
554 		if(!low && !(h = trypref(ep,".",0,ALL|STOP|DONT_TOUCH)))
555 			for(cp=original+1,dp=word+1; dp<ep; dp++,cp++)
556 				*dp = Tolower(*cp);
557 		if(!h)
558 		for(;;) {	/* at most twice */
559 			if(h = trypref(ep,".",0,ALL|STOP|DONT_TOUCH))
560 				break;
561 			if(h = trysuff(ep,0,ALL|STOP|DONT_TOUCH))
562 				break;
563 			if(!ISUPPER(word[0]))
564 				break;
565 			cp = original;
566 			dp = word;
567 			while(*dp = *cp++) {
568 					if(!low)
569 						*dp = Tolower(*dp);
570 				dp++;
571 			}
572 			word[0] = Tolower(word[0]);
573 		}
574 
575 		if(cflag) {
576 			if(!h || Set(h,STOP))
577 				print("-");
578 			else if(!vflag)
579 				print("+");
580 			else
581 				print("%c",'0' + (suffcount>0) +
582 				   (prefcount>4? 8: 2*prefcount));
583 		} else if(!h || Set(h,STOP)) {
584 			if(aflag)
585 				Bprint(&bout, "%s:%s\n", acmeid, original);
586 			else
587 				Bprint(&bout, "%s\n", original);
588 		} else if(affix[0] != 0 && affix[0] != '.')
589 			print("%s\t%s\n", affix, original);
590 	}
591 	/* not reached */
592 }
593 
594 /*	strip exactly one suffix and do
595  *	indicated routine(s), which may recursively
596  *	strip suffixes
597  */
598 Bits
trysuff(char * ep,int lev,int flag)599 trysuff(char* ep, int lev, int flag)
600 {
601 	Suftab *t;
602 	char *cp, *sp;
603 	Bits h = 0;
604 	int initchar = ep[-1];
605 
606 	flag &= ~MONO;
607 	lev += DLEV;
608 	if(lev < DSIZ) {
609 		deriv[lev]  = emptyderiv;
610 		deriv[lev-1] = emptyderiv;
611 	}
612 	if(!ISLOWER(initchar))
613 		return h;
614 	for(t=suftab[initchar-'a']; sp=t->suf; t++) {
615 		cp = ep;
616 		while(*sp)
617 			if(*--cp != *sp++)
618 				goto next;
619 		for(sp=ep-t->n1; --sp >= word && !ISVOWEL(*sp);)
620 			;
621 		if(sp < word)
622 			continue;
623 		if(!(t->affixable & flag))
624 			return 0;
625 		h = (*t->p1)(ep-t->n1, t->d1, t->a1, lev+1, t->flag|STOP);
626 		if(!h && t->p2!=0) {
627 			if(lev < DSIZ) {
628 				deriv[lev] = emptyderiv;
629 				deriv[lev+1] = emptyderiv;
630 			}
631 			h = (*t->p2)(ep-t->n2, t->d2, t->a2, lev, t->flag|STOP);
632 		}
633 		break;
634 	next:;
635 	}
636 	return h;
637 }
638 
639 Bits
nop(char * ep,char * d,char * a,int lev,int flag)640 nop(char* ep, char* d, char* a, int lev, int flag)
641 {
642 	USED(ep, d, a, lev, flag);
643 	return 0;
644 }
645 
646 Bits
cstrip(char * ep,char * d,char * a,int lev,int flag)647 cstrip(char* ep, char* d, char* a, int lev, int flag)
648 {
649 	int temp = ep[0];
650 
651 	if(ISVOWEL(temp) && ISVOWEL(ep[-1])) {
652 		switch(pair(ep[-1],ep[0])) {
653 		case pair('a', 'a'):
654 		case pair('a', 'e'):
655 		case pair('a', 'i'):
656 		case pair('e', 'a'):
657 		case pair('e', 'e'):
658 		case pair('e', 'i'):
659 		case pair('i', 'i'):
660 		case pair('o', 'a'):
661 			return 0;
662 		}
663 	} else
664 	if(temp==ep[-1]&&temp==ep[-2])
665 		return 0;
666 	return strip(ep,d,a,lev,flag);
667 }
668 
669 Bits
strip(char * ep,char * d,char * a,int lev,int flag)670 strip(char* ep, char* d, char* a, int lev, int flag)
671 {
672 	Bits h = trypref(ep, a, lev, flag);
673 
674 	USED(d);
675 	if(Set(h,MONO) && ISVOWEL(*ep) && ISVOWEL(ep[-2]))
676 		h = 0;
677 	if(h)
678 		return h;
679 	if(ISVOWEL(*ep) && !ISVOWEL(ep[-1]) && ep[-1]==ep[-2]) {
680 		h = trypref(ep-1,a,lev,flag|MONO);
681 		if(h)
682 			return h;
683 	}
684 	return trysuff(ep,lev,flag);
685 }
686 
687 Bits
s(char * ep,char * d,char * a,int lev,int flag)688 s(char* ep, char* d, char* a, int lev, int flag)
689 {
690 	if(lev > DLEV+1)
691 		return 0;
692 	if(*ep=='s') {
693 		switch(ep[-1]) {
694 		case 'y':
695 			if(ISVOWEL(ep[-2])||ISUPPER(*word))
696 				break;	/*says Kennedys*/
697 		case 'x':
698 		case 'z':
699 		case 's':
700 			return 0;
701 		case 'h':
702 			switch(ep[-2]) {
703 			case 'c':
704 			case 's':
705 				return 0;
706 			}
707 		}
708 	}
709 	return strip(ep,d,a,lev,flag);
710 }
711 
712 Bits
an(char * ep,char * d,char * a,int lev,int flag)713 an(char* ep, char* d, char* a, int lev, int flag)
714 {
715 	USED(d);
716 	if(!ISUPPER(*word))	/*must be proper name*/
717 		return 0;
718 	return trypref(ep,a,lev,flag);
719 }
720 
721 Bits
ize(char * ep,char * d,char * a,int lev,int flag)722 ize(char* ep, char* d, char* a, int lev, int flag)
723 {
724 	int temp = ep[-1];
725 	Bits h;
726 
727 	USED(a);
728 	ep[-1] = 'e';
729 	h = strip(ep,"",d,lev,flag);
730 	ep[-1] = temp;
731 	return h;
732 }
733 
734 Bits
y_to_e(char * ep,char * d,char * a,int lev,int flag)735 y_to_e(char* ep, char* d, char* a, int lev, int flag)
736 {
737 	Bits h;
738 	int  temp;
739 
740 	USED(a);
741 	switch(ep[-1]) {
742 	case 'a':
743 	case 'e':
744 	case 'i':
745 		return 0;
746 	}
747 	temp = *ep;
748 	*ep++ = 'e';
749 	h = strip(ep,"",d,lev,flag);
750 	ep[-1] = temp;
751 	return h;
752 }
753 
754 Bits
ily(char * ep,char * d,char * a,int lev,int flag)755 ily(char* ep, char* d, char* a, int lev, int flag)
756 {
757 	int temp = ep[0];
758 	char *cp = ep;
759 
760 	if(temp==ep[-1]&&temp==ep[-2])		/* sillly */
761 		return 0;
762 	if(*--cp=='y' && !ISVOWEL(*--cp))	/* happyly */
763 		while(cp>word)
764 			if(ISVOWEL(*--cp))	/* shyness */
765 				return 0;
766 	if(ep[-1]=='i')
767 		return i_to_y(ep,d,a,lev,flag);
768 	return cstrip(ep,d,a,lev,flag);
769 }
770 
771 Bits
bility(char * ep,char * d,char * a,int lev,int flag)772 bility(char* ep, char* d, char* a, int lev, int flag)
773 {
774 	*ep++ = 'l';
775 	return y_to_e(ep,d,a,lev,flag);
776 }
777 
778 Bits
i_to_y(char * ep,char * d,char * a,int lev,int flag)779 i_to_y(char* ep, char* d, char* a, int lev, int flag)
780 {
781 	Bits h;
782 	int temp;
783 
784 	if(ISUPPER(*word))
785 		return 0;
786 	if((temp=ep[-1])=='i' && !ISVOWEL(ep[-2])) {
787 		ep[-1] = 'y';
788 		a = d;
789 	}
790 	h = cstrip(ep,"",a,lev,flag);
791 	ep[-1] = temp;
792 	return h;
793 }
794 
795 Bits
es(char * ep,char * d,char * a,int lev,int flag)796 es(char* ep, char* d, char* a, int lev, int flag)
797 {
798 	if(lev>DLEV)
799 		return 0;
800 	switch(ep[-1]) {
801 	default:
802 		return 0;
803 	case 'i':
804 		return i_to_y(ep,d,a,lev,flag);
805 	case 'h':
806 		switch(ep[-2]) {
807 		default:
808 			return 0;
809 		case 'c':
810 		case 's':
811 			break;
812 		}
813 	case 's':
814 	case 'z':
815 	case 'x':
816 		return strip(ep,d,a,lev,flag);
817 	}
818 }
819 
820 Bits
subst(char * ep,char * d,char * a,int lev,int flag)821 subst(char* ep, char* d, char* a, int lev, int flag)
822 {
823 	char *u,*t;
824 	Bits h;
825 
826 	USED(a);
827 	if(skipv(skipv(ep-1)) < word)
828 		return 0;
829 	for(t=d; *t!='+'; t++)
830 		continue;
831 	for(u=ep; *--t!='-';)
832 		*--u = *t;
833 	h = strip(ep,"",d,lev,flag);
834 	while(*++t != '+')
835 		continue;
836 	while(*++t)
837 		*u++ = *t;
838 	return h;
839 }
840 
841 Bits
tion(char * ep,char * d,char * a,int lev,int flag)842 tion(char* ep, char* d, char* a, int lev, int flag)
843 {
844 	switch(ep[-2]) {
845 	default:
846 		return trypref(ep,a,lev,flag);
847 	case 'a':
848 	case 'e':
849 	case 'i':
850 	case 'o':
851 	case 'u':
852 		return y_to_e(ep,d,a,lev,flag);
853 	}
854 }
855 
856 /*
857  * possible consonant-consonant-e ending
858  */
859 Bits
CCe(char * ep,char * d,char * a,int lev,int flag)860 CCe(char* ep, char* d, char* a, int lev, int flag)
861 {
862 	Bits h;
863 
864 	switch(ep[-1]) {
865 	case 'l':
866 		if(ISVOWEL(ep[-2]))
867 			break;
868 		switch(ep[-2]) {
869 		case 'l':
870 		case 'r':
871 		case 'w':
872 			break;
873 		default:
874 			return y_to_e(ep,d,a,lev,flag);
875 		}
876 		break;
877 	case 'c':
878 	case 'g':
879 		if(*ep == 'a')	/* prevent -able for -eable */
880 			return 0;
881 	case 's':
882 	case 'v':
883 	case 'z':
884 		if(ep[-2]==ep[-1])
885 			break;
886 		if(ISVOWEL(ep[-2]))
887 			break;
888 	case 'u':
889 		if(h = y_to_e(ep,d,a,lev,flag))
890 			return h;
891 		if(!(ep[-2]=='n' && ep[-1]=='g'))
892 			return 0;
893 	}
894 	return VCe(ep,d,a,lev,flag);
895 }
896 
897 /*
898  * possible consonant-vowel-consonant-e ending
899  */
900 Bits
VCe(char * ep,char * d,char * a,int lev,int flag)901 VCe(char* ep, char* d, char* a, int lev, int flag)
902 {
903 	int c;
904 	Bits h;
905 
906 	c = ep[-1];
907 	if(c=='e')
908 		return 0;
909 	if(!ISVOWEL(c) && ISVOWEL(ep[-2])) {
910 		c = *ep;
911 		*ep++ = 'e';
912 		h = trypref(ep,d,lev,flag);
913 		if(!h)
914 			h = trysuff(ep,lev,flag);
915 		if(h)
916 			return h;
917 		ep--;
918 		*ep = c;
919 	}
920 	return cstrip(ep,d,a,lev,flag);
921 }
922 
923 Ptab*
lookuppref(uchar ** wp,char * ep)924 lookuppref(uchar** wp, char* ep)
925 {
926 	Ptab *sp;
927 	uchar *bp,*cp;
928 	unsigned int initchar = Tolower(**wp);
929 
930 	if(!ISALPHA(initchar))
931 		return 0;
932 	for(sp=preftab[initchar-'a'];sp->s;sp++) {
933 		bp = *wp;
934 		for(cp= (uchar*)sp->s;*cp; )
935 			if(*bp++!=*cp++)
936 				goto next;
937 		for(cp=bp;cp<(uchar*)ep;cp++)
938 			if(ISVOWEL(*cp)) {
939 				*wp = bp;
940 				return sp;
941 			}
942 	next:;
943 	}
944 	return 0;
945 }
946 
947 /*	while word is not in dictionary try stripping
948  *	prefixes. Fail if no more prefixes.
949  */
950 Bits
trypref(char * ep,char * a,int lev,int flag)951 trypref(char* ep, char* a, int lev, int flag)
952 {
953 	Ptab *tp;
954 	char *bp, *cp;
955 	char *pp;
956 	Bits h;
957 	char space[20];
958 
959 	if(lev<DSIZ) {
960 		deriv[lev].mesg = a;
961 		deriv[lev].type = *a=='.'? NONE: SUFF;
962 	}
963 	if(h = tryword(word,ep,lev,flag)) {
964 		if(Set(h, flag&~MONO) && (flag&MONO) <= Set(h, MONO))
965 			return h;
966 		h = 0;
967 	}
968 	bp = word;
969 	pp = space;
970 	if(lev<DSIZ) {
971 		deriv[lev+1].mesg = pp;
972 		deriv[lev+1].type = 0;
973 	}
974 	while(tp=lookuppref((uchar**)&bp,ep)) {
975 		*pp++ = '+';
976 		cp = tp->s;
977 		while(pp<space+sizeof(space) && (*pp = *cp++))
978 			pp++;
979 		deriv[lev+1].type += PREF;
980 		h = tryword(bp,ep,lev+1,flag);
981 		if(Set(h,NOPREF) ||
982 		   ((tp->flag&IN) && inun(bp-2,h)==0)) {
983 			h = 0;
984 			break;
985 		}
986 		if(Set(h,flag&~MONO) && (flag&MONO) <= Set(h, MONO))
987 			break;
988 		h = 0;
989 	}
990 	if(lev < DSIZ) {
991 		deriv[lev+1] = emptyderiv;
992 		deriv[lev+2] = emptyderiv;
993 	}
994 	return h;
995 }
996 
997 Bits
tryword(char * bp,char * ep,int lev,int flag)998 tryword(char* bp, char* ep, int lev, int flag)
999 {
1000 	int  j;
1001 	Bits h = 0;
1002 	char duple[3];
1003 
1004 	if(ep-bp <= 1)
1005 		return h;
1006 	if(flag&MONO) {
1007 		if(lev<DSIZ) {
1008 			deriv[++lev].mesg = duple;
1009 			deriv[lev].type = SUFF;
1010 		}
1011 		duple[0] = '+';
1012 		duple[1] = *ep;
1013 		duple[2] = 0;
1014 	}
1015 	h = dict(bp, ep);
1016 	if(vflag==0 || h==0)
1017 		return h;
1018 	/*
1019 	 * when derivations are wanted, collect them
1020 	 * for printing
1021 	 */
1022 	j = lev;
1023 	prefcount = suffcount = 0;
1024 	do {
1025 		if(j<DSIZ && deriv[j].type) {
1026 			strcat(affix, deriv[j].mesg);
1027 			if(deriv[j].type == SUFF)
1028 				suffcount++;
1029 			else if(deriv[j].type != NONE)
1030 				prefcount = deriv[j].type/PREF;
1031 		}
1032 	} while(--j > 0);
1033 	return h;
1034 }
1035 
1036 int
inun(char * bp,Bits h)1037 inun(char* bp, Bits h)
1038 {
1039 	if(*bp == 'u')
1040 		return Set(h, IN) == 0;
1041 	/* *bp == 'i' */
1042 	if(Set(h, IN) == 0)
1043 		return 0;
1044 	switch(bp[2]) {
1045 	case 'r':
1046 		return bp[1] == 'r';
1047 	case 'm':
1048 	case 'p':
1049 		return bp[1] == 'm';
1050 	}
1051 	return bp[1] == 'n';
1052 }
1053 
1054 char*
skipv(char * s)1055 skipv(char *s)
1056 {
1057 	if(s >= word && ISVOWEL(*s))
1058 		s--;
1059 	while(s >= word && !ISVOWEL(*s))
1060 		s--;
1061 	return s;
1062 }
1063 
1064 /*
1065  * crummy way to Britishise
1066  */
1067 void
ise(void)1068 ise(void)
1069 {
1070 	Suftab *p;
1071 	int i;
1072 
1073 	for(i=0; i<26; i++)
1074 		for(p = suftab[i]; p->suf; p++) {
1075 			p->suf = ztos(p->suf);
1076 			p->d1 = ztos(p->d1);
1077 			p->a1 = ztos(p->a1);
1078 		}
1079 }
1080 
1081 char*
ztos(char * as)1082 ztos(char *as)
1083 {
1084 	char *s, *ds;
1085 
1086 	for(s=as; *s; s++)
1087 		if(*s == 'z')
1088 			goto copy;
1089 	return as;
1090 
1091 copy:
1092 	ds = strdup(as);
1093 	for(s=ds; *s; s++)
1094 		if(*s == 'z')
1095 			*s = 's';
1096 	return ds;
1097 }
1098 
1099 Bits
dict(char * bp,char * ep)1100 dict(char* bp, char* ep)
1101 {
1102 	char *cp, *cp1, *w, *wp, *we;
1103 	int n, f;
1104 
1105 	w = bp;
1106 	we = ep;
1107 	n = ep-bp;
1108 	if(n <= 1)
1109 		return NOUN;
1110 
1111 	f = w[0] & 0x7f;
1112 	f *= 128;
1113 	f += w[1] & 0x7f;
1114 	bp = spacep[f];
1115 	ep = spacep[f+1];
1116 
1117 loop:
1118 	if(bp >= ep) {
1119 		if(xflag)
1120 			fprint(2, "=%.*s\n", utfnlen(w, n), w);
1121 		return 0;
1122 	}
1123 	/*
1124 	 * find the beginning of some word in the middle
1125 	 */
1126 	cp = bp + (ep-bp)/2;
1127 
1128 	while(cp > bp && !(*cp & 0x80))
1129 		cp--;
1130 	while(cp > bp && (cp[-1] & 0x80))
1131 		cp--;
1132 
1133 	wp = w + 2;	/* skip two letters */
1134 	cp1 = cp + 2;	/* skip affix code */
1135 	for(;;) {
1136 		if(wp >= we) {
1137 			if(*cp1 & 0x80)
1138 				goto found;
1139 			else
1140 				f = 1;
1141 			break;
1142 		}
1143 		if(*cp1 & 0x80) {
1144 			f = -1;
1145 			break;
1146 		}
1147 		f = *cp1++ - *wp++;
1148 		if(f != 0)
1149 			break;
1150 	}
1151 
1152 	if(f < 0) {
1153 		while(!(*cp1 & 0x80))
1154 			cp1++;
1155 		bp = cp1;
1156 		goto loop;
1157 	}
1158 	ep = cp;
1159 	goto loop;
1160 
1161 found:
1162 	f = ((cp[0] & 0x7) << 8) |
1163 		(cp[1] & 0xff);
1164 	if(xflag) {
1165 		fprint(2, "=%.*s ", utfnlen(w, n), w);
1166 		typeprint(encode[f]);
1167 	}
1168 	return encode[f];
1169 }
1170 
1171 void
typeprint(Bits h)1172 typeprint(Bits h)
1173 {
1174 
1175 	pcomma("");
1176 	if(h & NOUN)
1177 		pcomma("n");
1178 	if(h & PROP_COLLECT)
1179 		pcomma("pc");
1180 	if(h & VERB) {
1181 		if((h & VERB) == VERB)
1182 			pcomma("v");
1183 		else
1184 		if((h & VERB) == V_IRREG)
1185 			pcomma("vi");
1186 		else
1187 		if(h & ED)
1188 			pcomma("ed");
1189 	}
1190 	if(h & ADJ)
1191 		pcomma("a");
1192 	if(h & COMP) {
1193 		if((h & COMP) == ACTOR)
1194 			pcomma("er");
1195 		else
1196 			pcomma("comp");
1197 	}
1198 	if(h & DONT_TOUCH)
1199 		pcomma("d");
1200 	if(h & N_AFFIX)
1201 		pcomma("na");
1202 	if(h & ADV)
1203 		pcomma("adv");
1204 	if(h & ION)
1205 		pcomma("ion");
1206 	if(h & V_AFFIX)
1207 		pcomma("va");
1208 	if(h & MAN)
1209 		pcomma("man");
1210 	if(h & NOPREF)
1211 		pcomma("nopref");
1212 	if(h & MONO)
1213 		pcomma("ms");
1214 	if(h & IN)
1215 		pcomma("in");
1216 	if(h & _Y)
1217 		pcomma("y");
1218 	if(h & STOP)
1219 		pcomma("s");
1220 	fprint(2, "\n");
1221 }
1222 
1223 void
pcomma(char * s)1224 pcomma(char *s)
1225 {
1226 	static flag;
1227 
1228 	if(*s == 0) {
1229 		flag = 0;
1230 		return;
1231 	}
1232 	if(!flag) {
1233 		fprint(2, "%s", s);
1234 		flag = 1;
1235 	} else
1236 		fprint(2, ",%s", s);
1237 }
1238 
1239 /*
1240  * is the word on of the following
1241  *	12th	teen
1242  *	21st	end in 1
1243  *	23rd	end in 3
1244  *	77th	default
1245  * called knowing word[0] is a digit
1246  */
1247 int
ordinal(void)1248 ordinal(void)
1249 {
1250 	char *cp = word;
1251 	static char sp[4];
1252 
1253 	while(ISDIGIT(*cp))
1254 		cp++;
1255 	strncpy(sp,cp,3);
1256 	if(ISUPPER(cp[0]) && ISUPPER(cp[1])) {
1257 		sp[0] = Tolower(cp[0]);
1258 		sp[1] = Tolower(cp[1]);
1259 	}
1260 	return 0 == strncmp(sp,
1261 		cp[-2]=='1'? "th":	/* out of bounds if 1 digit */
1262 		*--cp=='1'? "st":	/* harmless */
1263 		*cp=='2'? "nd":
1264 		*cp=='3'? "rd":
1265 		"th", 3);
1266 }
1267 
1268 /*
1269  * read in the dictionary.
1270  * format is
1271  * {
1272  *	short	nencode;
1273  *	long	encode[nencode];
1274  *	char	space[*];
1275  * };
1276  *
1277  * the encodings are a table all different
1278  * affixes.
1279  * the dictionary proper has 2 bytes
1280  * that demark and then the rest of the
1281  * word. the 2 bytes have the following
1282  *	0x80 0x00	flag
1283  *	0x78 0x00	count of prefix bytes
1284  *			common with prev word
1285  *	0x07 0xff	affix code
1286  *
1287  * all ints are big endians in the file.
1288  */
1289 void
readdict(char * file)1290 readdict(char *file)
1291 {
1292 	char *s, *is, *lasts, *ls;
1293 	int c, i, sp, p;
1294 	int f;
1295 	long l;
1296 
1297 	lasts = 0;
1298 	f = open(file, 0);
1299 	if(f == -1) {
1300 		fprint(2, "cannot open %s\n", file);
1301 		exits("open");
1302 	}
1303 	if(read(f, space, 2) != 2)
1304 		goto bad;
1305 	nencode = ((space[0]&0xff)<<8) | (space[1]&0xff);
1306 	if(read(f, space, 4*nencode) != 4*nencode)
1307 		goto bad;
1308 	s = space;
1309 	for(i=0; i<nencode; i++) {
1310 		l = (long)(s[0] & 0xff) << 24;
1311 		l |= (s[1] & 0xff) << 16;
1312 		l |= (s[2] & 0xff) << 8;
1313 		l |= s[3] & 0xff;
1314 		encode[i] = (Bits)l;
1315 		s += 4;
1316 	}
1317 	l = read(f, space, sizeof(space));
1318 	if(l == sizeof(space))
1319 		goto noroom;
1320 	is = space + (sizeof(space) - l);
1321 	memmove(is, space, l);
1322 
1323 	s = space;
1324 	c = *is++ & 0xff;
1325 	sp = -1;
1326 	i = 0;
1327 
1328 loop:
1329 	if(s > is)
1330 		goto noroom;
1331 	if(c < 0) {
1332 		close(f);
1333 		while(sp < 128*128)
1334 			spacep[++sp] = s;
1335 		*s = 0x80;		/* fence */
1336 		return;
1337 	}
1338 	p = (c>>3) & 0xf;
1339 	*s++ = c;
1340 	*s++ = *is++ & 0xff;
1341 	if(p <= 0)
1342 		i = (*is++ & 0xff)*128;
1343 	if(p <= 1) {
1344 		if(!(*is & 0x80))
1345 			i = i/128*128 + (*is++ & 0xff);
1346 		if(i <= sp) {
1347 			fprint(2, "the dict isnt sorted or \n");
1348 			fprint(2, "memmove didn't work\n");
1349 			goto bad;
1350 		}
1351 		while(sp < i)
1352 			spacep[++sp] = s-2;
1353 	}
1354 	ls = lasts;
1355 	lasts = s;
1356 	for(p-=2; p>0; p--)
1357 		*s++ = *ls++;
1358 	for(;;) {
1359 		if(is >= space+sizeof(space)) {
1360 			c = -1;
1361 			break;
1362 		}
1363 		c = *is++ & 0xff;
1364 		if(c & 0x80)
1365 			break;
1366 		*s++ = c;
1367 	}
1368 	*s = 0;
1369 	goto loop;
1370 
1371 bad:
1372 	fprint(2, "trouble reading %s\n", file);
1373 	exits("read");
1374 noroom:
1375 	fprint(2, "not enough space for dictionary\n");
1376 	exits("space");
1377 }
1378