1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include <ctype.h>
5 #include "code.h"
6
7 /* fig leaves for possibly signed char quantities */
8 #define ISUPPER(c) isupper((c)&0xff)
9 #define ISLOWER(c) islower((c)&0xff)
10 #define ISALPHA(c) isalpha((c)&0xff)
11 #define ISDIGIT(c) isdigit((c)&0xff)
12 #define ISVOWEL(c) voweltab[(c)&0xff]
13 #define Tolower(c) (ISUPPER(c)? (c)-'A'+'a': (c))
14 #define pair(a,b) (((a)<<8) | (b))
15 #define DLEV 2
16 #define DSIZ 40
17
18 typedef long Bits;
19 #define Set(h, f) ((long)(h) & (f))
20
21 Bits nop(char*, char*, char*, int, int);
22 Bits strip(char*, char*, char*, int, int);
23 Bits ize(char*, char*, char*, int, int);
24 Bits i_to_y(char*, char*, char*, int, int);
25 Bits ily(char*, char*, char*, int, int);
26 Bits subst(char*, char*, char*, int, int);
27 Bits CCe(char*, char*, char*, int, int);
28 Bits tion(char*, char*, char*, int, int);
29 Bits an(char*, char*, char*, int, int);
30 Bits s(char*, char*, char*, int, int);
31 Bits es(char*, char*, char*, int, int);
32 Bits bility(char*, char*, char*, int, int);
33 Bits y_to_e(char*, char*, char*, int, int);
34 Bits VCe(char*, char*, char*, int, int);
35
36 Bits trypref(char*, char*, int, int);
37 Bits tryword(char*, char*, int, int);
38 Bits trysuff(char*, int, int);
39 Bits dict(char*, char*);
40 void typeprint(Bits);
41 void pcomma(char*);
42
43 void ise(void);
44 int ordinal(void);
45 char* skipv(char*);
46 int inun(char*, Bits);
47 char* ztos(char*);
48 void readdict(char*);
49
50 typedef struct Ptab Ptab;
51 struct Ptab
52 {
53 char* s;
54 int flag;
55 };
56
57 typedef struct Suftab Suftab;
58 struct Suftab
59 {
60 char *suf;
61 Bits (*p1)(char*, char*, char*, int, int);
62 int n1;
63 char *d1;
64 char *a1;
65 int flag;
66 int affixable;
67 Bits (*p2)(char*, char*, char*, int, int);
68 int n2;
69 char *d2;
70 char *a2;
71 };
72
73 Suftab staba[] = {
74 {"aibohp",subst,1,"-e+ia","",NOUN, NOUN},
75 0
76 };
77
78 Suftab stabc[] =
79 {
80 {"cai",strip,1,"","+c",N_AFFIX, ADJ|NOUN},
81 {"citsi",strip,2,"","+ic",N_AFFIX, ADJ | N_AFFIX | NOUN},
82 {"citi",ize,1,"-e+ic","",N_AFFIX, ADJ },
83 {"cihparg",i_to_y,1,"-y+ic","",NOUN, ADJ|NOUN },
84 {"cipocs",ize,1,"-e+ic","",NOUN, ADJ },
85 {"cirtem",i_to_y,1,"-y+ic","",NOUN, ADJ },
86 {"cigol",i_to_y,1,"-y+ic","",NOUN, ADJ },
87 {"cimono",i_to_y,1,"-y+ic","",NOUN, ADJ },
88 {"cibohp",subst,1,"-e+ic","",NOUN, ADJ },
89 0
90 };
91 Suftab stabd[] =
92 {
93 {"de",strip,1,"","+d",ED,ADJ |COMP,i_to_y,2,"-y+ied","+ed"},
94 {"dooh",ily,4,"-y+ihood","+hood",NOUN | ADV, NOUN},
95 0
96 };
97 Suftab stabe[] =
98 {
99 /*
100 * V_affix for comment ->commence->commentment??
101 */
102 {"ecna",subst,1,"-t+ce","",ADJ,N_AFFIX|_Y|NOUN|VERB|ACTOR|V_AFFIX},
103 {"ecne",subst,1,"-t+ce","",ADJ,N_AFFIX|_Y|NOUN|VERB|ACTOR|V_AFFIX},
104 {"elbaif",i_to_y,4,"-y+iable","",V_IRREG,ADJ},
105 {"elba",CCe,4,"-e+able","+able",V_AFFIX,ADJ},
106 {"evi",subst,0,"-ion+ive","",N_AFFIX | V_AFFIX,NOUN | N_AFFIX| ADJ},
107 {"ezi",CCe,3,"-e+ize","+ize",N_AFFIX|ADJ ,V_AFFIX | VERB |ION | COMP},
108 {"ekil",strip,4,"","+like",N_AFFIX ,ADJ},
109 0
110 };
111 Suftab stabg[] =
112 {
113 {"gniee",strip,3,"","+ing",V_IRREG ,ADJ|NOUN},
114 {"gnikam",strip,6,"","+making",NOUN,NOUN},
115 {"gnipeek",strip,7,"","+keeping",NOUN,NOUN},
116 {"gni",CCe,3,"-e+ing","+ing",V_IRREG ,ADJ|ED|NOUN},
117 0
118 };
119 Suftab stabl[] =
120 {
121 {"ladio",strip,2,"","+al",NOUN |ADJ,ADJ},
122 {"laci",strip,2,"","+al",NOUN |ADJ,ADJ |NOUN|N_AFFIX},
123 {"latnem",strip,2,"","+al",N_AFFIX,ADJ},
124 {"lanoi",strip,2,"","+al",N_AFFIX,ADJ|NOUN},
125 {"luf",ily,3,"-y+iful","+ful",N_AFFIX,ADJ | NOUN},
126 0
127 };
128 Suftab stabm[] =
129 {
130 /* congregational + ism */
131 {"msi",CCe,3,"-e+ism","ism",N_AFFIX|ADJ,NOUN},
132 {"margo",subst,-1,"-ph+m","",NOUN,NOUN},
133 0
134 };
135 Suftab stabn[] =
136 {
137 {"noitacifi",i_to_y,6,"-y+ication","",ION,NOUN | N_AFFIX},
138 {"noitazi",ize,4,"-e+ation","",ION,NOUN| N_AFFIX},
139 {"noit",tion,3,"-e+ion","+ion",ION,NOUN| N_AFFIX | V_AFFIX |VERB|ACTOR},
140 {"naino",an,3,"","+ian",NOUN|PROP_COLLECT,NOUN| N_AFFIX},
141 {"namow",strip,5,"","+woman",MAN,PROP_COLLECT|N_AFFIX},
142 {"nam",strip,3,"","+man",MAN,PROP_COLLECT | N_AFFIX | VERB},
143 {"na",an,1,"","+n",NOUN|PROP_COLLECT,NOUN | N_AFFIX},
144 {"nemow",strip,5,"","+women",MAN,PROP_COLLECT},
145 {"nem",strip,3,"","+man",MAN,PROP_COLLECT},
146 {"nosrep",strip,6,"","+person",MAN,PROP_COLLECT},
147 0
148 };
149 Suftab stabp[] =
150 {
151 {"pihs",strip,4,"","+ship",NOUN|PROP_COLLECT,NOUN| N_AFFIX},
152 0
153 };
154 Suftab stabr[] =
155 {
156 {"rehparg",subst,1,"-y+er","",ACTOR,NOUN,strip,2,"","+er"},
157 {"reyhparg",nop,0,"","",0,NOUN},
158 {"reyl",nop,0,"","",0,NOUN},
159 {"rekam",strip,5,"","+maker",NOUN,NOUN},
160 {"repeek",strip,6,"","+keeper",NOUN,NOUN},
161 {"re",strip,1,"","+r",ACTOR,NOUN | N_AFFIX|VERB|ADJ, i_to_y,2,"-y+ier","+er"},
162 {"rota",tion,2,"-e+or","",ION,NOUN| N_AFFIX|_Y},
163 {"rotc",tion,2,"","+or",ION,NOUN| N_AFFIX},
164 {"rotp",tion,2,"","+or",ION,NOUN| N_AFFIX},
165 0
166 };
167 Suftab stabs[] =
168 {
169 {"ssen",ily,4,"-y+iness","+ness",ADJ|ADV,NOUN| N_AFFIX},
170 {"ssel",ily,4,"-y+iless","+less",NOUN | PROP_COLLECT,ADJ },
171 {"se",s,1,"","+s",NOUN | V_IRREG,DONT_TOUCH , es,2,"-y+ies","+es"},
172 {"s'",s,2,"","+'s",PROP_COLLECT | NOUN,DONT_TOUCH },
173 {"s",s,1,"","+s",NOUN | V_IRREG,DONT_TOUCH },
174 0
175 };
176 Suftab stabt[] =
177 {
178 {"tnem",strip,4,"","+ment",V_AFFIX,NOUN | N_AFFIX | ADJ|VERB},
179 {"tse",strip,2,"","+st",EST,DONT_TOUCH, i_to_y,3,"-y+iest","+est" },
180 {"tsigol",i_to_y,2,"-y+ist","",N_AFFIX,NOUN | N_AFFIX},
181 {"tsi",CCe,3,"-e+ist","+ist",N_AFFIX|ADJ,NOUN | N_AFFIX|COMP},
182 0
183 };
184 Suftab staby[] =
185 {
186 {"ycna",subst,1,"-t+cy","",ADJ | N_AFFIX,NOUN | N_AFFIX},
187 {"ycne",subst,1,"-t+cy","",ADJ | N_AFFIX,NOUN | N_AFFIX},
188 {"ytilib",bility,5,"-le+ility","",ADJ | V_AFFIX,NOUN | N_AFFIX},
189 {"ytisuo",nop,0,"","",NOUN},
190 {"ytilb",nop,0,"","",0,NOUN},
191 {"yti",CCe,3,"-e+ity","+ity",ADJ ,NOUN | N_AFFIX },
192 {"ylb",y_to_e,1,"-e+y","",ADJ,ADV},
193 {"ylc",nop,0,"","",0},
194 {"ylelb",nop,0,"","",0},
195 {"ylelp",nop,0,"","",0},
196 {"yl",ily,2,"-y+ily","+ly",ADJ,ADV|COMP},
197 {"yrtem",subst,0,"-er+ry","",NOUN,NOUN | N_AFFIX},
198 {"y",CCe,1,"-e+y","+y",_Y,ADJ|COMP},
199 0
200 };
201 Suftab stabz[] =
202 {
203 0
204 };
205 Suftab* suftab[] =
206 {
207 staba,
208 stabz,
209 stabc,
210 stabd,
211 stabe,
212 stabz,
213 stabg,
214 stabz,
215 stabz,
216 stabz,
217 stabz,
218 stabl,
219 stabm,
220 stabn,
221 stabz,
222 stabp,
223 stabz,
224 stabr,
225 stabs,
226 stabt,
227 stabz,
228 stabz,
229 stabz,
230 stabz,
231 staby,
232 stabz,
233 };
234
235 Ptab ptaba[] =
236 {
237 "anti", 0,
238 "auto", 0,
239 0
240 };
241 Ptab ptabb[] =
242 {
243 "bio", 0,
244 0
245 };
246 Ptab ptabc[] =
247 {
248 "counter", 0,
249 0
250 };
251 Ptab ptabd[] =
252 {
253 "dis", 0,
254 0
255 };
256 Ptab ptabe[] =
257 {
258 "electro", 0,
259 0
260 };
261 Ptab ptabf[] =
262 {
263 "femto", 0,
264 0
265 };
266 Ptab ptabg[] =
267 {
268 "geo", 0,
269 "giga", 0,
270 0
271 };
272 Ptab ptabh[] =
273 {
274 "hyper", 0,
275 0
276 };
277 Ptab ptabi[] =
278 {
279 "immuno", 0,
280 "im", IN,
281 "intra", 0,
282 "inter", 0,
283 "in", IN,
284 "ir", IN,
285 "iso", 0,
286 0
287 };
288 Ptab ptabj[] =
289 {
290 0
291 };
292 Ptab ptabk[] =
293 {
294 "kilo", 0,
295 0
296 };
297 Ptab ptabl[] =
298 {
299 0
300 };
301 Ptab ptabm[] =
302 {
303 "magneto", 0,
304 "mega", 0,
305 "meta", 0,
306 "micro", 0,
307 "mid", 0,
308 "milli", 0,
309 "mini", 0,
310 "mis", 0,
311 "mono", 0,
312 "multi", 0,
313 0
314 };
315 Ptab ptabn[] =
316 {
317 "nano", 0,
318 "neuro", 0,
319 "non", 0,
320 0
321 };
322 Ptab ptabo[] =
323 {
324 "out", 0,
325 "over", 0,
326 0
327 };
328 Ptab ptabp[] =
329 {
330 "para", 0,
331 "photo", 0,
332 "pico", 0,
333 "poly", 0,
334 "pre", 0,
335 "pseudo", 0,
336 "psycho", 0,
337 0
338 };
339 Ptab ptabq[] =
340 {
341 "quasi", 0,
342 0
343 };
344 Ptab ptabr[] =
345 {
346 "radio", 0,
347 "re", 0,
348 0
349 };
350 Ptab ptabs[] =
351 {
352 "semi", 0,
353 "stereo", 0,
354 "sub", 0,
355 "super", 0,
356 0
357 };
358 Ptab ptabt[] =
359 {
360 "tele", 0,
361 "tera", 0,
362 "thermo", 0,
363 0
364 };
365 Ptab ptabu[] =
366 {
367 "ultra", 0,
368 "under", 0, /*must precede un*/
369 "un", IN,
370 0
371 };
372 Ptab ptabv[] =
373 {
374 0
375 };
376 Ptab ptabw[] =
377 {
378 0
379 };
380 Ptab ptabx[] =
381 {
382 0
383 };
384 Ptab ptaby[] =
385 {
386 0
387 };
388 Ptab ptabz[] =
389 {
390 0
391 };
392
393 Ptab* preftab[] =
394 {
395 ptaba,
396 ptabb,
397 ptabc,
398 ptabd,
399 ptabe,
400 ptabf,
401 ptabg,
402 ptabh,
403 ptabi,
404 ptabj,
405 ptabk,
406 ptabl,
407 ptabm,
408 ptabn,
409 ptabo,
410 ptabp,
411 ptabq,
412 ptabr,
413 ptabs,
414 ptabt,
415 ptabu,
416 ptabv,
417 ptabw,
418 ptabx,
419 ptaby,
420 ptabz,
421 };
422
423 typedef struct {
424 char *mesg;
425 enum { NONE, SUFF, PREF} type;
426 } Deriv;
427
428 int aflag;
429 int cflag;
430 int fflag;
431 int vflag;
432 int xflag;
433 int nflag;
434 char word[500];
435 char* original;
436 Deriv emptyderiv;
437 Deriv deriv[DSIZ+3];
438 char affix[DSIZ*10]; /* 10 is longest affix message */
439 int prefcount;
440 int suffcount;
441 char* acmeid;
442 char space[300000]; /* must be as large as "words"+"space" in pcode run */
443 Bits encode[2048]; /* must be as long as "codes" in pcode run */
444 int nencode;
445 char voweltab[256];
446 char* spacep[128*128+1]; /* pointer to words starting with 'xx' */
447 Biobuf bin;
448 Biobuf bout;
449
450 char* codefile = "/sys/lib/amspell";
451 char* brfile = "/sys/lib/brspell";
452 char* Usage = "usage";
453
454 void
main(int argc,char * argv[])455 main(int argc, char *argv[])
456 {
457 char *ep, *cp;
458 char *dp;
459 int j, i, c;
460 int low;
461 Bits h;
462
463 Binit(&bin, 0, OREAD);
464 Binit(&bout, 1, OWRITE);
465 for(i=0; c = "aeiouyAEIOUY"[i]; i++)
466 voweltab[c] = 1;
467 while(argc > 1) {
468 if(argv[1][0] != '-')
469 break;
470 for(i=1; c = argv[1][i]; i++)
471 switch(c) {
472 default:
473 fprint(2, "usage: spell [-bcCvx] [-f file]\n");
474 exits(Usage);
475
476 case 'a':
477 aflag++;
478 continue;
479
480 case 'b':
481 ise();
482 if(!fflag)
483 codefile = brfile;
484 continue;
485
486 case 'C': /* for "correct" */
487 vflag++;
488 case 'c': /* for ocr */
489 cflag++;
490 continue;
491
492 case 'v':
493 vflag++;
494 continue;
495
496 case 'x':
497 xflag++;
498 continue;
499
500 case 'f':
501 if(argc <= 2) {
502 fprint(2, "spell: -f requires another argument\n");
503 exits(Usage);
504 }
505 argv++;
506 argc--;
507 codefile = argv[1];
508 fflag++;
509 goto brk;
510 }
511 brk:
512 argv++;
513 argc--;
514 }
515 readdict(codefile);
516 if(argc > 1) {
517 fprint(2, "usage: spell [-bcCvx] [-f file]\n");
518 exits(Usage);
519 }
520 if(aflag)
521 cflag = vflag = 0;
522
523 for(;;) {
524 affix[0] = 0;
525 original = Brdline(&bin, '\n');
526 if(original == 0)
527 exits(0);
528 original[Blinelen(&bin)-1] = 0;
529 low = 0;
530
531 if(aflag) {
532 acmeid = original;
533 while(*original != ':')
534 if(*original++ == 0)
535 exits(0);
536 while(*++original != ':')
537 if(*original == 0)
538 exits(0);
539 *original++ = 0;
540 }
541 for(ep=word,dp=original; j = *dp; ep++,dp++) {
542 if(ISLOWER(j))
543 low++;
544 if(ep >= word+sizeof(word)-1)
545 break;
546 *ep = j;
547 }
548 *ep = 0;
549
550 if(ISDIGIT(word[0]) && ordinal())
551 continue;
552
553 h = 0;
554 if(!low && !(h = trypref(ep,".",0,ALL|STOP|DONT_TOUCH)))
555 for(cp=original+1,dp=word+1; dp<ep; dp++,cp++)
556 *dp = Tolower(*cp);
557 if(!h)
558 for(;;) { /* at most twice */
559 if(h = trypref(ep,".",0,ALL|STOP|DONT_TOUCH))
560 break;
561 if(h = trysuff(ep,0,ALL|STOP|DONT_TOUCH))
562 break;
563 if(!ISUPPER(word[0]))
564 break;
565 cp = original;
566 dp = word;
567 while(*dp = *cp++) {
568 if(!low)
569 *dp = Tolower(*dp);
570 dp++;
571 }
572 word[0] = Tolower(word[0]);
573 }
574
575 if(cflag) {
576 if(!h || Set(h,STOP))
577 print("-");
578 else if(!vflag)
579 print("+");
580 else
581 print("%c",'0' + (suffcount>0) +
582 (prefcount>4? 8: 2*prefcount));
583 } else if(!h || Set(h,STOP)) {
584 if(aflag)
585 Bprint(&bout, "%s:%s\n", acmeid, original);
586 else
587 Bprint(&bout, "%s\n", original);
588 } else if(affix[0] != 0 && affix[0] != '.')
589 print("%s\t%s\n", affix, original);
590 }
591 /* not reached */
592 }
593
594 /* strip exactly one suffix and do
595 * indicated routine(s), which may recursively
596 * strip suffixes
597 */
598 Bits
trysuff(char * ep,int lev,int flag)599 trysuff(char* ep, int lev, int flag)
600 {
601 Suftab *t;
602 char *cp, *sp;
603 Bits h = 0;
604 int initchar = ep[-1];
605
606 flag &= ~MONO;
607 lev += DLEV;
608 if(lev < DSIZ) {
609 deriv[lev] = emptyderiv;
610 deriv[lev-1] = emptyderiv;
611 }
612 if(!ISLOWER(initchar))
613 return h;
614 for(t=suftab[initchar-'a']; sp=t->suf; t++) {
615 cp = ep;
616 while(*sp)
617 if(*--cp != *sp++)
618 goto next;
619 for(sp=ep-t->n1; --sp >= word && !ISVOWEL(*sp);)
620 ;
621 if(sp < word)
622 continue;
623 if(!(t->affixable & flag))
624 return 0;
625 h = (*t->p1)(ep-t->n1, t->d1, t->a1, lev+1, t->flag|STOP);
626 if(!h && t->p2!=0) {
627 if(lev < DSIZ) {
628 deriv[lev] = emptyderiv;
629 deriv[lev+1] = emptyderiv;
630 }
631 h = (*t->p2)(ep-t->n2, t->d2, t->a2, lev, t->flag|STOP);
632 }
633 break;
634 next:;
635 }
636 return h;
637 }
638
639 Bits
nop(char * ep,char * d,char * a,int lev,int flag)640 nop(char* ep, char* d, char* a, int lev, int flag)
641 {
642 USED(ep, d, a, lev, flag);
643 return 0;
644 }
645
646 Bits
cstrip(char * ep,char * d,char * a,int lev,int flag)647 cstrip(char* ep, char* d, char* a, int lev, int flag)
648 {
649 int temp = ep[0];
650
651 if(ISVOWEL(temp) && ISVOWEL(ep[-1])) {
652 switch(pair(ep[-1],ep[0])) {
653 case pair('a', 'a'):
654 case pair('a', 'e'):
655 case pair('a', 'i'):
656 case pair('e', 'a'):
657 case pair('e', 'e'):
658 case pair('e', 'i'):
659 case pair('i', 'i'):
660 case pair('o', 'a'):
661 return 0;
662 }
663 } else
664 if(temp==ep[-1]&&temp==ep[-2])
665 return 0;
666 return strip(ep,d,a,lev,flag);
667 }
668
669 Bits
strip(char * ep,char * d,char * a,int lev,int flag)670 strip(char* ep, char* d, char* a, int lev, int flag)
671 {
672 Bits h = trypref(ep, a, lev, flag);
673
674 USED(d);
675 if(Set(h,MONO) && ISVOWEL(*ep) && ISVOWEL(ep[-2]))
676 h = 0;
677 if(h)
678 return h;
679 if(ISVOWEL(*ep) && !ISVOWEL(ep[-1]) && ep[-1]==ep[-2]) {
680 h = trypref(ep-1,a,lev,flag|MONO);
681 if(h)
682 return h;
683 }
684 return trysuff(ep,lev,flag);
685 }
686
687 Bits
s(char * ep,char * d,char * a,int lev,int flag)688 s(char* ep, char* d, char* a, int lev, int flag)
689 {
690 if(lev > DLEV+1)
691 return 0;
692 if(*ep=='s') {
693 switch(ep[-1]) {
694 case 'y':
695 if(ISVOWEL(ep[-2])||ISUPPER(*word))
696 break; /*says Kennedys*/
697 case 'x':
698 case 'z':
699 case 's':
700 return 0;
701 case 'h':
702 switch(ep[-2]) {
703 case 'c':
704 case 's':
705 return 0;
706 }
707 }
708 }
709 return strip(ep,d,a,lev,flag);
710 }
711
712 Bits
an(char * ep,char * d,char * a,int lev,int flag)713 an(char* ep, char* d, char* a, int lev, int flag)
714 {
715 USED(d);
716 if(!ISUPPER(*word)) /*must be proper name*/
717 return 0;
718 return trypref(ep,a,lev,flag);
719 }
720
721 Bits
ize(char * ep,char * d,char * a,int lev,int flag)722 ize(char* ep, char* d, char* a, int lev, int flag)
723 {
724 int temp = ep[-1];
725 Bits h;
726
727 USED(a);
728 ep[-1] = 'e';
729 h = strip(ep,"",d,lev,flag);
730 ep[-1] = temp;
731 return h;
732 }
733
734 Bits
y_to_e(char * ep,char * d,char * a,int lev,int flag)735 y_to_e(char* ep, char* d, char* a, int lev, int flag)
736 {
737 Bits h;
738 int temp;
739
740 USED(a);
741 switch(ep[-1]) {
742 case 'a':
743 case 'e':
744 case 'i':
745 return 0;
746 }
747 temp = *ep;
748 *ep++ = 'e';
749 h = strip(ep,"",d,lev,flag);
750 ep[-1] = temp;
751 return h;
752 }
753
754 Bits
ily(char * ep,char * d,char * a,int lev,int flag)755 ily(char* ep, char* d, char* a, int lev, int flag)
756 {
757 int temp = ep[0];
758 char *cp = ep;
759
760 if(temp==ep[-1]&&temp==ep[-2]) /* sillly */
761 return 0;
762 if(*--cp=='y' && !ISVOWEL(*--cp)) /* happyly */
763 while(cp>word)
764 if(ISVOWEL(*--cp)) /* shyness */
765 return 0;
766 if(ep[-1]=='i')
767 return i_to_y(ep,d,a,lev,flag);
768 return cstrip(ep,d,a,lev,flag);
769 }
770
771 Bits
bility(char * ep,char * d,char * a,int lev,int flag)772 bility(char* ep, char* d, char* a, int lev, int flag)
773 {
774 *ep++ = 'l';
775 return y_to_e(ep,d,a,lev,flag);
776 }
777
778 Bits
i_to_y(char * ep,char * d,char * a,int lev,int flag)779 i_to_y(char* ep, char* d, char* a, int lev, int flag)
780 {
781 Bits h;
782 int temp;
783
784 if(ISUPPER(*word))
785 return 0;
786 if((temp=ep[-1])=='i' && !ISVOWEL(ep[-2])) {
787 ep[-1] = 'y';
788 a = d;
789 }
790 h = cstrip(ep,"",a,lev,flag);
791 ep[-1] = temp;
792 return h;
793 }
794
795 Bits
es(char * ep,char * d,char * a,int lev,int flag)796 es(char* ep, char* d, char* a, int lev, int flag)
797 {
798 if(lev>DLEV)
799 return 0;
800 switch(ep[-1]) {
801 default:
802 return 0;
803 case 'i':
804 return i_to_y(ep,d,a,lev,flag);
805 case 'h':
806 switch(ep[-2]) {
807 default:
808 return 0;
809 case 'c':
810 case 's':
811 break;
812 }
813 case 's':
814 case 'z':
815 case 'x':
816 return strip(ep,d,a,lev,flag);
817 }
818 }
819
820 Bits
subst(char * ep,char * d,char * a,int lev,int flag)821 subst(char* ep, char* d, char* a, int lev, int flag)
822 {
823 char *u,*t;
824 Bits h;
825
826 USED(a);
827 if(skipv(skipv(ep-1)) < word)
828 return 0;
829 for(t=d; *t!='+'; t++)
830 continue;
831 for(u=ep; *--t!='-';)
832 *--u = *t;
833 h = strip(ep,"",d,lev,flag);
834 while(*++t != '+')
835 continue;
836 while(*++t)
837 *u++ = *t;
838 return h;
839 }
840
841 Bits
tion(char * ep,char * d,char * a,int lev,int flag)842 tion(char* ep, char* d, char* a, int lev, int flag)
843 {
844 switch(ep[-2]) {
845 default:
846 return trypref(ep,a,lev,flag);
847 case 'a':
848 case 'e':
849 case 'i':
850 case 'o':
851 case 'u':
852 return y_to_e(ep,d,a,lev,flag);
853 }
854 }
855
856 /*
857 * possible consonant-consonant-e ending
858 */
859 Bits
CCe(char * ep,char * d,char * a,int lev,int flag)860 CCe(char* ep, char* d, char* a, int lev, int flag)
861 {
862 Bits h;
863
864 switch(ep[-1]) {
865 case 'l':
866 if(ISVOWEL(ep[-2]))
867 break;
868 switch(ep[-2]) {
869 case 'l':
870 case 'r':
871 case 'w':
872 break;
873 default:
874 return y_to_e(ep,d,a,lev,flag);
875 }
876 break;
877 case 'c':
878 case 'g':
879 if(*ep == 'a') /* prevent -able for -eable */
880 return 0;
881 case 's':
882 case 'v':
883 case 'z':
884 if(ep[-2]==ep[-1])
885 break;
886 if(ISVOWEL(ep[-2]))
887 break;
888 case 'u':
889 if(h = y_to_e(ep,d,a,lev,flag))
890 return h;
891 if(!(ep[-2]=='n' && ep[-1]=='g'))
892 return 0;
893 }
894 return VCe(ep,d,a,lev,flag);
895 }
896
897 /*
898 * possible consonant-vowel-consonant-e ending
899 */
900 Bits
VCe(char * ep,char * d,char * a,int lev,int flag)901 VCe(char* ep, char* d, char* a, int lev, int flag)
902 {
903 int c;
904 Bits h;
905
906 c = ep[-1];
907 if(c=='e')
908 return 0;
909 if(!ISVOWEL(c) && ISVOWEL(ep[-2])) {
910 c = *ep;
911 *ep++ = 'e';
912 h = trypref(ep,d,lev,flag);
913 if(!h)
914 h = trysuff(ep,lev,flag);
915 if(h)
916 return h;
917 ep--;
918 *ep = c;
919 }
920 return cstrip(ep,d,a,lev,flag);
921 }
922
923 Ptab*
lookuppref(uchar ** wp,char * ep)924 lookuppref(uchar** wp, char* ep)
925 {
926 Ptab *sp;
927 uchar *bp,*cp;
928 unsigned int initchar = Tolower(**wp);
929
930 if(!ISALPHA(initchar))
931 return 0;
932 for(sp=preftab[initchar-'a'];sp->s;sp++) {
933 bp = *wp;
934 for(cp= (uchar*)sp->s;*cp; )
935 if(*bp++!=*cp++)
936 goto next;
937 for(cp=bp;cp<(uchar*)ep;cp++)
938 if(ISVOWEL(*cp)) {
939 *wp = bp;
940 return sp;
941 }
942 next:;
943 }
944 return 0;
945 }
946
947 /* while word is not in dictionary try stripping
948 * prefixes. Fail if no more prefixes.
949 */
950 Bits
trypref(char * ep,char * a,int lev,int flag)951 trypref(char* ep, char* a, int lev, int flag)
952 {
953 Ptab *tp;
954 char *bp, *cp;
955 char *pp;
956 Bits h;
957 char space[20];
958
959 if(lev<DSIZ) {
960 deriv[lev].mesg = a;
961 deriv[lev].type = *a=='.'? NONE: SUFF;
962 }
963 if(h = tryword(word,ep,lev,flag)) {
964 if(Set(h, flag&~MONO) && (flag&MONO) <= Set(h, MONO))
965 return h;
966 h = 0;
967 }
968 bp = word;
969 pp = space;
970 if(lev<DSIZ) {
971 deriv[lev+1].mesg = pp;
972 deriv[lev+1].type = 0;
973 }
974 while(tp=lookuppref((uchar**)&bp,ep)) {
975 *pp++ = '+';
976 cp = tp->s;
977 while(pp<space+sizeof(space) && (*pp = *cp++))
978 pp++;
979 deriv[lev+1].type += PREF;
980 h = tryword(bp,ep,lev+1,flag);
981 if(Set(h,NOPREF) ||
982 ((tp->flag&IN) && inun(bp-2,h)==0)) {
983 h = 0;
984 break;
985 }
986 if(Set(h,flag&~MONO) && (flag&MONO) <= Set(h, MONO))
987 break;
988 h = 0;
989 }
990 if(lev < DSIZ) {
991 deriv[lev+1] = emptyderiv;
992 deriv[lev+2] = emptyderiv;
993 }
994 return h;
995 }
996
997 Bits
tryword(char * bp,char * ep,int lev,int flag)998 tryword(char* bp, char* ep, int lev, int flag)
999 {
1000 int j;
1001 Bits h = 0;
1002 char duple[3];
1003
1004 if(ep-bp <= 1)
1005 return h;
1006 if(flag&MONO) {
1007 if(lev<DSIZ) {
1008 deriv[++lev].mesg = duple;
1009 deriv[lev].type = SUFF;
1010 }
1011 duple[0] = '+';
1012 duple[1] = *ep;
1013 duple[2] = 0;
1014 }
1015 h = dict(bp, ep);
1016 if(vflag==0 || h==0)
1017 return h;
1018 /*
1019 * when derivations are wanted, collect them
1020 * for printing
1021 */
1022 j = lev;
1023 prefcount = suffcount = 0;
1024 do {
1025 if(j<DSIZ && deriv[j].type) {
1026 strcat(affix, deriv[j].mesg);
1027 if(deriv[j].type == SUFF)
1028 suffcount++;
1029 else if(deriv[j].type != NONE)
1030 prefcount = deriv[j].type/PREF;
1031 }
1032 } while(--j > 0);
1033 return h;
1034 }
1035
1036 int
inun(char * bp,Bits h)1037 inun(char* bp, Bits h)
1038 {
1039 if(*bp == 'u')
1040 return Set(h, IN) == 0;
1041 /* *bp == 'i' */
1042 if(Set(h, IN) == 0)
1043 return 0;
1044 switch(bp[2]) {
1045 case 'r':
1046 return bp[1] == 'r';
1047 case 'm':
1048 case 'p':
1049 return bp[1] == 'm';
1050 }
1051 return bp[1] == 'n';
1052 }
1053
1054 char*
skipv(char * s)1055 skipv(char *s)
1056 {
1057 if(s >= word && ISVOWEL(*s))
1058 s--;
1059 while(s >= word && !ISVOWEL(*s))
1060 s--;
1061 return s;
1062 }
1063
1064 /*
1065 * crummy way to Britishise
1066 */
1067 void
ise(void)1068 ise(void)
1069 {
1070 Suftab *p;
1071 int i;
1072
1073 for(i=0; i<26; i++)
1074 for(p = suftab[i]; p->suf; p++) {
1075 p->suf = ztos(p->suf);
1076 p->d1 = ztos(p->d1);
1077 p->a1 = ztos(p->a1);
1078 }
1079 }
1080
1081 char*
ztos(char * as)1082 ztos(char *as)
1083 {
1084 char *s, *ds;
1085
1086 for(s=as; *s; s++)
1087 if(*s == 'z')
1088 goto copy;
1089 return as;
1090
1091 copy:
1092 ds = strdup(as);
1093 for(s=ds; *s; s++)
1094 if(*s == 'z')
1095 *s = 's';
1096 return ds;
1097 }
1098
1099 Bits
dict(char * bp,char * ep)1100 dict(char* bp, char* ep)
1101 {
1102 char *cp, *cp1, *w, *wp, *we;
1103 int n, f;
1104
1105 w = bp;
1106 we = ep;
1107 n = ep-bp;
1108 if(n <= 1)
1109 return NOUN;
1110
1111 f = w[0] & 0x7f;
1112 f *= 128;
1113 f += w[1] & 0x7f;
1114 bp = spacep[f];
1115 ep = spacep[f+1];
1116
1117 loop:
1118 if(bp >= ep) {
1119 if(xflag)
1120 fprint(2, "=%.*s\n", utfnlen(w, n), w);
1121 return 0;
1122 }
1123 /*
1124 * find the beginning of some word in the middle
1125 */
1126 cp = bp + (ep-bp)/2;
1127
1128 while(cp > bp && !(*cp & 0x80))
1129 cp--;
1130 while(cp > bp && (cp[-1] & 0x80))
1131 cp--;
1132
1133 wp = w + 2; /* skip two letters */
1134 cp1 = cp + 2; /* skip affix code */
1135 for(;;) {
1136 if(wp >= we) {
1137 if(*cp1 & 0x80)
1138 goto found;
1139 else
1140 f = 1;
1141 break;
1142 }
1143 if(*cp1 & 0x80) {
1144 f = -1;
1145 break;
1146 }
1147 f = *cp1++ - *wp++;
1148 if(f != 0)
1149 break;
1150 }
1151
1152 if(f < 0) {
1153 while(!(*cp1 & 0x80))
1154 cp1++;
1155 bp = cp1;
1156 goto loop;
1157 }
1158 ep = cp;
1159 goto loop;
1160
1161 found:
1162 f = ((cp[0] & 0x7) << 8) |
1163 (cp[1] & 0xff);
1164 if(xflag) {
1165 fprint(2, "=%.*s ", utfnlen(w, n), w);
1166 typeprint(encode[f]);
1167 }
1168 return encode[f];
1169 }
1170
1171 void
typeprint(Bits h)1172 typeprint(Bits h)
1173 {
1174
1175 pcomma("");
1176 if(h & NOUN)
1177 pcomma("n");
1178 if(h & PROP_COLLECT)
1179 pcomma("pc");
1180 if(h & VERB) {
1181 if((h & VERB) == VERB)
1182 pcomma("v");
1183 else
1184 if((h & VERB) == V_IRREG)
1185 pcomma("vi");
1186 else
1187 if(h & ED)
1188 pcomma("ed");
1189 }
1190 if(h & ADJ)
1191 pcomma("a");
1192 if(h & COMP) {
1193 if((h & COMP) == ACTOR)
1194 pcomma("er");
1195 else
1196 pcomma("comp");
1197 }
1198 if(h & DONT_TOUCH)
1199 pcomma("d");
1200 if(h & N_AFFIX)
1201 pcomma("na");
1202 if(h & ADV)
1203 pcomma("adv");
1204 if(h & ION)
1205 pcomma("ion");
1206 if(h & V_AFFIX)
1207 pcomma("va");
1208 if(h & MAN)
1209 pcomma("man");
1210 if(h & NOPREF)
1211 pcomma("nopref");
1212 if(h & MONO)
1213 pcomma("ms");
1214 if(h & IN)
1215 pcomma("in");
1216 if(h & _Y)
1217 pcomma("y");
1218 if(h & STOP)
1219 pcomma("s");
1220 fprint(2, "\n");
1221 }
1222
1223 void
pcomma(char * s)1224 pcomma(char *s)
1225 {
1226 static flag;
1227
1228 if(*s == 0) {
1229 flag = 0;
1230 return;
1231 }
1232 if(!flag) {
1233 fprint(2, "%s", s);
1234 flag = 1;
1235 } else
1236 fprint(2, ",%s", s);
1237 }
1238
1239 /*
1240 * is the word on of the following
1241 * 12th teen
1242 * 21st end in 1
1243 * 23rd end in 3
1244 * 77th default
1245 * called knowing word[0] is a digit
1246 */
1247 int
ordinal(void)1248 ordinal(void)
1249 {
1250 char *cp = word;
1251 static char sp[4];
1252
1253 while(ISDIGIT(*cp))
1254 cp++;
1255 strncpy(sp,cp,3);
1256 if(ISUPPER(cp[0]) && ISUPPER(cp[1])) {
1257 sp[0] = Tolower(cp[0]);
1258 sp[1] = Tolower(cp[1]);
1259 }
1260 return 0 == strncmp(sp,
1261 cp[-2]=='1'? "th": /* out of bounds if 1 digit */
1262 *--cp=='1'? "st": /* harmless */
1263 *cp=='2'? "nd":
1264 *cp=='3'? "rd":
1265 "th", 3);
1266 }
1267
1268 /*
1269 * read in the dictionary.
1270 * format is
1271 * {
1272 * short nencode;
1273 * long encode[nencode];
1274 * char space[*];
1275 * };
1276 *
1277 * the encodings are a table all different
1278 * affixes.
1279 * the dictionary proper has 2 bytes
1280 * that demark and then the rest of the
1281 * word. the 2 bytes have the following
1282 * 0x80 0x00 flag
1283 * 0x78 0x00 count of prefix bytes
1284 * common with prev word
1285 * 0x07 0xff affix code
1286 *
1287 * all ints are big endians in the file.
1288 */
1289 void
readdict(char * file)1290 readdict(char *file)
1291 {
1292 char *s, *is, *lasts, *ls;
1293 int c, i, sp, p;
1294 int f;
1295 long l;
1296
1297 lasts = 0;
1298 f = open(file, 0);
1299 if(f == -1) {
1300 fprint(2, "cannot open %s\n", file);
1301 exits("open");
1302 }
1303 if(read(f, space, 2) != 2)
1304 goto bad;
1305 nencode = ((space[0]&0xff)<<8) | (space[1]&0xff);
1306 if(read(f, space, 4*nencode) != 4*nencode)
1307 goto bad;
1308 s = space;
1309 for(i=0; i<nencode; i++) {
1310 l = (long)(s[0] & 0xff) << 24;
1311 l |= (s[1] & 0xff) << 16;
1312 l |= (s[2] & 0xff) << 8;
1313 l |= s[3] & 0xff;
1314 encode[i] = (Bits)l;
1315 s += 4;
1316 }
1317 l = read(f, space, sizeof(space));
1318 if(l == sizeof(space))
1319 goto noroom;
1320 is = space + (sizeof(space) - l);
1321 memmove(is, space, l);
1322
1323 s = space;
1324 c = *is++ & 0xff;
1325 sp = -1;
1326 i = 0;
1327
1328 loop:
1329 if(s > is)
1330 goto noroom;
1331 if(c < 0) {
1332 close(f);
1333 while(sp < 128*128)
1334 spacep[++sp] = s;
1335 *s = 0x80; /* fence */
1336 return;
1337 }
1338 p = (c>>3) & 0xf;
1339 *s++ = c;
1340 *s++ = *is++ & 0xff;
1341 if(p <= 0)
1342 i = (*is++ & 0xff)*128;
1343 if(p <= 1) {
1344 if(!(*is & 0x80))
1345 i = i/128*128 + (*is++ & 0xff);
1346 if(i <= sp) {
1347 fprint(2, "the dict isnt sorted or \n");
1348 fprint(2, "memmove didn't work\n");
1349 goto bad;
1350 }
1351 while(sp < i)
1352 spacep[++sp] = s-2;
1353 }
1354 ls = lasts;
1355 lasts = s;
1356 for(p-=2; p>0; p--)
1357 *s++ = *ls++;
1358 for(;;) {
1359 if(is >= space+sizeof(space)) {
1360 c = -1;
1361 break;
1362 }
1363 c = *is++ & 0xff;
1364 if(c & 0x80)
1365 break;
1366 *s++ = c;
1367 }
1368 *s = 0;
1369 goto loop;
1370
1371 bad:
1372 fprint(2, "trouble reading %s\n", file);
1373 exits("read");
1374 noroom:
1375 fprint(2, "not enough space for dictionary\n");
1376 exits("space");
1377 }
1378