1 #include <u.h> 2 #include <libc.h> 3 #include <bio.h> 4 #include <ctype.h> 5 #include "code.h" 6 7 /* fig leaves for possibly signed char quantities */ 8 #define ISUPPER(c) isupper((c)&0xff) 9 #define ISLOWER(c) islower((c)&0xff) 10 #define ISALPHA(c) isalpha((c)&0xff) 11 #define ISDIGIT(c) isdigit((c)&0xff) 12 #define ISVOWEL(c) voweltab[(c)&0xff] 13 #define Tolower(c) (ISUPPER(c)? (c)-'A'+'a': (c)) 14 #define pair(a,b) (((a)<<8) | (b)) 15 #define DLEV 2 16 #define DSIZ 40 17 18 typedef long Bits; 19 #define Set(h, f) ((long)(h) & (f)) 20 21 Bits nop(char*, char*, char*, int, int); 22 Bits strip(char*, char*, char*, int, int); 23 Bits ize(char*, char*, char*, int, int); 24 Bits i_to_y(char*, char*, char*, int, int); 25 Bits ily(char*, char*, char*, int, int); 26 Bits subst(char*, char*, char*, int, int); 27 Bits CCe(char*, char*, char*, int, int); 28 Bits tion(char*, char*, char*, int, int); 29 Bits an(char*, char*, char*, int, int); 30 Bits s(char*, char*, char*, int, int); 31 Bits es(char*, char*, char*, int, int); 32 Bits bility(char*, char*, char*, int, int); 33 Bits y_to_e(char*, char*, char*, int, int); 34 Bits VCe(char*, char*, char*, int, int); 35 36 Bits trypref(char*, char*, int, int); 37 Bits tryword(char*, char*, int, int); 38 Bits trysuff(char*, int, int); 39 Bits dict(char*, char*); 40 void typeprint(Bits); 41 void pcomma(char*); 42 43 void ise(void); 44 int ordinal(void); 45 char* skipv(char*); 46 int inun(char*, Bits); 47 char* ztos(char*); 48 void readdict(char*); 49 50 typedef struct Ptab Ptab; 51 struct Ptab 52 { 53 char* s; 54 int flag; 55 }; 56 57 typedef struct Suftab Suftab; 58 struct Suftab 59 { 60 char *suf; 61 Bits (*p1)(char*, char*, char*, int, int); 62 int n1; 63 char *d1; 64 char *a1; 65 int flag; 66 int affixable; 67 Bits (*p2)(char*, char*, char*, int, int); 68 int n2; 69 char *d2; 70 char *a2; 71 }; 72 73 Suftab staba[] = { 74 {"aibohp",subst,1,"-e+ia","",NOUN, NOUN}, 75 0 76 }; 77 78 Suftab stabc[] = 79 { 80 {"cai",strip,1,"","+c",N_AFFIX, ADJ|NOUN}, 81 {"citsi",strip,2,"","+ic",N_AFFIX, ADJ | N_AFFIX | NOUN}, 82 {"citi",ize,1,"-e+ic","",N_AFFIX, ADJ }, 83 {"cihparg",i_to_y,1,"-y+ic","",NOUN, ADJ|NOUN }, 84 {"cipocs",ize,1,"-e+ic","",NOUN, ADJ }, 85 {"cirtem",i_to_y,1,"-y+ic","",NOUN, ADJ }, 86 {"cigol",i_to_y,1,"-y+ic","",NOUN, ADJ }, 87 {"cimono",i_to_y,1,"-y+ic","",NOUN, ADJ }, 88 {"cibohp",subst,1,"-e+ic","",NOUN, ADJ }, 89 0 90 }; 91 Suftab stabd[] = 92 { 93 {"de",strip,1,"","+d",ED,ADJ |COMP,i_to_y,2,"-y+ied","+ed"}, 94 {"dooh",ily,4,"-y+ihood","+hood",NOUN | ADV, NOUN}, 95 0 96 }; 97 Suftab stabe[] = 98 { 99 /* 100 * V_affix for comment ->commence->commentment?? 101 */ 102 {"ecna",subst,1,"-t+ce","",ADJ,N_AFFIX|_Y|NOUN|VERB|ACTOR|V_AFFIX}, 103 {"ecne",subst,1,"-t+ce","",ADJ,N_AFFIX|_Y|NOUN|VERB|ACTOR|V_AFFIX}, 104 {"elbaif",i_to_y,4,"-y+iable","",V_IRREG,ADJ}, 105 {"elba",CCe,4,"-e+able","+able",V_AFFIX,ADJ}, 106 {"evi",subst,0,"-ion+ive","",N_AFFIX | V_AFFIX,NOUN | N_AFFIX| ADJ}, 107 {"ezi",CCe,3,"-e+ize","+ize",N_AFFIX|ADJ ,V_AFFIX | VERB |ION | COMP}, 108 {"ekil",strip,4,"","+like",N_AFFIX ,ADJ}, 109 0 110 }; 111 Suftab stabg[] = 112 { 113 {"gniee",strip,3,"","+ing",V_IRREG ,ADJ|NOUN}, 114 {"gnikam",strip,6,"","+making",NOUN,NOUN}, 115 {"gnipeek",strip,7,"","+keeping",NOUN,NOUN}, 116 {"gni",CCe,3,"-e+ing","+ing",V_IRREG ,ADJ|ED|NOUN}, 117 0 118 }; 119 Suftab stabl[] = 120 { 121 {"ladio",strip,2,"","+al",NOUN |ADJ,ADJ}, 122 {"laci",strip,2,"","+al",NOUN |ADJ,ADJ |NOUN|N_AFFIX}, 123 {"latnem",strip,2,"","+al",N_AFFIX,ADJ}, 124 {"lanoi",strip,2,"","+al",N_AFFIX,ADJ|NOUN}, 125 {"luf",ily,3,"-y+iful","+ful",N_AFFIX,ADJ | NOUN}, 126 0 127 }; 128 Suftab stabm[] = 129 { 130 /* congregational + ism */ 131 {"msi",CCe,3,"-e+ism","ism",N_AFFIX|ADJ,NOUN}, 132 {"margo",subst,-1,"-ph+m","",NOUN,NOUN}, 133 0 134 }; 135 Suftab stabn[] = 136 { 137 {"noitacifi",i_to_y,6,"-y+ication","",ION,NOUN | N_AFFIX}, 138 {"noitazi",ize,4,"-e+ation","",ION,NOUN| N_AFFIX}, 139 {"noit",tion,3,"-e+ion","+ion",ION,NOUN| N_AFFIX | V_AFFIX |VERB|ACTOR}, 140 {"naino",an,3,"","+ian",NOUN|PROP_COLLECT,NOUN| N_AFFIX}, 141 {"namow",strip,5,"","+woman",MAN,PROP_COLLECT|N_AFFIX}, 142 {"nam",strip,3,"","+man",MAN,PROP_COLLECT | N_AFFIX | VERB}, 143 {"na",an,1,"","+n",NOUN|PROP_COLLECT,NOUN | N_AFFIX}, 144 {"nemow",strip,5,"","+women",MAN,PROP_COLLECT}, 145 {"nem",strip,3,"","+man",MAN,PROP_COLLECT}, 146 {"nosrep",strip,6,"","+person",MAN,PROP_COLLECT}, 147 0 148 }; 149 Suftab stabp[] = 150 { 151 {"pihs",strip,4,"","+ship",NOUN|PROP_COLLECT,NOUN| N_AFFIX}, 152 0 153 }; 154 Suftab stabr[] = 155 { 156 {"rehparg",subst,1,"-y+er","",ACTOR,NOUN,strip,2,"","+er"}, 157 {"reyhparg",nop,0,"","",0,NOUN}, 158 {"reyl",nop,0,"","",0,NOUN}, 159 {"rekam",strip,5,"","+maker",NOUN,NOUN}, 160 {"repeek",strip,6,"","+keeper",NOUN,NOUN}, 161 {"re",strip,1,"","+r",ACTOR,NOUN | N_AFFIX|VERB|ADJ, i_to_y,2,"-y+ier","+er"}, 162 {"rota",tion,2,"-e+or","",ION,NOUN| N_AFFIX|_Y}, 163 {"rotc",tion,2,"","+or",ION,NOUN| N_AFFIX}, 164 {"rotp",tion,2,"","+or",ION,NOUN| N_AFFIX}, 165 0 166 }; 167 Suftab stabs[] = 168 { 169 {"ssen",ily,4,"-y+iness","+ness",ADJ|ADV,NOUN| N_AFFIX}, 170 {"ssel",ily,4,"-y+iless","+less",NOUN | PROP_COLLECT,ADJ }, 171 {"se",s,1,"","+s",NOUN | V_IRREG,DONT_TOUCH , es,2,"-y+ies","+es"}, 172 {"s'",s,2,"","+'s",PROP_COLLECT | NOUN,DONT_TOUCH }, 173 {"s",s,1,"","+s",NOUN | V_IRREG,DONT_TOUCH }, 174 0 175 }; 176 Suftab stabt[] = 177 { 178 {"tnem",strip,4,"","+ment",V_AFFIX,NOUN | N_AFFIX | ADJ|VERB}, 179 {"tse",strip,2,"","+st",EST,DONT_TOUCH, i_to_y,3,"-y+iest","+est" }, 180 {"tsigol",i_to_y,2,"-y+ist","",N_AFFIX,NOUN | N_AFFIX}, 181 {"tsi",CCe,3,"-e+ist","+ist",N_AFFIX|ADJ,NOUN | N_AFFIX|COMP}, 182 0 183 }; 184 Suftab staby[] = 185 { 186 {"ycna",subst,1,"-t+cy","",ADJ | N_AFFIX,NOUN | N_AFFIX}, 187 {"ycne",subst,1,"-t+cy","",ADJ | N_AFFIX,NOUN | N_AFFIX}, 188 {"ytilib",bility,5,"-le+ility","",ADJ | V_AFFIX,NOUN | N_AFFIX}, 189 {"ytisuo",nop,0,"","",NOUN}, 190 {"ytilb",nop,0,"","",0,NOUN}, 191 {"yti",CCe,3,"-e+ity","+ity",ADJ ,NOUN | N_AFFIX }, 192 {"ylb",y_to_e,1,"-e+y","",ADJ,ADV}, 193 {"ylc",nop,0,"","",0}, 194 {"ylelb",nop,0,"","",0}, 195 {"ylelp",nop,0,"","",0}, 196 {"yl",ily,2,"-y+ily","+ly",ADJ,ADV|COMP}, 197 {"yrtem",subst,0,"-er+ry","",NOUN,NOUN | N_AFFIX}, 198 {"y",CCe,1,"-e+y","+y",_Y,ADJ|COMP}, 199 0 200 }; 201 Suftab stabz[] = 202 { 203 0 204 }; 205 Suftab* suftab[] = 206 { 207 staba, 208 stabz, 209 stabc, 210 stabd, 211 stabe, 212 stabz, 213 stabg, 214 stabz, 215 stabz, 216 stabz, 217 stabz, 218 stabl, 219 stabm, 220 stabn, 221 stabz, 222 stabp, 223 stabz, 224 stabr, 225 stabs, 226 stabt, 227 stabz, 228 stabz, 229 stabz, 230 stabz, 231 staby, 232 stabz, 233 }; 234 235 Ptab ptaba[] = 236 { 237 "anti", 0, 238 "auto", 0, 239 0 240 }; 241 Ptab ptabb[] = 242 { 243 "bio", 0, 244 0 245 }; 246 Ptab ptabc[] = 247 { 248 "counter", 0, 249 0 250 }; 251 Ptab ptabd[] = 252 { 253 "dis", 0, 254 0 255 }; 256 Ptab ptabe[] = 257 { 258 "electro", 0, 259 0 260 }; 261 Ptab ptabf[] = 262 { 263 "femto", 0, 264 0 265 }; 266 Ptab ptabg[] = 267 { 268 "geo", 0, 269 "giga", 0, 270 0 271 }; 272 Ptab ptabh[] = 273 { 274 "hyper", 0, 275 0 276 }; 277 Ptab ptabi[] = 278 { 279 "immuno", 0, 280 "im", IN, 281 "intra", 0, 282 "inter", 0, 283 "in", IN, 284 "ir", IN, 285 "iso", 0, 286 0 287 }; 288 Ptab ptabj[] = 289 { 290 0 291 }; 292 Ptab ptabk[] = 293 { 294 "kilo", 0, 295 0 296 }; 297 Ptab ptabl[] = 298 { 299 0 300 }; 301 Ptab ptabm[] = 302 { 303 "magneto", 0, 304 "mega", 0, 305 "meta", 0, 306 "micro", 0, 307 "mid", 0, 308 "milli", 0, 309 "mini", 0, 310 "mis", 0, 311 "mono", 0, 312 "multi", 0, 313 0 314 }; 315 Ptab ptabn[] = 316 { 317 "nano", 0, 318 "neuro", 0, 319 "non", 0, 320 0 321 }; 322 Ptab ptabo[] = 323 { 324 "out", 0, 325 "over", 0, 326 0 327 }; 328 Ptab ptabp[] = 329 { 330 "para", 0, 331 "photo", 0, 332 "pico", 0, 333 "poly", 0, 334 "pre", 0, 335 "pseudo", 0, 336 "psycho", 0, 337 0 338 }; 339 Ptab ptabq[] = 340 { 341 "quasi", 0, 342 0 343 }; 344 Ptab ptabr[] = 345 { 346 "radio", 0, 347 "re", 0, 348 0 349 }; 350 Ptab ptabs[] = 351 { 352 "semi", 0, 353 "stereo", 0, 354 "sub", 0, 355 "super", 0, 356 0 357 }; 358 Ptab ptabt[] = 359 { 360 "tele", 0, 361 "tera", 0, 362 "thermo", 0, 363 0 364 }; 365 Ptab ptabu[] = 366 { 367 "ultra", 0, 368 "under", 0, /*must precede un*/ 369 "un", IN, 370 0 371 }; 372 Ptab ptabv[] = 373 { 374 0 375 }; 376 Ptab ptabw[] = 377 { 378 0 379 }; 380 Ptab ptabx[] = 381 { 382 0 383 }; 384 Ptab ptaby[] = 385 { 386 0 387 }; 388 Ptab ptabz[] = 389 { 390 0 391 }; 392 393 Ptab* preftab[] = 394 { 395 ptaba, 396 ptabb, 397 ptabc, 398 ptabd, 399 ptabe, 400 ptabf, 401 ptabg, 402 ptabh, 403 ptabi, 404 ptabj, 405 ptabk, 406 ptabl, 407 ptabm, 408 ptabn, 409 ptabo, 410 ptabp, 411 ptabq, 412 ptabr, 413 ptabs, 414 ptabt, 415 ptabu, 416 ptabv, 417 ptabw, 418 ptabx, 419 ptaby, 420 ptabz, 421 }; 422 423 typedef struct { 424 char *mesg; 425 enum { NONE, SUFF, PREF} type; 426 } Deriv; 427 428 int aflag; 429 int cflag; 430 int fflag; 431 int vflag; 432 int xflag; 433 int nflag; 434 char word[500]; 435 char* original; 436 Deriv emptyderiv; 437 Deriv deriv[DSIZ+3]; 438 char affix[DSIZ*10]; /* 10 is longest affix message */ 439 int prefcount; 440 int suffcount; 441 char* acmeid; 442 char space[300000]; /* must be as large as "words"+"space" in pcode run */ 443 Bits encode[2048]; /* must be as long as "codes" in pcode run */ 444 int nencode; 445 char voweltab[256]; 446 char* spacep[128*128+1]; /* pointer to words starting with 'xx' */ 447 Biobuf bin; 448 Biobuf bout; 449 450 char* codefile = "/sys/lib/amspell"; 451 char* brfile = "/sys/lib/brspell"; 452 char* Usage = "usage"; 453 454 void 455 main(int argc, char *argv[]) 456 { 457 char *ep, *cp; 458 char *dp; 459 int j, i, c; 460 int low; 461 Bits h; 462 463 Binit(&bin, 0, OREAD); 464 Binit(&bout, 1, OWRITE); 465 for(i=0; c = "aeiouyAEIOUY"[i]; i++) 466 voweltab[c] = 1; 467 while(argc > 1) { 468 if(argv[1][0] != '-') 469 break; 470 for(i=1; c = argv[1][i]; i++) 471 switch(c) { 472 default: 473 fprint(2, "usage: spell [-bcCvx] [-f file]\n"); 474 exits(Usage); 475 476 case 'a': 477 aflag++; 478 continue; 479 480 case 'b': 481 ise(); 482 if(!fflag) 483 codefile = brfile; 484 continue; 485 486 case 'C': /* for "correct" */ 487 vflag++; 488 case 'c': /* for ocr */ 489 cflag++; 490 continue; 491 492 case 'v': 493 vflag++; 494 continue; 495 496 case 'x': 497 xflag++; 498 continue; 499 500 case 'f': 501 if(argc <= 2) { 502 fprint(2, "spell: -f requires another argument\n"); 503 exits(Usage); 504 } 505 argv++; 506 argc--; 507 codefile = argv[1]; 508 fflag++; 509 goto brk; 510 } 511 brk: 512 argv++; 513 argc--; 514 } 515 readdict(codefile); 516 if(argc > 1) { 517 fprint(2, "usage: spell [-bcCvx] [-f file]\n"); 518 exits(Usage); 519 } 520 if(aflag) 521 cflag = vflag = 0; 522 523 for(;;) { 524 affix[0] = 0; 525 original = Brdline(&bin, '\n'); 526 if(original == 0) 527 exits(0); 528 original[Blinelen(&bin)-1] = 0; 529 low = 0; 530 531 if(aflag) { 532 acmeid = original; 533 while(*original != ':') 534 if(*original++ == 0) 535 exits(0); 536 while(*++original != ':') 537 if(*original == 0) 538 exits(0); 539 *original++ = 0; 540 } 541 for(ep=word,dp=original; j = *dp; ep++,dp++) { 542 if(ISLOWER(j)) 543 low++; 544 if(ep >= word+sizeof(word)-1) 545 break; 546 *ep = j; 547 } 548 *ep = 0; 549 550 if(ISDIGIT(word[0]) && ordinal()) 551 continue; 552 553 h = 0; 554 if(!low && !(h = trypref(ep,".",0,ALL|STOP|DONT_TOUCH))) 555 for(cp=original+1,dp=word+1; dp<ep; dp++,cp++) 556 *dp = Tolower(*cp); 557 if(!h) 558 for(;;) { /* at most twice */ 559 if(h = trypref(ep,".",0,ALL|STOP|DONT_TOUCH)) 560 break; 561 if(h = trysuff(ep,0,ALL|STOP|DONT_TOUCH)) 562 break; 563 if(!ISUPPER(word[0])) 564 break; 565 cp = original; 566 dp = word; 567 while(*dp = *cp++) { 568 if(!low) 569 *dp = Tolower(*dp); 570 dp++; 571 } 572 word[0] = Tolower(word[0]); 573 } 574 575 if(cflag) { 576 if(!h || Set(h,STOP)) 577 print("-"); 578 else if(!vflag) 579 print("+"); 580 else 581 print("%c",'0' + (suffcount>0) + 582 (prefcount>4? 8: 2*prefcount)); 583 } else if(!h || Set(h,STOP)) { 584 if(aflag) 585 Bprint(&bout, "%s:%s\n", acmeid, original); 586 else 587 Bprint(&bout, "%s\n", original); 588 } else if(affix[0] != 0 && affix[0] != '.') 589 print("%s\t%s\n", affix, original); 590 } 591 exits(0); 592 } 593 594 /* strip exactly one suffix and do 595 * indicated routine(s), which may recursively 596 * strip suffixes 597 */ 598 Bits 599 trysuff(char* ep, int lev, int flag) 600 { 601 Suftab *t; 602 char *cp, *sp; 603 Bits h = 0; 604 int initchar = ep[-1]; 605 606 flag &= ~MONO; 607 lev += DLEV; 608 if(lev < DSIZ) { 609 deriv[lev] = emptyderiv; 610 deriv[lev-1] = emptyderiv; 611 } 612 if(!ISLOWER(initchar)) 613 return h; 614 for(t=suftab[initchar-'a']; sp=t->suf; t++) { 615 cp = ep; 616 while(*sp) 617 if(*--cp != *sp++) 618 goto next; 619 for(sp=ep-t->n1; --sp >= word && !ISVOWEL(*sp);) 620 ; 621 if(sp < word) 622 continue; 623 if(!(t->affixable & flag)) 624 return 0; 625 h = (*t->p1)(ep-t->n1, t->d1, t->a1, lev+1, t->flag|STOP); 626 if(!h && t->p2!=0) { 627 if(lev < DSIZ) { 628 deriv[lev] = emptyderiv; 629 deriv[lev+1] = emptyderiv; 630 } 631 h = (*t->p2)(ep-t->n2, t->d2, t->a2, lev, t->flag|STOP); 632 } 633 break; 634 next:; 635 } 636 return h; 637 } 638 639 Bits 640 nop(char* ep, char* d, char* a, int lev, int flag) 641 { 642 USED(ep, d, a, lev, flag); 643 return 0; 644 } 645 646 Bits 647 cstrip(char* ep, char* d, char* a, int lev, int flag) 648 { 649 int temp = ep[0]; 650 651 if(ISVOWEL(temp) && ISVOWEL(ep[-1])) { 652 switch(pair(ep[-1],ep[0])) { 653 case pair('a', 'a'): 654 case pair('a', 'e'): 655 case pair('a', 'i'): 656 case pair('e', 'a'): 657 case pair('e', 'e'): 658 case pair('e', 'i'): 659 case pair('i', 'i'): 660 case pair('o', 'a'): 661 return 0; 662 } 663 } else 664 if(temp==ep[-1]&&temp==ep[-2]) 665 return 0; 666 return strip(ep,d,a,lev,flag); 667 } 668 669 Bits 670 strip(char* ep, char* d, char* a, int lev, int flag) 671 { 672 Bits h = trypref(ep, a, lev, flag); 673 674 USED(d); 675 if(Set(h,MONO) && ISVOWEL(*ep) && ISVOWEL(ep[-2])) 676 h = 0; 677 if(h) 678 return h; 679 if(ISVOWEL(*ep) && !ISVOWEL(ep[-1]) && ep[-1]==ep[-2]) { 680 h = trypref(ep-1,a,lev,flag|MONO); 681 if(h) 682 return h; 683 } 684 return trysuff(ep,lev,flag); 685 } 686 687 Bits 688 s(char* ep, char* d, char* a, int lev, int flag) 689 { 690 if(lev > DLEV+1) 691 return 0; 692 if(*ep=='s') { 693 switch(ep[-1]) { 694 case 'y': 695 if(ISVOWEL(ep[-2])||ISUPPER(*word)) 696 break; /*says Kennedys*/ 697 case 'x': 698 case 'z': 699 case 's': 700 return 0; 701 case 'h': 702 switch(ep[-2]) { 703 case 'c': 704 case 's': 705 return 0; 706 } 707 } 708 } 709 return strip(ep,d,a,lev,flag); 710 } 711 712 Bits 713 an(char* ep, char* d, char* a, int lev, int flag) 714 { 715 USED(d); 716 if(!ISUPPER(*word)) /*must be proper name*/ 717 return 0; 718 return trypref(ep,a,lev,flag); 719 } 720 721 Bits 722 ize(char* ep, char* d, char* a, int lev, int flag) 723 { 724 int temp = ep[-1]; 725 Bits h; 726 727 USED(a); 728 ep[-1] = 'e'; 729 h = strip(ep,"",d,lev,flag); 730 ep[-1] = temp; 731 return h; 732 } 733 734 Bits 735 y_to_e(char* ep, char* d, char* a, int lev, int flag) 736 { 737 Bits h; 738 int temp; 739 740 USED(a); 741 switch(ep[-1]) { 742 case 'a': 743 case 'e': 744 case 'i': 745 return 0; 746 } 747 temp = *ep; 748 *ep++ = 'e'; 749 h = strip(ep,"",d,lev,flag); 750 ep[-1] = temp; 751 return h; 752 } 753 754 Bits 755 ily(char* ep, char* d, char* a, int lev, int flag) 756 { 757 int temp = ep[0]; 758 char *cp = ep; 759 760 if(temp==ep[-1]&&temp==ep[-2]) /* sillly */ 761 return 0; 762 if(*--cp=='y' && !ISVOWEL(*--cp)) /* happyly */ 763 while(cp>word) 764 if(ISVOWEL(*--cp)) /* shyness */ 765 return 0; 766 if(ep[-1]=='i') 767 return i_to_y(ep,d,a,lev,flag); 768 return cstrip(ep,d,a,lev,flag); 769 } 770 771 Bits 772 bility(char* ep, char* d, char* a, int lev, int flag) 773 { 774 *ep++ = 'l'; 775 return y_to_e(ep,d,a,lev,flag); 776 } 777 778 Bits 779 i_to_y(char* ep, char* d, char* a, int lev, int flag) 780 { 781 Bits h; 782 int temp; 783 784 if(ISUPPER(*word)) 785 return 0; 786 if((temp=ep[-1])=='i' && !ISVOWEL(ep[-2])) { 787 ep[-1] = 'y'; 788 a = d; 789 } 790 h = cstrip(ep,"",a,lev,flag); 791 ep[-1] = temp; 792 return h; 793 } 794 795 Bits 796 es(char* ep, char* d, char* a, int lev, int flag) 797 { 798 if(lev>DLEV) 799 return 0; 800 switch(ep[-1]) { 801 default: 802 return 0; 803 case 'i': 804 return i_to_y(ep,d,a,lev,flag); 805 case 'h': 806 switch(ep[-2]) { 807 default: 808 return 0; 809 case 'c': 810 case 's': 811 break; 812 } 813 case 's': 814 case 'z': 815 case 'x': 816 return strip(ep,d,a,lev,flag); 817 } 818 } 819 820 Bits 821 subst(char* ep, char* d, char* a, int lev, int flag) 822 { 823 char *u,*t; 824 Bits h; 825 826 USED(a); 827 if(skipv(skipv(ep-1)) < word) 828 return 0; 829 for(t=d; *t!='+'; t++) 830 continue; 831 for(u=ep; *--t!='-';) 832 *--u = *t; 833 h = strip(ep,"",d,lev,flag); 834 while(*++t != '+') 835 continue; 836 while(*++t) 837 *u++ = *t; 838 return h; 839 } 840 841 Bits 842 tion(char* ep, char* d, char* a, int lev, int flag) 843 { 844 switch(ep[-2]) { 845 default: 846 return trypref(ep,a,lev,flag); 847 case 'a': 848 case 'e': 849 case 'i': 850 case 'o': 851 case 'u': 852 return y_to_e(ep,d,a,lev,flag); 853 } 854 } 855 856 /* 857 * possible consonant-consonant-e ending 858 */ 859 Bits 860 CCe(char* ep, char* d, char* a, int lev, int flag) 861 { 862 Bits h; 863 864 switch(ep[-1]) { 865 case 'l': 866 if(ISVOWEL(ep[-2])) 867 break; 868 switch(ep[-2]) { 869 case 'l': 870 case 'r': 871 case 'w': 872 break; 873 default: 874 return y_to_e(ep,d,a,lev,flag); 875 } 876 break; 877 case 'c': 878 case 'g': 879 if(*ep == 'a') /* prevent -able for -eable */ 880 return 0; 881 case 's': 882 case 'v': 883 case 'z': 884 if(ep[-2]==ep[-1]) 885 break; 886 if(ISVOWEL(ep[-2])) 887 break; 888 case 'u': 889 if(h = y_to_e(ep,d,a,lev,flag)) 890 return h; 891 if(!(ep[-2]=='n' && ep[-1]=='g')) 892 return 0; 893 } 894 return VCe(ep,d,a,lev,flag); 895 } 896 897 /* 898 * possible consonant-vowel-consonant-e ending 899 */ 900 Bits 901 VCe(char* ep, char* d, char* a, int lev, int flag) 902 { 903 int c; 904 Bits h; 905 906 c = ep[-1]; 907 if(c=='e') 908 return 0; 909 if(!ISVOWEL(c) && ISVOWEL(ep[-2])) { 910 c = *ep; 911 *ep++ = 'e'; 912 h = trypref(ep,d,lev,flag); 913 if(!h) 914 h = trysuff(ep,lev,flag); 915 if(h) 916 return h; 917 ep--; 918 *ep = c; 919 } 920 return cstrip(ep,d,a,lev,flag); 921 } 922 923 Ptab* 924 lookuppref(uchar** wp, char* ep) 925 { 926 Ptab *sp; 927 uchar *bp,*cp; 928 unsigned int initchar = Tolower(**wp); 929 930 if(!ISALPHA(initchar)) 931 return 0; 932 for(sp=preftab[initchar-'a'];sp->s;sp++) { 933 bp = *wp; 934 for(cp= (uchar*)sp->s;*cp; ) 935 if(*bp++!=*cp++) 936 goto next; 937 for(cp=bp;cp<(uchar*)ep;cp++) 938 if(ISVOWEL(*cp)) { 939 *wp = bp; 940 return sp; 941 } 942 next:; 943 } 944 return 0; 945 } 946 947 /* while word is not in dictionary try stripping 948 * prefixes. Fail if no more prefixes. 949 */ 950 Bits 951 trypref(char* ep, char* a, int lev, int flag) 952 { 953 Ptab *tp; 954 char *bp, *cp; 955 char *pp; 956 Bits h; 957 char space[20]; 958 959 if(lev<DSIZ) { 960 deriv[lev].mesg = a; 961 deriv[lev].type = *a=='.'? NONE: SUFF; 962 } 963 if(h = tryword(word,ep,lev,flag)) { 964 if(Set(h, flag&~MONO) && (flag&MONO) <= Set(h, MONO)) 965 return h; 966 h = 0; 967 } 968 bp = word; 969 pp = space; 970 if(lev<DSIZ) { 971 deriv[lev+1].mesg = pp; 972 deriv[lev+1].type = 0; 973 } 974 while(tp=lookuppref((uchar**)&bp,ep)) { 975 *pp++ = '+'; 976 cp = tp->s; 977 while(pp<space+sizeof(space) && (*pp = *cp++)) 978 pp++; 979 deriv[lev+1].type += PREF; 980 h = tryword(bp,ep,lev+1,flag); 981 if(Set(h,NOPREF) || 982 ((tp->flag&IN) && inun(bp-2,h)==0)) { 983 h = 0; 984 break; 985 } 986 if(Set(h,flag&~MONO) && (flag&MONO) <= Set(h, MONO)) 987 break; 988 h = 0; 989 } 990 if(lev < DSIZ) { 991 deriv[lev+1] = emptyderiv; 992 deriv[lev+2] = emptyderiv; 993 } 994 return h; 995 } 996 997 Bits 998 tryword(char* bp, char* ep, int lev, int flag) 999 { 1000 int j; 1001 Bits h = 0; 1002 char duple[3]; 1003 1004 if(ep-bp <= 1) 1005 return h; 1006 if(flag&MONO) { 1007 if(lev<DSIZ) { 1008 deriv[++lev].mesg = duple; 1009 deriv[lev].type = SUFF; 1010 } 1011 duple[0] = '+'; 1012 duple[1] = *ep; 1013 duple[2] = 0; 1014 } 1015 h = dict(bp, ep); 1016 if(vflag==0 || h==0) 1017 return h; 1018 /* 1019 * when derivations are wanted, collect them 1020 * for printing 1021 */ 1022 j = lev; 1023 prefcount = suffcount = 0; 1024 do { 1025 if(j<DSIZ && deriv[j].type) { 1026 strcat(affix, deriv[j].mesg); 1027 if(deriv[j].type == SUFF) 1028 suffcount++; 1029 else if(deriv[j].type != NONE) 1030 prefcount = deriv[j].type/PREF; 1031 } 1032 } while(--j > 0); 1033 return h; 1034 } 1035 1036 int 1037 inun(char* bp, Bits h) 1038 { 1039 if(*bp == 'u') 1040 return Set(h, IN) == 0; 1041 /* *bp == 'i' */ 1042 if(Set(h, IN) == 0) 1043 return 0; 1044 switch(bp[2]) { 1045 case 'r': 1046 return bp[1] == 'r'; 1047 case 'm': 1048 case 'p': 1049 return bp[1] == 'm'; 1050 } 1051 return bp[1] == 'n'; 1052 } 1053 1054 char* 1055 skipv(char *s) 1056 { 1057 if(s >= word && ISVOWEL(*s)) 1058 s--; 1059 while(s >= word && !ISVOWEL(*s)) 1060 s--; 1061 return s; 1062 } 1063 1064 /* 1065 * crummy way to Britishise 1066 */ 1067 void 1068 ise(void) 1069 { 1070 Suftab *p; 1071 int i; 1072 1073 for(i=0; i<26; i++) 1074 for(p = suftab[i]; p->suf; p++) { 1075 p->suf = ztos(p->suf); 1076 p->d1 = ztos(p->d1); 1077 p->a1 = ztos(p->a1); 1078 } 1079 } 1080 1081 char* 1082 ztos(char *as) 1083 { 1084 char *s, *ds; 1085 1086 for(s=as; *s; s++) 1087 if(*s == 'z') 1088 goto copy; 1089 return as; 1090 1091 copy: 1092 ds = strdup(as); 1093 for(s=ds; *s; s++) 1094 if(*s == 'z') 1095 *s = 's'; 1096 return ds; 1097 } 1098 1099 Bits 1100 dict(char* bp, char* ep) 1101 { 1102 char *cp, *cp1, *w, *wp, *we; 1103 int n, f; 1104 1105 w = bp; 1106 we = ep; 1107 n = ep-bp; 1108 if(n <= 1) 1109 return NOUN; 1110 1111 f = w[0] & 0x7f; 1112 f *= 128; 1113 f += w[1] & 0x7f; 1114 bp = spacep[f]; 1115 ep = spacep[f+1]; 1116 1117 loop: 1118 if(bp >= ep) { 1119 if(xflag) 1120 fprint(2, "=%.*s\n", n, w); 1121 return 0; 1122 } 1123 /* 1124 * find the beginning of some word in the middle 1125 */ 1126 cp = bp + (ep-bp)/2; 1127 1128 while(cp > bp && !(*cp & 0x80)) 1129 cp--; 1130 while(cp > bp && (cp[-1] & 0x80)) 1131 cp--; 1132 1133 wp = w + 2; /* skip two letters */ 1134 cp1 = cp + 2; /* skip affix code */ 1135 for(;;) { 1136 if(wp >= we) { 1137 if(*cp1 & 0x80) 1138 goto found; 1139 else 1140 f = 1; 1141 break; 1142 } 1143 if(*cp1 & 0x80) { 1144 f = -1; 1145 break; 1146 } 1147 f = *cp1++ - *wp++; 1148 if(f != 0) 1149 break; 1150 } 1151 1152 if(f < 0) { 1153 while(!(*cp1 & 0x80)) 1154 cp1++; 1155 bp = cp1; 1156 goto loop; 1157 } 1158 ep = cp; 1159 goto loop; 1160 1161 found: 1162 f = ((cp[0] & 0x7) << 8) | 1163 (cp[1] & 0xff); 1164 if(xflag) { 1165 fprint(2, "=%.*s ", n, w); 1166 typeprint(encode[f]); 1167 } 1168 return encode[f]; 1169 } 1170 1171 void 1172 typeprint(Bits h) 1173 { 1174 1175 pcomma(""); 1176 if(h & NOUN) 1177 pcomma("n"); 1178 if(h & PROP_COLLECT) 1179 pcomma("pc"); 1180 if(h & VERB) { 1181 if((h & VERB) == VERB) 1182 pcomma("v"); 1183 else 1184 if((h & VERB) == V_IRREG) 1185 pcomma("vi"); 1186 else 1187 if(h & ED) 1188 pcomma("ed"); 1189 } 1190 if(h & ADJ) 1191 pcomma("a"); 1192 if(h & COMP) { 1193 if((h & COMP) == ACTOR) 1194 pcomma("er"); 1195 else 1196 pcomma("comp"); 1197 } 1198 if(h & DONT_TOUCH) 1199 pcomma("d"); 1200 if(h & N_AFFIX) 1201 pcomma("na"); 1202 if(h & ADV) 1203 pcomma("adv"); 1204 if(h & ION) 1205 pcomma("ion"); 1206 if(h & V_AFFIX) 1207 pcomma("va"); 1208 if(h & MAN) 1209 pcomma("man"); 1210 if(h & NOPREF) 1211 pcomma("nopref"); 1212 if(h & MONO) 1213 pcomma("ms"); 1214 if(h & IN) 1215 pcomma("in"); 1216 if(h & _Y) 1217 pcomma("y"); 1218 if(h & STOP) 1219 pcomma("s"); 1220 fprint(2, "\n"); 1221 } 1222 1223 void 1224 pcomma(char *s) 1225 { 1226 static flag; 1227 1228 if(*s == 0) { 1229 flag = 0; 1230 return; 1231 } 1232 if(!flag) { 1233 fprint(2, "%s", s); 1234 flag = 1; 1235 } else 1236 fprint(2, ",%s", s); 1237 } 1238 1239 /* 1240 * is the word on of the following 1241 * 12th teen 1242 * 21st end in 1 1243 * 23rd end in 3 1244 * 77th default 1245 * called knowing word[0] is a digit 1246 */ 1247 int 1248 ordinal(void) 1249 { 1250 char *cp = word; 1251 static char sp[4]; 1252 1253 while(ISDIGIT(*cp)) 1254 cp++; 1255 strncpy(sp,cp,3); 1256 if(ISUPPER(cp[0]) && ISUPPER(cp[1])) { 1257 sp[0] = Tolower(cp[0]); 1258 sp[1] = Tolower(cp[1]); 1259 } 1260 return 0 == strncmp(sp, 1261 cp[-2]=='1'? "th": /* out of bounds if 1 digit */ 1262 *--cp=='1'? "st": /* harmless */ 1263 *cp=='2'? "nd": 1264 *cp=='3'? "rd": 1265 "th", 3); 1266 } 1267 1268 /* 1269 * read in the dictionary. 1270 * format is 1271 * { 1272 * short nencode; 1273 * long encode[nencode]; 1274 * char space[*]; 1275 * }; 1276 * 1277 * the encodings are a table all different 1278 * affixes. 1279 * the dictionary proper has 2 bytes 1280 * that demark and then the rest of the 1281 * word. the 2 bytes have the following 1282 * 0x80 0x00 flag 1283 * 0x78 0x00 count of prefix bytes 1284 * common with prev word 1285 * 0x07 0xff affix code 1286 * 1287 * all ints are big endians in the file. 1288 */ 1289 void 1290 readdict(char *file) 1291 { 1292 char *s, *is, *lasts, *ls; 1293 int c, i, sp, p; 1294 int f; 1295 long l; 1296 1297 lasts = 0; 1298 f = open(file, 0); 1299 if(f == -1) { 1300 fprint(2, "cannot open %s\n", file); 1301 exits("open"); 1302 } 1303 if(read(f, space, 2) != 2) 1304 goto bad; 1305 nencode = ((space[0]&0xff)<<8) | (space[1]&0xff); 1306 if(read(f, space, 4*nencode) != 4*nencode) 1307 goto bad; 1308 s = space; 1309 for(i=0; i<nencode; i++) { 1310 l = (long)(s[0] & 0xff) << 24; 1311 l |= (s[1] & 0xff) << 16; 1312 l |= (s[2] & 0xff) << 8; 1313 l |= s[3] & 0xff; 1314 encode[i] = (Bits)l; 1315 s += 4; 1316 } 1317 l = read(f, space, sizeof(space)); 1318 if(l == sizeof(space)) 1319 goto noroom; 1320 is = space + (sizeof(space) - l); 1321 memmove(is, space, l); 1322 1323 s = space; 1324 c = *is++ & 0xff; 1325 sp = -1; 1326 i = 0; 1327 1328 loop: 1329 if(s > is) 1330 goto noroom; 1331 if(c < 0) { 1332 close(f); 1333 while(sp < 128*128) 1334 spacep[++sp] = s; 1335 *s = 0x80; /* fence */ 1336 return; 1337 } 1338 p = (c>>3) & 0xf; 1339 *s++ = c; 1340 *s++ = *is++ & 0xff; 1341 if(p <= 0) 1342 i = (*is++ & 0xff)*128; 1343 if(p <= 1) { 1344 if(!(*is & 0x80)) 1345 i = i/128*128 + (*is++ & 0xff); 1346 if(i <= sp) { 1347 fprint(2, "the dict isnt sorted or \n"); 1348 fprint(2, "memmove didn't work\n"); 1349 goto bad; 1350 } 1351 while(sp < i) 1352 spacep[++sp] = s-2; 1353 } 1354 ls = lasts; 1355 lasts = s; 1356 for(p-=2; p>0; p--) 1357 *s++ = *ls++; 1358 for(;;) { 1359 if(is >= space+sizeof(space)) { 1360 c = -1; 1361 break; 1362 } 1363 c = *is++ & 0xff; 1364 if(c & 0x80) 1365 break; 1366 *s++ = c; 1367 } 1368 *s = 0; 1369 goto loop; 1370 1371 bad: 1372 fprint(2, "trouble reading %s\n", file); 1373 exits("read"); 1374 noroom: 1375 fprint(2, "not enough space for dictionary\n"); 1376 exits("space"); 1377 } 1378