1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License"). You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /*
23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
28 /* All Rights Reserved */
29
30 #pragma ident "%Z%%M% %I% %E% SMI"
31
32 #include <stdlib.h>
33 #include <unistd.h>
34 #include <limits.h>
35 #include <string.h>
36 #include <stdio.h>
37 #include <ctype.h>
38 #include <locale.h>
39 #include "hash.h"
40
41 #define Tolower(c) (isupper(c)?tolower(c):c)
42 #define DLEV 2
43
44 /*
45 * ANSI prototypes
46 */
47 static int ily(char *, char *, char *, int);
48 static int s(char *, char *, char *, int);
49 static int es(char *, char *, char *, int);
50 static int subst(char *, char *, char *, int);
51 static int nop(void);
52 static int bility(char *, char *, char *, int);
53 static int i_to_y(char *, char *, char *, int);
54 static int CCe(char *, char *, char *, int);
55 static int y_to_e(char *, char *, char *, int);
56 static int strip(char *, char *, char *, int);
57 static int ize(char *, char *, char *, int);
58 static int tion(char *, char *, char *, int);
59 static int an(char *, char *, char *, int);
60 int prime(char *);
61 static void ise(void);
62 static int tryword(char *, char *, int);
63 static int trypref(char *, char *, int);
64 static int trysuff(char *, int);
65 static int vowel(int);
66 static int dict(char *, char *);
67 static int monosyl(char *, char *);
68 static int VCe(char *, char *, char *, int);
69 static char *skipv(char *);
70 static void ztos(char *);
71
72 static struct suftab {
73 char *suf;
74 int (*p1)();
75 int n1;
76 char *d1;
77 char *a1;
78 int (*p2)();
79 int n2;
80 char *d2;
81 char *a2;
82 } suftab[] = {
83 {"ssen", ily, 4, "-y+iness", "+ness" },
84 {"ssel", ily, 4, "-y+i+less", "+less" },
85 {"se", s, 1, "", "+s", es, 2, "-y+ies", "+es" },
86 {"s'", s, 2, "", "+'s"},
87 {"s", s, 1, "", "+s"},
88 {"ecn", subst, 1, "-t+ce", ""},
89 {"ycn", subst, 1, "-t+cy", ""},
90 {"ytilb", nop, 0, "", ""},
91 {"ytilib", bility, 5, "-le+ility", ""},
92 {"elbaif", i_to_y, 4, "-y+iable", ""},
93 {"elba", CCe, 4, "-e+able", "+able"},
94 {"yti", CCe, 3, "-e+ity", "+ity"},
95 {"ylb", y_to_e, 1, "-e+y", ""},
96 {"yl", ily, 2, "-y+ily", "+ly"},
97 {"laci", strip, 2, "", "+al"},
98 {"latnem", strip, 2, "", "+al"},
99 {"lanoi", strip, 2, "", "+al"},
100 {"tnem", strip, 4, "", "+ment"},
101 {"gni", CCe, 3, "-e+ing", "+ing"},
102 {"reta", nop, 0, "", ""},
103 {"retc", nop, 0, "", ""},
104 {"re", strip, 1, "", "+r", i_to_y, 2, "-y+ier", "+er"},
105 {"de", strip, 1, "", "+d", i_to_y, 2, "-y+ied", "+ed"},
106 {"citsi", strip, 2, "", "+ic"},
107 {"citi", ize, 1, "-ic+e", ""},
108 {"cihparg", i_to_y, 1, "-y+ic", ""},
109 {"tse", strip, 2, "", "+st", i_to_y, 3, "-y+iest", "+est"},
110 {"cirtem", i_to_y, 1, "-y+ic", ""},
111 {"yrtem", subst, 0, "-er+ry", ""},
112 {"cigol", i_to_y, 1, "-y+ic", ""},
113 {"tsigol", i_to_y, 2, "-y+ist", ""},
114 {"tsi", CCe, 3, "-e+ist", "+ist"},
115 {"msi", CCe, 3, "-e+ism", "+ist"},
116 {"noitacifi", i_to_y, 6, "-y+ication", ""},
117 {"noitazi", ize, 4, "-e+ation", ""},
118 {"rota", tion, 2, "-e+or", ""},
119 {"rotc", tion, 2, "", "+or"},
120 {"noit", tion, 3, "-e+ion", "+ion"},
121 {"naino", an, 3, "", "+ian"},
122 {"na", an, 1, "", "+n"},
123 {"evi", subst, 0, "-ion+ive", ""},
124 {"ezi", CCe, 3, "-e+ize", "+ize"},
125 {"pihs", strip, 4, "", "+ship"},
126 {"dooh", ily, 4, "-y+ihood", "+hood"},
127 {"luf", ily, 3, "-y+iful", "+ful"},
128 {"ekil", strip, 4, "", "+like"},
129 0
130 };
131
132 static char *preftab[] = {
133 "anti",
134 "auto",
135 "bio",
136 "counter",
137 "dis",
138 "electro",
139 "en",
140 "fore",
141 "geo",
142 "hyper",
143 "intra",
144 "inter",
145 "iso",
146 "kilo",
147 "magneto",
148 "meta",
149 "micro",
150 "mid",
151 "milli",
152 "mis",
153 "mono",
154 "multi",
155 "non",
156 "out",
157 "over",
158 "photo",
159 "poly",
160 "pre",
161 "pseudo",
162 "psycho",
163 "re",
164 "semi",
165 "stereo",
166 "sub",
167 "super",
168 "tele",
169 "thermo",
170 "ultra",
171 "under", /* must precede un */
172 "un",
173 0
174 };
175
176 static int vflag;
177 static int xflag;
178 static char *prog;
179 static char word[LINE_MAX];
180 static char original[LINE_MAX];
181 static char *deriv[LINE_MAX];
182 static char affix[LINE_MAX];
183 static FILE *file, *found;
184 /*
185 * deriv is stack of pointers to notes like +micro +ed
186 * affix is concatenated string of notes
187 * the buffer size 141 stems from the sizes of original and affix.
188 */
189
190 /*
191 * in an attempt to defray future maintenance misunderstandings, here is
192 * an attempt to describe the input/output expectations of the spell
193 * program.
194 *
195 * spellprog is intended to be called from the shell file spell.
196 * because of this, there is little error checking (this is historical, not
197 * necessarily advisable).
198 *
199 * spellprog options hashed-list pass
200 *
201 * the hashed-list is a list of the form made by spellin.
202 * there are 2 types of hashed lists:
203 * 1. a stop list: this specifies words that by the rules embodied
204 * in spellprog would be recognized as correct, BUT are really
205 * errors.
206 * 2. a dictionary of correctly spelled words.
207 * the pass number determines how the words found in the specified
208 * hashed-list are treated. If the pass number is 1, the hashed-list is
209 * treated as the stop-list, otherwise, it is treated as the regular
210 * dictionary list. in this case, the value of "pass" is a filename. Found
211 * words are written to this file.
212 *
213 * In the normal case, the filename = /dev/null. However, if the v option
214 * is specified, the derivations are written to this file.
215 * The spellprog looks up words in the hashed-list; if a word is found, it
216 * is printed to the stdout. If the hashed-list was the stop-list, the
217 * words found are presumed to be misspellings. in this case,
218 * a control character is printed ( a "-" is appended to the word.
219 * a hyphen will never occur naturally in the input list because deroff
220 * is used in the shell file before calling spellprog.)
221 * If the regualar spelling list was used (hlista or hlistb), the words
222 * are correct, and may be ditched. (unless the -v option was used -
223 * see the manual page).
224 *
225 * spellprog should be called twice : first with the stop-list, to flag all
226 * a priori incorrectly spelled words; second with the dictionary.
227 *
228 * spellprog hstop 1 |\
229 * spellprog hlista /dev/null
230 *
231 * for a complete scenario, see the shell file: spell.
232 *
233 */
234
235 int
main(int argc,char ** argv)236 main(int argc, char **argv)
237 {
238 char *ep, *cp;
239 char *dp;
240 int fold;
241 int c, j;
242 int pass;
243
244 /* Set locale environment variables local definitions */
245 (void) setlocale(LC_ALL, "");
246 #if !defined(TEXT_DOMAIN) /* Should be defined by cc -D */
247 #define TEXT_DOMAIN "SYS_TEST" /* Use this only if it wasn't */
248 #endif
249 (void) textdomain(TEXT_DOMAIN);
250
251
252 prog = argv[0];
253 while ((c = getopt(argc, argv, "bvx")) != EOF) {
254 switch (c) {
255 case 'b':
256 ise();
257 break;
258 case 'v':
259 vflag++;
260 break;
261 case 'x':
262 xflag++;
263 break;
264 }
265 }
266
267 argc -= optind;
268 argv = &argv[optind];
269
270 if ((argc < 2) || !prime(*argv)) {
271 (void) fprintf(stderr,
272 gettext("%s: cannot initialize hash table\n"), prog);
273 exit(1);
274 }
275 argc--;
276 argv++;
277
278 /*
279 * if pass is not 1, it is assumed to be a filename.
280 * found words are written to this file.
281 */
282 pass = **argv;
283 if (pass != '1')
284 found = fopen(*argv, "w");
285
286 for (;;) {
287 affix[0] = 0;
288 file = stdout;
289 for (ep = word; (*ep = j = getchar()) != '\n'; ep++)
290 if (j == EOF)
291 exit(0);
292 /*
293 * here is the hyphen processing. these words were found in the stop
294 * list. however, if they exist as is, (no derivations tried) in the
295 * dictionary, let them through as correct.
296 *
297 */
298 if (ep[-1] == '-') {
299 *--ep = 0;
300 if (!tryword(word, ep, 0))
301 (void) fprintf(file, "%s\n", word);
302 continue;
303 }
304 for (cp = word, dp = original; cp < ep; )
305 *dp++ = *cp++;
306 *dp = 0;
307 fold = 0;
308 for (cp = word; cp < ep; cp++)
309 if (islower(*cp))
310 goto lcase;
311 if (((ep - word) == 1) &&
312 ((word[0] == 'A') || (word[0] == 'I')))
313 continue;
314 if (trypref(ep, ".", 0))
315 goto foundit;
316 ++fold;
317 for (cp = original+1, dp = word+1; dp < ep; dp++, cp++)
318 *dp = Tolower(*cp);
319 lcase:
320 if (((ep - word) == 1) && (word[0] == 'a'))
321 continue;
322 if (trypref(ep, ".", 0)||trysuff(ep, 0))
323 goto foundit;
324 if (isupper(word[0])) {
325 for (cp = original, dp = word; *dp = *cp++; dp++)
326 if (fold) *dp = Tolower(*dp);
327 word[0] = Tolower(word[0]);
328 goto lcase;
329 }
330 (void) fprintf(file, "%s\n", original);
331 continue;
332
333 foundit:
334 if (pass == '1')
335 (void) fprintf(file, "%s-\n", original);
336 else if (affix[0] != 0 && affix[0] != '.') {
337 file = found;
338 (void) fprintf(file, "%s\t%s\n", affix,
339 original);
340 }
341 }
342 }
343
344 /*
345 * strip exactly one suffix and do
346 * indicated routine(s), which may recursively
347 * strip suffixes
348 */
349
350 static int
trysuff(char * ep,int lev)351 trysuff(char *ep, int lev)
352 {
353 struct suftab *t;
354 char *cp, *sp;
355
356 lev += DLEV;
357 deriv[lev] = deriv[lev-1] = 0;
358 for (t = &suftab[0]; (sp = t->suf) != 0; t++) {
359 cp = ep;
360 while (*sp)
361 if (*--cp != *sp++)
362 goto next;
363 for (sp = cp; --sp >= word && !vowel(*sp); );
364 if (sp < word)
365 return (0);
366 if ((*t->p1)(ep-t->n1, t->d1, t->a1, lev+1))
367 return (1);
368 if (t->p2 != 0) {
369 deriv[lev] = deriv[lev+1] = 0;
370 return ((*t->p2)(ep-t->n2, t->d2, t->a2, lev));
371 }
372 return (0);
373 next:;
374 }
375 return (0);
376 }
377
378 static int
nop(void)379 nop(void)
380 {
381 return (0);
382 }
383
384 /* ARGSUSED */
385 static int
strip(char * ep,char * d,char * a,int lev)386 strip(char *ep, char *d, char *a, int lev)
387 {
388 return (trypref(ep, a, lev)||trysuff(ep, lev));
389 }
390
391 static int
s(char * ep,char * d,char * a,int lev)392 s(char *ep, char *d, char *a, int lev)
393 {
394 if (lev > DLEV+1)
395 return (0);
396 if (*ep == 's' && ep[-1] == 's')
397 return (0);
398 return (strip(ep, d, a, lev));
399 }
400
401 /* ARGSUSED */
402 static int
an(char * ep,char * d,char * a,int lev)403 an(char *ep, char *d, char *a, int lev)
404 {
405 if (!isupper(*word)) /* must be proper name */
406 return (0);
407 return (trypref(ep, a, lev));
408 }
409
410 /* ARGSUSED */
411 static int
ize(char * ep,char * d,char * a,int lev)412 ize(char *ep, char *d, char *a, int lev)
413 {
414 ep[-1] = 'e';
415 return (strip(ep, "", d, lev));
416 }
417
418 /* ARGSUSED */
419 static int
y_to_e(char * ep,char * d,char * a,int lev)420 y_to_e(char *ep, char *d, char *a, int lev)
421 {
422 *ep++ = 'e';
423 return (strip(ep, "", d, lev));
424 }
425
426 static int
ily(char * ep,char * d,char * a,int lev)427 ily(char *ep, char *d, char *a, int lev)
428 {
429 if (ep[-1] == 'i')
430 return (i_to_y(ep, d, a, lev));
431 else
432 return (strip(ep, d, a, lev));
433 }
434
435 static int
bility(char * ep,char * d,char * a,int lev)436 bility(char *ep, char *d, char *a, int lev)
437 {
438 *ep++ = 'l';
439 return (y_to_e(ep, d, a, lev));
440 }
441
442 static int
i_to_y(char * ep,char * d,char * a,int lev)443 i_to_y(char *ep, char *d, char *a, int lev)
444 {
445 if (ep[-1] == 'i') {
446 ep[-1] = 'y';
447 a = d;
448 }
449 return (strip(ep, "", a, lev));
450 }
451
452 static int
es(char * ep,char * d,char * a,int lev)453 es(char *ep, char *d, char *a, int lev)
454 {
455 if (lev > DLEV)
456 return (0);
457 switch (ep[-1]) {
458 default:
459 return (0);
460 case 'i':
461 return (i_to_y(ep, d, a, lev));
462 case 's':
463 case 'h':
464 case 'z':
465 case 'x':
466 return (strip(ep, d, a, lev));
467 }
468 }
469
470 /* ARGSUSED */
471 static int
subst(char * ep,char * d,char * a,int lev)472 subst(char *ep, char *d, char *a, int lev)
473 {
474 char *u, *t;
475
476 if (skipv(skipv(ep-1)) < word)
477 return (0);
478 for (t = d; *t != '+'; t++)
479 continue;
480 for (u = ep; *--t != '-'; )
481 *--u = *t;
482 return (strip(ep, "", d, lev));
483 }
484
485
486 static int
tion(char * ep,char * d,char * a,int lev)487 tion(char *ep, char *d, char *a, int lev)
488 {
489 switch (ep[-2]) {
490 case 'c':
491 case 'r':
492 return (trypref(ep, a, lev));
493 case 'a':
494 return (y_to_e(ep, d, a, lev));
495 }
496 return (0);
497 }
498
499 /* possible consonant-consonant-e ending */
500 static int
CCe(char * ep,char * d,char * a,int lev)501 CCe(char *ep, char *d, char *a, int lev)
502 {
503 switch (ep[-1]) {
504 case 'r':
505 if (ep[-2] == 't')
506 return (y_to_e(ep, d, a, lev));
507 break;
508 case 'l':
509 if (vowel(ep[-2]))
510 break;
511 switch (ep[-2]) {
512 case 'l':
513 case 'r':
514 case 'w':
515 break;
516 default:
517 return (y_to_e(ep, d, a, lev));
518 }
519 break;
520 case 's':
521 if (ep[-2] == 's')
522 break;
523 if (*ep == 'a')
524 return (0);
525 if (vowel(ep[-2]))
526 break;
527 if (y_to_e(ep, d, a, lev))
528 return (1);
529 if (!(ep[-2] == 'n' && ep[-1] == 'g'))
530 return (0);
531 break;
532 case 'c':
533 case 'g':
534 if (*ep == 'a')
535 return (0);
536 if (vowel(ep[-2]))
537 break;
538 if (y_to_e(ep, d, a, lev))
539 return (1);
540 if (!(ep[-2] == 'n' && ep[-1] == 'g'))
541 return (0);
542 break;
543 case 'v':
544 case 'z':
545 if (vowel(ep[-2]))
546 break;
547 if (y_to_e(ep, d, a, lev))
548 return (1);
549 if (!(ep[-2] == 'n' && ep[-1] == 'g'))
550 return (0);
551 break;
552 case 'u':
553 if (y_to_e(ep, d, a, lev))
554 return (1);
555 if (!(ep[-2] == 'n' && ep[-1] == 'g'))
556 return (0);
557 break;
558 }
559 return (VCe(ep, d, a, lev));
560 }
561
562 /* possible consonant-vowel-consonant-e ending */
563 static int
VCe(char * ep,char * d,char * a,int lev)564 VCe(char *ep, char *d, char *a, int lev)
565 {
566 char c;
567 c = ep[-1];
568 if (c == 'e')
569 return (0);
570 if (!vowel(c) && vowel(ep[-2])) {
571 c = *ep;
572 *ep++ = 'e';
573 if (trypref(ep, d, lev)||trysuff(ep, lev))
574 return (1);
575 ep--;
576 *ep = c;
577 }
578 return (strip(ep, d, a, lev));
579 }
580
581 static char *
lookuppref(char ** wp,char * ep)582 lookuppref(char **wp, char *ep)
583 {
584 char **sp;
585 char *bp, *cp;
586
587 for (sp = preftab; *sp; sp++) {
588 bp = *wp;
589 for (cp = *sp; *cp; cp++, bp++)
590 if (Tolower(*bp) != *cp)
591 goto next;
592 for (cp = bp; cp < ep; cp++)
593 if (vowel(*cp)) {
594 *wp = bp;
595 return (*sp);
596 }
597 next:;
598 }
599 return (0);
600 }
601
602 /*
603 * while word is not in dictionary try stripping
604 * prefixes. Fail if no more prefixes.
605 */
606 static int
trypref(char * ep,char * a,int lev)607 trypref(char *ep, char *a, int lev)
608 {
609 char *cp;
610 char *bp;
611 char *pp;
612 int val = 0;
613 char space[LINE_MAX * 2];
614 deriv[lev] = a;
615 if (tryword(word, ep, lev))
616 return (1);
617 bp = word;
618 pp = space;
619 deriv[lev+1] = pp;
620 while (cp = lookuppref(&bp, ep)) {
621 *pp++ = '+';
622 while (*pp = *cp++)
623 pp++;
624 if (tryword(bp, ep, lev+1)) {
625 val = 1;
626 break;
627 }
628 }
629 deriv[lev+1] = deriv[lev+2] = 0;
630 return (val);
631 }
632
633 static int
tryword(char * bp,char * ep,int lev)634 tryword(char *bp, char *ep, int lev)
635 {
636 int i, j;
637 char duple[3];
638 if (ep-bp <= 1)
639 return (0);
640 if (vowel(*ep)) {
641 if (monosyl(bp, ep))
642 return (0);
643 }
644 i = dict(bp, ep);
645 if (i == 0 && vowel(*ep) && ep[-1] == ep[-2] && monosyl(bp, ep-1)) {
646 ep--;
647 deriv[++lev] = duple;
648 duple[0] = '+';
649 duple[1] = *ep;
650 duple[2] = 0;
651 i = dict(bp, ep);
652 }
653 if (vflag == 0 || i == 0)
654 return (i);
655 /*
656 * when derivations are wanted, collect them
657 * for printing
658 */
659 j = lev;
660 do {
661 if (deriv[j])
662 (void) strcat(affix, deriv[j]);
663 } while (--j > 0);
664 return (i);
665 }
666
667
668 static int
monosyl(char * bp,char * ep)669 monosyl(char *bp, char *ep)
670 {
671 if (ep < bp+2)
672 return (0);
673 if (vowel(*--ep) || !vowel(*--ep) || ep[1] == 'x' || ep[1] == 'w')
674 return (0);
675 while (--ep >= bp)
676 if (vowel(*ep))
677 return (0);
678 return (1);
679 }
680
681 static char *
skipv(char * s)682 skipv(char *s)
683 {
684 if (s >= word&&vowel(*s))
685 s--;
686 while (s >= word && !vowel(*s))
687 s--;
688 return (s);
689 }
690
691 static int
vowel(int c)692 vowel(int c)
693 {
694 switch (Tolower(c)) {
695 case 'a':
696 case 'e':
697 case 'i':
698 case 'o':
699 case 'u':
700 case 'y':
701 return (1);
702 }
703 return (0);
704 }
705
706 /* crummy way to Britishise */
707 static void
ise(void)708 ise(void)
709 {
710 struct suftab *p;
711
712 for (p = suftab; p->suf; p++) {
713 ztos(p->suf);
714 ztos(p->d1);
715 ztos(p->a1);
716 }
717 }
718
719 static void
ztos(char * s)720 ztos(char *s)
721 {
722 for (; *s; s++)
723 if (*s == 'z')
724 *s = 's';
725 }
726
727 static int
dict(char * bp,char * ep)728 dict(char *bp, char *ep)
729 {
730 int temp, result;
731 if (xflag)
732 (void) fprintf(stdout, "=%.*s\n", ep-bp, bp);
733 temp = *ep;
734 *ep = 0;
735 result = hashlook(bp);
736 *ep = temp;
737 return (result);
738 }
739