13e12c5d1SDavid du Colombier #include <u.h>
23e12c5d1SDavid du Colombier #include <libc.h>
33e12c5d1SDavid du Colombier #include <bio.h>
43e12c5d1SDavid du Colombier #include <ctype.h>
53e12c5d1SDavid du Colombier #include "code.h"
63e12c5d1SDavid du Colombier
73e12c5d1SDavid du Colombier /* read an annotated spelling list in the form
83e12c5d1SDavid du Colombier word <tab> affixcode [ , affixcode ] ...
93e12c5d1SDavid du Colombier print a reencoded version
103e12c5d1SDavid du Colombier octal <tab> word
113e12c5d1SDavid du Colombier */
123e12c5d1SDavid du Colombier
133e12c5d1SDavid du Colombier typedef struct Dict Dict;
143e12c5d1SDavid du Colombier struct Dict
153e12c5d1SDavid du Colombier {
163e12c5d1SDavid du Colombier char* word;
173e12c5d1SDavid du Colombier int encode;
183e12c5d1SDavid du Colombier };
193e12c5d1SDavid du Colombier
203e12c5d1SDavid du Colombier Dict words[200000];
213e12c5d1SDavid du Colombier char space[500000];
223e12c5d1SDavid du Colombier long encodes[4094];
233e12c5d1SDavid du Colombier long nspace;
243e12c5d1SDavid du Colombier long nwords;
253e12c5d1SDavid du Colombier int ncodes;
263e12c5d1SDavid du Colombier Biobuf bout;
273e12c5d1SDavid du Colombier
283e12c5d1SDavid du Colombier void readinput(int f);
293e12c5d1SDavid du Colombier long typecode(char *str);
303e12c5d1SDavid du Colombier int wcmp(void*, void*);
313e12c5d1SDavid du Colombier void pdict(void);
323e12c5d1SDavid du Colombier void sput(int);
333e12c5d1SDavid du Colombier
343e12c5d1SDavid du Colombier void
main(int argc,char * argv[])353e12c5d1SDavid du Colombier main(int argc, char *argv[])
363e12c5d1SDavid du Colombier {
373e12c5d1SDavid du Colombier int f;
383e12c5d1SDavid du Colombier
393e12c5d1SDavid du Colombier Binit(&bout, 1, OWRITE);
403e12c5d1SDavid du Colombier nwords = 0;
413e12c5d1SDavid du Colombier nspace = 0;
423e12c5d1SDavid du Colombier ncodes = 0;
433e12c5d1SDavid du Colombier if(argc <= 1)
443e12c5d1SDavid du Colombier readinput(0);
453e12c5d1SDavid du Colombier while(argc > 1) {
463e12c5d1SDavid du Colombier f = open(argv[1], 0);
473e12c5d1SDavid du Colombier if(f < 0) {
483e12c5d1SDavid du Colombier fprint(2, "Cannot open %s\n", argv[1]);
493e12c5d1SDavid du Colombier exits("open");
503e12c5d1SDavid du Colombier }
513e12c5d1SDavid du Colombier readinput(f);
523e12c5d1SDavid du Colombier argc--;
533e12c5d1SDavid du Colombier argv++;
543e12c5d1SDavid du Colombier }
553e12c5d1SDavid du Colombier fprint(2, "words = %ld; space = %ld; codes = %d\n",
563e12c5d1SDavid du Colombier nwords, nspace, ncodes);
573e12c5d1SDavid du Colombier qsort(words, nwords, sizeof(words[0]), wcmp);
583e12c5d1SDavid du Colombier pdict();
593e12c5d1SDavid du Colombier exits(0);
603e12c5d1SDavid du Colombier }
613e12c5d1SDavid du Colombier
wcmp(void * a,void * b)623e12c5d1SDavid du Colombier wcmp(void *a, void *b)
633e12c5d1SDavid du Colombier {
643e12c5d1SDavid du Colombier
653e12c5d1SDavid du Colombier return strcmp(((Dict*)a)->word, ((Dict*)b)->word);
663e12c5d1SDavid du Colombier }
673e12c5d1SDavid du Colombier
683e12c5d1SDavid du Colombier void
readinput(int f)693e12c5d1SDavid du Colombier readinput(int f)
703e12c5d1SDavid du Colombier {
713e12c5d1SDavid du Colombier long i;
723e12c5d1SDavid du Colombier char *code, *line, *bword;
733e12c5d1SDavid du Colombier Biobuf buf;
743e12c5d1SDavid du Colombier long lineno = 0;
753e12c5d1SDavid du Colombier
763e12c5d1SDavid du Colombier Binit(&buf, f, OREAD);
773e12c5d1SDavid du Colombier while(line = Brdline(&buf, '\n')) {
783e12c5d1SDavid du Colombier line[Blinelen(&buf)-1] = 0;
793e12c5d1SDavid du Colombier lineno++;
803e12c5d1SDavid du Colombier code = line;
813e12c5d1SDavid du Colombier while(isspace(*code))
823e12c5d1SDavid du Colombier code++;
833e12c5d1SDavid du Colombier bword = code;
843e12c5d1SDavid du Colombier while(*code && !isspace(*code))
853e12c5d1SDavid du Colombier code++;
863e12c5d1SDavid du Colombier
873e12c5d1SDavid du Colombier i = code-bword;
883e12c5d1SDavid du Colombier memmove(space+nspace, bword, i);
893e12c5d1SDavid du Colombier words[nwords].word = space+nspace;
903e12c5d1SDavid du Colombier nspace += i;
913e12c5d1SDavid du Colombier space[nspace] = 0;
923e12c5d1SDavid du Colombier nspace++;
933e12c5d1SDavid du Colombier
943e12c5d1SDavid du Colombier if(*code) {
953e12c5d1SDavid du Colombier *code++ = 0;
963e12c5d1SDavid du Colombier while(isspace(*code))
973e12c5d1SDavid du Colombier code++;
983e12c5d1SDavid du Colombier }
993e12c5d1SDavid du Colombier words[nwords].encode = typecode(code);
1003e12c5d1SDavid du Colombier nwords++;
1013e12c5d1SDavid du Colombier if(nwords >= sizeof(words)/sizeof(words[0])) {
1023e12c5d1SDavid du Colombier fprint(2, "words array too small\n");
1033e12c5d1SDavid du Colombier exits("words");
1043e12c5d1SDavid du Colombier }
1053e12c5d1SDavid du Colombier if(nspace >= sizeof(space)/sizeof(space[0])) {
1063e12c5d1SDavid du Colombier fprint(2, "space array too small\n");
1073e12c5d1SDavid du Colombier exits("space");
1083e12c5d1SDavid du Colombier }
1093e12c5d1SDavid du Colombier }
110*219b2ee8SDavid du Colombier Bterm(&buf);
1113e12c5d1SDavid du Colombier }
1123e12c5d1SDavid du Colombier
1133e12c5d1SDavid du Colombier
1143e12c5d1SDavid du Colombier typedef struct Class Class;
1153e12c5d1SDavid du Colombier struct Class
1163e12c5d1SDavid du Colombier {
1173e12c5d1SDavid du Colombier char* codename;
1183e12c5d1SDavid du Colombier long bits;
1193e12c5d1SDavid du Colombier };
1203e12c5d1SDavid du Colombier Class codea[] =
1213e12c5d1SDavid du Colombier {
1223e12c5d1SDavid du Colombier { "a", ADJ },
1233e12c5d1SDavid du Colombier { "adv", ADV },
1243e12c5d1SDavid du Colombier 0
1253e12c5d1SDavid du Colombier };
1263e12c5d1SDavid du Colombier Class codec[] =
1273e12c5d1SDavid du Colombier {
1283e12c5d1SDavid du Colombier { "comp", COMP },
1293e12c5d1SDavid du Colombier 0
1303e12c5d1SDavid du Colombier };
1313e12c5d1SDavid du Colombier Class coded[] =
1323e12c5d1SDavid du Colombier {
1333e12c5d1SDavid du Colombier { "d", DONT_TOUCH},
1343e12c5d1SDavid du Colombier 0
1353e12c5d1SDavid du Colombier };
1363e12c5d1SDavid du Colombier
1373e12c5d1SDavid du Colombier Class codee[] =
1383e12c5d1SDavid du Colombier {
1393e12c5d1SDavid du Colombier { "ed", ED },
1403e12c5d1SDavid du Colombier { "er", ACTOR },
1413e12c5d1SDavid du Colombier 0
1423e12c5d1SDavid du Colombier };
1433e12c5d1SDavid du Colombier
1443e12c5d1SDavid du Colombier Class codei[] =
1453e12c5d1SDavid du Colombier {
1463e12c5d1SDavid du Colombier { "in", IN },
1473e12c5d1SDavid du Colombier { "ion", ION },
1483e12c5d1SDavid du Colombier 0
1493e12c5d1SDavid du Colombier };
1503e12c5d1SDavid du Colombier
1513e12c5d1SDavid du Colombier Class codem[] =
1523e12c5d1SDavid du Colombier {
1533e12c5d1SDavid du Colombier { "man", MAN },
1543e12c5d1SDavid du Colombier { "ms", MONO },
1553e12c5d1SDavid du Colombier 0
1563e12c5d1SDavid du Colombier };
1573e12c5d1SDavid du Colombier
1583e12c5d1SDavid du Colombier Class coden[] =
1593e12c5d1SDavid du Colombier {
1603e12c5d1SDavid du Colombier { "n", NOUN },
1613e12c5d1SDavid du Colombier { "na", N_AFFIX },
1623e12c5d1SDavid du Colombier { "nopref", NOPREF },
1633e12c5d1SDavid du Colombier 0
1643e12c5d1SDavid du Colombier };
1653e12c5d1SDavid du Colombier
1663e12c5d1SDavid du Colombier Class codep[] =
1673e12c5d1SDavid du Colombier {
1683e12c5d1SDavid du Colombier { "pc", PROP_COLLECT },
1693e12c5d1SDavid du Colombier 0
1703e12c5d1SDavid du Colombier };
1713e12c5d1SDavid du Colombier Class codes[] =
1723e12c5d1SDavid du Colombier {
1733e12c5d1SDavid du Colombier { "s", STOP },
1743e12c5d1SDavid du Colombier 0
1753e12c5d1SDavid du Colombier };
1763e12c5d1SDavid du Colombier
1773e12c5d1SDavid du Colombier Class codev[] =
1783e12c5d1SDavid du Colombier {
1793e12c5d1SDavid du Colombier { "v", VERB },
1803e12c5d1SDavid du Colombier { "va", V_AFFIX },
1813e12c5d1SDavid du Colombier { "vi", V_IRREG },
1823e12c5d1SDavid du Colombier 0
1833e12c5d1SDavid du Colombier };
1843e12c5d1SDavid du Colombier
1853e12c5d1SDavid du Colombier Class codey[] =
1863e12c5d1SDavid du Colombier {
1873e12c5d1SDavid du Colombier { "y", _Y },
1883e12c5d1SDavid du Colombier 0
1893e12c5d1SDavid du Colombier };
1903e12c5d1SDavid du Colombier
1913e12c5d1SDavid du Colombier Class codez[] =
1923e12c5d1SDavid du Colombier {
1933e12c5d1SDavid du Colombier 0
1943e12c5d1SDavid du Colombier };
1953e12c5d1SDavid du Colombier Class* codetab[] =
1963e12c5d1SDavid du Colombier {
1973e12c5d1SDavid du Colombier codea,
1983e12c5d1SDavid du Colombier codez,
1993e12c5d1SDavid du Colombier codec,
2003e12c5d1SDavid du Colombier coded,
2013e12c5d1SDavid du Colombier codee,
2023e12c5d1SDavid du Colombier codez,
2033e12c5d1SDavid du Colombier codez,
2043e12c5d1SDavid du Colombier codez,
2053e12c5d1SDavid du Colombier codei,
2063e12c5d1SDavid du Colombier codez,
2073e12c5d1SDavid du Colombier codez,
2083e12c5d1SDavid du Colombier codez,
2093e12c5d1SDavid du Colombier codem,
2103e12c5d1SDavid du Colombier coden,
2113e12c5d1SDavid du Colombier codez,
2123e12c5d1SDavid du Colombier codep,
2133e12c5d1SDavid du Colombier codez,
2143e12c5d1SDavid du Colombier codez,
2153e12c5d1SDavid du Colombier codes,
2163e12c5d1SDavid du Colombier codez,
2173e12c5d1SDavid du Colombier codez,
2183e12c5d1SDavid du Colombier codev,
2193e12c5d1SDavid du Colombier codez,
2203e12c5d1SDavid du Colombier codez,
2213e12c5d1SDavid du Colombier codey,
2223e12c5d1SDavid du Colombier codez,
2233e12c5d1SDavid du Colombier };
2243e12c5d1SDavid du Colombier
2253e12c5d1SDavid du Colombier long
typecode(char * str)2263e12c5d1SDavid du Colombier typecode(char *str)
2273e12c5d1SDavid du Colombier {
2283e12c5d1SDavid du Colombier Class *p;
2293e12c5d1SDavid du Colombier long code;
2303e12c5d1SDavid du Colombier int n, i;
2313e12c5d1SDavid du Colombier char *s, *sp, *st;
2323e12c5d1SDavid du Colombier
2333e12c5d1SDavid du Colombier code = 0;
2343e12c5d1SDavid du Colombier
2353e12c5d1SDavid du Colombier loop:
2363e12c5d1SDavid du Colombier for(s=str; *s != 0 && *s != ','; s++)
2373e12c5d1SDavid du Colombier ;
2383e12c5d1SDavid du Colombier for(p = codetab[*str-'a']; sp = p->codename; p++) {
2393e12c5d1SDavid du Colombier st = str;
2403e12c5d1SDavid du Colombier for(n=s-str;; st++,sp++) {
2413e12c5d1SDavid du Colombier if(*st != *sp)
2423e12c5d1SDavid du Colombier goto cont;
2433e12c5d1SDavid du Colombier n--;
2443e12c5d1SDavid du Colombier if(n == 0)
2453e12c5d1SDavid du Colombier break;
2463e12c5d1SDavid du Colombier }
2473e12c5d1SDavid du Colombier code |= p->bits;
2483e12c5d1SDavid du Colombier if(*s == 0)
2493e12c5d1SDavid du Colombier goto out;
2503e12c5d1SDavid du Colombier str = s+1;
2513e12c5d1SDavid du Colombier goto loop;
2523e12c5d1SDavid du Colombier cont:;
2533e12c5d1SDavid du Colombier }
2543e12c5d1SDavid du Colombier fprint(2, "Unknown affix code \"%s\"\n", str);
2553e12c5d1SDavid du Colombier return 0;
2563e12c5d1SDavid du Colombier out:
2573e12c5d1SDavid du Colombier for(i=0; i<ncodes; i++)
2583e12c5d1SDavid du Colombier if(encodes[i] == code)
2593e12c5d1SDavid du Colombier return i;
2603e12c5d1SDavid du Colombier encodes[i] = code;
2613e12c5d1SDavid du Colombier ncodes++;
2623e12c5d1SDavid du Colombier return i;
2633e12c5d1SDavid du Colombier }
2643e12c5d1SDavid du Colombier
2653e12c5d1SDavid du Colombier void
sput(int s)2663e12c5d1SDavid du Colombier sput(int s)
2673e12c5d1SDavid du Colombier {
2683e12c5d1SDavid du Colombier
2693e12c5d1SDavid du Colombier Bputc(&bout, s>>8);
2703e12c5d1SDavid du Colombier Bputc(&bout, s);
2713e12c5d1SDavid du Colombier }
2723e12c5d1SDavid du Colombier
2733e12c5d1SDavid du Colombier void
lput(long l)2743e12c5d1SDavid du Colombier lput(long l)
2753e12c5d1SDavid du Colombier {
2763e12c5d1SDavid du Colombier Bputc(&bout, l>>24);
2773e12c5d1SDavid du Colombier Bputc(&bout, l>>16);
2783e12c5d1SDavid du Colombier Bputc(&bout, l>>8);
2793e12c5d1SDavid du Colombier Bputc(&bout, l);
2803e12c5d1SDavid du Colombier }
2813e12c5d1SDavid du Colombier
2823e12c5d1SDavid du Colombier /*
2833e12c5d1SDavid du Colombier * spit out the encoded dictionary
2843e12c5d1SDavid du Colombier * all numbers are encoded big-endian.
2853e12c5d1SDavid du Colombier * struct
2863e12c5d1SDavid du Colombier * {
2873e12c5d1SDavid du Colombier * short ncodes;
2883e12c5d1SDavid du Colombier * long encodes[ncodes];
2893e12c5d1SDavid du Colombier * struct
2903e12c5d1SDavid du Colombier * {
2913e12c5d1SDavid du Colombier * short encode;
2923e12c5d1SDavid du Colombier * char word[*];
2933e12c5d1SDavid du Colombier * } words[*];
2943e12c5d1SDavid du Colombier * };
2953e12c5d1SDavid du Colombier * 0x8000 flag for code word
2963e12c5d1SDavid du Colombier * 0x7800 count of number of common bytes with previous word
2973e12c5d1SDavid du Colombier * 0x07ff index into codes array for affixes
2983e12c5d1SDavid du Colombier */
2993e12c5d1SDavid du Colombier void
pdict(void)3003e12c5d1SDavid du Colombier pdict(void)
3013e12c5d1SDavid du Colombier {
3023e12c5d1SDavid du Colombier long i, count;
3033e12c5d1SDavid du Colombier int encode, j, c;
3043e12c5d1SDavid du Colombier char *lastword, *thisword, *word;
3053e12c5d1SDavid du Colombier
3063e12c5d1SDavid du Colombier sput(ncodes);
3073e12c5d1SDavid du Colombier for(i=0; i<ncodes; i++)
3083e12c5d1SDavid du Colombier lput(encodes[i]);
3093e12c5d1SDavid du Colombier
3103e12c5d1SDavid du Colombier count = ncodes*4 + 2;
3113e12c5d1SDavid du Colombier lastword = "";
3123e12c5d1SDavid du Colombier for(i=0; i<nwords; i++) {
3133e12c5d1SDavid du Colombier word = words[i].word;
3143e12c5d1SDavid du Colombier thisword = word;
3153e12c5d1SDavid du Colombier for(j=0; *thisword == *lastword; j++) {
3163e12c5d1SDavid du Colombier if(*thisword == 0) {
3173e12c5d1SDavid du Colombier fprint(2, "identical words: %s\n", word);
3183e12c5d1SDavid du Colombier break;
3193e12c5d1SDavid du Colombier }
3203e12c5d1SDavid du Colombier thisword++;
3213e12c5d1SDavid du Colombier lastword++;
3223e12c5d1SDavid du Colombier }
3233e12c5d1SDavid du Colombier if(j > 15)
3243e12c5d1SDavid du Colombier j = 15;
3253e12c5d1SDavid du Colombier encode = words[i].encode;
3263e12c5d1SDavid du Colombier c = (1<<15) | (j<<11) | encode;
3273e12c5d1SDavid du Colombier sput(c);
3283e12c5d1SDavid du Colombier count += 2;
3293e12c5d1SDavid du Colombier for(thisword=word+j; c = *thisword; thisword++) {
3303e12c5d1SDavid du Colombier Bputc(&bout, c);
3313e12c5d1SDavid du Colombier count++;
3323e12c5d1SDavid du Colombier }
3333e12c5d1SDavid du Colombier lastword = word;
3343e12c5d1SDavid du Colombier }
3353e12c5d1SDavid du Colombier fprint(2, "output bytes = %ld\n", count);
3363e12c5d1SDavid du Colombier }
337