1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include <ctype.h>
5 #include "code.h"
6
7 /* read an annotated spelling list in the form
8 word <tab> affixcode [ , affixcode ] ...
9 print a reencoded version
10 octal <tab> word
11 */
12
13 typedef struct Dict Dict;
14 struct Dict
15 {
16 char* word;
17 int encode;
18 };
19
20 Dict words[200000];
21 char space[500000];
22 long encodes[4094];
23 long nspace;
24 long nwords;
25 int ncodes;
26 Biobuf bout;
27
28 void readinput(int f);
29 long typecode(char *str);
30 int wcmp(void*, void*);
31 void pdict(void);
32 void sput(int);
33
34 void
main(int argc,char * argv[])35 main(int argc, char *argv[])
36 {
37 int f;
38
39 Binit(&bout, 1, OWRITE);
40 nwords = 0;
41 nspace = 0;
42 ncodes = 0;
43 if(argc <= 1)
44 readinput(0);
45 while(argc > 1) {
46 f = open(argv[1], 0);
47 if(f < 0) {
48 fprint(2, "Cannot open %s\n", argv[1]);
49 exits("open");
50 }
51 readinput(f);
52 argc--;
53 argv++;
54 }
55 fprint(2, "words = %ld; space = %ld; codes = %d\n",
56 nwords, nspace, ncodes);
57 qsort(words, nwords, sizeof(words[0]), wcmp);
58 pdict();
59 exits(0);
60 }
61
wcmp(void * a,void * b)62 wcmp(void *a, void *b)
63 {
64
65 return strcmp(((Dict*)a)->word, ((Dict*)b)->word);
66 }
67
68 void
readinput(int f)69 readinput(int f)
70 {
71 long i;
72 char *code, *line, *bword;
73 Biobuf buf;
74 long lineno = 0;
75
76 Binit(&buf, f, OREAD);
77 while(line = Brdline(&buf, '\n')) {
78 line[Blinelen(&buf)-1] = 0;
79 lineno++;
80 code = line;
81 while(isspace(*code))
82 code++;
83 bword = code;
84 while(*code && !isspace(*code))
85 code++;
86
87 i = code-bword;
88 memmove(space+nspace, bword, i);
89 words[nwords].word = space+nspace;
90 nspace += i;
91 space[nspace] = 0;
92 nspace++;
93
94 if(*code) {
95 *code++ = 0;
96 while(isspace(*code))
97 code++;
98 }
99 words[nwords].encode = typecode(code);
100 nwords++;
101 if(nwords >= sizeof(words)/sizeof(words[0])) {
102 fprint(2, "words array too small\n");
103 exits("words");
104 }
105 if(nspace >= sizeof(space)/sizeof(space[0])) {
106 fprint(2, "space array too small\n");
107 exits("space");
108 }
109 }
110 Bterm(&buf);
111 }
112
113
114 typedef struct Class Class;
115 struct Class
116 {
117 char* codename;
118 long bits;
119 };
120 Class codea[] =
121 {
122 { "a", ADJ },
123 { "adv", ADV },
124 0
125 };
126 Class codec[] =
127 {
128 { "comp", COMP },
129 0
130 };
131 Class coded[] =
132 {
133 { "d", DONT_TOUCH},
134 0
135 };
136
137 Class codee[] =
138 {
139 { "ed", ED },
140 { "er", ACTOR },
141 0
142 };
143
144 Class codei[] =
145 {
146 { "in", IN },
147 { "ion", ION },
148 0
149 };
150
151 Class codem[] =
152 {
153 { "man", MAN },
154 { "ms", MONO },
155 0
156 };
157
158 Class coden[] =
159 {
160 { "n", NOUN },
161 { "na", N_AFFIX },
162 { "nopref", NOPREF },
163 0
164 };
165
166 Class codep[] =
167 {
168 { "pc", PROP_COLLECT },
169 0
170 };
171 Class codes[] =
172 {
173 { "s", STOP },
174 0
175 };
176
177 Class codev[] =
178 {
179 { "v", VERB },
180 { "va", V_AFFIX },
181 { "vi", V_IRREG },
182 0
183 };
184
185 Class codey[] =
186 {
187 { "y", _Y },
188 0
189 };
190
191 Class codez[] =
192 {
193 0
194 };
195 Class* codetab[] =
196 {
197 codea,
198 codez,
199 codec,
200 coded,
201 codee,
202 codez,
203 codez,
204 codez,
205 codei,
206 codez,
207 codez,
208 codez,
209 codem,
210 coden,
211 codez,
212 codep,
213 codez,
214 codez,
215 codes,
216 codez,
217 codez,
218 codev,
219 codez,
220 codez,
221 codey,
222 codez,
223 };
224
225 long
typecode(char * str)226 typecode(char *str)
227 {
228 Class *p;
229 long code;
230 int n, i;
231 char *s, *sp, *st;
232
233 code = 0;
234
235 loop:
236 for(s=str; *s != 0 && *s != ','; s++)
237 ;
238 for(p = codetab[*str-'a']; sp = p->codename; p++) {
239 st = str;
240 for(n=s-str;; st++,sp++) {
241 if(*st != *sp)
242 goto cont;
243 n--;
244 if(n == 0)
245 break;
246 }
247 code |= p->bits;
248 if(*s == 0)
249 goto out;
250 str = s+1;
251 goto loop;
252 cont:;
253 }
254 fprint(2, "Unknown affix code \"%s\"\n", str);
255 return 0;
256 out:
257 for(i=0; i<ncodes; i++)
258 if(encodes[i] == code)
259 return i;
260 encodes[i] = code;
261 ncodes++;
262 return i;
263 }
264
265 void
sput(int s)266 sput(int s)
267 {
268
269 Bputc(&bout, s>>8);
270 Bputc(&bout, s);
271 }
272
273 void
lput(long l)274 lput(long l)
275 {
276 Bputc(&bout, l>>24);
277 Bputc(&bout, l>>16);
278 Bputc(&bout, l>>8);
279 Bputc(&bout, l);
280 }
281
282 /*
283 * spit out the encoded dictionary
284 * all numbers are encoded big-endian.
285 * struct
286 * {
287 * short ncodes;
288 * long encodes[ncodes];
289 * struct
290 * {
291 * short encode;
292 * char word[*];
293 * } words[*];
294 * };
295 * 0x8000 flag for code word
296 * 0x7800 count of number of common bytes with previous word
297 * 0x07ff index into codes array for affixes
298 */
299 void
pdict(void)300 pdict(void)
301 {
302 long i, count;
303 int encode, j, c;
304 char *lastword, *thisword, *word;
305
306 sput(ncodes);
307 for(i=0; i<ncodes; i++)
308 lput(encodes[i]);
309
310 count = ncodes*4 + 2;
311 lastword = "";
312 for(i=0; i<nwords; i++) {
313 word = words[i].word;
314 thisword = word;
315 for(j=0; *thisword == *lastword; j++) {
316 if(*thisword == 0) {
317 fprint(2, "identical words: %s\n", word);
318 break;
319 }
320 thisword++;
321 lastword++;
322 }
323 if(j > 15)
324 j = 15;
325 encode = words[i].encode;
326 c = (1<<15) | (j<<11) | encode;
327 sput(c);
328 count += 2;
329 for(thisword=word+j; c = *thisword; thisword++) {
330 Bputc(&bout, c);
331 count++;
332 }
333 lastword = word;
334 }
335 fprint(2, "output bytes = %ld\n", count);
336 }
337