1 #include "tdef.h"
2 #include "fns.h"
3 #include "ext.h"
4 #include <assert.h>
5
6 #define HY_BIT 0200 /* stuff in here only works for 7-bit ascii */
7 /* this value is used (as a literal) in suftab.c */
8 /* to encode possible hyphenation points in suffixes. */
9 /* it could be changed, by widening the tables */
10 /* to be shorts instead of chars. */
11
12 /*
13 * troff8.c
14 *
15 * hyphenation
16 */
17
18 int hexsize = 0; /* hyphenation exception list size */
19 char *hbufp = NULL; /* base of list */
20 char *nexth = NULL; /* first free slot in list */
21 Tchar *hyend;
22
23 #define THRESH 160 /* digram goodness threshold */
24 int thresh = THRESH;
25
26 int texhyphen(void);
27 static int alpha(Tchar);
28
hyphen(Tchar * wp)29 void hyphen(Tchar *wp)
30 {
31 int j;
32 Tchar *i;
33
34 i = wp;
35 while (punct((*i++)))
36 ;
37 if (!alpha(*--i))
38 return;
39 wdstart = i++;
40 while (alpha(*i++))
41 ;
42 hyend = wdend = --i - 1;
43 while (punct((*i++)))
44 ;
45 if (*--i)
46 return;
47 if (wdend - wdstart < 4) /* 4 chars is too short to hyphenate */
48 return;
49 hyp = hyptr;
50 *hyp = 0;
51 hyoff = 2;
52
53 /* for now, try exceptions first, then tex (if hyphalg is non-zero),
54 then suffix and digram if tex didn't hyphenate it at all.
55 */
56
57 if (!exword() && !texhyphen() && !suffix())
58 digram();
59
60 /* this appears to sort hyphenation points into increasing order */
61 *hyp++ = 0;
62 if (*hyptr)
63 for (j = 1; j; ) {
64 j = 0;
65 for (hyp = hyptr + 1; *hyp != 0; hyp++) {
66 if (*(hyp - 1) > *hyp) {
67 j++;
68 i = *hyp;
69 *hyp = *(hyp - 1);
70 *(hyp - 1) = i;
71 }
72 }
73 }
74 }
75
alpha(Tchar i)76 static alpha(Tchar i) /* non-zero if really alphabetic */
77 {
78 if (ismot(i))
79 return 0;
80 else if (cbits(i) >= ALPHABET) /* this isn't very elegant, but there's */
81 return 0; /* no good way to make sure i is in range for */
82 else /* the call of isalpha */
83 return isalpha(cbits(i));
84 }
85
86
punct(Tchar i)87 punct(Tchar i)
88 {
89 if (!i || alpha(i))
90 return(0);
91 else
92 return(1);
93 }
94
95
caseha(void)96 void caseha(void) /* set hyphenation algorithm */
97 {
98 hyphalg = HYPHALG;
99 if (skip())
100 return;
101 noscale++;
102 hyphalg = atoi0();
103 noscale = 0;
104 }
105
106
caseht(void)107 void caseht(void) /* set hyphenation threshold; not in manual! */
108 {
109 thresh = THRESH;
110 if (skip())
111 return;
112 noscale++;
113 thresh = atoi0();
114 noscale = 0;
115 }
116
117
growh(char * where)118 char *growh(char *where)
119 {
120 char *new;
121
122 hexsize += NHEX;
123 if ((new = grow(hbufp, hexsize, sizeof(char))) == NULL)
124 return NULL;
125 if (new == hbufp) {
126 return where;
127 } else {
128 int diff;
129 diff = where - hbufp;
130 hbufp = new;
131 return new + diff;
132 }
133 }
134
135
casehw(void)136 void casehw(void)
137 {
138 int i, k;
139 char *j;
140 Tchar t;
141
142 if (nexth == NULL) {
143 if ((nexth = hbufp = grow(hbufp, NHEX, sizeof(char))) == NULL) {
144 ERROR "No space for exception word list." WARN;
145 return;
146 }
147 hexsize = NHEX;
148 }
149 k = 0;
150 while (!skip()) {
151 if ((j = nexth) >= hbufp + hexsize - 2)
152 if ((j = nexth = growh(j)) == NULL)
153 goto full;
154 for (;;) {
155 if (ismot(t = getch()))
156 continue;
157 i = cbits(t);
158 if (i == ' ' || i == '\n') {
159 *j++ = 0;
160 nexth = j;
161 *j = 0;
162 if (i == ' ')
163 break;
164 else
165 return;
166 }
167 if (i == '-') {
168 k = HY_BIT;
169 continue;
170 }
171 *j++ = maplow(i) | k;
172 k = 0;
173 if (j >= hbufp + hexsize - 2)
174 if ((j = growh(j)) == NULL)
175 goto full;
176 }
177 }
178 return;
179 full:
180 ERROR "Cannot grow exception word list." WARN;
181 *nexth = 0;
182 }
183
184
exword(void)185 int exword(void)
186 {
187 Tchar *w;
188 char *e, *save;
189
190 e = hbufp;
191 while (1) {
192 save = e;
193 if (e == NULL || *e == 0)
194 return(0);
195 w = wdstart;
196 while (*e && w <= hyend && (*e & 0177) == maplow(cbits(*w))) {
197 e++;
198 w++;
199 }
200 if (!*e) {
201 if (w-1 == hyend || (w == wdend && maplow(cbits(*w)) == 's')) {
202 w = wdstart;
203 for (e = save; *e; e++) {
204 if (*e & HY_BIT)
205 *hyp++ = w;
206 if (hyp > hyptr + NHYP - 1)
207 hyp = hyptr + NHYP - 1;
208 w++;
209 }
210 return(1);
211 } else {
212 e++;
213 continue;
214 }
215 } else
216 while (*e++)
217 ;
218 }
219 }
220
221
suffix(void)222 suffix(void)
223 {
224 Tchar *w;
225 char *s, *s0;
226 Tchar i;
227 extern char *suftab[];
228
229 again:
230 i = cbits(*hyend);
231 if (!alpha(i))
232 return(0);
233 if (i < 'a')
234 i -= 'A' - 'a';
235 if ((s0 = suftab[i-'a']) == 0)
236 return(0);
237 for (;;) {
238 if ((i = *s0 & 017) == 0)
239 return(0);
240 s = s0 + i - 1;
241 w = hyend - 1;
242 while (s > s0 && w >= wdstart && (*s & 0177) == maplow(cbits(*w))) {
243 s--;
244 w--;
245 }
246 if (s == s0)
247 break;
248 s0 += i;
249 }
250 s = s0 + i - 1;
251 w = hyend;
252 if (*s0 & HY_BIT)
253 goto mark;
254 while (s > s0) {
255 w--;
256 if (*s-- & HY_BIT) {
257 mark:
258 hyend = w - 1;
259 if (*s0 & 0100) /* 0100 used in suftab to encode something too */
260 continue;
261 if (!chkvow(w))
262 return(0);
263 *hyp++ = w;
264 }
265 }
266 if (*s0 & 040)
267 return(0);
268 if (exword())
269 return(1);
270 goto again;
271 }
272
273
maplow(int i)274 maplow(int i)
275 {
276 if (isupper(i))
277 i = tolower(i);
278 return(i);
279 }
280
281
vowel(int i)282 vowel(int i)
283 {
284 switch (i) {
285 case 'a': case 'A':
286 case 'e': case 'E':
287 case 'i': case 'I':
288 case 'o': case 'O':
289 case 'u': case 'U':
290 case 'y': case 'Y':
291 return(1);
292 default:
293 return(0);
294 }
295 }
296
297
chkvow(Tchar * w)298 Tchar *chkvow(Tchar *w)
299 {
300 while (--w >= wdstart)
301 if (vowel(cbits(*w)))
302 return(w);
303 return(0);
304 }
305
306
digram(void)307 void digram(void)
308 {
309 int maxval, val;
310 Tchar *nhyend, *maxw, *w;
311 extern char bxh[26][13], bxxh[26][13], xxh[26][13], xhx[26][13], hxx[26][13];
312
313 maxw = 0;
314 again:
315 if (!(w = chkvow(hyend + 1)))
316 return;
317 hyend = w;
318 if (!(w = chkvow(hyend)))
319 return;
320 nhyend = w;
321 maxval = 0;
322 w--;
323 while (++w < hyend && w < wdend - 1) {
324 val = 1;
325 if (w == wdstart)
326 val *= dilook('a', cbits(*w), bxh);
327 else if (w == wdstart + 1)
328 val *= dilook(cbits(*(w-1)), cbits(*w), bxxh);
329 else
330 val *= dilook(cbits(*(w-1)), cbits(*w), xxh);
331 val *= dilook(cbits(*w), cbits(*(w+1)), xhx);
332 val *= dilook(cbits(*(w+1)), cbits(*(w+2)), hxx);
333 if (val > maxval) {
334 maxval = val;
335 maxw = w + 1;
336 }
337 }
338 hyend = nhyend;
339 if (maxval > thresh)
340 *hyp++ = maxw;
341 goto again;
342 }
343
344
dilook(int a,int b,char t[26][13])345 dilook(int a, int b, char t[26][13])
346 {
347 int i, j;
348
349 i = t[maplow(a)-'a'][(j = maplow(b)-'a')/2];
350 if (!(j & 01))
351 i >>= 4;
352 return(i & 017);
353 }
354
355
356 /* here beginneth the tex hyphenation code, as interpreted freely */
357 /* the main difference is that there is no attempt to squeeze space */
358 /* as tightly at tex does. */
359
360 static int texit(Tchar *, Tchar *);
361 static int readpats(void);
362 static void install(char *);
363 static void fixup(void);
364 static int trieindex(int, int);
365
366 static char pats[50000]; /* size ought to be computed dynamically */
367 static char *nextpat = pats;
368 static char *trie[27*27]; /* english-specific sizes */
369
texhyphen(void)370 int texhyphen(void)
371 {
372 static int loaded = 0; /* -1: couldn't find tex file */
373
374 if (hyphalg == 0 || loaded == -1) /* non-zero => tex for now */
375 return 0;
376 if (loaded == 0) {
377 if (readpats())
378 loaded = 1;
379 else
380 loaded = -1;
381 }
382 return texit(wdstart, wdend);
383 }
384
texit(Tchar * start,Tchar * end)385 static int texit(Tchar *start, Tchar *end) /* hyphenate as in tex, return # found */
386 {
387 int nw, i, k, equal, cnt[500];
388 char w[500+1], *np, *pp, *wp, *xpp, *xwp;
389
390 w[0] = '.';
391 for (nw = 1; start <= end && nw < 500-1; nw++, start++)
392 w[nw] = maplow(tolower(cbits(*start)));
393 start -= (nw - 1);
394 w[nw++] = '.';
395 w[nw] = 0;
396 /*
397 * printf("try %s\n", w);
398 */
399 for (i = 0; i <= nw; i++)
400 cnt[i] = '0';
401
402 for (wp = w; wp+1 < w+nw; wp++) {
403 for (pp = trie[trieindex(*wp, *(wp+1))]; pp < nextpat; ) {
404 if (pp == 0 /* no trie entry */
405 || *pp != *wp /* no match on 1st letter */
406 || *(pp+1) != *(wp+1)) /* no match on 2nd letter */
407 break; /* so move to next letter of word */
408 equal = 1;
409 for (xpp = pp+2, xwp = wp+2; *xpp; )
410 if (*xpp++ != *xwp++) {
411 equal = 0;
412 break;
413 }
414 if (equal) {
415 np = xpp+1; /* numpat */
416 for (k = wp-w; *np; k++, np++)
417 if (*np > cnt[k])
418 cnt[k] = *np;
419 /*
420 * printf("match: %s %s\n", pp, xpp+1);
421 */
422 }
423 pp += *(pp-1); /* skip over pattern and numbers to next */
424 }
425 }
426 /*
427 * for (i = 0; i < nw; i++) printf("%c", w[i]);
428 * printf(" ");
429 * for (i = 0; i <= nw; i++) printf("%c", cnt[i]);
430 * printf("\n");
431 */
432 /*
433 * for (i = 1; i < nw - 1; i++) {
434 * if (i > 2 && i < nw - 3 && cnt[i] % 2)
435 * printf("-");
436 * if (cbits(start[i-1]) != '.')
437 * printf("%c", cbits(start[i-1]));
438 * }
439 * printf("\n");
440 */
441 for (i = 1; i < nw -1; i++)
442 if (i > 2 && i < nw - 3 && cnt[i] % 2)
443 *hyp++ = start + i - 1;
444 return hyp - hyptr; /* non-zero if a hyphen was found */
445 }
446
447 /*
448 This code assumes that hyphen.tex looks like
449 % some comments
450 \patterns{ % more comments
451 pat5ter4ns, 1 per line, SORTED, nothing else
452 }
453 more goo
454 \hyphenation{ % more comments
455 ex-cep-tions, one per line; i ignore this part for now
456 }
457
458 this code is NOT robust against variations. unfortunately,
459 it looks like every local language version of this file has
460 a different format. i have also made no provision for weird
461 characters. sigh.
462 */
463
readpats(void)464 static int readpats(void)
465 {
466 FILE *fp;
467 char buf[200], buf1[200];
468
469 if ((fp = fopen(TEXHYPHENS, "r")) == NULL
470 && (fp = fopen(DWBalthyphens, "r")) == NULL) {
471 ERROR "warning: can't find hyphen.tex" WARN;
472 return 0;
473 }
474
475 while (fgets(buf, sizeof buf, fp) != NULL) {
476 sscanf(buf, "%s", buf1);
477 if (strcmp(buf1, "\\patterns{") == 0)
478 break;
479 }
480 while (fgets(buf, sizeof buf, fp) != NULL) {
481 if (buf[0] == '}')
482 break;
483 install(buf);
484 }
485 fclose(fp);
486 fixup();
487 return 1;
488 }
489
install(char * s)490 static void install(char *s) /* map ab4c5de to: 12 abcde \0 00405 \0 */
491 {
492 int npat, lastpat;
493 char num[500], *onextpat = nextpat;
494
495 num[0] = '0';
496 *nextpat++ = ' '; /* fill in with count later */
497 for (npat = lastpat = 0; *s != '\n' && *s != '\0'; s++) {
498 if (isdigit(*s)) {
499 num[npat] = *s;
500 lastpat = npat;
501 } else {
502 *nextpat++ = *s;
503 npat++;
504 num[npat] = '0';
505 }
506 }
507 *nextpat++ = 0;
508 if (nextpat > pats + sizeof(pats)-20) {
509 ERROR "tex hyphenation table overflow, tail end ignored" WARN;
510 nextpat = onextpat;
511 }
512 num[lastpat+1] = 0;
513 strcat(nextpat, num);
514 nextpat += strlen(nextpat) + 1;
515 }
516
fixup(void)517 static void fixup(void) /* build indexes of where . a b c ... start */
518 {
519 char *p, *lastc;
520 int n;
521
522 for (lastc = pats, p = pats+1; p < nextpat; p++)
523 if (*p == ' ') {
524 *lastc = p - lastc;
525 lastc = p;
526 }
527 *lastc = p - lastc;
528 for (p = pats+1; p < nextpat; ) {
529 n = trieindex(p[0], p[1]);
530 if (trie[n] == 0)
531 trie[n] = p;
532 p += p[-1];
533 }
534 /* printf("pats = %d\n", nextpat - pats); */
535 }
536
trieindex(int d1,int d2)537 static int trieindex(int d1, int d2)
538 {
539 int i;
540
541 i = 27*(d1 == '.'? 0: d1 - 'a' + 1) + (d2 == '.'? 0: d2 - 'a' + 1);
542 assert(0 <= i && i < 27*27);
543 return i;
544 }
545