xref: /plan9/sys/src/cmd/troff/n8.c (revision 14f51593fd82e19ba95969a8c07ff71131015979)
1 #include "tdef.h"
2 #include "fns.h"
3 #include "ext.h"
4 #include <assert.h>
5 
6 #define	HY_BIT	0200	/* stuff in here only works for 7-bit ascii */
7 			/* this value is used (as a literal) in suftab.c */
8 			/* to encode possible hyphenation points in suffixes. */
9 			/* it could be changed, by widening the tables */
10 			/* to be shorts instead of chars. */
11 
12 /*
13  * troff8.c
14  *
15  * hyphenation
16  */
17 
18 int	hexsize = 0;		/* hyphenation exception list size */
19 char	*hbufp = NULL;		/* base of list */
20 char	*nexth = NULL;		/* first free slot in list */
21 Tchar	*hyend;
22 
23 #define THRESH 160 		/* digram goodness threshold */
24 int	thresh = THRESH;
25 
26 int	texhyphen(void);
27 static	int	alpha(Tchar);
28 
hyphen(Tchar * wp)29 void hyphen(Tchar *wp)
30 {
31 	int j;
32 	Tchar *i;
33 
34 	i = wp;
35 	while (punct((*i++)))
36 		;
37 	if (!alpha(*--i))
38 		return;
39 	wdstart = i++;
40 	while (alpha(*i++))
41 		;
42 	hyend = wdend = --i - 1;
43 	while (punct((*i++)))
44 		;
45 	if (*--i)
46 		return;
47 	if (wdend - wdstart < 4)	/* 4 chars is too short to hyphenate */
48 		return;
49 	hyp = hyptr;
50 	*hyp = 0;
51 	hyoff = 2;
52 
53 	/* for now, try exceptions first, then tex (if hyphalg is non-zero),
54 	   then suffix and digram if tex didn't hyphenate it at all.
55 	*/
56 
57 	if (!exword() && !texhyphen() && !suffix())
58 		digram();
59 
60 	/* this appears to sort hyphenation points into increasing order */
61 	*hyp++ = 0;
62 	if (*hyptr)
63 		for (j = 1; j; ) {
64 			j = 0;
65 			for (hyp = hyptr + 1; *hyp != 0; hyp++) {
66 				if (*(hyp - 1) > *hyp) {
67 					j++;
68 					i = *hyp;
69 					*hyp = *(hyp - 1);
70 					*(hyp - 1) = i;
71 				}
72 			}
73 		}
74 }
75 
alpha(Tchar i)76 static alpha(Tchar i)	/* non-zero if really alphabetic */
77 {
78 	if (ismot(i))
79 		return 0;
80 	else if (cbits(i) >= ALPHABET)	/* this isn't very elegant, but there's */
81 		return 0;		/* no good way to make sure i is in range for */
82 	else				/* the call of isalpha */
83 		return isalpha(cbits(i));
84 }
85 
86 
punct(Tchar i)87 punct(Tchar i)
88 {
89 	if (!i || alpha(i))
90 		return(0);
91 	else
92 		return(1);
93 }
94 
95 
caseha(void)96 void caseha(void)	/* set hyphenation algorithm */
97 {
98 	hyphalg = HYPHALG;
99 	if (skip())
100 		return;
101 	noscale++;
102 	hyphalg = atoi0();
103 	noscale = 0;
104 }
105 
106 
caseht(void)107 void caseht(void)	/* set hyphenation threshold;  not in manual! */
108 {
109 	thresh = THRESH;
110 	if (skip())
111 		return;
112 	noscale++;
113 	thresh = atoi0();
114 	noscale = 0;
115 }
116 
117 
growh(char * where)118 char *growh(char *where)
119 {
120 	char *new;
121 
122 	hexsize += NHEX;
123 	if ((new = grow(hbufp, hexsize, sizeof(char))) == NULL)
124 		return NULL;
125 	if (new == hbufp) {
126 		return where;
127 	} else {
128 		int diff;
129 		diff = where - hbufp;
130 		hbufp = new;
131 		return new + diff;
132 	}
133 }
134 
135 
casehw(void)136 void casehw(void)
137 {
138 	int i, k;
139 	char *j;
140 	Tchar t;
141 
142 	if (nexth == NULL) {
143 		if ((nexth = hbufp = grow(hbufp, NHEX, sizeof(char))) == NULL) {
144 			ERROR "No space for exception word list." WARN;
145 			return;
146 		}
147 		hexsize = NHEX;
148 	}
149 	k = 0;
150 	while (!skip()) {
151 		if ((j = nexth) >= hbufp + hexsize - 2)
152 			if ((j = nexth = growh(j)) == NULL)
153 				goto full;
154 		for (;;) {
155 			if (ismot(t = getch()))
156 				continue;
157 			i = cbits(t);
158 			if (i == ' ' || i == '\n') {
159 				*j++ = 0;
160 				nexth = j;
161 				*j = 0;
162 				if (i == ' ')
163 					break;
164 				else
165 					return;
166 			}
167 			if (i == '-') {
168 				k = HY_BIT;
169 				continue;
170 			}
171 			*j++ = maplow(i) | k;
172 			k = 0;
173 			if (j >= hbufp + hexsize - 2)
174 				if ((j = growh(j)) == NULL)
175 					goto full;
176 		}
177 	}
178 	return;
179 full:
180 	ERROR "Cannot grow exception word list." WARN;
181 	*nexth = 0;
182 }
183 
184 
exword(void)185 int exword(void)
186 {
187 	Tchar *w;
188 	char *e, *save;
189 
190 	e = hbufp;
191 	while (1) {
192 		save = e;
193 		if (e == NULL || *e == 0)
194 			return(0);
195 		w = wdstart;
196 		while (*e && w <= hyend && (*e & 0177) == maplow(cbits(*w))) {
197 			e++;
198 			w++;
199 		}
200 		if (!*e) {
201 			if (w-1 == hyend || (w == wdend && maplow(cbits(*w)) == 's')) {
202 				w = wdstart;
203 				for (e = save; *e; e++) {
204 					if (*e & HY_BIT)
205 						*hyp++ = w;
206 					if (hyp > hyptr + NHYP - 1)
207 						hyp = hyptr + NHYP - 1;
208 					w++;
209 				}
210 				return(1);
211 			} else {
212 				e++;
213 				continue;
214 			}
215 		} else
216 			while (*e++)
217 				;
218 	}
219 }
220 
221 
suffix(void)222 suffix(void)
223 {
224 	Tchar *w;
225 	char *s, *s0;
226 	Tchar i;
227 	extern char *suftab[];
228 
229 again:
230 	i = cbits(*hyend);
231 	if (!alpha(i))
232 		return(0);
233 	if (i < 'a')
234 		i -= 'A' - 'a';
235 	if ((s0 = suftab[i-'a']) == 0)
236 		return(0);
237 	for (;;) {
238 		if ((i = *s0 & 017) == 0)
239 			return(0);
240 		s = s0 + i - 1;
241 		w = hyend - 1;
242 		while (s > s0 && w >= wdstart && (*s & 0177) == maplow(cbits(*w))) {
243 			s--;
244 			w--;
245 		}
246 		if (s == s0)
247 			break;
248 		s0 += i;
249 	}
250 	s = s0 + i - 1;
251 	w = hyend;
252 	if (*s0 & HY_BIT)
253 		goto mark;
254 	while (s > s0) {
255 		w--;
256 		if (*s-- & HY_BIT) {
257 mark:
258 			hyend = w - 1;
259 			if (*s0 & 0100)	/* 0100 used in suftab to encode something too */
260 				continue;
261 			if (!chkvow(w))
262 				return(0);
263 			*hyp++ = w;
264 		}
265 	}
266 	if (*s0 & 040)
267 		return(0);
268 	if (exword())
269 		return(1);
270 	goto again;
271 }
272 
273 
maplow(int i)274 maplow(int i)
275 {
276 	if (isupper(i))
277 		i = tolower(i);
278 	return(i);
279 }
280 
281 
vowel(int i)282 vowel(int i)
283 {
284 	switch (i) {
285 	case 'a': case 'A':
286 	case 'e': case 'E':
287 	case 'i': case 'I':
288 	case 'o': case 'O':
289 	case 'u': case 'U':
290 	case 'y': case 'Y':
291 		return(1);
292 	default:
293 		return(0);
294 	}
295 }
296 
297 
chkvow(Tchar * w)298 Tchar *chkvow(Tchar *w)
299 {
300 	while (--w >= wdstart)
301 		if (vowel(cbits(*w)))
302 			return(w);
303 	return(0);
304 }
305 
306 
digram(void)307 void digram(void)
308 {
309 	int maxval, val;
310 	Tchar *nhyend, *maxw, *w;
311 	extern char bxh[26][13], bxxh[26][13], xxh[26][13], xhx[26][13], hxx[26][13];
312 
313 	maxw = 0;
314 again:
315 	if (!(w = chkvow(hyend + 1)))
316 		return;
317 	hyend = w;
318 	if (!(w = chkvow(hyend)))
319 		return;
320 	nhyend = w;
321 	maxval = 0;
322 	w--;
323 	while (++w < hyend && w < wdend - 1) {
324 		val = 1;
325 		if (w == wdstart)
326 			val *= dilook('a', cbits(*w), bxh);
327 		else if (w == wdstart + 1)
328 			val *= dilook(cbits(*(w-1)), cbits(*w), bxxh);
329 		else
330 			val *= dilook(cbits(*(w-1)), cbits(*w), xxh);
331 		val *= dilook(cbits(*w), cbits(*(w+1)), xhx);
332 		val *= dilook(cbits(*(w+1)), cbits(*(w+2)), hxx);
333 		if (val > maxval) {
334 			maxval = val;
335 			maxw = w + 1;
336 		}
337 	}
338 	hyend = nhyend;
339 	if (maxval > thresh)
340 		*hyp++ = maxw;
341 	goto again;
342 }
343 
344 
dilook(int a,int b,char t[26][13])345 dilook(int a, int b, char t[26][13])
346 {
347 	int i, j;
348 
349 	i = t[maplow(a)-'a'][(j = maplow(b)-'a')/2];
350 	if (!(j & 01))
351 		i >>= 4;
352 	return(i & 017);
353 }
354 
355 
356 /* here beginneth the tex hyphenation code, as interpreted freely */
357 /* the main difference is that there is no attempt to squeeze space */
358 /* as tightly at tex does. */
359 
360 static int	texit(Tchar *, Tchar *);
361 static int	readpats(void);
362 static void	install(char *);
363 static void	fixup(void);
364 static int	trieindex(int, int);
365 
366 static char	pats[50000];	/* size ought to be computed dynamically */
367 static char	*nextpat = pats;
368 static char	*trie[27*27];	/* english-specific sizes */
369 
texhyphen(void)370 int texhyphen(void)
371 {
372 	static int loaded = 0;		/* -1: couldn't find tex file */
373 
374 	if (hyphalg == 0 || loaded == -1)	/* non-zero => tex for now */
375 		return 0;
376 	if (loaded == 0) {
377 		if (readpats())
378 			loaded = 1;
379 		else
380 			loaded = -1;
381 	}
382 	return texit(wdstart, wdend);
383 }
384 
texit(Tchar * start,Tchar * end)385 static int texit(Tchar *start, Tchar *end)	/* hyphenate as in tex, return # found */
386 {
387 	int nw, i, k, equal, cnt[500];
388 	char w[500+1], *np, *pp, *wp, *xpp, *xwp;
389 
390 	w[0] = '.';
391 	for (nw = 1; start <= end && nw < 500-1; nw++, start++)
392 		w[nw] = maplow(tolower(cbits(*start)));
393 	start -= (nw - 1);
394 	w[nw++] = '.';
395 	w[nw] = 0;
396 /*
397  * printf("try %s\n", w);
398 */
399 	for (i = 0; i <= nw; i++)
400 		cnt[i] = '0';
401 
402 	for (wp = w; wp+1 < w+nw; wp++) {
403 		for (pp = trie[trieindex(*wp, *(wp+1))]; pp < nextpat; ) {
404 			if (pp == 0		/* no trie entry */
405 			 || *pp != *wp		/* no match on 1st letter */
406 			 || *(pp+1) != *(wp+1))	/* no match on 2nd letter */
407 				break;		/*   so move to next letter of word */
408 			equal = 1;
409 			for (xpp = pp+2, xwp = wp+2; *xpp; )
410 				if (*xpp++ != *xwp++) {
411 					equal = 0;
412 					break;
413 				}
414 			if (equal) {
415 				np = xpp+1;	/* numpat */
416 				for (k = wp-w; *np; k++, np++)
417 					if (*np > cnt[k])
418 						cnt[k] = *np;
419 /*
420  * printf("match: %s  %s\n", pp, xpp+1);
421 */
422 			}
423 			pp += *(pp-1);	/* skip over pattern and numbers to next */
424 		}
425 	}
426 /*
427  * for (i = 0; i < nw; i++) printf("%c", w[i]);
428  * printf("  ");
429  * for (i = 0; i <= nw; i++) printf("%c", cnt[i]);
430  * printf("\n");
431 */
432 /*
433  * 	for (i = 1; i < nw - 1; i++) {
434  * 		if (i > 2 && i < nw - 3 && cnt[i] % 2)
435  * 			printf("-");
436  * 		if (cbits(start[i-1]) != '.')
437  * 			printf("%c", cbits(start[i-1]));
438  * 	}
439  * 	printf("\n");
440 */
441 	for (i = 1; i < nw -1; i++)
442 		if (i > 2 && i < nw - 3 && cnt[i] % 2)
443 			*hyp++ = start + i - 1;
444 	return hyp - hyptr;	/* non-zero if a hyphen was found */
445 }
446 
447 /*
448 	This code assumes that hyphen.tex looks like
449 		% some comments
450 		\patterns{ % more comments
451 		pat5ter4ns, 1 per line, SORTED, nothing else
452 		}
453 		more goo
454 		\hyphenation{ % more comments
455 		ex-cep-tions, one per line; i ignore this part for now
456 		}
457 
458 	this code is NOT robust against variations.  unfortunately,
459 	it looks like every local language version of this file has
460 	a different format.  i have also made no provision for weird
461 	characters.  sigh.
462 */
463 
readpats(void)464 static int readpats(void)
465 {
466 	FILE *fp;
467 	char buf[200], buf1[200];
468 
469 	if ((fp = fopen(TEXHYPHENS, "r")) == NULL
470 	 && (fp = fopen(DWBalthyphens, "r")) == NULL) {
471 		ERROR "warning: can't find hyphen.tex" WARN;
472 		return 0;
473 	}
474 
475 	while (fgets(buf, sizeof buf, fp) != NULL) {
476 		sscanf(buf, "%s", buf1);
477 		if (strcmp(buf1, "\\patterns{") == 0)
478 			break;
479 	}
480 	while (fgets(buf, sizeof buf, fp) != NULL) {
481 		if (buf[0] == '}')
482 			break;
483 		install(buf);
484 	}
485 	fclose(fp);
486 	fixup();
487 	return 1;
488 }
489 
install(char * s)490 static void install(char *s)	/* map ab4c5de to: 12 abcde \0 00405 \0 */
491 {
492 	int npat, lastpat;
493 	char num[500], *onextpat = nextpat;
494 
495 	num[0] = '0';
496 	*nextpat++ = ' ';	/* fill in with count later */
497 	for (npat = lastpat = 0; *s != '\n' && *s != '\0'; s++) {
498 		if (isdigit(*s)) {
499 			num[npat] = *s;
500 			lastpat = npat;
501 		} else {
502 			*nextpat++ = *s;
503 			npat++;
504 			num[npat] = '0';
505 		}
506 	}
507 	*nextpat++ = 0;
508 	if (nextpat > pats + sizeof(pats)-20) {
509 		ERROR "tex hyphenation table overflow, tail end ignored" WARN;
510 		nextpat = onextpat;
511 	}
512 	num[lastpat+1] = 0;
513 	strcat(nextpat, num);
514 	nextpat += strlen(nextpat) + 1;
515 }
516 
fixup(void)517 static void fixup(void)	/* build indexes of where . a b c ... start */
518 {
519 	char *p, *lastc;
520 	int n;
521 
522 	for (lastc = pats, p = pats+1; p < nextpat; p++)
523 		if (*p == ' ') {
524 			*lastc = p - lastc;
525 			lastc = p;
526 		}
527 	*lastc = p - lastc;
528 	for (p = pats+1; p < nextpat; ) {
529 		n = trieindex(p[0], p[1]);
530 		if (trie[n] == 0)
531 			trie[n] = p;
532 		p += p[-1];
533 	}
534 	/* printf("pats = %d\n", nextpat - pats); */
535 }
536 
trieindex(int d1,int d2)537 static int trieindex(int d1, int d2)
538 {
539 	int i;
540 
541 	i = 27*(d1 == '.'? 0: d1 - 'a' + 1) + (d2 == '.'? 0: d2 - 'a' + 1);
542 	assert(0 <= i && i < 27*27);
543 	return i;
544 }
545