xref: /plan9/sys/src/cmd/troff2html/troff2html.c (revision b09c09c56326c85e62a02c7f2061cadadc3e620b)
1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 
5 enum{
6 	Nfont = 11,
7 	Wid = 20,	/* tmac.anhtml sets page width to 20" so we can recognize .nf text */
8 };
9 
10 typedef uintptr Char;
11 typedef struct Troffchar Troffchar;
12 typedef struct Htmlchar Htmlchar;
13 typedef struct Font Font;
14 typedef struct HTMLfont HTMLfont;
15 
16 /*
17  * a Char is >= 32 bits. low 16 bits are the rune. higher are attributes.
18  * must be able to hold a pointer.
19  */
20 enum
21 {
22 	Italic	=	16,
23 	Bold,
24 	CW,
25 	Indent1,
26 	Indent2,
27 	Indent3,
28 	Heading =	25,
29 	Anchor =	26,	/* must be last */
30 };
31 
32 enum	/* magic emissions */
33 {
34 	Estring = 0,
35 	Epp = 1<<16,
36 };
37 
38 int attrorder[] = { Indent1, Indent2, Indent3, Heading, Anchor, Italic, Bold, CW };
39 
40 int nest[10];
41 int nnest;
42 
43 struct Troffchar
44 {
45 	char *name;
46 	char *value;
47 };
48 
49 struct Htmlchar
50 {
51 	char *utf;
52 	char *name;
53 	int value;
54 };
55 
56 #include "chars.h"
57 
58 struct Font{
59 	char		*name;
60 	HTMLfont	*htmlfont;
61 };
62 
63 struct HTMLfont{
64 	char	*name;
65 	char	*htmlname;
66 	int	bit;
67 };
68 
69 /* R must be first; it's the default representation for fonts we don't recognize */
70 HTMLfont htmlfonts[] =
71 {
72 	"R",		nil,	0,
73 	"LucidaSans",	nil,	0,
74 	"I",		"i",	Italic,
75 	"LucidaSansI",	"i",	Italic,
76 	"CW",		"tt",	CW,
77 	"LucidaCW",	"tt",	CW,
78 	nil,	nil,
79 };
80 
81 #define TABLE "<table border=0 cellpadding=0 cellspacing=0>"
82 
83 char*
84 onattr[8*sizeof(int)] =
85 {
86 	0, 0, 0, 0, 0, 0, 0, 0,
87 	0, 0, 0, 0, 0, 0, 0, 0,
88 	"<i>",			/* italic */
89 	"<b>",			/* bold */
90 	"<tt><font size=+1>",	/* cw */
91 	"<+table border=0 cellpadding=0 cellspacing=0><tr height=2><td><tr><td width=20><td>\n",		/* indent1 */
92 	"<+table border=0 cellpadding=0 cellspacing=0><tr height=2><td><tr><td width=20><td>\n",		/* indent2 */
93 	"<+table border=0 cellpadding=0 cellspacing=0><tr height=2><td><tr><td width=20><td>\n",		/* indent3 */
94 	0,
95 	0,
96 	0,
97 	"<p><font size=+1><b>",	/* heading 25 */
98 	"<unused>",		/* anchor 26 */
99 };
100 
101 char*
102 offattr[8*sizeof(int)] =
103 {
104 	0, 0, 0, 0, 0, 0, 0, 0,
105 	0, 0, 0, 0, 0, 0, 0, 0,
106 	"</i>",			/* italic */
107 	"</b>",			/* bold */
108 	"</font></tt>",		/* cw */
109 	"<-/table>",		/* indent1 */
110 	"<-/table>",		/* indent2 */
111 	"<-/table>",		/* indent3 */
112 	0,
113 	0,
114 	0,
115 	"</b></font>",		/* heading 25 */
116 	"</a>",			/* anchor 26 */
117 };
118 
119 Font	*font[Nfont];
120 
121 Biobuf	bout;
122 int	debug = 0;
123 
124 /* troff state */
125 int	page = 1;
126 int	ft = 1;
127 int	vp = 0;
128 int	hp = 0;
129 int	ps = 1;
130 int	res = 720;
131 
132 int	didP = 0;
133 int	atnewline = 1;
134 int	prevlineH = 0;
135 Char	attr = 0;	/* or'ed into each Char */
136 
137 Char	*chars;
138 int	nchars;
139 int	nalloc;
140 char**	anchors;	/* allocated in order */
141 int	nanchors;
142 
143 char	*filename;
144 int	cno;
145 char	buf[8192];
146 char	*title = "Plan 9 man page";
147 
148 void	process(Biobuf*, char*);
149 void	mountfont(int, char*);
150 void	switchfont(int);
151 void	header(char*);
152 void	flush(void);
153 void	trailer(void);
154 
155 void*
emalloc(ulong n)156 emalloc(ulong n)
157 {
158 	void *p;
159 
160 	p = malloc(n);
161 	if(p == nil)
162 		sysfatal("malloc failed: %r");
163 	return p;
164 }
165 
166 void*
erealloc(void * p,ulong n)167 erealloc(void *p, ulong n)
168 {
169 
170 	p = realloc(p, n);
171 	if(p == nil)
172 		sysfatal("realloc failed: %r");
173 	return p;
174 }
175 
176 char*
estrdup(char * s)177 estrdup(char *s)
178 {
179 	char *t;
180 
181 	t = strdup(s);
182 	if(t == nil)
183 		sysfatal("strdup failed: %r");
184 	return t;
185 }
186 
187 void
usage(void)188 usage(void)
189 {
190 	fprint(2, "usage: troff2html [-d] [-t title] [file ...]\n");
191 	exits("usage");
192 }
193 
194 int
hccmp(const void * va,const void * vb)195 hccmp(const void *va, const void *vb)
196 {
197 	Htmlchar *a, *b;
198 
199 	a = (Htmlchar*)va;
200 	b = (Htmlchar*)vb;
201 	return a->value - b->value;
202 }
203 
204 void
main(int argc,char * argv[])205 main(int argc, char *argv[])
206 {
207 	int i;
208 	Biobuf in, *inp;
209 	Rune r;
210 
211 	for(i=0; i<nelem(htmlchars); i++){
212 		chartorune(&r, htmlchars[i].utf);
213 		htmlchars[i].value = r;
214 	}
215 	qsort(htmlchars, nelem(htmlchars), sizeof(htmlchars[0]), hccmp);
216 
217 	ARGBEGIN{
218 	case 't':
219 		title = ARGF();
220 		if(title == nil)
221 			usage();
222 		break;
223 	case 'd':
224 		debug++;
225 		break;
226 	default:
227 		usage();
228 	}ARGEND
229 
230 	Binit(&bout, 1, OWRITE);
231 	if(argc == 0){
232 		header(title);
233 		Binit(&in, 0, OREAD);
234 		process(&in, "<stdin>");
235 	}else{
236 		header(title);
237 		for(i=0; i<argc; i++){
238 			inp = Bopen(argv[i], OREAD);
239 			if(inp == nil)
240 				sysfatal("can't open %s: %r", argv[i]);
241 			process(inp, argv[i]);
242 			Bterm(inp);
243 		}
244 	}
245 	flush();
246 	trailer();
247 	exits(nil);
248 }
249 
250 void
emitchar(Char c)251 emitchar(Char c)
252 {
253 	if(nalloc == nchars){
254 		nalloc += 10000;
255 		chars = realloc(chars, nalloc*sizeof(chars[0]));
256 		if(chars == nil)
257 			sysfatal("malloc failed: %r");
258 	}
259 	chars[nchars++] = c;
260 }
261 
262 void
emit(Rune r)263 emit(Rune r)
264 {
265 	emitchar(r | attr);
266 	/*
267 	 * Close man page references early, so that
268 	 * .IR proof (1),
269 	 * doesn't make the comma part of the link.
270 	 */
271 	if(r == ')')
272 		attr &= ~(1<<Anchor);
273 }
274 
275 void
emitstr(char * s)276 emitstr(char *s)
277 {
278 	emitchar(Estring);
279 	emitchar((Char)s);
280 }
281 
282 int indentlevel;
283 int linelen;
284 
285 void
iputrune(Biobuf * b,Rune r)286 iputrune(Biobuf *b, Rune r)
287 {
288 	int i;
289 
290 	if(linelen++ > 60 && r == ' ')
291 		r = '\n';
292 	Bputrune(b, r);
293 	if(r == '\n'){
294 		for(i=0; i<indentlevel; i++)
295 			Bprint(b, "    ");
296 		linelen = 0;
297 	}
298 }
299 
300 void
iputs(Biobuf * b,char * s)301 iputs(Biobuf *b, char *s)
302 {
303 	if(s[0]=='<' && s[1]=='+'){
304 		iputrune(b, '\n');
305 		Bprint(b, "<%s", s+2);
306 		indentlevel++;
307 		iputrune(b, '\n');
308 	}else if(s[0]=='<' && s[1]=='-'){
309 		indentlevel--;
310 		iputrune(b, '\n');
311 		Bprint(b, "<%s", s+2);
312 		iputrune(b, '\n');
313 	}else
314 		Bprint(b, "%s", s);
315 }
316 
317 void
setattr(Char a)318 setattr(Char a)
319 {
320 	Char on, off;
321 	int i, j;
322 
323 	on = a & ~attr;
324 	off = attr & ~a;
325 
326 	/* walk up the nest stack until we reach something we need to turn off. */
327 	for(i=0; i<nnest; i++)
328 		if(off&(1<<nest[i]))
329 			break;
330 
331 	/* turn off everything above that */
332 	for(j=nnest-1; j>=i; j--)
333 		iputs(&bout, offattr[nest[j]]);
334 
335 	/* turn on everything we just turned off but didn't want to */
336 	for(j=i; j<nnest; j++)
337 		if(a&(1<<nest[j]))
338 			iputs(&bout, onattr[nest[j]]);
339 		else
340 			nest[j] = 0;
341 
342 	/* shift the zeros (turned off things) up */
343 	for(i=j=0; i<nnest; i++)
344 		if(nest[i] != 0)
345 			nest[j++] = nest[i];
346 	nnest = j;
347 
348 	/* now turn on the new attributes */
349 	for(i=0; i<nelem(attrorder); i++){
350 		j = attrorder[i];
351 		if(on&(1<<j)){
352 			if(j == Anchor)
353 				onattr[j] = anchors[nanchors++];
354 			iputs(&bout, onattr[j]);
355 			if(nnest >= nelem(nest))
356 				sysfatal("nesting too deep");
357 			nest[nnest++] = j;
358 		}
359 	}
360 	attr = a;
361 }
362 
363 void
flush(void)364 flush(void)
365 {
366 	int i;
367 	Char c, a;
368 
369 	nanchors = 0;
370 	for(i=0; i<nchars; i++){
371 		c = chars[i];
372 		if(c == Estring){
373 			/* next word is string to print */
374 			iputs(&bout, (char*)chars[++i]);
375 			continue;
376 		}
377 		if(c == Epp){
378 			iputrune(&bout, '\n');
379 			iputs(&bout, TABLE "<tr height=5><td></table>");
380 			iputrune(&bout, '\n');
381 			continue;
382 		}
383 		a = c & ~0xFFFF;
384 		c &= 0xFFFF;
385 		/*
386 		 * If we're going to something off after a space,
387 		 * let's just turn it off before.
388 		 */
389 		if(c == ' ' && i<nchars-1 && (chars[i+1]&0xFFFF) >= 32)
390 			a ^= a & ~chars[i+1];
391 		setattr(a);
392 		iputrune(&bout, c & 0xFFFF);
393 	}
394 }
395 
396 void
header(char * s)397 header(char *s)
398 {
399 	Bprint(&bout, "<head>\n");
400 	Bprint(&bout, "<title>%s</title>\n", s);
401 	Bprint(&bout, "<meta content=\"text/html; charset=utf-8\" http-equiv=Content-Type>\n");
402 	Bprint(&bout, "</head>\n");
403 	Bprint(&bout, "<body bgcolor=#ffffff>\n");
404 }
405 
406 void
trailer(void)407 trailer(void)
408 {
409 
410 #ifdef LUCENT
411 	Tm *t;
412 	t = localtime(time(nil));
413 	Bprint(&bout, TABLE "<tr height=20><td></table>\n");
414 	Bprint(&bout, "<font size=-1><a href=\"http://www.lucent.com/copyright.html\">\n");
415 	Bprint(&bout, "Copyright</A> &#169; %d Alcatel-Lucent.  All rights reserved.</font>\n", t->year+1900);
416 #endif
417 	Bprint(&bout, "</body></html>\n");
418 }
419 
420 int
getc(Biobuf * b)421 getc(Biobuf *b)
422 {
423 	cno++;
424 	return Bgetrune(b);
425 }
426 
427 void
ungetc(Biobuf * b)428 ungetc(Biobuf *b)
429 {
430 	cno--;
431 	Bungetrune(b);
432 }
433 
434 char*
getline(Biobuf * b)435 getline(Biobuf *b)
436 {
437 	int i, c;
438 
439 	for(i=0; i<sizeof buf; i++){
440 		c = getc(b);
441 		if(c == Beof)
442 			return nil;
443 		buf[i] = c;
444 		if(c == '\n'){
445 			buf[i] = '\0';
446 			break;
447 		}
448 	}
449 	return buf;
450 }
451 
452 int
getnum(Biobuf * b)453 getnum(Biobuf *b)
454 {
455 	int i, c;
456 
457 	i = 0;
458 	for(;;){
459 		c = getc(b);
460 		if(c<'0' || '9'<c){
461 			ungetc(b);
462 			break;
463 		}
464 		i = i*10 + (c-'0');
465 	}
466 	return i;
467 }
468 
469 char*
getstr(Biobuf * b)470 getstr(Biobuf *b)
471 {
472 	int i, c;
473 
474 	for(i=0; i<sizeof buf; i++){
475 		/* must get bytes not runes */
476 		cno++;
477 		c = Bgetc(b);
478 		if(c == Beof)
479 			return nil;
480 		buf[i] = c;
481 		if(c == '\n' || c==' ' || c=='\t'){
482 			ungetc(b);
483 			buf[i] = '\0';
484 			break;
485 		}
486 	}
487 	return buf;
488 }
489 
490 int
setnum(Biobuf * b,char * name,int min,int max)491 setnum(Biobuf *b, char *name, int min, int max)
492 {
493 	int i;
494 
495 	i = getnum(b);
496 	if(debug > 2)
497 		fprint(2, "set %s = %d\n", name, i);
498 	if(min<=i && i<max)
499 		return i;
500 	sysfatal("value of %s is %d; min %d max %d at %s:#%d", name, i, min, max, filename, cno);
501 	return i;
502 }
503 
504 void
xcmd(Biobuf * b)505 xcmd(Biobuf *b)
506 {
507 	char *p, *fld[16], buf[1024];
508 
509 	int i, nfld;
510 
511 	p = getline(b);
512 	if(p == nil)
513 		sysfatal("xcmd error: %r");
514 	if(debug)
515 		fprint(2, "x command '%s'\n", p);
516 	nfld = tokenize(p, fld, nelem(fld));
517 	if(nfld == 0)
518 		return;
519 	switch(fld[0][0]){
520 	case 'f':
521 		/* mount font */
522 		if(nfld != 3)
523 			break;
524 		i = atoi(fld[1]);
525 		if(i<0 || Nfont<=i)
526 			sysfatal("font %d out of range at %s:#%d", i, filename, cno);
527 		mountfont(i, fld[2]);
528 		return;
529 	case 'i':
530 		/* init */
531 		return;
532 	case 'r':
533 		if(nfld<2 || atoi(fld[1])!=res)
534 			sysfatal("typesetter has unexpected resolution %s", fld[1]? fld[1] : "<unspecified>");
535 		return;
536 	case 's':
537 		/* stop */
538 		return;
539 	case 't':
540 		/* trailer */
541 		return;
542 	case 'T':
543 		if(nfld!=2 || strcmp(fld[1], "utf")!=0)
544 			sysfatal("output for unknown typesetter type %s", fld[1]);
545 		return;
546 	case 'X':
547 		if(nfld<3 || strcmp(fld[1], "html")!=0)
548 			break;
549 		/* is it a man reference of the form cp(1)? */
550 		/* X manref start/end cp (1) */
551 		if(nfld==6 && strcmp(fld[2], "manref")==0){
552 			/* was the right macro; is it the right form? */
553 			if(strlen(fld[5])>=3 &&
554 			   fld[5][0]=='(' && fld[5][2]==')' &&
555 			   '0'<=fld[5][1] && fld[5][1]<='9'){
556 				if(strcmp(fld[3], "start") == 0){
557 					/* set anchor attribute and remember string */
558 					attr |= (1<<Anchor);
559 					snprint(buf, sizeof buf,
560 						"<a href=\"/magic/man2html/%c/%s\">",
561 						fld[5][1], fld[4]);
562 					nanchors++;
563 					anchors = erealloc(anchors, nanchors*sizeof(char*));
564 					anchors[nanchors-1] = estrdup(buf);
565 				}else if(strcmp(fld[3], "end") == 0)
566 					attr &= ~(1<<Anchor);
567 			}
568 		}else if(strcmp(fld[2], "manPP") == 0){
569 			didP = 1;
570 			emitchar(Epp);
571 		}else if(nfld<4 || strcmp(fld[2], "manref")!=0){
572 			if(nfld>2 && strcmp(fld[2], "<P>")==0){	/* avoid triggering extra <br> */
573 				didP = 1;
574 				/* clear all font attributes before paragraph */
575 				emitchar(' ' | (attr & ~(0xFFFF|((1<<Italic)|(1<<Bold)|(1<<CW)))));
576 				emitstr("<P>");
577 				/* next emittec char will turn font attributes back on */
578 			}else if(nfld>2 && strcmp(fld[2], "<H4>")==0)
579 				attr |= (1<<Heading);
580 			else if(nfld>2 && strcmp(fld[2], "</H4>")==0)
581 				attr &= ~(1<<Heading);
582 			else if(debug)
583 				fprint(2, "unknown in-line html %s... at %s:%#d\n",
584 					fld[2], filename, cno);
585 		}
586 		return;
587 	}
588 	if(debug)
589 		fprint(2, "unknown or badly formatted x command %s\n", fld[0]);
590 }
591 
592 int
lookup(int c,Htmlchar tab[],int ntab)593 lookup(int c, Htmlchar tab[], int ntab)
594 {
595 	int low, high, mid;
596 
597 	low = 0;
598 	high = ntab - 1;
599 	while(low <= high){
600 		mid = (low+high)/2;
601 		if(c < tab[mid].value)
602 			high = mid - 1;
603 		else if(c > tab[mid].value)
604 			low = mid + 1;
605 		else
606 			return mid;
607 	}
608 	return -1;	/* no match */
609 }
610 
611 void
emithtmlchar(int r)612 emithtmlchar(int r)
613 {
614 	static char buf[10];
615 	int i;
616 
617 	i = lookup(r, htmlchars, nelem(htmlchars));
618 	if(i >= 0)
619 		emitstr(htmlchars[i].name);
620 	else
621 		emit(r);
622 }
623 
624 char*
troffchar(char * s)625 troffchar(char *s)
626 {
627 	int i;
628 
629 	for(i=0; troffchars[i].name!=nil; i++)
630 		if(strcmp(s, troffchars[i].name) == 0)
631 			return troffchars[i].value;
632 	return "??";
633 }
634 
635 void
indent(void)636 indent(void)
637 {
638 	int nind;
639 
640 	didP = 0;
641 	if(atnewline){
642 		if(hp != prevlineH){
643 			prevlineH = hp;
644 			/* these most peculiar numbers appear in the troff -man output */
645 			nind = ((prevlineH-1*res)+323)/324;
646 			attr &= ~((1<<Indent1)|(1<<Indent2)|(1<<Indent3));
647 			if(nind >= 1)
648 				attr |= (1<<Indent1);
649 			if(nind >= 2)
650 				attr |= (1<<Indent2);
651 			if(nind >= 3)
652 				attr |= (1<<Indent3);
653 		}
654 		atnewline = 0;
655 	}
656 }
657 
658 void
process(Biobuf * b,char * name)659 process(Biobuf *b, char *name)
660 {
661 	int c, r, v, i;
662 	char *p;
663 
664 	cno = 0;
665 	prevlineH = res;
666 	filename = name;
667 	for(;;){
668 		c = getc(b);
669 		switch(c){
670 		case Beof:
671 			/* go to ground state */
672 			attr = 0;
673 			emit('\n');
674 			return;
675 		case '\n':
676 			break;
677 		case '0': case '1': case '2': case '3': case '4':
678 		case '5': case '6': case '7': case '8': case '9':
679 			v = c-'0';
680 			c = getc(b);
681 			if(c<'0' || '9'<c)
682 				sysfatal("illegal character motion at %s:#%d", filename, cno);
683 			v = v*10 + (c-'0');
684 			hp += v;
685 			/* fall through to character case */
686 		case 'c':
687 			indent();
688 			r = getc(b);
689 			emithtmlchar(r);
690 			break;
691 		case 'D':
692 			/* draw line; ignore */
693 			do
694 				c = getc(b);
695 			while(c!='\n' && c!= Beof);
696 			break;
697 		case 'f':
698 			v = setnum(b, "font", 0, Nfont);
699 			switchfont(v);
700 			break;
701 		case 'h':
702 			v = setnum(b, "hpos", -20000, 20000);
703 			/* generate spaces if motion is large and within a line */
704 			if(!atnewline && v>2*72)
705 				for(i=0; i<v; i+=72)
706 					emitstr("&nbsp;");
707 			hp += v;
708 			break;
709 		case 'n':
710 			setnum(b, "n1", -10000, 10000);
711 			//Bprint(&bout, " N1=%d", v);
712 			getc(b);	/* space separates */
713 			setnum(b, "n2", -10000, 10000);
714 			atnewline = 1;
715 			if(!didP && hp < (Wid-1)*res)	/* if line is less than 19" long, probably need a line break */
716 				emitstr("<br>");
717 			emit('\n');
718 			break;
719 		case 'p':
720 			page = setnum(b, "ps", -10000, 10000);
721 			break;
722 		case 's':
723 			ps = setnum(b, "ps", 1, 1000);
724 			break;
725 		case 'v':
726 			vp += setnum(b, "vpos", -10000, 10000);
727 			/* BUG: ignore motion */
728 			break;
729 		case 'x':
730 			xcmd(b);
731 			break;
732 		case 'w':
733 			emit(' ');
734 			break;
735 		case 'C':
736 			indent();
737 			p = getstr(b);
738 			emitstr(troffchar(p));
739 			break;
740 		case 'H':
741 			hp = setnum(b, "hpos", 0, 20000);
742 			//Bprint(&bout, " H=%d ", hp);
743 			break;
744 		case 'V':
745 			vp = setnum(b, "vpos", 0, 10000);
746 			break;
747 		default:
748 			fprint(2, "dhtml: unknown directive %c(0x%.2ux) at %s:#%d\n", c, c, filename, cno);
749 			return;
750 		}
751 	}
752 }
753 
754 HTMLfont*
htmlfont(char * name)755 htmlfont(char *name)
756 {
757 	int i;
758 
759 	for(i=0; htmlfonts[i].name!=nil; i++)
760 		if(strcmp(name, htmlfonts[i].name) == 0)
761 			return &htmlfonts[i];
762 	return &htmlfonts[0];
763 }
764 
765 void
mountfont(int pos,char * name)766 mountfont(int pos, char *name)
767 {
768 	if(debug)
769 		fprint(2, "mount font %s on %d\n", name, pos);
770 	if(font[pos] != nil){
771 		free(font[pos]->name);
772 		free(font[pos]);
773 	}
774 	font[pos] = emalloc(sizeof(Font));
775 	font[pos]->name = estrdup(name);
776 	font[pos]->htmlfont = htmlfont(name);
777 }
778 
779 void
switchfont(int pos)780 switchfont(int pos)
781 {
782 	HTMLfont *hf;
783 
784 	if(debug)
785 		fprint(2, "font change from %d (%s) to %d (%s)\n", ft, font[ft]->name, pos, font[pos]->name);
786 	if(pos == ft)
787 		return;
788 	hf = font[ft]->htmlfont;
789 	if(hf->bit != 0)
790 		attr &= ~(1<<hf->bit);
791 	ft = pos;
792 	hf = font[ft]->htmlfont;
793 	if(hf->bit != 0)
794 		attr |= (1<<hf->bit);
795 }
796