1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4
5 enum{
6 Nfont = 11,
7 Wid = 20, /* tmac.anhtml sets page width to 20" so we can recognize .nf text */
8 };
9
10 typedef uintptr Char;
11 typedef struct Troffchar Troffchar;
12 typedef struct Htmlchar Htmlchar;
13 typedef struct Font Font;
14 typedef struct HTMLfont HTMLfont;
15
16 /*
17 * a Char is >= 32 bits. low 16 bits are the rune. higher are attributes.
18 * must be able to hold a pointer.
19 */
20 enum
21 {
22 Italic = 16,
23 Bold,
24 CW,
25 Indent1,
26 Indent2,
27 Indent3,
28 Heading = 25,
29 Anchor = 26, /* must be last */
30 };
31
32 enum /* magic emissions */
33 {
34 Estring = 0,
35 Epp = 1<<16,
36 };
37
38 int attrorder[] = { Indent1, Indent2, Indent3, Heading, Anchor, Italic, Bold, CW };
39
40 int nest[10];
41 int nnest;
42
43 struct Troffchar
44 {
45 char *name;
46 char *value;
47 };
48
49 struct Htmlchar
50 {
51 char *utf;
52 char *name;
53 int value;
54 };
55
56 #include "chars.h"
57
58 struct Font{
59 char *name;
60 HTMLfont *htmlfont;
61 };
62
63 struct HTMLfont{
64 char *name;
65 char *htmlname;
66 int bit;
67 };
68
69 /* R must be first; it's the default representation for fonts we don't recognize */
70 HTMLfont htmlfonts[] =
71 {
72 "R", nil, 0,
73 "LucidaSans", nil, 0,
74 "I", "i", Italic,
75 "LucidaSansI", "i", Italic,
76 "CW", "tt", CW,
77 "LucidaCW", "tt", CW,
78 nil, nil,
79 };
80
81 #define TABLE "<table border=0 cellpadding=0 cellspacing=0>"
82
83 char*
84 onattr[8*sizeof(int)] =
85 {
86 0, 0, 0, 0, 0, 0, 0, 0,
87 0, 0, 0, 0, 0, 0, 0, 0,
88 "<i>", /* italic */
89 "<b>", /* bold */
90 "<tt><font size=+1>", /* cw */
91 "<+table border=0 cellpadding=0 cellspacing=0><tr height=2><td><tr><td width=20><td>\n", /* indent1 */
92 "<+table border=0 cellpadding=0 cellspacing=0><tr height=2><td><tr><td width=20><td>\n", /* indent2 */
93 "<+table border=0 cellpadding=0 cellspacing=0><tr height=2><td><tr><td width=20><td>\n", /* indent3 */
94 0,
95 0,
96 0,
97 "<p><font size=+1><b>", /* heading 25 */
98 "<unused>", /* anchor 26 */
99 };
100
101 char*
102 offattr[8*sizeof(int)] =
103 {
104 0, 0, 0, 0, 0, 0, 0, 0,
105 0, 0, 0, 0, 0, 0, 0, 0,
106 "</i>", /* italic */
107 "</b>", /* bold */
108 "</font></tt>", /* cw */
109 "<-/table>", /* indent1 */
110 "<-/table>", /* indent2 */
111 "<-/table>", /* indent3 */
112 0,
113 0,
114 0,
115 "</b></font>", /* heading 25 */
116 "</a>", /* anchor 26 */
117 };
118
119 Font *font[Nfont];
120
121 Biobuf bout;
122 int debug = 0;
123
124 /* troff state */
125 int page = 1;
126 int ft = 1;
127 int vp = 0;
128 int hp = 0;
129 int ps = 1;
130 int res = 720;
131
132 int didP = 0;
133 int atnewline = 1;
134 int prevlineH = 0;
135 Char attr = 0; /* or'ed into each Char */
136
137 Char *chars;
138 int nchars;
139 int nalloc;
140 char** anchors; /* allocated in order */
141 int nanchors;
142
143 char *filename;
144 int cno;
145 char buf[8192];
146 char *title = "Plan 9 man page";
147
148 void process(Biobuf*, char*);
149 void mountfont(int, char*);
150 void switchfont(int);
151 void header(char*);
152 void flush(void);
153 void trailer(void);
154
155 void*
emalloc(ulong n)156 emalloc(ulong n)
157 {
158 void *p;
159
160 p = malloc(n);
161 if(p == nil)
162 sysfatal("malloc failed: %r");
163 return p;
164 }
165
166 void*
erealloc(void * p,ulong n)167 erealloc(void *p, ulong n)
168 {
169
170 p = realloc(p, n);
171 if(p == nil)
172 sysfatal("realloc failed: %r");
173 return p;
174 }
175
176 char*
estrdup(char * s)177 estrdup(char *s)
178 {
179 char *t;
180
181 t = strdup(s);
182 if(t == nil)
183 sysfatal("strdup failed: %r");
184 return t;
185 }
186
187 void
usage(void)188 usage(void)
189 {
190 fprint(2, "usage: troff2html [-d] [-t title] [file ...]\n");
191 exits("usage");
192 }
193
194 int
hccmp(const void * va,const void * vb)195 hccmp(const void *va, const void *vb)
196 {
197 Htmlchar *a, *b;
198
199 a = (Htmlchar*)va;
200 b = (Htmlchar*)vb;
201 return a->value - b->value;
202 }
203
204 void
main(int argc,char * argv[])205 main(int argc, char *argv[])
206 {
207 int i;
208 Biobuf in, *inp;
209 Rune r;
210
211 for(i=0; i<nelem(htmlchars); i++){
212 chartorune(&r, htmlchars[i].utf);
213 htmlchars[i].value = r;
214 }
215 qsort(htmlchars, nelem(htmlchars), sizeof(htmlchars[0]), hccmp);
216
217 ARGBEGIN{
218 case 't':
219 title = ARGF();
220 if(title == nil)
221 usage();
222 break;
223 case 'd':
224 debug++;
225 break;
226 default:
227 usage();
228 }ARGEND
229
230 Binit(&bout, 1, OWRITE);
231 if(argc == 0){
232 header(title);
233 Binit(&in, 0, OREAD);
234 process(&in, "<stdin>");
235 }else{
236 header(title);
237 for(i=0; i<argc; i++){
238 inp = Bopen(argv[i], OREAD);
239 if(inp == nil)
240 sysfatal("can't open %s: %r", argv[i]);
241 process(inp, argv[i]);
242 Bterm(inp);
243 }
244 }
245 flush();
246 trailer();
247 exits(nil);
248 }
249
250 void
emitchar(Char c)251 emitchar(Char c)
252 {
253 if(nalloc == nchars){
254 nalloc += 10000;
255 chars = realloc(chars, nalloc*sizeof(chars[0]));
256 if(chars == nil)
257 sysfatal("malloc failed: %r");
258 }
259 chars[nchars++] = c;
260 }
261
262 void
emit(Rune r)263 emit(Rune r)
264 {
265 emitchar(r | attr);
266 /*
267 * Close man page references early, so that
268 * .IR proof (1),
269 * doesn't make the comma part of the link.
270 */
271 if(r == ')')
272 attr &= ~(1<<Anchor);
273 }
274
275 void
emitstr(char * s)276 emitstr(char *s)
277 {
278 emitchar(Estring);
279 emitchar((Char)s);
280 }
281
282 int indentlevel;
283 int linelen;
284
285 void
iputrune(Biobuf * b,Rune r)286 iputrune(Biobuf *b, Rune r)
287 {
288 int i;
289
290 if(linelen++ > 60 && r == ' ')
291 r = '\n';
292 Bputrune(b, r);
293 if(r == '\n'){
294 for(i=0; i<indentlevel; i++)
295 Bprint(b, " ");
296 linelen = 0;
297 }
298 }
299
300 void
iputs(Biobuf * b,char * s)301 iputs(Biobuf *b, char *s)
302 {
303 if(s[0]=='<' && s[1]=='+'){
304 iputrune(b, '\n');
305 Bprint(b, "<%s", s+2);
306 indentlevel++;
307 iputrune(b, '\n');
308 }else if(s[0]=='<' && s[1]=='-'){
309 indentlevel--;
310 iputrune(b, '\n');
311 Bprint(b, "<%s", s+2);
312 iputrune(b, '\n');
313 }else
314 Bprint(b, "%s", s);
315 }
316
317 void
setattr(Char a)318 setattr(Char a)
319 {
320 Char on, off;
321 int i, j;
322
323 on = a & ~attr;
324 off = attr & ~a;
325
326 /* walk up the nest stack until we reach something we need to turn off. */
327 for(i=0; i<nnest; i++)
328 if(off&(1<<nest[i]))
329 break;
330
331 /* turn off everything above that */
332 for(j=nnest-1; j>=i; j--)
333 iputs(&bout, offattr[nest[j]]);
334
335 /* turn on everything we just turned off but didn't want to */
336 for(j=i; j<nnest; j++)
337 if(a&(1<<nest[j]))
338 iputs(&bout, onattr[nest[j]]);
339 else
340 nest[j] = 0;
341
342 /* shift the zeros (turned off things) up */
343 for(i=j=0; i<nnest; i++)
344 if(nest[i] != 0)
345 nest[j++] = nest[i];
346 nnest = j;
347
348 /* now turn on the new attributes */
349 for(i=0; i<nelem(attrorder); i++){
350 j = attrorder[i];
351 if(on&(1<<j)){
352 if(j == Anchor)
353 onattr[j] = anchors[nanchors++];
354 iputs(&bout, onattr[j]);
355 if(nnest >= nelem(nest))
356 sysfatal("nesting too deep");
357 nest[nnest++] = j;
358 }
359 }
360 attr = a;
361 }
362
363 void
flush(void)364 flush(void)
365 {
366 int i;
367 Char c, a;
368
369 nanchors = 0;
370 for(i=0; i<nchars; i++){
371 c = chars[i];
372 if(c == Estring){
373 /* next word is string to print */
374 iputs(&bout, (char*)chars[++i]);
375 continue;
376 }
377 if(c == Epp){
378 iputrune(&bout, '\n');
379 iputs(&bout, TABLE "<tr height=5><td></table>");
380 iputrune(&bout, '\n');
381 continue;
382 }
383 a = c & ~0xFFFF;
384 c &= 0xFFFF;
385 /*
386 * If we're going to something off after a space,
387 * let's just turn it off before.
388 */
389 if(c == ' ' && i<nchars-1 && (chars[i+1]&0xFFFF) >= 32)
390 a ^= a & ~chars[i+1];
391 setattr(a);
392 iputrune(&bout, c & 0xFFFF);
393 }
394 }
395
396 void
header(char * s)397 header(char *s)
398 {
399 Bprint(&bout, "<head>\n");
400 Bprint(&bout, "<title>%s</title>\n", s);
401 Bprint(&bout, "<meta content=\"text/html; charset=utf-8\" http-equiv=Content-Type>\n");
402 Bprint(&bout, "</head>\n");
403 Bprint(&bout, "<body bgcolor=#ffffff>\n");
404 }
405
406 void
trailer(void)407 trailer(void)
408 {
409
410 #ifdef LUCENT
411 Tm *t;
412 t = localtime(time(nil));
413 Bprint(&bout, TABLE "<tr height=20><td></table>\n");
414 Bprint(&bout, "<font size=-1><a href=\"http://www.lucent.com/copyright.html\">\n");
415 Bprint(&bout, "Copyright</A> © %d Alcatel-Lucent. All rights reserved.</font>\n", t->year+1900);
416 #endif
417 Bprint(&bout, "</body></html>\n");
418 }
419
420 int
getc(Biobuf * b)421 getc(Biobuf *b)
422 {
423 cno++;
424 return Bgetrune(b);
425 }
426
427 void
ungetc(Biobuf * b)428 ungetc(Biobuf *b)
429 {
430 cno--;
431 Bungetrune(b);
432 }
433
434 char*
getline(Biobuf * b)435 getline(Biobuf *b)
436 {
437 int i, c;
438
439 for(i=0; i<sizeof buf; i++){
440 c = getc(b);
441 if(c == Beof)
442 return nil;
443 buf[i] = c;
444 if(c == '\n'){
445 buf[i] = '\0';
446 break;
447 }
448 }
449 return buf;
450 }
451
452 int
getnum(Biobuf * b)453 getnum(Biobuf *b)
454 {
455 int i, c;
456
457 i = 0;
458 for(;;){
459 c = getc(b);
460 if(c<'0' || '9'<c){
461 ungetc(b);
462 break;
463 }
464 i = i*10 + (c-'0');
465 }
466 return i;
467 }
468
469 char*
getstr(Biobuf * b)470 getstr(Biobuf *b)
471 {
472 int i, c;
473
474 for(i=0; i<sizeof buf; i++){
475 /* must get bytes not runes */
476 cno++;
477 c = Bgetc(b);
478 if(c == Beof)
479 return nil;
480 buf[i] = c;
481 if(c == '\n' || c==' ' || c=='\t'){
482 ungetc(b);
483 buf[i] = '\0';
484 break;
485 }
486 }
487 return buf;
488 }
489
490 int
setnum(Biobuf * b,char * name,int min,int max)491 setnum(Biobuf *b, char *name, int min, int max)
492 {
493 int i;
494
495 i = getnum(b);
496 if(debug > 2)
497 fprint(2, "set %s = %d\n", name, i);
498 if(min<=i && i<max)
499 return i;
500 sysfatal("value of %s is %d; min %d max %d at %s:#%d", name, i, min, max, filename, cno);
501 return i;
502 }
503
504 void
xcmd(Biobuf * b)505 xcmd(Biobuf *b)
506 {
507 char *p, *fld[16], buf[1024];
508
509 int i, nfld;
510
511 p = getline(b);
512 if(p == nil)
513 sysfatal("xcmd error: %r");
514 if(debug)
515 fprint(2, "x command '%s'\n", p);
516 nfld = tokenize(p, fld, nelem(fld));
517 if(nfld == 0)
518 return;
519 switch(fld[0][0]){
520 case 'f':
521 /* mount font */
522 if(nfld != 3)
523 break;
524 i = atoi(fld[1]);
525 if(i<0 || Nfont<=i)
526 sysfatal("font %d out of range at %s:#%d", i, filename, cno);
527 mountfont(i, fld[2]);
528 return;
529 case 'i':
530 /* init */
531 return;
532 case 'r':
533 if(nfld<2 || atoi(fld[1])!=res)
534 sysfatal("typesetter has unexpected resolution %s", fld[1]? fld[1] : "<unspecified>");
535 return;
536 case 's':
537 /* stop */
538 return;
539 case 't':
540 /* trailer */
541 return;
542 case 'T':
543 if(nfld!=2 || strcmp(fld[1], "utf")!=0)
544 sysfatal("output for unknown typesetter type %s", fld[1]);
545 return;
546 case 'X':
547 if(nfld<3 || strcmp(fld[1], "html")!=0)
548 break;
549 /* is it a man reference of the form cp(1)? */
550 /* X manref start/end cp (1) */
551 if(nfld==6 && strcmp(fld[2], "manref")==0){
552 /* was the right macro; is it the right form? */
553 if(strlen(fld[5])>=3 &&
554 fld[5][0]=='(' && fld[5][2]==')' &&
555 '0'<=fld[5][1] && fld[5][1]<='9'){
556 if(strcmp(fld[3], "start") == 0){
557 /* set anchor attribute and remember string */
558 attr |= (1<<Anchor);
559 snprint(buf, sizeof buf,
560 "<a href=\"/magic/man2html/%c/%s\">",
561 fld[5][1], fld[4]);
562 nanchors++;
563 anchors = erealloc(anchors, nanchors*sizeof(char*));
564 anchors[nanchors-1] = estrdup(buf);
565 }else if(strcmp(fld[3], "end") == 0)
566 attr &= ~(1<<Anchor);
567 }
568 }else if(strcmp(fld[2], "manPP") == 0){
569 didP = 1;
570 emitchar(Epp);
571 }else if(nfld<4 || strcmp(fld[2], "manref")!=0){
572 if(nfld>2 && strcmp(fld[2], "<P>")==0){ /* avoid triggering extra <br> */
573 didP = 1;
574 /* clear all font attributes before paragraph */
575 emitchar(' ' | (attr & ~(0xFFFF|((1<<Italic)|(1<<Bold)|(1<<CW)))));
576 emitstr("<P>");
577 /* next emittec char will turn font attributes back on */
578 }else if(nfld>2 && strcmp(fld[2], "<H4>")==0)
579 attr |= (1<<Heading);
580 else if(nfld>2 && strcmp(fld[2], "</H4>")==0)
581 attr &= ~(1<<Heading);
582 else if(debug)
583 fprint(2, "unknown in-line html %s... at %s:%#d\n",
584 fld[2], filename, cno);
585 }
586 return;
587 }
588 if(debug)
589 fprint(2, "unknown or badly formatted x command %s\n", fld[0]);
590 }
591
592 int
lookup(int c,Htmlchar tab[],int ntab)593 lookup(int c, Htmlchar tab[], int ntab)
594 {
595 int low, high, mid;
596
597 low = 0;
598 high = ntab - 1;
599 while(low <= high){
600 mid = (low+high)/2;
601 if(c < tab[mid].value)
602 high = mid - 1;
603 else if(c > tab[mid].value)
604 low = mid + 1;
605 else
606 return mid;
607 }
608 return -1; /* no match */
609 }
610
611 void
emithtmlchar(int r)612 emithtmlchar(int r)
613 {
614 static char buf[10];
615 int i;
616
617 i = lookup(r, htmlchars, nelem(htmlchars));
618 if(i >= 0)
619 emitstr(htmlchars[i].name);
620 else
621 emit(r);
622 }
623
624 char*
troffchar(char * s)625 troffchar(char *s)
626 {
627 int i;
628
629 for(i=0; troffchars[i].name!=nil; i++)
630 if(strcmp(s, troffchars[i].name) == 0)
631 return troffchars[i].value;
632 return "??";
633 }
634
635 void
indent(void)636 indent(void)
637 {
638 int nind;
639
640 didP = 0;
641 if(atnewline){
642 if(hp != prevlineH){
643 prevlineH = hp;
644 /* these most peculiar numbers appear in the troff -man output */
645 nind = ((prevlineH-1*res)+323)/324;
646 attr &= ~((1<<Indent1)|(1<<Indent2)|(1<<Indent3));
647 if(nind >= 1)
648 attr |= (1<<Indent1);
649 if(nind >= 2)
650 attr |= (1<<Indent2);
651 if(nind >= 3)
652 attr |= (1<<Indent3);
653 }
654 atnewline = 0;
655 }
656 }
657
658 void
process(Biobuf * b,char * name)659 process(Biobuf *b, char *name)
660 {
661 int c, r, v, i;
662 char *p;
663
664 cno = 0;
665 prevlineH = res;
666 filename = name;
667 for(;;){
668 c = getc(b);
669 switch(c){
670 case Beof:
671 /* go to ground state */
672 attr = 0;
673 emit('\n');
674 return;
675 case '\n':
676 break;
677 case '0': case '1': case '2': case '3': case '4':
678 case '5': case '6': case '7': case '8': case '9':
679 v = c-'0';
680 c = getc(b);
681 if(c<'0' || '9'<c)
682 sysfatal("illegal character motion at %s:#%d", filename, cno);
683 v = v*10 + (c-'0');
684 hp += v;
685 /* fall through to character case */
686 case 'c':
687 indent();
688 r = getc(b);
689 emithtmlchar(r);
690 break;
691 case 'D':
692 /* draw line; ignore */
693 do
694 c = getc(b);
695 while(c!='\n' && c!= Beof);
696 break;
697 case 'f':
698 v = setnum(b, "font", 0, Nfont);
699 switchfont(v);
700 break;
701 case 'h':
702 v = setnum(b, "hpos", -20000, 20000);
703 /* generate spaces if motion is large and within a line */
704 if(!atnewline && v>2*72)
705 for(i=0; i<v; i+=72)
706 emitstr(" ");
707 hp += v;
708 break;
709 case 'n':
710 setnum(b, "n1", -10000, 10000);
711 //Bprint(&bout, " N1=%d", v);
712 getc(b); /* space separates */
713 setnum(b, "n2", -10000, 10000);
714 atnewline = 1;
715 if(!didP && hp < (Wid-1)*res) /* if line is less than 19" long, probably need a line break */
716 emitstr("<br>");
717 emit('\n');
718 break;
719 case 'p':
720 page = setnum(b, "ps", -10000, 10000);
721 break;
722 case 's':
723 ps = setnum(b, "ps", 1, 1000);
724 break;
725 case 'v':
726 vp += setnum(b, "vpos", -10000, 10000);
727 /* BUG: ignore motion */
728 break;
729 case 'x':
730 xcmd(b);
731 break;
732 case 'w':
733 emit(' ');
734 break;
735 case 'C':
736 indent();
737 p = getstr(b);
738 emitstr(troffchar(p));
739 break;
740 case 'H':
741 hp = setnum(b, "hpos", 0, 20000);
742 //Bprint(&bout, " H=%d ", hp);
743 break;
744 case 'V':
745 vp = setnum(b, "vpos", 0, 10000);
746 break;
747 default:
748 fprint(2, "dhtml: unknown directive %c(0x%.2ux) at %s:#%d\n", c, c, filename, cno);
749 return;
750 }
751 }
752 }
753
754 HTMLfont*
htmlfont(char * name)755 htmlfont(char *name)
756 {
757 int i;
758
759 for(i=0; htmlfonts[i].name!=nil; i++)
760 if(strcmp(name, htmlfonts[i].name) == 0)
761 return &htmlfonts[i];
762 return &htmlfonts[0];
763 }
764
765 void
mountfont(int pos,char * name)766 mountfont(int pos, char *name)
767 {
768 if(debug)
769 fprint(2, "mount font %s on %d\n", name, pos);
770 if(font[pos] != nil){
771 free(font[pos]->name);
772 free(font[pos]);
773 }
774 font[pos] = emalloc(sizeof(Font));
775 font[pos]->name = estrdup(name);
776 font[pos]->htmlfont = htmlfont(name);
777 }
778
779 void
switchfont(int pos)780 switchfont(int pos)
781 {
782 HTMLfont *hf;
783
784 if(debug)
785 fprint(2, "font change from %d (%s) to %d (%s)\n", ft, font[ft]->name, pos, font[pos]->name);
786 if(pos == ft)
787 return;
788 hf = font[ft]->htmlfont;
789 if(hf->bit != 0)
790 attr &= ~(1<<hf->bit);
791 ft = pos;
792 hf = font[ft]->htmlfont;
793 if(hf->bit != 0)
794 attr |= (1<<hf->bit);
795 }
796