xref: /openbsd-src/usr.bin/mandoc/html.c (revision ac9b4aacc1da35008afea06a5d23c2f2dea9b93e)
1 /*	$Id: html.c,v 1.30 2012/05/28 13:00:51 schwarze Exp $ */
2 /*
3  * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
4  * Copyright (c) 2011, 2012 Ingo Schwarze <schwarze@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 #include <sys/types.h>
19 
20 #include <assert.h>
21 #include <ctype.h>
22 #include <stdarg.h>
23 #include <stdio.h>
24 #include <stdint.h>
25 #include <stdlib.h>
26 #include <string.h>
27 #include <unistd.h>
28 
29 #include "mandoc.h"
30 #include "libmandoc.h"
31 #include "out.h"
32 #include "html.h"
33 #include "main.h"
34 
35 struct	htmldata {
36 	const char	 *name;
37 	int		  flags;
38 #define	HTML_CLRLINE	 (1 << 0)
39 #define	HTML_NOSTACK	 (1 << 1)
40 #define	HTML_AUTOCLOSE	 (1 << 2) /* Tag has auto-closure. */
41 };
42 
43 static	const struct htmldata htmltags[TAG_MAX] = {
44 	{"html",	HTML_CLRLINE}, /* TAG_HTML */
45 	{"head",	HTML_CLRLINE}, /* TAG_HEAD */
46 	{"body",	HTML_CLRLINE}, /* TAG_BODY */
47 	{"meta",	HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_META */
48 	{"title",	HTML_CLRLINE}, /* TAG_TITLE */
49 	{"div",		HTML_CLRLINE}, /* TAG_DIV */
50 	{"h1",		0}, /* TAG_H1 */
51 	{"h2",		0}, /* TAG_H2 */
52 	{"span",	0}, /* TAG_SPAN */
53 	{"link",	HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_LINK */
54 	{"br",		HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_BR */
55 	{"a",		0}, /* TAG_A */
56 	{"table",	HTML_CLRLINE}, /* TAG_TABLE */
57 	{"tbody",	HTML_CLRLINE}, /* TAG_TBODY */
58 	{"col",		HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_COL */
59 	{"tr",		HTML_CLRLINE}, /* TAG_TR */
60 	{"td",		HTML_CLRLINE}, /* TAG_TD */
61 	{"li",		HTML_CLRLINE}, /* TAG_LI */
62 	{"ul",		HTML_CLRLINE}, /* TAG_UL */
63 	{"ol",		HTML_CLRLINE}, /* TAG_OL */
64 	{"dl",		HTML_CLRLINE}, /* TAG_DL */
65 	{"dt",		HTML_CLRLINE}, /* TAG_DT */
66 	{"dd",		HTML_CLRLINE}, /* TAG_DD */
67 	{"blockquote",	HTML_CLRLINE}, /* TAG_BLOCKQUOTE */
68 	{"p",		HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_P */
69 	{"pre",		HTML_CLRLINE }, /* TAG_PRE */
70 	{"b",		0 }, /* TAG_B */
71 	{"i",		0 }, /* TAG_I */
72 	{"code",	0 }, /* TAG_CODE */
73 	{"small",	0 }, /* TAG_SMALL */
74 };
75 
76 static	const char	*const htmlattrs[ATTR_MAX] = {
77 	"http-equiv", /* ATTR_HTTPEQUIV */
78 	"content", /* ATTR_CONTENT */
79 	"name", /* ATTR_NAME */
80 	"rel", /* ATTR_REL */
81 	"href", /* ATTR_HREF */
82 	"type", /* ATTR_TYPE */
83 	"media", /* ATTR_MEDIA */
84 	"class", /* ATTR_CLASS */
85 	"style", /* ATTR_STYLE */
86 	"width", /* ATTR_WIDTH */
87 	"id", /* ATTR_ID */
88 	"summary", /* ATTR_SUMMARY */
89 	"align", /* ATTR_ALIGN */
90 	"colspan", /* ATTR_COLSPAN */
91 };
92 
93 static	const char	*const roffscales[SCALE_MAX] = {
94 	"cm", /* SCALE_CM */
95 	"in", /* SCALE_IN */
96 	"pc", /* SCALE_PC */
97 	"pt", /* SCALE_PT */
98 	"em", /* SCALE_EM */
99 	"em", /* SCALE_MM */
100 	"ex", /* SCALE_EN */
101 	"ex", /* SCALE_BU */
102 	"em", /* SCALE_VS */
103 	"ex", /* SCALE_FS */
104 };
105 
106 static	void	 bufncat(struct html *, const char *, size_t);
107 static	void	 print_ctag(struct html *, enum htmltag);
108 static	int	 print_encode(struct html *, const char *, int);
109 static	void	 print_metaf(struct html *, enum mandoc_esc);
110 static	void	 print_attr(struct html *, const char *, const char *);
111 static	void	 *ml_alloc(char *, enum htmltype);
112 
113 static void *
114 ml_alloc(char *outopts, enum htmltype type)
115 {
116 	struct html	*h;
117 	const char	*toks[5];
118 	char		*v;
119 
120 	toks[0] = "style";
121 	toks[1] = "man";
122 	toks[2] = "includes";
123 	toks[3] = "fragment";
124 	toks[4] = NULL;
125 
126 	h = mandoc_calloc(1, sizeof(struct html));
127 
128 	h->type = type;
129 	h->tags.head = NULL;
130 	h->symtab = mchars_alloc();
131 
132 	while (outopts && *outopts)
133 		switch (getsubopt(&outopts, UNCONST(toks), &v)) {
134 		case (0):
135 			h->style = v;
136 			break;
137 		case (1):
138 			h->base_man = v;
139 			break;
140 		case (2):
141 			h->base_includes = v;
142 			break;
143 		case (3):
144 			h->oflags |= HTML_FRAGMENT;
145 			break;
146 		default:
147 			break;
148 		}
149 
150 	return(h);
151 }
152 
153 void *
154 html_alloc(char *outopts)
155 {
156 
157 	return(ml_alloc(outopts, HTML_HTML_4_01_STRICT));
158 }
159 
160 
161 void *
162 xhtml_alloc(char *outopts)
163 {
164 
165 	return(ml_alloc(outopts, HTML_XHTML_1_0_STRICT));
166 }
167 
168 
169 void
170 html_free(void *p)
171 {
172 	struct tag	*tag;
173 	struct html	*h;
174 
175 	h = (struct html *)p;
176 
177 	while ((tag = h->tags.head) != NULL) {
178 		h->tags.head = tag->next;
179 		free(tag);
180 	}
181 
182 	if (h->symtab)
183 		mchars_free(h->symtab);
184 
185 	free(h);
186 }
187 
188 
189 void
190 print_gen_head(struct html *h)
191 {
192 	struct htmlpair	 tag[4];
193 
194 	tag[0].key = ATTR_HTTPEQUIV;
195 	tag[0].val = "Content-Type";
196 	tag[1].key = ATTR_CONTENT;
197 	tag[1].val = "text/html; charset=utf-8";
198 	print_otag(h, TAG_META, 2, tag);
199 
200 	tag[0].key = ATTR_NAME;
201 	tag[0].val = "resource-type";
202 	tag[1].key = ATTR_CONTENT;
203 	tag[1].val = "document";
204 	print_otag(h, TAG_META, 2, tag);
205 
206 	if (h->style) {
207 		tag[0].key = ATTR_REL;
208 		tag[0].val = "stylesheet";
209 		tag[1].key = ATTR_HREF;
210 		tag[1].val = h->style;
211 		tag[2].key = ATTR_TYPE;
212 		tag[2].val = "text/css";
213 		tag[3].key = ATTR_MEDIA;
214 		tag[3].val = "all";
215 		print_otag(h, TAG_LINK, 4, tag);
216 	}
217 }
218 
219 static void
220 print_metaf(struct html *h, enum mandoc_esc deco)
221 {
222 	enum htmlfont	 font;
223 
224 	switch (deco) {
225 	case (ESCAPE_FONTPREV):
226 		font = h->metal;
227 		break;
228 	case (ESCAPE_FONTITALIC):
229 		font = HTMLFONT_ITALIC;
230 		break;
231 	case (ESCAPE_FONTBOLD):
232 		font = HTMLFONT_BOLD;
233 		break;
234 	case (ESCAPE_FONT):
235 		/* FALLTHROUGH */
236 	case (ESCAPE_FONTROMAN):
237 		font = HTMLFONT_NONE;
238 		break;
239 	default:
240 		abort();
241 		/* NOTREACHED */
242 	}
243 
244 	if (h->metaf) {
245 		print_tagq(h, h->metaf);
246 		h->metaf = NULL;
247 	}
248 
249 	h->metal = h->metac;
250 	h->metac = font;
251 
252 	if (HTMLFONT_NONE != font)
253 		h->metaf = HTMLFONT_BOLD == font ?
254 			print_otag(h, TAG_B, 0, NULL) :
255 			print_otag(h, TAG_I, 0, NULL);
256 }
257 
258 int
259 html_strlen(const char *cp)
260 {
261 	size_t		 rsz;
262 	int		 skip, sz;
263 
264 	/*
265 	 * Account for escaped sequences within string length
266 	 * calculations.  This follows the logic in term_strlen() as we
267 	 * must calculate the width of produced strings.
268 	 * Assume that characters are always width of "1".  This is
269 	 * hacky, but it gets the job done for approximation of widths.
270 	 */
271 
272 	sz = 0;
273 	skip = 0;
274 	while (1) {
275 		rsz = strcspn(cp, "\\");
276 		if (rsz) {
277 			cp += rsz;
278 			if (skip) {
279 				skip = 0;
280 				rsz--;
281 			}
282 			sz += rsz;
283 		}
284 		if ('\0' == *cp)
285 			break;
286 		cp++;
287 		switch (mandoc_escape(&cp, NULL, NULL)) {
288 		case (ESCAPE_ERROR):
289 			return(sz);
290 		case (ESCAPE_UNICODE):
291 			/* FALLTHROUGH */
292 		case (ESCAPE_NUMBERED):
293 			/* FALLTHROUGH */
294 		case (ESCAPE_SPECIAL):
295 			if (skip)
296 				skip = 0;
297 			else
298 				sz++;
299 			break;
300 		case (ESCAPE_SKIPCHAR):
301 			skip = 1;
302 			break;
303 		default:
304 			break;
305 		}
306 	}
307 	return(sz);
308 }
309 
310 static int
311 print_encode(struct html *h, const char *p, int norecurse)
312 {
313 	size_t		 sz;
314 	int		 c, len, nospace;
315 	const char	*seq;
316 	enum mandoc_esc	 esc;
317 	static const char rejs[6] = { '\\', '<', '>', '&', ASCII_HYPH, '\0' };
318 
319 	nospace = 0;
320 
321 	while ('\0' != *p) {
322 		if (HTML_SKIPCHAR & h->flags && '\\' != *p) {
323 			h->flags &= ~HTML_SKIPCHAR;
324 			p++;
325 			continue;
326 		}
327 
328 		sz = strcspn(p, rejs);
329 
330 		fwrite(p, 1, sz, stdout);
331 		p += (int)sz;
332 
333 		if ('\0' == *p)
334 			break;
335 
336 		switch (*p++) {
337 		case ('<'):
338 			printf("&lt;");
339 			continue;
340 		case ('>'):
341 			printf("&gt;");
342 			continue;
343 		case ('&'):
344 			printf("&amp;");
345 			continue;
346 		case (ASCII_HYPH):
347 			putchar('-');
348 			continue;
349 		default:
350 			break;
351 		}
352 
353 		esc = mandoc_escape(&p, &seq, &len);
354 		if (ESCAPE_ERROR == esc)
355 			break;
356 
357 		switch (esc) {
358 		case (ESCAPE_FONT):
359 			/* FALLTHROUGH */
360 		case (ESCAPE_FONTPREV):
361 			/* FALLTHROUGH */
362 		case (ESCAPE_FONTBOLD):
363 			/* FALLTHROUGH */
364 		case (ESCAPE_FONTITALIC):
365 			/* FALLTHROUGH */
366 		case (ESCAPE_FONTROMAN):
367 			if (0 == norecurse)
368 				print_metaf(h, esc);
369 			continue;
370 		case (ESCAPE_SKIPCHAR):
371 			h->flags |= HTML_SKIPCHAR;
372 			continue;
373 		default:
374 			break;
375 		}
376 
377 		if (h->flags & HTML_SKIPCHAR) {
378 			h->flags &= ~HTML_SKIPCHAR;
379 			continue;
380 		}
381 
382 		switch (esc) {
383 		case (ESCAPE_UNICODE):
384 			/* Skip passed "u" header. */
385 			c = mchars_num2uc(seq + 1, len - 1);
386 			if ('\0' != c)
387 				printf("&#x%x;", c);
388 			break;
389 		case (ESCAPE_NUMBERED):
390 			c = mchars_num2char(seq, len);
391 			if ('\0' != c)
392 				putchar(c);
393 			break;
394 		case (ESCAPE_SPECIAL):
395 			c = mchars_spec2cp(h->symtab, seq, len);
396 			if (c > 0)
397 				printf("&#%d;", c);
398 			else if (-1 == c && 1 == len)
399 				putchar((int)*seq);
400 			break;
401 		case (ESCAPE_NOSPACE):
402 			if ('\0' == *p)
403 				nospace = 1;
404 			break;
405 		default:
406 			break;
407 		}
408 	}
409 
410 	return(nospace);
411 }
412 
413 
414 static void
415 print_attr(struct html *h, const char *key, const char *val)
416 {
417 	printf(" %s=\"", key);
418 	(void)print_encode(h, val, 1);
419 	putchar('\"');
420 }
421 
422 
423 struct tag *
424 print_otag(struct html *h, enum htmltag tag,
425 		int sz, const struct htmlpair *p)
426 {
427 	int		 i;
428 	struct tag	*t;
429 
430 	/* Push this tags onto the stack of open scopes. */
431 
432 	if ( ! (HTML_NOSTACK & htmltags[tag].flags)) {
433 		t = mandoc_malloc(sizeof(struct tag));
434 		t->tag = tag;
435 		t->next = h->tags.head;
436 		h->tags.head = t;
437 	} else
438 		t = NULL;
439 
440 	if ( ! (HTML_NOSPACE & h->flags))
441 		if ( ! (HTML_CLRLINE & htmltags[tag].flags)) {
442 			/* Manage keeps! */
443 			if ( ! (HTML_KEEP & h->flags)) {
444 				if (HTML_PREKEEP & h->flags)
445 					h->flags |= HTML_KEEP;
446 				putchar(' ');
447 			} else
448 				printf("&#160;");
449 		}
450 
451 	if ( ! (h->flags & HTML_NONOSPACE))
452 		h->flags &= ~HTML_NOSPACE;
453 	else
454 		h->flags |= HTML_NOSPACE;
455 
456 	/* Print out the tag name and attributes. */
457 
458 	printf("<%s", htmltags[tag].name);
459 	for (i = 0; i < sz; i++)
460 		print_attr(h, htmlattrs[p[i].key], p[i].val);
461 
462 	/* Add non-overridable attributes. */
463 
464 	if (TAG_HTML == tag && HTML_XHTML_1_0_STRICT == h->type) {
465 		print_attr(h, "xmlns", "http://www.w3.org/1999/xhtml");
466 		print_attr(h, "xml:lang", "en");
467 		print_attr(h, "lang", "en");
468 	}
469 
470 	/* Accommodate for XML "well-formed" singleton escaping. */
471 
472 	if (HTML_AUTOCLOSE & htmltags[tag].flags)
473 		switch (h->type) {
474 		case (HTML_XHTML_1_0_STRICT):
475 			putchar('/');
476 			break;
477 		default:
478 			break;
479 		}
480 
481 	putchar('>');
482 
483 	h->flags |= HTML_NOSPACE;
484 
485 	if ((HTML_AUTOCLOSE | HTML_CLRLINE) & htmltags[tag].flags)
486 		putchar('\n');
487 
488 	return(t);
489 }
490 
491 
492 static void
493 print_ctag(struct html *h, enum htmltag tag)
494 {
495 
496 	printf("</%s>", htmltags[tag].name);
497 	if (HTML_CLRLINE & htmltags[tag].flags) {
498 		h->flags |= HTML_NOSPACE;
499 		putchar('\n');
500 	}
501 }
502 
503 void
504 print_gen_decls(struct html *h)
505 {
506 	const char	*doctype;
507 	const char	*dtd;
508 	const char	*name;
509 
510 	switch (h->type) {
511 	case (HTML_HTML_4_01_STRICT):
512 		name = "HTML";
513 		doctype = "-//W3C//DTD HTML 4.01//EN";
514 		dtd = "http://www.w3.org/TR/html4/strict.dtd";
515 		break;
516 	default:
517 		puts("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
518 		name = "html";
519 		doctype = "-//W3C//DTD XHTML 1.0 Strict//EN";
520 		dtd = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";
521 		break;
522 	}
523 
524 	printf("<!DOCTYPE %s PUBLIC \"%s\" \"%s\">\n",
525 			name, doctype, dtd);
526 }
527 
528 void
529 print_text(struct html *h, const char *word)
530 {
531 
532 	if ( ! (HTML_NOSPACE & h->flags)) {
533 		/* Manage keeps! */
534 		if ( ! (HTML_KEEP & h->flags)) {
535 			if (HTML_PREKEEP & h->flags)
536 				h->flags |= HTML_KEEP;
537 			putchar(' ');
538 		} else
539 			printf("&#160;");
540 	}
541 
542 	assert(NULL == h->metaf);
543 	if (HTMLFONT_NONE != h->metac)
544 		h->metaf = HTMLFONT_BOLD == h->metac ?
545 			print_otag(h, TAG_B, 0, NULL) :
546 			print_otag(h, TAG_I, 0, NULL);
547 
548 	assert(word);
549 	if ( ! print_encode(h, word, 0)) {
550 		if ( ! (h->flags & HTML_NONOSPACE))
551 			h->flags &= ~HTML_NOSPACE;
552 	} else
553 		h->flags |= HTML_NOSPACE;
554 
555 	if (h->metaf) {
556 		print_tagq(h, h->metaf);
557 		h->metaf = NULL;
558 	}
559 
560 	h->flags &= ~HTML_IGNDELIM;
561 }
562 
563 
564 void
565 print_tagq(struct html *h, const struct tag *until)
566 {
567 	struct tag	*tag;
568 
569 	while ((tag = h->tags.head) != NULL) {
570 		/*
571 		 * Remember to close out and nullify the current
572 		 * meta-font and table, if applicable.
573 		 */
574 		if (tag == h->metaf)
575 			h->metaf = NULL;
576 		if (tag == h->tblt)
577 			h->tblt = NULL;
578 		print_ctag(h, tag->tag);
579 		h->tags.head = tag->next;
580 		free(tag);
581 		if (until && tag == until)
582 			return;
583 	}
584 }
585 
586 
587 void
588 print_stagq(struct html *h, const struct tag *suntil)
589 {
590 	struct tag	*tag;
591 
592 	while ((tag = h->tags.head) != NULL) {
593 		if (suntil && tag == suntil)
594 			return;
595 		/*
596 		 * Remember to close out and nullify the current
597 		 * meta-font and table, if applicable.
598 		 */
599 		if (tag == h->metaf)
600 			h->metaf = NULL;
601 		if (tag == h->tblt)
602 			h->tblt = NULL;
603 		print_ctag(h, tag->tag);
604 		h->tags.head = tag->next;
605 		free(tag);
606 	}
607 }
608 
609 void
610 bufinit(struct html *h)
611 {
612 
613 	h->buf[0] = '\0';
614 	h->buflen = 0;
615 }
616 
617 void
618 bufcat_style(struct html *h, const char *key, const char *val)
619 {
620 
621 	bufcat(h, key);
622 	bufcat(h, ":");
623 	bufcat(h, val);
624 	bufcat(h, ";");
625 }
626 
627 void
628 bufcat(struct html *h, const char *p)
629 {
630 
631 	h->buflen = strlcat(h->buf, p, BUFSIZ);
632 	assert(h->buflen < BUFSIZ);
633 }
634 
635 void
636 bufcat_fmt(struct html *h, const char *fmt, ...)
637 {
638 	va_list		 ap;
639 
640 	va_start(ap, fmt);
641 	(void)vsnprintf(h->buf + (int)h->buflen,
642 			BUFSIZ - h->buflen - 1, fmt, ap);
643 	va_end(ap);
644 	h->buflen = strlen(h->buf);
645 }
646 
647 static void
648 bufncat(struct html *h, const char *p, size_t sz)
649 {
650 
651 	assert(h->buflen + sz + 1 < BUFSIZ);
652 	strncat(h->buf, p, sz);
653 	h->buflen += sz;
654 }
655 
656 void
657 buffmt_includes(struct html *h, const char *name)
658 {
659 	const char	*p, *pp;
660 
661 	pp = h->base_includes;
662 
663 	bufinit(h);
664 	while (NULL != (p = strchr(pp, '%'))) {
665 		bufncat(h, pp, (size_t)(p - pp));
666 		switch (*(p + 1)) {
667 		case('I'):
668 			bufcat(h, name);
669 			break;
670 		default:
671 			bufncat(h, p, 2);
672 			break;
673 		}
674 		pp = p + 2;
675 	}
676 	if (pp)
677 		bufcat(h, pp);
678 }
679 
680 void
681 buffmt_man(struct html *h,
682 		const char *name, const char *sec)
683 {
684 	const char	*p, *pp;
685 
686 	pp = h->base_man;
687 
688 	bufinit(h);
689 	while (NULL != (p = strchr(pp, '%'))) {
690 		bufncat(h, pp, (size_t)(p - pp));
691 		switch (*(p + 1)) {
692 		case('S'):
693 			bufcat(h, sec ? sec : "1");
694 			break;
695 		case('N'):
696 			bufcat_fmt(h, name);
697 			break;
698 		default:
699 			bufncat(h, p, 2);
700 			break;
701 		}
702 		pp = p + 2;
703 	}
704 	if (pp)
705 		bufcat(h, pp);
706 }
707 
708 void
709 bufcat_su(struct html *h, const char *p, const struct roffsu *su)
710 {
711 	double		 v;
712 
713 	v = su->scale;
714 	if (SCALE_MM == su->unit && 0.0 == (v /= 100.0))
715 		v = 1.0;
716 
717 	bufcat_fmt(h, "%s: %.2f%s;", p, v, roffscales[su->unit]);
718 }
719 
720 void
721 bufcat_id(struct html *h, const char *src)
722 {
723 
724 	/* Cf. <http://www.w3.org/TR/html4/types.html#h-6.2>. */
725 
726 	while ('\0' != *src)
727 		bufcat_fmt(h, "%.2x", *src++);
728 }
729