xref: /netbsd-src/external/bsd/mdocml/dist/html.c (revision b1c86f5f087524e68db12794ee9c3e3da1ab17a0)
1 /*	$Vendor-Id: html.c,v 1.110 2010/07/26 22:26:05 kristaps Exp $ */
2 /*
3  * Copyright (c) 2008, 2009, 2010 Kristaps Dzonsons <kristaps@bsd.lv>
4  *
5  * Permission to use, copy, modify, and distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16  */
17 #ifdef HAVE_CONFIG_H
18 #include "config.h"
19 #endif
20 
21 #include <sys/types.h>
22 
23 #include <assert.h>
24 #include <ctype.h>
25 #include <stdarg.h>
26 #include <stdio.h>
27 #include <stdint.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <unistd.h>
31 
32 #include "mandoc.h"
33 #include "out.h"
34 #include "chars.h"
35 #include "html.h"
36 #include "main.h"
37 
38 struct	htmldata {
39 	const char	 *name;
40 	int		  flags;
41 #define	HTML_CLRLINE	 (1 << 0)
42 #define	HTML_NOSTACK	 (1 << 1)
43 #define	HTML_AUTOCLOSE	 (1 << 2) /* Tag has auto-closure. */
44 };
45 
46 static	const struct htmldata htmltags[TAG_MAX] = {
47 	{"html",	HTML_CLRLINE}, /* TAG_HTML */
48 	{"head",	HTML_CLRLINE}, /* TAG_HEAD */
49 	{"body",	HTML_CLRLINE}, /* TAG_BODY */
50 	{"meta",	HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_META */
51 	{"title",	HTML_CLRLINE}, /* TAG_TITLE */
52 	{"div",		HTML_CLRLINE}, /* TAG_DIV */
53 	{"h1",		0}, /* TAG_H1 */
54 	{"h2",		0}, /* TAG_H2 */
55 	{"span",	0}, /* TAG_SPAN */
56 	{"link",	HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_LINK */
57 	{"br",		HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_BR */
58 	{"a",		0}, /* TAG_A */
59 	{"table",	HTML_CLRLINE}, /* TAG_TABLE */
60 	{"col",		HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_COL */
61 	{"tr",		HTML_CLRLINE}, /* TAG_TR */
62 	{"td",		HTML_CLRLINE}, /* TAG_TD */
63 	{"li",		HTML_CLRLINE}, /* TAG_LI */
64 	{"ul",		HTML_CLRLINE}, /* TAG_UL */
65 	{"ol",		HTML_CLRLINE}, /* TAG_OL */
66 };
67 
68 static	const char	*const htmlfonts[HTMLFONT_MAX] = {
69 	"roman",
70 	"bold",
71 	"italic"
72 };
73 
74 static	const char	*const htmlattrs[ATTR_MAX] = {
75 	"http-equiv",
76 	"content",
77 	"name",
78 	"rel",
79 	"href",
80 	"type",
81 	"media",
82 	"class",
83 	"style",
84 	"width",
85 	"valign",
86 	"target",
87 	"id",
88 	"summary",
89 };
90 
91 static	void		  print_spec(struct html *, enum roffdeco,
92 				const char *, size_t);
93 static	void		  print_res(struct html *, const char *, size_t);
94 static	void		  print_ctag(struct html *, enum htmltag);
95 static	void		  print_doctype(struct html *);
96 static	void		  print_xmltype(struct html *);
97 static	int		  print_encode(struct html *, const char *, int);
98 static	void		  print_metaf(struct html *, enum roffdeco);
99 static	void		  print_attr(struct html *,
100 				const char *, const char *);
101 static	void		 *ml_alloc(char *, enum htmltype);
102 
103 
104 static void *
105 ml_alloc(char *outopts, enum htmltype type)
106 {
107 	struct html	*h;
108 	const char	*toks[4];
109 	char		*v;
110 
111 	toks[0] = "style";
112 	toks[1] = "man";
113 	toks[2] = "includes";
114 	toks[3] = NULL;
115 
116 	h = calloc(1, sizeof(struct html));
117 	if (NULL == h) {
118 		perror(NULL);
119 		exit(EXIT_FAILURE);
120 	}
121 
122 	h->type = type;
123 	h->tags.head = NULL;
124 	h->ords.head = NULL;
125 	h->symtab = chars_init(CHARS_HTML);
126 
127 	while (outopts && *outopts)
128 		switch (getsubopt(&outopts, UNCONST(toks), &v)) {
129 		case (0):
130 			h->style = v;
131 			break;
132 		case (1):
133 			h->base_man = v;
134 			break;
135 		case (2):
136 			h->base_includes = v;
137 			break;
138 		default:
139 			break;
140 		}
141 
142 	return(h);
143 }
144 
145 void *
146 html_alloc(char *outopts)
147 {
148 
149 	return(ml_alloc(outopts, HTML_HTML_4_01_STRICT));
150 }
151 
152 
153 void *
154 xhtml_alloc(char *outopts)
155 {
156 
157 	return(ml_alloc(outopts, HTML_XHTML_1_0_STRICT));
158 }
159 
160 
161 void
162 html_free(void *p)
163 {
164 	struct tag	*tag;
165 	struct ord	*ord;
166 	struct html	*h;
167 
168 	h = (struct html *)p;
169 
170 	while ((ord = h->ords.head) != NULL) {
171 		h->ords.head = ord->next;
172 		free(ord);
173 	}
174 
175 	while ((tag = h->tags.head) != NULL) {
176 		h->tags.head = tag->next;
177 		free(tag);
178 	}
179 
180 	if (h->symtab)
181 		chars_free(h->symtab);
182 
183 	free(h);
184 }
185 
186 
187 void
188 print_gen_head(struct html *h)
189 {
190 	struct htmlpair	 tag[4];
191 
192 	tag[0].key = ATTR_HTTPEQUIV;
193 	tag[0].val = "Content-Type";
194 	tag[1].key = ATTR_CONTENT;
195 	tag[1].val = "text/html; charset=utf-8";
196 	print_otag(h, TAG_META, 2, tag);
197 
198 	tag[0].key = ATTR_NAME;
199 	tag[0].val = "resource-type";
200 	tag[1].key = ATTR_CONTENT;
201 	tag[1].val = "document";
202 	print_otag(h, TAG_META, 2, tag);
203 
204 	if (h->style) {
205 		tag[0].key = ATTR_REL;
206 		tag[0].val = "stylesheet";
207 		tag[1].key = ATTR_HREF;
208 		tag[1].val = h->style;
209 		tag[2].key = ATTR_TYPE;
210 		tag[2].val = "text/css";
211 		tag[3].key = ATTR_MEDIA;
212 		tag[3].val = "all";
213 		print_otag(h, TAG_LINK, 4, tag);
214 	}
215 }
216 
217 
218 static void
219 print_spec(struct html *h, enum roffdeco d, const char *p, size_t len)
220 {
221 	int		 cp;
222 	const char	*rhs;
223 	size_t		 sz;
224 
225 	if ((cp = chars_spec2cp(h->symtab, p, len)) > 0) {
226 		printf("&#%d;", cp);
227 		return;
228 	} else if (-1 == cp && DECO_SSPECIAL == d) {
229 		fwrite(p, 1, len, stdout);
230 		return;
231 	} else if (-1 == cp)
232 		return;
233 
234 	if (NULL != (rhs = chars_spec2str(h->symtab, p, len, &sz)))
235 		fwrite(rhs, 1, sz, stdout);
236 }
237 
238 
239 static void
240 print_res(struct html *h, const char *p, size_t len)
241 {
242 	int		 cp;
243 	const char	*rhs;
244 	size_t		 sz;
245 
246 	if ((cp = chars_res2cp(h->symtab, p, len)) > 0) {
247 		printf("&#%d;", cp);
248 		return;
249 	} else if (-1 == cp)
250 		return;
251 
252 	if (NULL != (rhs = chars_res2str(h->symtab, p, len, &sz)))
253 		fwrite(rhs, 1, sz, stdout);
254 }
255 
256 
257 struct tag *
258 print_ofont(struct html *h, enum htmlfont font)
259 {
260 	struct htmlpair	 tag;
261 
262 	h->metal = h->metac;
263 	h->metac = font;
264 
265 	/* FIXME: DECO_ROMAN should just close out preexisting. */
266 
267 	if (h->metaf && h->tags.head == h->metaf)
268 		print_tagq(h, h->metaf);
269 
270 	PAIR_CLASS_INIT(&tag, htmlfonts[font]);
271 	h->metaf = print_otag(h, TAG_SPAN, 1, &tag);
272 	return(h->metaf);
273 }
274 
275 
276 static void
277 print_metaf(struct html *h, enum roffdeco deco)
278 {
279 	enum htmlfont	 font;
280 
281 	switch (deco) {
282 	case (DECO_PREVIOUS):
283 		font = h->metal;
284 		break;
285 	case (DECO_ITALIC):
286 		font = HTMLFONT_ITALIC;
287 		break;
288 	case (DECO_BOLD):
289 		font = HTMLFONT_BOLD;
290 		break;
291 	case (DECO_ROMAN):
292 		font = HTMLFONT_NONE;
293 		break;
294 	default:
295 		abort();
296 		/* NOTREACHED */
297 	}
298 
299 	(void)print_ofont(h, font);
300 }
301 
302 
303 static int
304 print_encode(struct html *h, const char *p, int norecurse)
305 {
306 	size_t		 sz;
307 	int		 len, nospace;
308 	const char	*seq;
309 	enum roffdeco	 deco;
310 	static const char rejs[6] = { '\\', '<', '>', '&', ASCII_HYPH, '\0' };
311 
312 	nospace = 0;
313 
314 	for (; *p; p++) {
315 		sz = strcspn(p, rejs);
316 
317 		fwrite(p, 1, sz, stdout);
318 		p += /* LINTED */
319 			sz;
320 
321 		if ('<' == *p) {
322 			printf("&lt;");
323 			continue;
324 		} else if ('>' == *p) {
325 			printf("&gt;");
326 			continue;
327 		} else if ('&' == *p) {
328 			printf("&amp;");
329 			continue;
330 		} else if (ASCII_HYPH == *p) {
331 			/*
332 			 * Note: "soft hyphens" aren't graphically
333 			 * displayed when not breaking the text; we want
334 			 * them to be displayed.
335 			 */
336 			/*printf("&#173;");*/
337 			putchar('-');
338 			continue;
339 		} else if ('\0' == *p)
340 			break;
341 
342 		seq = ++p;
343 		len = a2roffdeco(&deco, &seq, &sz);
344 
345 		switch (deco) {
346 		case (DECO_RESERVED):
347 			print_res(h, seq, sz);
348 			break;
349 		case (DECO_SSPECIAL):
350 			/* FALLTHROUGH */
351 		case (DECO_SPECIAL):
352 			print_spec(h, deco, seq, sz);
353 			break;
354 		case (DECO_PREVIOUS):
355 			/* FALLTHROUGH */
356 		case (DECO_BOLD):
357 			/* FALLTHROUGH */
358 		case (DECO_ITALIC):
359 			/* FALLTHROUGH */
360 		case (DECO_ROMAN):
361 			if (norecurse)
362 				break;
363 			print_metaf(h, deco);
364 			break;
365 		default:
366 			break;
367 		}
368 
369 		p += len - 1;
370 
371 		if (DECO_NOSPACE == deco && '\0' == *(p + 1))
372 			nospace = 1;
373 	}
374 
375 	return(nospace);
376 }
377 
378 
379 static void
380 print_attr(struct html *h, const char *key, const char *val)
381 {
382 	printf(" %s=\"", key);
383 	(void)print_encode(h, val, 1);
384 	putchar('\"');
385 }
386 
387 
388 struct tag *
389 print_otag(struct html *h, enum htmltag tag,
390 		int sz, const struct htmlpair *p)
391 {
392 	int		 i;
393 	struct tag	*t;
394 
395 	/* Push this tags onto the stack of open scopes. */
396 
397 	if ( ! (HTML_NOSTACK & htmltags[tag].flags)) {
398 		t = malloc(sizeof(struct tag));
399 		if (NULL == t) {
400 			perror(NULL);
401 			exit(EXIT_FAILURE);
402 		}
403 		t->tag = tag;
404 		t->next = h->tags.head;
405 		h->tags.head = t;
406 	} else
407 		t = NULL;
408 
409 	if ( ! (HTML_NOSPACE & h->flags))
410 		if ( ! (HTML_CLRLINE & htmltags[tag].flags)) {
411 			/* Manage keeps! */
412 			if ( ! (HTML_KEEP & h->flags)) {
413 				if (HTML_PREKEEP & h->flags)
414 					h->flags |= HTML_KEEP;
415 				putchar(' ');
416 			} else
417 				printf("&#160;");
418 		}
419 
420 	if ( ! (h->flags & HTML_NONOSPACE))
421 		h->flags &= ~HTML_NOSPACE;
422 	else
423 		h->flags |= HTML_NOSPACE;
424 
425 	/* Print out the tag name and attributes. */
426 
427 	printf("<%s", htmltags[tag].name);
428 	for (i = 0; i < sz; i++)
429 		print_attr(h, htmlattrs[p[i].key], p[i].val);
430 
431 	/* Add non-overridable attributes. */
432 
433 	if (TAG_HTML == tag && HTML_XHTML_1_0_STRICT == h->type) {
434 		print_attr(h, "xmlns", "http://www.w3.org/1999/xhtml");
435 		print_attr(h, "xml:lang", "en");
436 		print_attr(h, "lang", "en");
437 	}
438 
439 	/* Accomodate for XML "well-formed" singleton escaping. */
440 
441 	if (HTML_AUTOCLOSE & htmltags[tag].flags)
442 		switch (h->type) {
443 		case (HTML_XHTML_1_0_STRICT):
444 			putchar('/');
445 			break;
446 		default:
447 			break;
448 		}
449 
450 	putchar('>');
451 
452 	h->flags |= HTML_NOSPACE;
453 	return(t);
454 }
455 
456 
457 static void
458 print_ctag(struct html *h, enum htmltag tag)
459 {
460 
461 	printf("</%s>", htmltags[tag].name);
462 	if (HTML_CLRLINE & htmltags[tag].flags) {
463 		h->flags |= HTML_NOSPACE;
464 		putchar('\n');
465 	}
466 }
467 
468 
469 void
470 print_gen_decls(struct html *h)
471 {
472 
473 	print_xmltype(h);
474 	print_doctype(h);
475 }
476 
477 
478 static void
479 print_xmltype(struct html *h)
480 {
481 
482 	if (HTML_XHTML_1_0_STRICT == h->type)
483 		printf("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
484 }
485 
486 
487 static void
488 print_doctype(struct html *h)
489 {
490 	const char	*doctype;
491 	const char	*dtd;
492 	const char	*name;
493 
494 	switch (h->type) {
495 	case (HTML_HTML_4_01_STRICT):
496 		name = "HTML";
497 		doctype = "-//W3C//DTD HTML 4.01//EN";
498 		dtd = "http://www.w3.org/TR/html4/strict.dtd";
499 		break;
500 	default:
501 		name = "html";
502 		doctype = "-//W3C//DTD XHTML 1.0 Strict//EN";
503 		dtd = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";
504 		break;
505 	}
506 
507 	printf("<!DOCTYPE %s PUBLIC \"%s\" \"%s\">\n",
508 			name, doctype, dtd);
509 }
510 
511 
512 void
513 print_text(struct html *h, const char *word)
514 {
515 
516 	if (word[0] && '\0' == word[1])
517 		switch (word[0]) {
518 		case('.'):
519 			/* FALLTHROUGH */
520 		case(','):
521 			/* FALLTHROUGH */
522 		case(';'):
523 			/* FALLTHROUGH */
524 		case(':'):
525 			/* FALLTHROUGH */
526 		case('?'):
527 			/* FALLTHROUGH */
528 		case('!'):
529 			/* FALLTHROUGH */
530 		case(')'):
531 			/* FALLTHROUGH */
532 		case(']'):
533 			if ( ! (HTML_IGNDELIM & h->flags))
534 				h->flags |= HTML_NOSPACE;
535 			break;
536 		default:
537 			break;
538 		}
539 
540 	if ( ! (HTML_NOSPACE & h->flags)) {
541 		/* Manage keeps! */
542 		if ( ! (HTML_KEEP & h->flags)) {
543 			if (HTML_PREKEEP & h->flags)
544 				h->flags |= HTML_KEEP;
545 			putchar(' ');
546 		} else
547 			printf("&#160;");
548 	}
549 
550 	assert(word);
551 	if ( ! print_encode(h, word, 0))
552 		if ( ! (h->flags & HTML_NONOSPACE))
553 			h->flags &= ~HTML_NOSPACE;
554 
555 	/*
556 	 * Note that we don't process the pipe: the parser sees it as
557 	 * punctuation, but we don't in terms of typography.
558 	 */
559 	if (word[0] && '\0' == word[1])
560 		switch (word[0]) {
561 		case('('):
562 			/* FALLTHROUGH */
563 		case('['):
564 			h->flags |= HTML_NOSPACE;
565 			break;
566 		default:
567 			break;
568 		}
569 }
570 
571 
572 void
573 print_tagq(struct html *h, const struct tag *until)
574 {
575 	struct tag	*tag;
576 
577 	while ((tag = h->tags.head) != NULL) {
578 		if (tag == h->metaf)
579 			h->metaf = NULL;
580 		print_ctag(h, tag->tag);
581 		h->tags.head = tag->next;
582 		free(tag);
583 		if (until && tag == until)
584 			return;
585 	}
586 }
587 
588 
589 void
590 print_stagq(struct html *h, const struct tag *suntil)
591 {
592 	struct tag	*tag;
593 
594 	while ((tag = h->tags.head) != NULL) {
595 		if (suntil && tag == suntil)
596 			return;
597 		if (tag == h->metaf)
598 			h->metaf = NULL;
599 		print_ctag(h, tag->tag);
600 		h->tags.head = tag->next;
601 		free(tag);
602 	}
603 }
604 
605 
606 void
607 bufinit(struct html *h)
608 {
609 
610 	h->buf[0] = '\0';
611 	h->buflen = 0;
612 }
613 
614 
615 void
616 bufcat_style(struct html *h, const char *key, const char *val)
617 {
618 
619 	bufcat(h, key);
620 	bufncat(h, ":", 1);
621 	bufcat(h, val);
622 	bufncat(h, ";", 1);
623 }
624 
625 
626 void
627 bufcat(struct html *h, const char *p)
628 {
629 
630 	bufncat(h, p, strlen(p));
631 }
632 
633 
634 void
635 buffmt(struct html *h, const char *fmt, ...)
636 {
637 	va_list		 ap;
638 
639 	va_start(ap, fmt);
640 	(void)vsnprintf(h->buf + (int)h->buflen,
641 			BUFSIZ - h->buflen - 1, fmt, ap);
642 	va_end(ap);
643 	h->buflen = strlen(h->buf);
644 }
645 
646 
647 void
648 bufncat(struct html *h, const char *p, size_t sz)
649 {
650 
651 	if (h->buflen + sz > BUFSIZ - 1)
652 		sz = BUFSIZ - 1 - h->buflen;
653 
654 	(void)strncat(h->buf, p, sz);
655 	h->buflen += sz;
656 }
657 
658 
659 void
660 buffmt_includes(struct html *h, const char *name)
661 {
662 	const char	*p, *pp;
663 
664 	pp = h->base_includes;
665 
666 	while (NULL != (p = strchr(pp, '%'))) {
667 		bufncat(h, pp, (size_t)(p - pp));
668 		switch (*(p + 1)) {
669 		case('I'):
670 			bufcat(h, name);
671 			break;
672 		default:
673 			bufncat(h, p, 2);
674 			break;
675 		}
676 		pp = p + 2;
677 	}
678 	if (pp)
679 		bufcat(h, pp);
680 }
681 
682 
683 void
684 buffmt_man(struct html *h,
685 		const char *name, const char *sec)
686 {
687 	const char	*p, *pp;
688 
689 	pp = h->base_man;
690 
691 	/* LINTED */
692 	while (NULL != (p = strchr(pp, '%'))) {
693 		bufncat(h, pp, (size_t)(p - pp));
694 		switch (*(p + 1)) {
695 		case('S'):
696 			bufcat(h, sec ? sec : "1");
697 			break;
698 		case('N'):
699 			buffmt(h, name);
700 			break;
701 		default:
702 			bufncat(h, p, 2);
703 			break;
704 		}
705 		pp = p + 2;
706 	}
707 	if (pp)
708 		bufcat(h, pp);
709 }
710 
711 
712 void
713 bufcat_su(struct html *h, const char *p, const struct roffsu *su)
714 {
715 	double		 v;
716 	const char	*u;
717 
718 	v = su->scale;
719 
720 	switch (su->unit) {
721 	case (SCALE_CM):
722 		u = "cm";
723 		break;
724 	case (SCALE_IN):
725 		u = "in";
726 		break;
727 	case (SCALE_PC):
728 		u = "pc";
729 		break;
730 	case (SCALE_PT):
731 		u = "pt";
732 		break;
733 	case (SCALE_EM):
734 		u = "em";
735 		break;
736 	case (SCALE_MM):
737 		if (0 == (v /= 100))
738 			v = 1;
739 		u = "em";
740 		break;
741 	case (SCALE_EN):
742 		u = "ex";
743 		break;
744 	case (SCALE_BU):
745 		u = "ex";
746 		break;
747 	case (SCALE_VS):
748 		u = "em";
749 		break;
750 	default:
751 		u = "ex";
752 		break;
753 	}
754 
755 	/*
756 	 * XXX: the CSS spec isn't clear as to which types accept
757 	 * integer or real numbers, so we just make them all decimals.
758 	 */
759 	buffmt(h, "%s: %.2f%s;", p, v, u);
760 }
761 
762 
763 void
764 html_idcat(char *dst, const char *src, int sz)
765 {
766 	int		 ssz;
767 
768 	assert(sz);
769 
770 	/* Cf. <http://www.w3.org/TR/html4/types.html#h-6.2>. */
771 
772 	for ( ; *dst != '\0' && sz; dst++, sz--)
773 		/* Jump to end. */ ;
774 
775 	assert(sz > 2);
776 
777 	/* We can't start with a number (bah). */
778 
779 	*dst++ = 'x';
780 	*dst = '\0';
781 	sz--;
782 
783 	for ( ; *src != '\0' && sz > 1; src++) {
784 		ssz = snprintf(dst, (size_t)sz, "%.2x", *src);
785 		sz -= ssz;
786 		dst += ssz;
787 	}
788 }
789