1 /* $Id: html.c,v 1.34 2014/03/21 22:17:01 schwarze Exp $ */ 2 /* 3 * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv> 4 * Copyright (c) 2011, 2012, 2013, 2014 Ingo Schwarze <schwarze@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 #include <sys/types.h> 19 20 #include <assert.h> 21 #include <ctype.h> 22 #include <stdarg.h> 23 #include <stdio.h> 24 #include <stdint.h> 25 #include <stdlib.h> 26 #include <string.h> 27 #include <unistd.h> 28 29 #include "mandoc.h" 30 #include "mandoc_aux.h" 31 #include "libmandoc.h" 32 #include "out.h" 33 #include "html.h" 34 #include "main.h" 35 36 struct htmldata { 37 const char *name; 38 int flags; 39 #define HTML_CLRLINE (1 << 0) 40 #define HTML_NOSTACK (1 << 1) 41 #define HTML_AUTOCLOSE (1 << 2) /* Tag has auto-closure. */ 42 }; 43 44 static const struct htmldata htmltags[TAG_MAX] = { 45 {"html", HTML_CLRLINE}, /* TAG_HTML */ 46 {"head", HTML_CLRLINE}, /* TAG_HEAD */ 47 {"body", HTML_CLRLINE}, /* TAG_BODY */ 48 {"meta", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_META */ 49 {"title", HTML_CLRLINE}, /* TAG_TITLE */ 50 {"div", HTML_CLRLINE}, /* TAG_DIV */ 51 {"h1", 0}, /* TAG_H1 */ 52 {"h2", 0}, /* TAG_H2 */ 53 {"span", 0}, /* TAG_SPAN */ 54 {"link", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_LINK */ 55 {"br", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_BR */ 56 {"a", 0}, /* TAG_A */ 57 {"table", HTML_CLRLINE}, /* TAG_TABLE */ 58 {"tbody", HTML_CLRLINE}, /* TAG_TBODY */ 59 {"col", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_COL */ 60 {"tr", HTML_CLRLINE}, /* TAG_TR */ 61 {"td", HTML_CLRLINE}, /* TAG_TD */ 62 {"li", HTML_CLRLINE}, /* TAG_LI */ 63 {"ul", HTML_CLRLINE}, /* TAG_UL */ 64 {"ol", HTML_CLRLINE}, /* TAG_OL */ 65 {"dl", HTML_CLRLINE}, /* TAG_DL */ 66 {"dt", HTML_CLRLINE}, /* TAG_DT */ 67 {"dd", HTML_CLRLINE}, /* TAG_DD */ 68 {"blockquote", HTML_CLRLINE}, /* TAG_BLOCKQUOTE */ 69 {"p", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_P */ 70 {"pre", HTML_CLRLINE }, /* TAG_PRE */ 71 {"b", 0 }, /* TAG_B */ 72 {"i", 0 }, /* TAG_I */ 73 {"code", 0 }, /* TAG_CODE */ 74 {"small", 0 }, /* TAG_SMALL */ 75 }; 76 77 static const char *const htmlattrs[ATTR_MAX] = { 78 "http-equiv", /* ATTR_HTTPEQUIV */ 79 "content", /* ATTR_CONTENT */ 80 "name", /* ATTR_NAME */ 81 "rel", /* ATTR_REL */ 82 "href", /* ATTR_HREF */ 83 "type", /* ATTR_TYPE */ 84 "media", /* ATTR_MEDIA */ 85 "class", /* ATTR_CLASS */ 86 "style", /* ATTR_STYLE */ 87 "width", /* ATTR_WIDTH */ 88 "id", /* ATTR_ID */ 89 "summary", /* ATTR_SUMMARY */ 90 "align", /* ATTR_ALIGN */ 91 "colspan", /* ATTR_COLSPAN */ 92 }; 93 94 static const char *const roffscales[SCALE_MAX] = { 95 "cm", /* SCALE_CM */ 96 "in", /* SCALE_IN */ 97 "pc", /* SCALE_PC */ 98 "pt", /* SCALE_PT */ 99 "em", /* SCALE_EM */ 100 "em", /* SCALE_MM */ 101 "ex", /* SCALE_EN */ 102 "ex", /* SCALE_BU */ 103 "em", /* SCALE_VS */ 104 "ex", /* SCALE_FS */ 105 }; 106 107 static void bufncat(struct html *, const char *, size_t); 108 static void print_ctag(struct html *, enum htmltag); 109 static int print_encode(struct html *, const char *, int); 110 static void print_metaf(struct html *, enum mandoc_esc); 111 static void print_attr(struct html *, const char *, const char *); 112 static void *ml_alloc(char *, enum htmltype); 113 114 static void * 115 ml_alloc(char *outopts, enum htmltype type) 116 { 117 struct html *h; 118 const char *toks[5]; 119 char *v; 120 121 toks[0] = "style"; 122 toks[1] = "man"; 123 toks[2] = "includes"; 124 toks[3] = "fragment"; 125 toks[4] = NULL; 126 127 h = mandoc_calloc(1, sizeof(struct html)); 128 129 h->type = type; 130 h->tags.head = NULL; 131 h->symtab = mchars_alloc(); 132 133 while (outopts && *outopts) 134 switch (getsubopt(&outopts, UNCONST(toks), &v)) { 135 case (0): 136 h->style = v; 137 break; 138 case (1): 139 h->base_man = v; 140 break; 141 case (2): 142 h->base_includes = v; 143 break; 144 case (3): 145 h->oflags |= HTML_FRAGMENT; 146 break; 147 default: 148 break; 149 } 150 151 return(h); 152 } 153 154 void * 155 html_alloc(char *outopts) 156 { 157 158 return(ml_alloc(outopts, HTML_HTML_4_01_STRICT)); 159 } 160 161 162 void * 163 xhtml_alloc(char *outopts) 164 { 165 166 return(ml_alloc(outopts, HTML_XHTML_1_0_STRICT)); 167 } 168 169 170 void 171 html_free(void *p) 172 { 173 struct tag *tag; 174 struct html *h; 175 176 h = (struct html *)p; 177 178 while ((tag = h->tags.head) != NULL) { 179 h->tags.head = tag->next; 180 free(tag); 181 } 182 183 if (h->symtab) 184 mchars_free(h->symtab); 185 186 free(h); 187 } 188 189 190 void 191 print_gen_head(struct html *h) 192 { 193 struct htmlpair tag[4]; 194 195 tag[0].key = ATTR_HTTPEQUIV; 196 tag[0].val = "Content-Type"; 197 tag[1].key = ATTR_CONTENT; 198 tag[1].val = "text/html; charset=utf-8"; 199 print_otag(h, TAG_META, 2, tag); 200 201 tag[0].key = ATTR_NAME; 202 tag[0].val = "resource-type"; 203 tag[1].key = ATTR_CONTENT; 204 tag[1].val = "document"; 205 print_otag(h, TAG_META, 2, tag); 206 207 if (h->style) { 208 tag[0].key = ATTR_REL; 209 tag[0].val = "stylesheet"; 210 tag[1].key = ATTR_HREF; 211 tag[1].val = h->style; 212 tag[2].key = ATTR_TYPE; 213 tag[2].val = "text/css"; 214 tag[3].key = ATTR_MEDIA; 215 tag[3].val = "all"; 216 print_otag(h, TAG_LINK, 4, tag); 217 } 218 } 219 220 static void 221 print_metaf(struct html *h, enum mandoc_esc deco) 222 { 223 enum htmlfont font; 224 225 switch (deco) { 226 case (ESCAPE_FONTPREV): 227 font = h->metal; 228 break; 229 case (ESCAPE_FONTITALIC): 230 font = HTMLFONT_ITALIC; 231 break; 232 case (ESCAPE_FONTBOLD): 233 font = HTMLFONT_BOLD; 234 break; 235 case (ESCAPE_FONTBI): 236 font = HTMLFONT_BI; 237 break; 238 case (ESCAPE_FONT): 239 /* FALLTHROUGH */ 240 case (ESCAPE_FONTROMAN): 241 font = HTMLFONT_NONE; 242 break; 243 default: 244 abort(); 245 /* NOTREACHED */ 246 } 247 248 if (h->metaf) { 249 print_tagq(h, h->metaf); 250 h->metaf = NULL; 251 } 252 253 h->metal = h->metac; 254 h->metac = font; 255 256 switch (font) { 257 case (HTMLFONT_ITALIC): 258 h->metaf = print_otag(h, TAG_I, 0, NULL); 259 break; 260 case (HTMLFONT_BOLD): 261 h->metaf = print_otag(h, TAG_B, 0, NULL); 262 break; 263 case (HTMLFONT_BI): 264 h->metaf = print_otag(h, TAG_B, 0, NULL); 265 print_otag(h, TAG_I, 0, NULL); 266 break; 267 default: 268 break; 269 } 270 } 271 272 int 273 html_strlen(const char *cp) 274 { 275 size_t rsz; 276 int skip, sz; 277 278 /* 279 * Account for escaped sequences within string length 280 * calculations. This follows the logic in term_strlen() as we 281 * must calculate the width of produced strings. 282 * Assume that characters are always width of "1". This is 283 * hacky, but it gets the job done for approximation of widths. 284 */ 285 286 sz = 0; 287 skip = 0; 288 while (1) { 289 rsz = strcspn(cp, "\\"); 290 if (rsz) { 291 cp += rsz; 292 if (skip) { 293 skip = 0; 294 rsz--; 295 } 296 sz += rsz; 297 } 298 if ('\0' == *cp) 299 break; 300 cp++; 301 switch (mandoc_escape(&cp, NULL, NULL)) { 302 case (ESCAPE_ERROR): 303 return(sz); 304 case (ESCAPE_UNICODE): 305 /* FALLTHROUGH */ 306 case (ESCAPE_NUMBERED): 307 /* FALLTHROUGH */ 308 case (ESCAPE_SPECIAL): 309 if (skip) 310 skip = 0; 311 else 312 sz++; 313 break; 314 case (ESCAPE_SKIPCHAR): 315 skip = 1; 316 break; 317 default: 318 break; 319 } 320 } 321 return(sz); 322 } 323 324 static int 325 print_encode(struct html *h, const char *p, int norecurse) 326 { 327 size_t sz; 328 int c, len, nospace; 329 const char *seq; 330 enum mandoc_esc esc; 331 static const char rejs[8] = { '\\', '<', '>', '&', 332 ASCII_NBRSP, ASCII_HYPH, ASCII_BREAK, '\0' }; 333 334 nospace = 0; 335 336 while ('\0' != *p) { 337 if (HTML_SKIPCHAR & h->flags && '\\' != *p) { 338 h->flags &= ~HTML_SKIPCHAR; 339 p++; 340 continue; 341 } 342 343 sz = strcspn(p, rejs); 344 345 fwrite(p, 1, sz, stdout); 346 p += (int)sz; 347 348 if ('\0' == *p) 349 break; 350 351 switch (*p++) { 352 case ('<'): 353 printf("<"); 354 continue; 355 case ('>'): 356 printf(">"); 357 continue; 358 case ('&'): 359 printf("&"); 360 continue; 361 case (ASCII_NBRSP): 362 putchar('-'); 363 continue; 364 case (ASCII_HYPH): 365 putchar('-'); 366 /* FALLTHROUGH */ 367 case (ASCII_BREAK): 368 continue; 369 default: 370 break; 371 } 372 373 esc = mandoc_escape(&p, &seq, &len); 374 if (ESCAPE_ERROR == esc) 375 break; 376 377 switch (esc) { 378 case (ESCAPE_FONT): 379 /* FALLTHROUGH */ 380 case (ESCAPE_FONTPREV): 381 /* FALLTHROUGH */ 382 case (ESCAPE_FONTBOLD): 383 /* FALLTHROUGH */ 384 case (ESCAPE_FONTITALIC): 385 /* FALLTHROUGH */ 386 case (ESCAPE_FONTBI): 387 /* FALLTHROUGH */ 388 case (ESCAPE_FONTROMAN): 389 if (0 == norecurse) 390 print_metaf(h, esc); 391 continue; 392 case (ESCAPE_SKIPCHAR): 393 h->flags |= HTML_SKIPCHAR; 394 continue; 395 default: 396 break; 397 } 398 399 if (h->flags & HTML_SKIPCHAR) { 400 h->flags &= ~HTML_SKIPCHAR; 401 continue; 402 } 403 404 switch (esc) { 405 case (ESCAPE_UNICODE): 406 /* Skip passed "u" header. */ 407 c = mchars_num2uc(seq + 1, len - 1); 408 if ('\0' != c) 409 printf("&#x%x;", c); 410 break; 411 case (ESCAPE_NUMBERED): 412 c = mchars_num2char(seq, len); 413 if ('\0' != c) 414 putchar(c); 415 break; 416 case (ESCAPE_SPECIAL): 417 c = mchars_spec2cp(h->symtab, seq, len); 418 if (c > 0) 419 printf("&#%d;", c); 420 else if (-1 == c && 1 == len) 421 putchar((int)*seq); 422 break; 423 case (ESCAPE_NOSPACE): 424 if ('\0' == *p) 425 nospace = 1; 426 break; 427 default: 428 break; 429 } 430 } 431 432 return(nospace); 433 } 434 435 436 static void 437 print_attr(struct html *h, const char *key, const char *val) 438 { 439 printf(" %s=\"", key); 440 (void)print_encode(h, val, 1); 441 putchar('\"'); 442 } 443 444 445 struct tag * 446 print_otag(struct html *h, enum htmltag tag, 447 int sz, const struct htmlpair *p) 448 { 449 int i; 450 struct tag *t; 451 452 /* Push this tags onto the stack of open scopes. */ 453 454 if ( ! (HTML_NOSTACK & htmltags[tag].flags)) { 455 t = mandoc_malloc(sizeof(struct tag)); 456 t->tag = tag; 457 t->next = h->tags.head; 458 h->tags.head = t; 459 } else 460 t = NULL; 461 462 if ( ! (HTML_NOSPACE & h->flags)) 463 if ( ! (HTML_CLRLINE & htmltags[tag].flags)) { 464 /* Manage keeps! */ 465 if ( ! (HTML_KEEP & h->flags)) { 466 if (HTML_PREKEEP & h->flags) 467 h->flags |= HTML_KEEP; 468 putchar(' '); 469 } else 470 printf(" "); 471 } 472 473 if ( ! (h->flags & HTML_NONOSPACE)) 474 h->flags &= ~HTML_NOSPACE; 475 else 476 h->flags |= HTML_NOSPACE; 477 478 /* Print out the tag name and attributes. */ 479 480 printf("<%s", htmltags[tag].name); 481 for (i = 0; i < sz; i++) 482 print_attr(h, htmlattrs[p[i].key], p[i].val); 483 484 /* Add non-overridable attributes. */ 485 486 if (TAG_HTML == tag && HTML_XHTML_1_0_STRICT == h->type) { 487 print_attr(h, "xmlns", "http://www.w3.org/1999/xhtml"); 488 print_attr(h, "xml:lang", "en"); 489 print_attr(h, "lang", "en"); 490 } 491 492 /* Accommodate for XML "well-formed" singleton escaping. */ 493 494 if (HTML_AUTOCLOSE & htmltags[tag].flags) 495 switch (h->type) { 496 case (HTML_XHTML_1_0_STRICT): 497 putchar('/'); 498 break; 499 default: 500 break; 501 } 502 503 putchar('>'); 504 505 h->flags |= HTML_NOSPACE; 506 507 if ((HTML_AUTOCLOSE | HTML_CLRLINE) & htmltags[tag].flags) 508 putchar('\n'); 509 510 return(t); 511 } 512 513 514 static void 515 print_ctag(struct html *h, enum htmltag tag) 516 { 517 518 printf("</%s>", htmltags[tag].name); 519 if (HTML_CLRLINE & htmltags[tag].flags) { 520 h->flags |= HTML_NOSPACE; 521 putchar('\n'); 522 } 523 } 524 525 void 526 print_gen_decls(struct html *h) 527 { 528 const char *doctype; 529 const char *dtd; 530 const char *name; 531 532 switch (h->type) { 533 case (HTML_HTML_4_01_STRICT): 534 name = "HTML"; 535 doctype = "-//W3C//DTD HTML 4.01//EN"; 536 dtd = "http://www.w3.org/TR/html4/strict.dtd"; 537 break; 538 default: 539 puts("<?xml version=\"1.0\" encoding=\"UTF-8\"?>"); 540 name = "html"; 541 doctype = "-//W3C//DTD XHTML 1.0 Strict//EN"; 542 dtd = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"; 543 break; 544 } 545 546 printf("<!DOCTYPE %s PUBLIC \"%s\" \"%s\">\n", 547 name, doctype, dtd); 548 } 549 550 void 551 print_text(struct html *h, const char *word) 552 { 553 554 if ( ! (HTML_NOSPACE & h->flags)) { 555 /* Manage keeps! */ 556 if ( ! (HTML_KEEP & h->flags)) { 557 if (HTML_PREKEEP & h->flags) 558 h->flags |= HTML_KEEP; 559 putchar(' '); 560 } else 561 printf(" "); 562 } 563 564 assert(NULL == h->metaf); 565 switch (h->metac) { 566 case (HTMLFONT_ITALIC): 567 h->metaf = print_otag(h, TAG_I, 0, NULL); 568 break; 569 case (HTMLFONT_BOLD): 570 h->metaf = print_otag(h, TAG_B, 0, NULL); 571 break; 572 case (HTMLFONT_BI): 573 h->metaf = print_otag(h, TAG_B, 0, NULL); 574 print_otag(h, TAG_I, 0, NULL); 575 break; 576 default: 577 break; 578 } 579 580 assert(word); 581 if ( ! print_encode(h, word, 0)) { 582 if ( ! (h->flags & HTML_NONOSPACE)) 583 h->flags &= ~HTML_NOSPACE; 584 } else 585 h->flags |= HTML_NOSPACE; 586 587 if (h->metaf) { 588 print_tagq(h, h->metaf); 589 h->metaf = NULL; 590 } 591 592 h->flags &= ~HTML_IGNDELIM; 593 } 594 595 596 void 597 print_tagq(struct html *h, const struct tag *until) 598 { 599 struct tag *tag; 600 601 while ((tag = h->tags.head) != NULL) { 602 /* 603 * Remember to close out and nullify the current 604 * meta-font and table, if applicable. 605 */ 606 if (tag == h->metaf) 607 h->metaf = NULL; 608 if (tag == h->tblt) 609 h->tblt = NULL; 610 print_ctag(h, tag->tag); 611 h->tags.head = tag->next; 612 free(tag); 613 if (until && tag == until) 614 return; 615 } 616 } 617 618 619 void 620 print_stagq(struct html *h, const struct tag *suntil) 621 { 622 struct tag *tag; 623 624 while ((tag = h->tags.head) != NULL) { 625 if (suntil && tag == suntil) 626 return; 627 /* 628 * Remember to close out and nullify the current 629 * meta-font and table, if applicable. 630 */ 631 if (tag == h->metaf) 632 h->metaf = NULL; 633 if (tag == h->tblt) 634 h->tblt = NULL; 635 print_ctag(h, tag->tag); 636 h->tags.head = tag->next; 637 free(tag); 638 } 639 } 640 641 void 642 bufinit(struct html *h) 643 { 644 645 h->buf[0] = '\0'; 646 h->buflen = 0; 647 } 648 649 void 650 bufcat_style(struct html *h, const char *key, const char *val) 651 { 652 653 bufcat(h, key); 654 bufcat(h, ":"); 655 bufcat(h, val); 656 bufcat(h, ";"); 657 } 658 659 void 660 bufcat(struct html *h, const char *p) 661 { 662 663 h->buflen = strlcat(h->buf, p, BUFSIZ); 664 assert(h->buflen < BUFSIZ); 665 } 666 667 void 668 bufcat_fmt(struct html *h, const char *fmt, ...) 669 { 670 va_list ap; 671 672 va_start(ap, fmt); 673 (void)vsnprintf(h->buf + (int)h->buflen, 674 BUFSIZ - h->buflen - 1, fmt, ap); 675 va_end(ap); 676 h->buflen = strlen(h->buf); 677 } 678 679 static void 680 bufncat(struct html *h, const char *p, size_t sz) 681 { 682 683 assert(h->buflen + sz + 1 < BUFSIZ); 684 strncat(h->buf, p, sz); 685 h->buflen += sz; 686 } 687 688 void 689 buffmt_includes(struct html *h, const char *name) 690 { 691 const char *p, *pp; 692 693 pp = h->base_includes; 694 695 bufinit(h); 696 while (NULL != (p = strchr(pp, '%'))) { 697 bufncat(h, pp, (size_t)(p - pp)); 698 switch (*(p + 1)) { 699 case('I'): 700 bufcat(h, name); 701 break; 702 default: 703 bufncat(h, p, 2); 704 break; 705 } 706 pp = p + 2; 707 } 708 if (pp) 709 bufcat(h, pp); 710 } 711 712 void 713 buffmt_man(struct html *h, 714 const char *name, const char *sec) 715 { 716 const char *p, *pp; 717 718 pp = h->base_man; 719 720 bufinit(h); 721 while (NULL != (p = strchr(pp, '%'))) { 722 bufncat(h, pp, (size_t)(p - pp)); 723 switch (*(p + 1)) { 724 case('S'): 725 bufcat(h, sec ? sec : "1"); 726 break; 727 case('N'): 728 bufcat_fmt(h, "%s", name); 729 break; 730 default: 731 bufncat(h, p, 2); 732 break; 733 } 734 pp = p + 2; 735 } 736 if (pp) 737 bufcat(h, pp); 738 } 739 740 void 741 bufcat_su(struct html *h, const char *p, const struct roffsu *su) 742 { 743 double v; 744 745 v = su->scale; 746 if (SCALE_MM == su->unit && 0.0 == (v /= 100.0)) 747 v = 1.0; 748 749 bufcat_fmt(h, "%s: %.2f%s;", p, v, roffscales[su->unit]); 750 } 751 752 void 753 bufcat_id(struct html *h, const char *src) 754 { 755 756 /* Cf. <http://www.w3.org/TR/html4/types.html#h-6.2>. */ 757 758 while ('\0' != *src) 759 bufcat_fmt(h, "%.2x", *src++); 760 } 761