1 /* $OpenBSD: html.c,v 1.53 2014/12/02 10:07:17 schwarze Exp $ */ 2 /* 3 * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv> 4 * Copyright (c) 2011, 2012, 2013, 2014 Ingo Schwarze <schwarze@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 #include <sys/types.h> 19 20 #include <assert.h> 21 #include <ctype.h> 22 #include <stdarg.h> 23 #include <stdio.h> 24 #include <stdint.h> 25 #include <stdlib.h> 26 #include <string.h> 27 #include <unistd.h> 28 29 #include "mandoc.h" 30 #include "mandoc_aux.h" 31 #include "out.h" 32 #include "html.h" 33 #include "main.h" 34 35 struct htmldata { 36 const char *name; 37 int flags; 38 #define HTML_CLRLINE (1 << 0) 39 #define HTML_NOSTACK (1 << 1) 40 #define HTML_AUTOCLOSE (1 << 2) /* Tag has auto-closure. */ 41 }; 42 43 static const struct htmldata htmltags[TAG_MAX] = { 44 {"html", HTML_CLRLINE}, /* TAG_HTML */ 45 {"head", HTML_CLRLINE}, /* TAG_HEAD */ 46 {"body", HTML_CLRLINE}, /* TAG_BODY */ 47 {"meta", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_META */ 48 {"title", HTML_CLRLINE}, /* TAG_TITLE */ 49 {"div", HTML_CLRLINE}, /* TAG_DIV */ 50 {"h1", 0}, /* TAG_H1 */ 51 {"h2", 0}, /* TAG_H2 */ 52 {"span", 0}, /* TAG_SPAN */ 53 {"link", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_LINK */ 54 {"br", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_BR */ 55 {"a", 0}, /* TAG_A */ 56 {"table", HTML_CLRLINE}, /* TAG_TABLE */ 57 {"tbody", HTML_CLRLINE}, /* TAG_TBODY */ 58 {"col", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_COL */ 59 {"tr", HTML_CLRLINE}, /* TAG_TR */ 60 {"td", HTML_CLRLINE}, /* TAG_TD */ 61 {"li", HTML_CLRLINE}, /* TAG_LI */ 62 {"ul", HTML_CLRLINE}, /* TAG_UL */ 63 {"ol", HTML_CLRLINE}, /* TAG_OL */ 64 {"dl", HTML_CLRLINE}, /* TAG_DL */ 65 {"dt", HTML_CLRLINE}, /* TAG_DT */ 66 {"dd", HTML_CLRLINE}, /* TAG_DD */ 67 {"blockquote", HTML_CLRLINE}, /* TAG_BLOCKQUOTE */ 68 {"pre", HTML_CLRLINE }, /* TAG_PRE */ 69 {"b", 0 }, /* TAG_B */ 70 {"i", 0 }, /* TAG_I */ 71 {"code", 0 }, /* TAG_CODE */ 72 {"small", 0 }, /* TAG_SMALL */ 73 {"style", HTML_CLRLINE}, /* TAG_STYLE */ 74 {"math", HTML_CLRLINE}, /* TAG_MATH */ 75 {"mrow", 0}, /* TAG_MROW */ 76 {"mi", 0}, /* TAG_MI */ 77 {"mo", 0}, /* TAG_MO */ 78 {"msup", 0}, /* TAG_MSUP */ 79 {"msub", 0}, /* TAG_MSUB */ 80 {"msubsup", 0}, /* TAG_MSUBSUP */ 81 {"mfrac", 0}, /* TAG_MFRAC */ 82 {"msqrt", 0}, /* TAG_MSQRT */ 83 {"mfenced", 0}, /* TAG_MFENCED */ 84 {"mtable", 0}, /* TAG_MTABLE */ 85 {"mtr", 0}, /* TAG_MTR */ 86 {"mtd", 0}, /* TAG_MTD */ 87 {"munderover", 0}, /* TAG_MUNDEROVER */ 88 {"munder", 0}, /* TAG_MUNDER*/ 89 {"mover", 0}, /* TAG_MOVER*/ 90 }; 91 92 static const char *const htmlattrs[ATTR_MAX] = { 93 "name", /* ATTR_NAME */ 94 "rel", /* ATTR_REL */ 95 "href", /* ATTR_HREF */ 96 "type", /* ATTR_TYPE */ 97 "media", /* ATTR_MEDIA */ 98 "class", /* ATTR_CLASS */ 99 "style", /* ATTR_STYLE */ 100 "id", /* ATTR_ID */ 101 "colspan", /* ATTR_COLSPAN */ 102 "charset", /* ATTR_CHARSET */ 103 "open", /* ATTR_OPEN */ 104 "close", /* ATTR_CLOSE */ 105 "mathvariant", /* ATTR_MATHVARIANT */ 106 }; 107 108 static const char *const roffscales[SCALE_MAX] = { 109 "cm", /* SCALE_CM */ 110 "in", /* SCALE_IN */ 111 "pc", /* SCALE_PC */ 112 "pt", /* SCALE_PT */ 113 "em", /* SCALE_EM */ 114 "em", /* SCALE_MM */ 115 "ex", /* SCALE_EN */ 116 "ex", /* SCALE_BU */ 117 "em", /* SCALE_VS */ 118 "ex", /* SCALE_FS */ 119 }; 120 121 static void bufncat(struct html *, const char *, size_t); 122 static void print_ctag(struct html *, enum htmltag); 123 static int print_escape(char); 124 static int print_encode(struct html *, const char *, int); 125 static void print_metaf(struct html *, enum mandoc_esc); 126 static void print_attr(struct html *, const char *, const char *); 127 128 129 void * 130 html_alloc(const struct mchars *mchars, char *outopts) 131 { 132 struct html *h; 133 const char *toks[5]; 134 char *v; 135 136 toks[0] = "style"; 137 toks[1] = "man"; 138 toks[2] = "includes"; 139 toks[3] = "fragment"; 140 toks[4] = NULL; 141 142 h = mandoc_calloc(1, sizeof(struct html)); 143 144 h->tags.head = NULL; 145 h->symtab = mchars; 146 147 while (outopts && *outopts) 148 switch (getsubopt(&outopts, UNCONST(toks), &v)) { 149 case 0: 150 h->style = v; 151 break; 152 case 1: 153 h->base_man = v; 154 break; 155 case 2: 156 h->base_includes = v; 157 break; 158 case 3: 159 h->oflags |= HTML_FRAGMENT; 160 break; 161 default: 162 break; 163 } 164 165 return(h); 166 } 167 168 void 169 html_free(void *p) 170 { 171 struct tag *tag; 172 struct html *h; 173 174 h = (struct html *)p; 175 176 while ((tag = h->tags.head) != NULL) { 177 h->tags.head = tag->next; 178 free(tag); 179 } 180 181 free(h); 182 } 183 184 void 185 print_gen_head(struct html *h) 186 { 187 struct htmlpair tag[4]; 188 struct tag *t; 189 190 tag[0].key = ATTR_CHARSET; 191 tag[0].val = "utf-8"; 192 print_otag(h, TAG_META, 1, tag); 193 194 /* 195 * Print a default style-sheet. 196 */ 197 t = print_otag(h, TAG_STYLE, 0, NULL); 198 print_text(h, "table.head, table.foot { width: 100%; }\n" 199 "td.head-rtitle, td.foot-os { text-align: right; }\n" 200 "td.head-vol { text-align: center; }\n" 201 "table.foot td { width: 50%; }\n" 202 "table.head td { width: 33%; }\n" 203 "div.spacer { margin: 1em 0; }\n"); 204 print_tagq(h, t); 205 206 if (h->style) { 207 tag[0].key = ATTR_REL; 208 tag[0].val = "stylesheet"; 209 tag[1].key = ATTR_HREF; 210 tag[1].val = h->style; 211 tag[2].key = ATTR_TYPE; 212 tag[2].val = "text/css"; 213 tag[3].key = ATTR_MEDIA; 214 tag[3].val = "all"; 215 print_otag(h, TAG_LINK, 4, tag); 216 } 217 } 218 219 static void 220 print_metaf(struct html *h, enum mandoc_esc deco) 221 { 222 enum htmlfont font; 223 224 switch (deco) { 225 case ESCAPE_FONTPREV: 226 font = h->metal; 227 break; 228 case ESCAPE_FONTITALIC: 229 font = HTMLFONT_ITALIC; 230 break; 231 case ESCAPE_FONTBOLD: 232 font = HTMLFONT_BOLD; 233 break; 234 case ESCAPE_FONTBI: 235 font = HTMLFONT_BI; 236 break; 237 case ESCAPE_FONT: 238 /* FALLTHROUGH */ 239 case ESCAPE_FONTROMAN: 240 font = HTMLFONT_NONE; 241 break; 242 default: 243 abort(); 244 /* NOTREACHED */ 245 } 246 247 if (h->metaf) { 248 print_tagq(h, h->metaf); 249 h->metaf = NULL; 250 } 251 252 h->metal = h->metac; 253 h->metac = font; 254 255 switch (font) { 256 case HTMLFONT_ITALIC: 257 h->metaf = print_otag(h, TAG_I, 0, NULL); 258 break; 259 case HTMLFONT_BOLD: 260 h->metaf = print_otag(h, TAG_B, 0, NULL); 261 break; 262 case HTMLFONT_BI: 263 h->metaf = print_otag(h, TAG_B, 0, NULL); 264 print_otag(h, TAG_I, 0, NULL); 265 break; 266 default: 267 break; 268 } 269 } 270 271 int 272 html_strlen(const char *cp) 273 { 274 size_t rsz; 275 int skip, sz; 276 277 /* 278 * Account for escaped sequences within string length 279 * calculations. This follows the logic in term_strlen() as we 280 * must calculate the width of produced strings. 281 * Assume that characters are always width of "1". This is 282 * hacky, but it gets the job done for approximation of widths. 283 */ 284 285 sz = 0; 286 skip = 0; 287 while (1) { 288 rsz = strcspn(cp, "\\"); 289 if (rsz) { 290 cp += rsz; 291 if (skip) { 292 skip = 0; 293 rsz--; 294 } 295 sz += rsz; 296 } 297 if ('\0' == *cp) 298 break; 299 cp++; 300 switch (mandoc_escape(&cp, NULL, NULL)) { 301 case ESCAPE_ERROR: 302 return(sz); 303 case ESCAPE_UNICODE: 304 /* FALLTHROUGH */ 305 case ESCAPE_NUMBERED: 306 /* FALLTHROUGH */ 307 case ESCAPE_SPECIAL: 308 if (skip) 309 skip = 0; 310 else 311 sz++; 312 break; 313 case ESCAPE_SKIPCHAR: 314 skip = 1; 315 break; 316 default: 317 break; 318 } 319 } 320 return(sz); 321 } 322 323 static int 324 print_escape(char c) 325 { 326 327 switch (c) { 328 case '<': 329 printf("<"); 330 break; 331 case '>': 332 printf(">"); 333 break; 334 case '&': 335 printf("&"); 336 break; 337 case '"': 338 printf("""); 339 break; 340 case ASCII_NBRSP: 341 putchar('-'); 342 break; 343 case ASCII_HYPH: 344 putchar('-'); 345 /* FALLTHROUGH */ 346 case ASCII_BREAK: 347 break; 348 default: 349 return(0); 350 } 351 return(1); 352 } 353 354 static int 355 print_encode(struct html *h, const char *p, int norecurse) 356 { 357 size_t sz; 358 int c, len, nospace; 359 const char *seq; 360 enum mandoc_esc esc; 361 static const char rejs[9] = { '\\', '<', '>', '&', '"', 362 ASCII_NBRSP, ASCII_HYPH, ASCII_BREAK, '\0' }; 363 364 nospace = 0; 365 366 while ('\0' != *p) { 367 if (HTML_SKIPCHAR & h->flags && '\\' != *p) { 368 h->flags &= ~HTML_SKIPCHAR; 369 p++; 370 continue; 371 } 372 373 sz = strcspn(p, rejs); 374 375 fwrite(p, 1, sz, stdout); 376 p += (int)sz; 377 378 if ('\0' == *p) 379 break; 380 381 if (print_escape(*p++)) 382 continue; 383 384 esc = mandoc_escape(&p, &seq, &len); 385 if (ESCAPE_ERROR == esc) 386 break; 387 388 switch (esc) { 389 case ESCAPE_FONT: 390 /* FALLTHROUGH */ 391 case ESCAPE_FONTPREV: 392 /* FALLTHROUGH */ 393 case ESCAPE_FONTBOLD: 394 /* FALLTHROUGH */ 395 case ESCAPE_FONTITALIC: 396 /* FALLTHROUGH */ 397 case ESCAPE_FONTBI: 398 /* FALLTHROUGH */ 399 case ESCAPE_FONTROMAN: 400 if (0 == norecurse) 401 print_metaf(h, esc); 402 continue; 403 case ESCAPE_SKIPCHAR: 404 h->flags |= HTML_SKIPCHAR; 405 continue; 406 default: 407 break; 408 } 409 410 if (h->flags & HTML_SKIPCHAR) { 411 h->flags &= ~HTML_SKIPCHAR; 412 continue; 413 } 414 415 switch (esc) { 416 case ESCAPE_UNICODE: 417 /* Skip past "u" header. */ 418 c = mchars_num2uc(seq + 1, len - 1); 419 break; 420 case ESCAPE_NUMBERED: 421 c = mchars_num2char(seq, len); 422 if (c < 0) 423 continue; 424 break; 425 case ESCAPE_SPECIAL: 426 c = mchars_spec2cp(h->symtab, seq, len); 427 if (c <= 0) 428 continue; 429 break; 430 case ESCAPE_NOSPACE: 431 if ('\0' == *p) 432 nospace = 1; 433 continue; 434 default: 435 continue; 436 } 437 if ((c < 0x20 && c != 0x09) || 438 (c > 0x7E && c < 0xA0)) 439 c = 0xFFFD; 440 if (c > 0x7E) 441 printf("&#%d;", c); 442 else if ( ! print_escape(c)) 443 putchar(c); 444 } 445 446 return(nospace); 447 } 448 449 static void 450 print_attr(struct html *h, const char *key, const char *val) 451 { 452 printf(" %s=\"", key); 453 (void)print_encode(h, val, 1); 454 putchar('\"'); 455 } 456 457 struct tag * 458 print_otag(struct html *h, enum htmltag tag, 459 int sz, const struct htmlpair *p) 460 { 461 int i; 462 struct tag *t; 463 464 /* Push this tags onto the stack of open scopes. */ 465 466 if ( ! (HTML_NOSTACK & htmltags[tag].flags)) { 467 t = mandoc_malloc(sizeof(struct tag)); 468 t->tag = tag; 469 t->next = h->tags.head; 470 h->tags.head = t; 471 } else 472 t = NULL; 473 474 if ( ! (HTML_NOSPACE & h->flags)) 475 if ( ! (HTML_CLRLINE & htmltags[tag].flags)) { 476 /* Manage keeps! */ 477 if ( ! (HTML_KEEP & h->flags)) { 478 if (HTML_PREKEEP & h->flags) 479 h->flags |= HTML_KEEP; 480 putchar(' '); 481 } else 482 printf(" "); 483 } 484 485 if ( ! (h->flags & HTML_NONOSPACE)) 486 h->flags &= ~HTML_NOSPACE; 487 else 488 h->flags |= HTML_NOSPACE; 489 490 /* Print out the tag name and attributes. */ 491 492 printf("<%s", htmltags[tag].name); 493 for (i = 0; i < sz; i++) 494 print_attr(h, htmlattrs[p[i].key], p[i].val); 495 496 /* Accommodate for "well-formed" singleton escaping. */ 497 498 if (HTML_AUTOCLOSE & htmltags[tag].flags) 499 putchar('/'); 500 501 putchar('>'); 502 503 h->flags |= HTML_NOSPACE; 504 505 if ((HTML_AUTOCLOSE | HTML_CLRLINE) & htmltags[tag].flags) 506 putchar('\n'); 507 508 return(t); 509 } 510 511 static void 512 print_ctag(struct html *h, enum htmltag tag) 513 { 514 515 printf("</%s>", htmltags[tag].name); 516 if (HTML_CLRLINE & htmltags[tag].flags) { 517 h->flags |= HTML_NOSPACE; 518 putchar('\n'); 519 } 520 } 521 522 void 523 print_gen_decls(struct html *h) 524 { 525 526 puts("<!DOCTYPE html>"); 527 } 528 529 void 530 print_text(struct html *h, const char *word) 531 { 532 533 if ( ! (HTML_NOSPACE & h->flags)) { 534 /* Manage keeps! */ 535 if ( ! (HTML_KEEP & h->flags)) { 536 if (HTML_PREKEEP & h->flags) 537 h->flags |= HTML_KEEP; 538 putchar(' '); 539 } else 540 printf(" "); 541 } 542 543 assert(NULL == h->metaf); 544 switch (h->metac) { 545 case HTMLFONT_ITALIC: 546 h->metaf = print_otag(h, TAG_I, 0, NULL); 547 break; 548 case HTMLFONT_BOLD: 549 h->metaf = print_otag(h, TAG_B, 0, NULL); 550 break; 551 case HTMLFONT_BI: 552 h->metaf = print_otag(h, TAG_B, 0, NULL); 553 print_otag(h, TAG_I, 0, NULL); 554 break; 555 default: 556 break; 557 } 558 559 assert(word); 560 if ( ! print_encode(h, word, 0)) { 561 if ( ! (h->flags & HTML_NONOSPACE)) 562 h->flags &= ~HTML_NOSPACE; 563 h->flags &= ~HTML_NONEWLINE; 564 } else 565 h->flags |= HTML_NOSPACE | HTML_NONEWLINE; 566 567 if (h->metaf) { 568 print_tagq(h, h->metaf); 569 h->metaf = NULL; 570 } 571 572 h->flags &= ~HTML_IGNDELIM; 573 } 574 575 void 576 print_tagq(struct html *h, const struct tag *until) 577 { 578 struct tag *tag; 579 580 while ((tag = h->tags.head) != NULL) { 581 /* 582 * Remember to close out and nullify the current 583 * meta-font and table, if applicable. 584 */ 585 if (tag == h->metaf) 586 h->metaf = NULL; 587 if (tag == h->tblt) 588 h->tblt = NULL; 589 print_ctag(h, tag->tag); 590 h->tags.head = tag->next; 591 free(tag); 592 if (until && tag == until) 593 return; 594 } 595 } 596 597 void 598 print_stagq(struct html *h, const struct tag *suntil) 599 { 600 struct tag *tag; 601 602 while ((tag = h->tags.head) != NULL) { 603 if (suntil && tag == suntil) 604 return; 605 /* 606 * Remember to close out and nullify the current 607 * meta-font and table, if applicable. 608 */ 609 if (tag == h->metaf) 610 h->metaf = NULL; 611 if (tag == h->tblt) 612 h->tblt = NULL; 613 print_ctag(h, tag->tag); 614 h->tags.head = tag->next; 615 free(tag); 616 } 617 } 618 619 void 620 print_paragraph(struct html *h) 621 { 622 struct tag *t; 623 struct htmlpair tag; 624 625 PAIR_CLASS_INIT(&tag, "spacer"); 626 t = print_otag(h, TAG_DIV, 1, &tag); 627 print_tagq(h, t); 628 } 629 630 631 void 632 bufinit(struct html *h) 633 { 634 635 h->buf[0] = '\0'; 636 h->buflen = 0; 637 } 638 639 void 640 bufcat_style(struct html *h, const char *key, const char *val) 641 { 642 643 bufcat(h, key); 644 bufcat(h, ":"); 645 bufcat(h, val); 646 bufcat(h, ";"); 647 } 648 649 void 650 bufcat(struct html *h, const char *p) 651 { 652 653 /* 654 * XXX This is broken and not easy to fix. 655 * When using the -Oincludes option, buffmt_includes() 656 * may pass in strings overrunning BUFSIZ, causing a crash. 657 */ 658 659 h->buflen = strlcat(h->buf, p, BUFSIZ); 660 assert(h->buflen < BUFSIZ); 661 } 662 663 void 664 bufcat_fmt(struct html *h, const char *fmt, ...) 665 { 666 va_list ap; 667 668 va_start(ap, fmt); 669 (void)vsnprintf(h->buf + (int)h->buflen, 670 BUFSIZ - h->buflen - 1, fmt, ap); 671 va_end(ap); 672 h->buflen = strlen(h->buf); 673 } 674 675 static void 676 bufncat(struct html *h, const char *p, size_t sz) 677 { 678 679 assert(h->buflen + sz + 1 < BUFSIZ); 680 strncat(h->buf, p, sz); 681 h->buflen += sz; 682 } 683 684 void 685 buffmt_includes(struct html *h, const char *name) 686 { 687 const char *p, *pp; 688 689 pp = h->base_includes; 690 691 bufinit(h); 692 while (NULL != (p = strchr(pp, '%'))) { 693 bufncat(h, pp, (size_t)(p - pp)); 694 switch (*(p + 1)) { 695 case'I': 696 bufcat(h, name); 697 break; 698 default: 699 bufncat(h, p, 2); 700 break; 701 } 702 pp = p + 2; 703 } 704 if (pp) 705 bufcat(h, pp); 706 } 707 708 void 709 buffmt_man(struct html *h, const char *name, const char *sec) 710 { 711 const char *p, *pp; 712 713 pp = h->base_man; 714 715 bufinit(h); 716 while (NULL != (p = strchr(pp, '%'))) { 717 bufncat(h, pp, (size_t)(p - pp)); 718 switch (*(p + 1)) { 719 case 'S': 720 bufcat(h, sec ? sec : "1"); 721 break; 722 case 'N': 723 bufcat_fmt(h, "%s", name); 724 break; 725 default: 726 bufncat(h, p, 2); 727 break; 728 } 729 pp = p + 2; 730 } 731 if (pp) 732 bufcat(h, pp); 733 } 734 735 void 736 bufcat_su(struct html *h, const char *p, const struct roffsu *su) 737 { 738 double v; 739 740 v = su->scale; 741 if (SCALE_MM == su->unit && 0.0 == (v /= 100.0)) 742 v = 1.0; 743 else if (SCALE_BU == su->unit) 744 v /= 24.0; 745 746 bufcat_fmt(h, "%s: %.2f%s;", p, v, roffscales[su->unit]); 747 } 748 749 void 750 bufcat_id(struct html *h, const char *src) 751 { 752 753 /* Cf. <http://www.w3.org/TR/html4/types.html#h-6.2>. */ 754 755 while ('\0' != *src) 756 bufcat_fmt(h, "%.2x", *src++); 757 } 758