1 /* $Id: html.c,v 1.36 2014/04/23 16:07:06 schwarze Exp $ */ 2 /* 3 * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv> 4 * Copyright (c) 2011, 2012, 2013, 2014 Ingo Schwarze <schwarze@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 #include <sys/types.h> 19 20 #include <assert.h> 21 #include <ctype.h> 22 #include <stdarg.h> 23 #include <stdio.h> 24 #include <stdint.h> 25 #include <stdlib.h> 26 #include <string.h> 27 #include <unistd.h> 28 29 #include "mandoc.h" 30 #include "mandoc_aux.h" 31 #include "libmandoc.h" 32 #include "out.h" 33 #include "html.h" 34 #include "main.h" 35 36 struct htmldata { 37 const char *name; 38 int flags; 39 #define HTML_CLRLINE (1 << 0) 40 #define HTML_NOSTACK (1 << 1) 41 #define HTML_AUTOCLOSE (1 << 2) /* Tag has auto-closure. */ 42 }; 43 44 static const struct htmldata htmltags[TAG_MAX] = { 45 {"html", HTML_CLRLINE}, /* TAG_HTML */ 46 {"head", HTML_CLRLINE}, /* TAG_HEAD */ 47 {"body", HTML_CLRLINE}, /* TAG_BODY */ 48 {"meta", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_META */ 49 {"title", HTML_CLRLINE}, /* TAG_TITLE */ 50 {"div", HTML_CLRLINE}, /* TAG_DIV */ 51 {"h1", 0}, /* TAG_H1 */ 52 {"h2", 0}, /* TAG_H2 */ 53 {"span", 0}, /* TAG_SPAN */ 54 {"link", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_LINK */ 55 {"br", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_BR */ 56 {"a", 0}, /* TAG_A */ 57 {"table", HTML_CLRLINE}, /* TAG_TABLE */ 58 {"tbody", HTML_CLRLINE}, /* TAG_TBODY */ 59 {"col", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_COL */ 60 {"tr", HTML_CLRLINE}, /* TAG_TR */ 61 {"td", HTML_CLRLINE}, /* TAG_TD */ 62 {"li", HTML_CLRLINE}, /* TAG_LI */ 63 {"ul", HTML_CLRLINE}, /* TAG_UL */ 64 {"ol", HTML_CLRLINE}, /* TAG_OL */ 65 {"dl", HTML_CLRLINE}, /* TAG_DL */ 66 {"dt", HTML_CLRLINE}, /* TAG_DT */ 67 {"dd", HTML_CLRLINE}, /* TAG_DD */ 68 {"blockquote", HTML_CLRLINE}, /* TAG_BLOCKQUOTE */ 69 {"p", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_P */ 70 {"pre", HTML_CLRLINE }, /* TAG_PRE */ 71 {"b", 0 }, /* TAG_B */ 72 {"i", 0 }, /* TAG_I */ 73 {"code", 0 }, /* TAG_CODE */ 74 {"small", 0 }, /* TAG_SMALL */ 75 }; 76 77 static const char *const htmlattrs[ATTR_MAX] = { 78 "http-equiv", /* ATTR_HTTPEQUIV */ 79 "content", /* ATTR_CONTENT */ 80 "name", /* ATTR_NAME */ 81 "rel", /* ATTR_REL */ 82 "href", /* ATTR_HREF */ 83 "type", /* ATTR_TYPE */ 84 "media", /* ATTR_MEDIA */ 85 "class", /* ATTR_CLASS */ 86 "style", /* ATTR_STYLE */ 87 "width", /* ATTR_WIDTH */ 88 "id", /* ATTR_ID */ 89 "summary", /* ATTR_SUMMARY */ 90 "align", /* ATTR_ALIGN */ 91 "colspan", /* ATTR_COLSPAN */ 92 }; 93 94 static const char *const roffscales[SCALE_MAX] = { 95 "cm", /* SCALE_CM */ 96 "in", /* SCALE_IN */ 97 "pc", /* SCALE_PC */ 98 "pt", /* SCALE_PT */ 99 "em", /* SCALE_EM */ 100 "em", /* SCALE_MM */ 101 "ex", /* SCALE_EN */ 102 "ex", /* SCALE_BU */ 103 "em", /* SCALE_VS */ 104 "ex", /* SCALE_FS */ 105 }; 106 107 static void bufncat(struct html *, const char *, size_t); 108 static void print_ctag(struct html *, enum htmltag); 109 static int print_encode(struct html *, const char *, int); 110 static void print_metaf(struct html *, enum mandoc_esc); 111 static void print_attr(struct html *, const char *, const char *); 112 static void *ml_alloc(char *, enum htmltype); 113 114 115 static void * 116 ml_alloc(char *outopts, enum htmltype type) 117 { 118 struct html *h; 119 const char *toks[5]; 120 char *v; 121 122 toks[0] = "style"; 123 toks[1] = "man"; 124 toks[2] = "includes"; 125 toks[3] = "fragment"; 126 toks[4] = NULL; 127 128 h = mandoc_calloc(1, sizeof(struct html)); 129 130 h->type = type; 131 h->tags.head = NULL; 132 h->symtab = mchars_alloc(); 133 134 while (outopts && *outopts) 135 switch (getsubopt(&outopts, UNCONST(toks), &v)) { 136 case 0: 137 h->style = v; 138 break; 139 case 1: 140 h->base_man = v; 141 break; 142 case 2: 143 h->base_includes = v; 144 break; 145 case 3: 146 h->oflags |= HTML_FRAGMENT; 147 break; 148 default: 149 break; 150 } 151 152 return(h); 153 } 154 155 void * 156 html_alloc(char *outopts) 157 { 158 159 return(ml_alloc(outopts, HTML_HTML_4_01_STRICT)); 160 } 161 162 void * 163 xhtml_alloc(char *outopts) 164 { 165 166 return(ml_alloc(outopts, HTML_XHTML_1_0_STRICT)); 167 } 168 169 void 170 html_free(void *p) 171 { 172 struct tag *tag; 173 struct html *h; 174 175 h = (struct html *)p; 176 177 while ((tag = h->tags.head) != NULL) { 178 h->tags.head = tag->next; 179 free(tag); 180 } 181 182 if (h->symtab) 183 mchars_free(h->symtab); 184 185 free(h); 186 } 187 188 void 189 print_gen_head(struct html *h) 190 { 191 struct htmlpair tag[4]; 192 193 tag[0].key = ATTR_HTTPEQUIV; 194 tag[0].val = "Content-Type"; 195 tag[1].key = ATTR_CONTENT; 196 tag[1].val = "text/html; charset=utf-8"; 197 print_otag(h, TAG_META, 2, tag); 198 199 tag[0].key = ATTR_NAME; 200 tag[0].val = "resource-type"; 201 tag[1].key = ATTR_CONTENT; 202 tag[1].val = "document"; 203 print_otag(h, TAG_META, 2, tag); 204 205 if (h->style) { 206 tag[0].key = ATTR_REL; 207 tag[0].val = "stylesheet"; 208 tag[1].key = ATTR_HREF; 209 tag[1].val = h->style; 210 tag[2].key = ATTR_TYPE; 211 tag[2].val = "text/css"; 212 tag[3].key = ATTR_MEDIA; 213 tag[3].val = "all"; 214 print_otag(h, TAG_LINK, 4, tag); 215 } 216 } 217 218 static void 219 print_metaf(struct html *h, enum mandoc_esc deco) 220 { 221 enum htmlfont font; 222 223 switch (deco) { 224 case ESCAPE_FONTPREV: 225 font = h->metal; 226 break; 227 case ESCAPE_FONTITALIC: 228 font = HTMLFONT_ITALIC; 229 break; 230 case ESCAPE_FONTBOLD: 231 font = HTMLFONT_BOLD; 232 break; 233 case ESCAPE_FONTBI: 234 font = HTMLFONT_BI; 235 break; 236 case ESCAPE_FONT: 237 /* FALLTHROUGH */ 238 case ESCAPE_FONTROMAN: 239 font = HTMLFONT_NONE; 240 break; 241 default: 242 abort(); 243 /* NOTREACHED */ 244 } 245 246 if (h->metaf) { 247 print_tagq(h, h->metaf); 248 h->metaf = NULL; 249 } 250 251 h->metal = h->metac; 252 h->metac = font; 253 254 switch (font) { 255 case HTMLFONT_ITALIC: 256 h->metaf = print_otag(h, TAG_I, 0, NULL); 257 break; 258 case HTMLFONT_BOLD: 259 h->metaf = print_otag(h, TAG_B, 0, NULL); 260 break; 261 case HTMLFONT_BI: 262 h->metaf = print_otag(h, TAG_B, 0, NULL); 263 print_otag(h, TAG_I, 0, NULL); 264 break; 265 default: 266 break; 267 } 268 } 269 270 int 271 html_strlen(const char *cp) 272 { 273 size_t rsz; 274 int skip, sz; 275 276 /* 277 * Account for escaped sequences within string length 278 * calculations. This follows the logic in term_strlen() as we 279 * must calculate the width of produced strings. 280 * Assume that characters are always width of "1". This is 281 * hacky, but it gets the job done for approximation of widths. 282 */ 283 284 sz = 0; 285 skip = 0; 286 while (1) { 287 rsz = strcspn(cp, "\\"); 288 if (rsz) { 289 cp += rsz; 290 if (skip) { 291 skip = 0; 292 rsz--; 293 } 294 sz += rsz; 295 } 296 if ('\0' == *cp) 297 break; 298 cp++; 299 switch (mandoc_escape(&cp, NULL, NULL)) { 300 case ESCAPE_ERROR: 301 return(sz); 302 case ESCAPE_UNICODE: 303 /* FALLTHROUGH */ 304 case ESCAPE_NUMBERED: 305 /* FALLTHROUGH */ 306 case ESCAPE_SPECIAL: 307 if (skip) 308 skip = 0; 309 else 310 sz++; 311 break; 312 case ESCAPE_SKIPCHAR: 313 skip = 1; 314 break; 315 default: 316 break; 317 } 318 } 319 return(sz); 320 } 321 322 static int 323 print_encode(struct html *h, const char *p, int norecurse) 324 { 325 size_t sz; 326 int c, len, nospace; 327 const char *seq; 328 enum mandoc_esc esc; 329 static const char rejs[8] = { '\\', '<', '>', '&', 330 ASCII_NBRSP, ASCII_HYPH, ASCII_BREAK, '\0' }; 331 332 nospace = 0; 333 334 while ('\0' != *p) { 335 if (HTML_SKIPCHAR & h->flags && '\\' != *p) { 336 h->flags &= ~HTML_SKIPCHAR; 337 p++; 338 continue; 339 } 340 341 sz = strcspn(p, rejs); 342 343 fwrite(p, 1, sz, stdout); 344 p += (int)sz; 345 346 if ('\0' == *p) 347 break; 348 349 switch (*p++) { 350 case '<': 351 printf("<"); 352 continue; 353 case '>': 354 printf(">"); 355 continue; 356 case '&': 357 printf("&"); 358 continue; 359 case ASCII_NBRSP: 360 putchar('-'); 361 continue; 362 case ASCII_HYPH: 363 putchar('-'); 364 /* FALLTHROUGH */ 365 case ASCII_BREAK: 366 continue; 367 default: 368 break; 369 } 370 371 esc = mandoc_escape(&p, &seq, &len); 372 if (ESCAPE_ERROR == esc) 373 break; 374 375 switch (esc) { 376 case ESCAPE_FONT: 377 /* FALLTHROUGH */ 378 case ESCAPE_FONTPREV: 379 /* FALLTHROUGH */ 380 case ESCAPE_FONTBOLD: 381 /* FALLTHROUGH */ 382 case ESCAPE_FONTITALIC: 383 /* FALLTHROUGH */ 384 case ESCAPE_FONTBI: 385 /* FALLTHROUGH */ 386 case ESCAPE_FONTROMAN: 387 if (0 == norecurse) 388 print_metaf(h, esc); 389 continue; 390 case ESCAPE_SKIPCHAR: 391 h->flags |= HTML_SKIPCHAR; 392 continue; 393 default: 394 break; 395 } 396 397 if (h->flags & HTML_SKIPCHAR) { 398 h->flags &= ~HTML_SKIPCHAR; 399 continue; 400 } 401 402 switch (esc) { 403 case ESCAPE_UNICODE: 404 /* Skip passed "u" header. */ 405 c = mchars_num2uc(seq + 1, len - 1); 406 if ('\0' != c) 407 printf("&#x%x;", c); 408 break; 409 case ESCAPE_NUMBERED: 410 c = mchars_num2char(seq, len); 411 if ('\0' != c) 412 putchar(c); 413 break; 414 case ESCAPE_SPECIAL: 415 c = mchars_spec2cp(h->symtab, seq, len); 416 if (c > 0) 417 printf("&#%d;", c); 418 else if (-1 == c && 1 == len) 419 putchar((int)*seq); 420 break; 421 case ESCAPE_NOSPACE: 422 if ('\0' == *p) 423 nospace = 1; 424 break; 425 default: 426 break; 427 } 428 } 429 430 return(nospace); 431 } 432 433 static void 434 print_attr(struct html *h, const char *key, const char *val) 435 { 436 printf(" %s=\"", key); 437 (void)print_encode(h, val, 1); 438 putchar('\"'); 439 } 440 441 struct tag * 442 print_otag(struct html *h, enum htmltag tag, 443 int sz, const struct htmlpair *p) 444 { 445 int i; 446 struct tag *t; 447 448 /* Push this tags onto the stack of open scopes. */ 449 450 if ( ! (HTML_NOSTACK & htmltags[tag].flags)) { 451 t = mandoc_malloc(sizeof(struct tag)); 452 t->tag = tag; 453 t->next = h->tags.head; 454 h->tags.head = t; 455 } else 456 t = NULL; 457 458 if ( ! (HTML_NOSPACE & h->flags)) 459 if ( ! (HTML_CLRLINE & htmltags[tag].flags)) { 460 /* Manage keeps! */ 461 if ( ! (HTML_KEEP & h->flags)) { 462 if (HTML_PREKEEP & h->flags) 463 h->flags |= HTML_KEEP; 464 putchar(' '); 465 } else 466 printf(" "); 467 } 468 469 if ( ! (h->flags & HTML_NONOSPACE)) 470 h->flags &= ~HTML_NOSPACE; 471 else 472 h->flags |= HTML_NOSPACE; 473 474 /* Print out the tag name and attributes. */ 475 476 printf("<%s", htmltags[tag].name); 477 for (i = 0; i < sz; i++) 478 print_attr(h, htmlattrs[p[i].key], p[i].val); 479 480 /* Add non-overridable attributes. */ 481 482 if (TAG_HTML == tag && HTML_XHTML_1_0_STRICT == h->type) { 483 print_attr(h, "xmlns", "http://www.w3.org/1999/xhtml"); 484 print_attr(h, "xml:lang", "en"); 485 print_attr(h, "lang", "en"); 486 } 487 488 /* Accommodate for XML "well-formed" singleton escaping. */ 489 490 if (HTML_AUTOCLOSE & htmltags[tag].flags) 491 switch (h->type) { 492 case HTML_XHTML_1_0_STRICT: 493 putchar('/'); 494 break; 495 default: 496 break; 497 } 498 499 putchar('>'); 500 501 h->flags |= HTML_NOSPACE; 502 503 if ((HTML_AUTOCLOSE | HTML_CLRLINE) & htmltags[tag].flags) 504 putchar('\n'); 505 506 return(t); 507 } 508 509 static void 510 print_ctag(struct html *h, enum htmltag tag) 511 { 512 513 printf("</%s>", htmltags[tag].name); 514 if (HTML_CLRLINE & htmltags[tag].flags) { 515 h->flags |= HTML_NOSPACE; 516 putchar('\n'); 517 } 518 } 519 520 void 521 print_gen_decls(struct html *h) 522 { 523 const char *doctype; 524 const char *dtd; 525 const char *name; 526 527 switch (h->type) { 528 case HTML_HTML_4_01_STRICT: 529 name = "HTML"; 530 doctype = "-//W3C//DTD HTML 4.01//EN"; 531 dtd = "http://www.w3.org/TR/html4/strict.dtd"; 532 break; 533 default: 534 puts("<?xml version=\"1.0\" encoding=\"UTF-8\"?>"); 535 name = "html"; 536 doctype = "-//W3C//DTD XHTML 1.0 Strict//EN"; 537 dtd = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"; 538 break; 539 } 540 541 printf("<!DOCTYPE %s PUBLIC \"%s\" \"%s\">\n", 542 name, doctype, dtd); 543 } 544 545 void 546 print_text(struct html *h, const char *word) 547 { 548 549 if ( ! (HTML_NOSPACE & h->flags)) { 550 /* Manage keeps! */ 551 if ( ! (HTML_KEEP & h->flags)) { 552 if (HTML_PREKEEP & h->flags) 553 h->flags |= HTML_KEEP; 554 putchar(' '); 555 } else 556 printf(" "); 557 } 558 559 assert(NULL == h->metaf); 560 switch (h->metac) { 561 case HTMLFONT_ITALIC: 562 h->metaf = print_otag(h, TAG_I, 0, NULL); 563 break; 564 case HTMLFONT_BOLD: 565 h->metaf = print_otag(h, TAG_B, 0, NULL); 566 break; 567 case HTMLFONT_BI: 568 h->metaf = print_otag(h, TAG_B, 0, NULL); 569 print_otag(h, TAG_I, 0, NULL); 570 break; 571 default: 572 break; 573 } 574 575 assert(word); 576 if ( ! print_encode(h, word, 0)) { 577 if ( ! (h->flags & HTML_NONOSPACE)) 578 h->flags &= ~HTML_NOSPACE; 579 } else 580 h->flags |= HTML_NOSPACE; 581 582 if (h->metaf) { 583 print_tagq(h, h->metaf); 584 h->metaf = NULL; 585 } 586 587 h->flags &= ~HTML_IGNDELIM; 588 } 589 590 void 591 print_tagq(struct html *h, const struct tag *until) 592 { 593 struct tag *tag; 594 595 while ((tag = h->tags.head) != NULL) { 596 /* 597 * Remember to close out and nullify the current 598 * meta-font and table, if applicable. 599 */ 600 if (tag == h->metaf) 601 h->metaf = NULL; 602 if (tag == h->tblt) 603 h->tblt = NULL; 604 print_ctag(h, tag->tag); 605 h->tags.head = tag->next; 606 free(tag); 607 if (until && tag == until) 608 return; 609 } 610 } 611 612 void 613 print_stagq(struct html *h, const struct tag *suntil) 614 { 615 struct tag *tag; 616 617 while ((tag = h->tags.head) != NULL) { 618 if (suntil && tag == suntil) 619 return; 620 /* 621 * Remember to close out and nullify the current 622 * meta-font and table, if applicable. 623 */ 624 if (tag == h->metaf) 625 h->metaf = NULL; 626 if (tag == h->tblt) 627 h->tblt = NULL; 628 print_ctag(h, tag->tag); 629 h->tags.head = tag->next; 630 free(tag); 631 } 632 } 633 634 void 635 bufinit(struct html *h) 636 { 637 638 h->buf[0] = '\0'; 639 h->buflen = 0; 640 } 641 642 void 643 bufcat_style(struct html *h, const char *key, const char *val) 644 { 645 646 bufcat(h, key); 647 bufcat(h, ":"); 648 bufcat(h, val); 649 bufcat(h, ";"); 650 } 651 652 void 653 bufcat(struct html *h, const char *p) 654 { 655 656 /* 657 * XXX This is broken and not easy to fix. 658 * When using the -Oincludes option, buffmt_includes() 659 * may pass in strings overrunning BUFSIZ, causing a crash. 660 */ 661 662 h->buflen = strlcat(h->buf, p, BUFSIZ); 663 assert(h->buflen < BUFSIZ); 664 } 665 666 void 667 bufcat_fmt(struct html *h, const char *fmt, ...) 668 { 669 va_list ap; 670 671 va_start(ap, fmt); 672 (void)vsnprintf(h->buf + (int)h->buflen, 673 BUFSIZ - h->buflen - 1, fmt, ap); 674 va_end(ap); 675 h->buflen = strlen(h->buf); 676 } 677 678 static void 679 bufncat(struct html *h, const char *p, size_t sz) 680 { 681 682 assert(h->buflen + sz + 1 < BUFSIZ); 683 strncat(h->buf, p, sz); 684 h->buflen += sz; 685 } 686 687 void 688 buffmt_includes(struct html *h, const char *name) 689 { 690 const char *p, *pp; 691 692 pp = h->base_includes; 693 694 bufinit(h); 695 while (NULL != (p = strchr(pp, '%'))) { 696 bufncat(h, pp, (size_t)(p - pp)); 697 switch (*(p + 1)) { 698 case'I': 699 bufcat(h, name); 700 break; 701 default: 702 bufncat(h, p, 2); 703 break; 704 } 705 pp = p + 2; 706 } 707 if (pp) 708 bufcat(h, pp); 709 } 710 711 void 712 buffmt_man(struct html *h, const char *name, const char *sec) 713 { 714 const char *p, *pp; 715 716 pp = h->base_man; 717 718 bufinit(h); 719 while (NULL != (p = strchr(pp, '%'))) { 720 bufncat(h, pp, (size_t)(p - pp)); 721 switch (*(p + 1)) { 722 case 'S': 723 bufcat(h, sec ? sec : "1"); 724 break; 725 case 'N': 726 bufcat_fmt(h, "%s", name); 727 break; 728 default: 729 bufncat(h, p, 2); 730 break; 731 } 732 pp = p + 2; 733 } 734 if (pp) 735 bufcat(h, pp); 736 } 737 738 void 739 bufcat_su(struct html *h, const char *p, const struct roffsu *su) 740 { 741 double v; 742 743 v = su->scale; 744 if (SCALE_MM == su->unit && 0.0 == (v /= 100.0)) 745 v = 1.0; 746 747 bufcat_fmt(h, "%s: %.2f%s;", p, v, roffscales[su->unit]); 748 } 749 750 void 751 bufcat_id(struct html *h, const char *src) 752 { 753 754 /* Cf. <http://www.w3.org/TR/html4/types.html#h-6.2>. */ 755 756 while ('\0' != *src) 757 bufcat_fmt(h, "%.2x", *src++); 758 } 759