1 /* $OpenBSD: html.c,v 1.62 2015/12/25 20:43:04 bentley Exp $ */ 2 /* 3 * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv> 4 * Copyright (c) 2011-2015 Ingo Schwarze <schwarze@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 #include <sys/types.h> 19 20 #include <assert.h> 21 #include <ctype.h> 22 #include <stdarg.h> 23 #include <stdio.h> 24 #include <stdint.h> 25 #include <stdlib.h> 26 #include <string.h> 27 #include <unistd.h> 28 29 #include "mandoc.h" 30 #include "mandoc_aux.h" 31 #include "out.h" 32 #include "html.h" 33 #include "manconf.h" 34 #include "main.h" 35 36 struct htmldata { 37 const char *name; 38 int flags; 39 #define HTML_CLRLINE (1 << 0) 40 #define HTML_NOSTACK (1 << 1) 41 #define HTML_AUTOCLOSE (1 << 2) /* Tag has auto-closure. */ 42 }; 43 44 static const struct htmldata htmltags[TAG_MAX] = { 45 {"html", HTML_CLRLINE}, /* TAG_HTML */ 46 {"head", HTML_CLRLINE}, /* TAG_HEAD */ 47 {"body", HTML_CLRLINE}, /* TAG_BODY */ 48 {"meta", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_META */ 49 {"title", HTML_CLRLINE}, /* TAG_TITLE */ 50 {"div", HTML_CLRLINE}, /* TAG_DIV */ 51 {"h1", 0}, /* TAG_H1 */ 52 {"h2", 0}, /* TAG_H2 */ 53 {"span", 0}, /* TAG_SPAN */ 54 {"link", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_LINK */ 55 {"br", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_BR */ 56 {"a", 0}, /* TAG_A */ 57 {"table", HTML_CLRLINE}, /* TAG_TABLE */ 58 {"tbody", HTML_CLRLINE}, /* TAG_TBODY */ 59 {"col", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_COL */ 60 {"tr", HTML_CLRLINE}, /* TAG_TR */ 61 {"td", HTML_CLRLINE}, /* TAG_TD */ 62 {"li", HTML_CLRLINE}, /* TAG_LI */ 63 {"ul", HTML_CLRLINE}, /* TAG_UL */ 64 {"ol", HTML_CLRLINE}, /* TAG_OL */ 65 {"dl", HTML_CLRLINE}, /* TAG_DL */ 66 {"dt", HTML_CLRLINE}, /* TAG_DT */ 67 {"dd", HTML_CLRLINE}, /* TAG_DD */ 68 {"blockquote", HTML_CLRLINE}, /* TAG_BLOCKQUOTE */ 69 {"pre", HTML_CLRLINE }, /* TAG_PRE */ 70 {"b", 0 }, /* TAG_B */ 71 {"i", 0 }, /* TAG_I */ 72 {"code", 0 }, /* TAG_CODE */ 73 {"small", 0 }, /* TAG_SMALL */ 74 {"style", HTML_CLRLINE}, /* TAG_STYLE */ 75 {"math", HTML_CLRLINE}, /* TAG_MATH */ 76 {"mrow", 0}, /* TAG_MROW */ 77 {"mi", 0}, /* TAG_MI */ 78 {"mo", 0}, /* TAG_MO */ 79 {"msup", 0}, /* TAG_MSUP */ 80 {"msub", 0}, /* TAG_MSUB */ 81 {"msubsup", 0}, /* TAG_MSUBSUP */ 82 {"mfrac", 0}, /* TAG_MFRAC */ 83 {"msqrt", 0}, /* TAG_MSQRT */ 84 {"mfenced", 0}, /* TAG_MFENCED */ 85 {"mtable", 0}, /* TAG_MTABLE */ 86 {"mtr", 0}, /* TAG_MTR */ 87 {"mtd", 0}, /* TAG_MTD */ 88 {"munderover", 0}, /* TAG_MUNDEROVER */ 89 {"munder", 0}, /* TAG_MUNDER*/ 90 {"mover", 0}, /* TAG_MOVER*/ 91 }; 92 93 static const char *const htmlattrs[ATTR_MAX] = { 94 "name", /* ATTR_NAME */ 95 "rel", /* ATTR_REL */ 96 "href", /* ATTR_HREF */ 97 "type", /* ATTR_TYPE */ 98 "media", /* ATTR_MEDIA */ 99 "class", /* ATTR_CLASS */ 100 "style", /* ATTR_STYLE */ 101 "id", /* ATTR_ID */ 102 "colspan", /* ATTR_COLSPAN */ 103 "charset", /* ATTR_CHARSET */ 104 "open", /* ATTR_OPEN */ 105 "close", /* ATTR_CLOSE */ 106 "mathvariant", /* ATTR_MATHVARIANT */ 107 }; 108 109 static const char *const roffscales[SCALE_MAX] = { 110 "cm", /* SCALE_CM */ 111 "in", /* SCALE_IN */ 112 "pc", /* SCALE_PC */ 113 "pt", /* SCALE_PT */ 114 "em", /* SCALE_EM */ 115 "em", /* SCALE_MM */ 116 "ex", /* SCALE_EN */ 117 "ex", /* SCALE_BU */ 118 "em", /* SCALE_VS */ 119 "ex", /* SCALE_FS */ 120 }; 121 122 static void bufncat(struct html *, const char *, size_t); 123 static void print_ctag(struct html *, struct tag *); 124 static int print_escape(char); 125 static int print_encode(struct html *, const char *, int); 126 static void print_metaf(struct html *, enum mandoc_esc); 127 static void print_attr(struct html *, const char *, const char *); 128 129 130 void * 131 html_alloc(const struct manoutput *outopts) 132 { 133 struct html *h; 134 135 h = mandoc_calloc(1, sizeof(struct html)); 136 137 h->tags.head = NULL; 138 h->style = outopts->style; 139 h->base_man = outopts->man; 140 h->base_includes = outopts->includes; 141 if (outopts->fragment) 142 h->oflags |= HTML_FRAGMENT; 143 144 return h; 145 } 146 147 void 148 html_free(void *p) 149 { 150 struct tag *tag; 151 struct html *h; 152 153 h = (struct html *)p; 154 155 while ((tag = h->tags.head) != NULL) { 156 h->tags.head = tag->next; 157 free(tag); 158 } 159 160 free(h); 161 } 162 163 void 164 print_gen_head(struct html *h) 165 { 166 struct htmlpair tag[4]; 167 struct tag *t; 168 169 tag[0].key = ATTR_CHARSET; 170 tag[0].val = "utf-8"; 171 print_otag(h, TAG_META, 1, tag); 172 173 /* 174 * Print a default style-sheet. 175 */ 176 t = print_otag(h, TAG_STYLE, 0, NULL); 177 print_text(h, "table.head, table.foot { width: 100%; }\n" 178 "td.head-rtitle, td.foot-os { text-align: right; }\n" 179 "td.head-vol { text-align: center; }\n" 180 "table.foot td { width: 50%; }\n" 181 "table.head td { width: 33%; }\n" 182 "div.spacer { margin: 1em 0; }\n"); 183 print_tagq(h, t); 184 185 if (h->style) { 186 tag[0].key = ATTR_REL; 187 tag[0].val = "stylesheet"; 188 tag[1].key = ATTR_HREF; 189 tag[1].val = h->style; 190 tag[2].key = ATTR_TYPE; 191 tag[2].val = "text/css"; 192 tag[3].key = ATTR_MEDIA; 193 tag[3].val = "all"; 194 print_otag(h, TAG_LINK, 4, tag); 195 } 196 } 197 198 static void 199 print_metaf(struct html *h, enum mandoc_esc deco) 200 { 201 enum htmlfont font; 202 203 switch (deco) { 204 case ESCAPE_FONTPREV: 205 font = h->metal; 206 break; 207 case ESCAPE_FONTITALIC: 208 font = HTMLFONT_ITALIC; 209 break; 210 case ESCAPE_FONTBOLD: 211 font = HTMLFONT_BOLD; 212 break; 213 case ESCAPE_FONTBI: 214 font = HTMLFONT_BI; 215 break; 216 case ESCAPE_FONT: 217 case ESCAPE_FONTROMAN: 218 font = HTMLFONT_NONE; 219 break; 220 default: 221 abort(); 222 } 223 224 if (h->metaf) { 225 print_tagq(h, h->metaf); 226 h->metaf = NULL; 227 } 228 229 h->metal = h->metac; 230 h->metac = font; 231 232 switch (font) { 233 case HTMLFONT_ITALIC: 234 h->metaf = print_otag(h, TAG_I, 0, NULL); 235 break; 236 case HTMLFONT_BOLD: 237 h->metaf = print_otag(h, TAG_B, 0, NULL); 238 break; 239 case HTMLFONT_BI: 240 h->metaf = print_otag(h, TAG_B, 0, NULL); 241 print_otag(h, TAG_I, 0, NULL); 242 break; 243 default: 244 break; 245 } 246 } 247 248 int 249 html_strlen(const char *cp) 250 { 251 size_t rsz; 252 int skip, sz; 253 254 /* 255 * Account for escaped sequences within string length 256 * calculations. This follows the logic in term_strlen() as we 257 * must calculate the width of produced strings. 258 * Assume that characters are always width of "1". This is 259 * hacky, but it gets the job done for approximation of widths. 260 */ 261 262 sz = 0; 263 skip = 0; 264 while (1) { 265 rsz = strcspn(cp, "\\"); 266 if (rsz) { 267 cp += rsz; 268 if (skip) { 269 skip = 0; 270 rsz--; 271 } 272 sz += rsz; 273 } 274 if ('\0' == *cp) 275 break; 276 cp++; 277 switch (mandoc_escape(&cp, NULL, NULL)) { 278 case ESCAPE_ERROR: 279 return sz; 280 case ESCAPE_UNICODE: 281 case ESCAPE_NUMBERED: 282 case ESCAPE_SPECIAL: 283 case ESCAPE_OVERSTRIKE: 284 if (skip) 285 skip = 0; 286 else 287 sz++; 288 break; 289 case ESCAPE_SKIPCHAR: 290 skip = 1; 291 break; 292 default: 293 break; 294 } 295 } 296 return sz; 297 } 298 299 static int 300 print_escape(char c) 301 { 302 303 switch (c) { 304 case '<': 305 printf("<"); 306 break; 307 case '>': 308 printf(">"); 309 break; 310 case '&': 311 printf("&"); 312 break; 313 case '"': 314 printf("""); 315 break; 316 case ASCII_NBRSP: 317 printf(" "); 318 break; 319 case ASCII_HYPH: 320 putchar('-'); 321 break; 322 case ASCII_BREAK: 323 break; 324 default: 325 return 0; 326 } 327 return 1; 328 } 329 330 static int 331 print_encode(struct html *h, const char *p, int norecurse) 332 { 333 size_t sz; 334 int c, len, nospace; 335 const char *seq; 336 enum mandoc_esc esc; 337 static const char rejs[9] = { '\\', '<', '>', '&', '"', 338 ASCII_NBRSP, ASCII_HYPH, ASCII_BREAK, '\0' }; 339 340 nospace = 0; 341 342 while ('\0' != *p) { 343 if (HTML_SKIPCHAR & h->flags && '\\' != *p) { 344 h->flags &= ~HTML_SKIPCHAR; 345 p++; 346 continue; 347 } 348 349 sz = strcspn(p, rejs); 350 351 fwrite(p, 1, sz, stdout); 352 p += (int)sz; 353 354 if ('\0' == *p) 355 break; 356 357 if (print_escape(*p++)) 358 continue; 359 360 esc = mandoc_escape(&p, &seq, &len); 361 if (ESCAPE_ERROR == esc) 362 break; 363 364 switch (esc) { 365 case ESCAPE_FONT: 366 case ESCAPE_FONTPREV: 367 case ESCAPE_FONTBOLD: 368 case ESCAPE_FONTITALIC: 369 case ESCAPE_FONTBI: 370 case ESCAPE_FONTROMAN: 371 if (0 == norecurse) 372 print_metaf(h, esc); 373 continue; 374 case ESCAPE_SKIPCHAR: 375 h->flags |= HTML_SKIPCHAR; 376 continue; 377 default: 378 break; 379 } 380 381 if (h->flags & HTML_SKIPCHAR) { 382 h->flags &= ~HTML_SKIPCHAR; 383 continue; 384 } 385 386 switch (esc) { 387 case ESCAPE_UNICODE: 388 /* Skip past "u" header. */ 389 c = mchars_num2uc(seq + 1, len - 1); 390 break; 391 case ESCAPE_NUMBERED: 392 c = mchars_num2char(seq, len); 393 if (c < 0) 394 continue; 395 break; 396 case ESCAPE_SPECIAL: 397 c = mchars_spec2cp(seq, len); 398 if (c <= 0) 399 continue; 400 break; 401 case ESCAPE_NOSPACE: 402 if ('\0' == *p) 403 nospace = 1; 404 continue; 405 case ESCAPE_OVERSTRIKE: 406 if (len == 0) 407 continue; 408 c = seq[len - 1]; 409 break; 410 default: 411 continue; 412 } 413 if ((c < 0x20 && c != 0x09) || 414 (c > 0x7E && c < 0xA0)) 415 c = 0xFFFD; 416 if (c > 0x7E) 417 printf("&#%d;", c); 418 else if ( ! print_escape(c)) 419 putchar(c); 420 } 421 422 return nospace; 423 } 424 425 static void 426 print_attr(struct html *h, const char *key, const char *val) 427 { 428 printf(" %s=\"", key); 429 (void)print_encode(h, val, 1); 430 putchar('\"'); 431 } 432 433 struct tag * 434 print_otag(struct html *h, enum htmltag tag, 435 int sz, const struct htmlpair *p) 436 { 437 int i; 438 struct tag *t; 439 440 /* Push this tags onto the stack of open scopes. */ 441 442 if ( ! (HTML_NOSTACK & htmltags[tag].flags)) { 443 t = mandoc_malloc(sizeof(struct tag)); 444 t->tag = tag; 445 t->next = h->tags.head; 446 h->tags.head = t; 447 } else 448 t = NULL; 449 450 if ( ! (HTML_NOSPACE & h->flags)) 451 if ( ! (HTML_CLRLINE & htmltags[tag].flags)) { 452 /* Manage keeps! */ 453 if ( ! (HTML_KEEP & h->flags)) { 454 if (HTML_PREKEEP & h->flags) 455 h->flags |= HTML_KEEP; 456 putchar(' '); 457 } else 458 printf(" "); 459 } 460 461 if ( ! (h->flags & HTML_NONOSPACE)) 462 h->flags &= ~HTML_NOSPACE; 463 else 464 h->flags |= HTML_NOSPACE; 465 466 /* Print out the tag name and attributes. */ 467 468 printf("<%s", htmltags[tag].name); 469 for (i = 0; i < sz; i++) 470 print_attr(h, htmlattrs[p[i].key], p[i].val); 471 472 /* Accommodate for "well-formed" singleton escaping. */ 473 474 if (HTML_AUTOCLOSE & htmltags[tag].flags) 475 putchar('/'); 476 477 putchar('>'); 478 479 h->flags |= HTML_NOSPACE; 480 481 if ((HTML_AUTOCLOSE | HTML_CLRLINE) & htmltags[tag].flags) 482 putchar('\n'); 483 484 return t; 485 } 486 487 static void 488 print_ctag(struct html *h, struct tag *tag) 489 { 490 491 /* 492 * Remember to close out and nullify the current 493 * meta-font and table, if applicable. 494 */ 495 if (tag == h->metaf) 496 h->metaf = NULL; 497 if (tag == h->tblt) 498 h->tblt = NULL; 499 500 printf("</%s>", htmltags[tag->tag].name); 501 if (HTML_CLRLINE & htmltags[tag->tag].flags) { 502 h->flags |= HTML_NOSPACE; 503 putchar('\n'); 504 } 505 506 h->tags.head = tag->next; 507 free(tag); 508 } 509 510 void 511 print_gen_decls(struct html *h) 512 { 513 514 puts("<!DOCTYPE html>"); 515 } 516 517 void 518 print_text(struct html *h, const char *word) 519 { 520 521 if ( ! (HTML_NOSPACE & h->flags)) { 522 /* Manage keeps! */ 523 if ( ! (HTML_KEEP & h->flags)) { 524 if (HTML_PREKEEP & h->flags) 525 h->flags |= HTML_KEEP; 526 putchar(' '); 527 } else 528 printf(" "); 529 } 530 531 assert(NULL == h->metaf); 532 switch (h->metac) { 533 case HTMLFONT_ITALIC: 534 h->metaf = print_otag(h, TAG_I, 0, NULL); 535 break; 536 case HTMLFONT_BOLD: 537 h->metaf = print_otag(h, TAG_B, 0, NULL); 538 break; 539 case HTMLFONT_BI: 540 h->metaf = print_otag(h, TAG_B, 0, NULL); 541 print_otag(h, TAG_I, 0, NULL); 542 break; 543 default: 544 break; 545 } 546 547 assert(word); 548 if ( ! print_encode(h, word, 0)) { 549 if ( ! (h->flags & HTML_NONOSPACE)) 550 h->flags &= ~HTML_NOSPACE; 551 h->flags &= ~HTML_NONEWLINE; 552 } else 553 h->flags |= HTML_NOSPACE | HTML_NONEWLINE; 554 555 if (h->metaf) { 556 print_tagq(h, h->metaf); 557 h->metaf = NULL; 558 } 559 560 h->flags &= ~HTML_IGNDELIM; 561 } 562 563 void 564 print_tagq(struct html *h, const struct tag *until) 565 { 566 struct tag *tag; 567 568 while ((tag = h->tags.head) != NULL) { 569 print_ctag(h, tag); 570 if (until && tag == until) 571 return; 572 } 573 } 574 575 void 576 print_stagq(struct html *h, const struct tag *suntil) 577 { 578 struct tag *tag; 579 580 while ((tag = h->tags.head) != NULL) { 581 if (suntil && tag == suntil) 582 return; 583 print_ctag(h, tag); 584 } 585 } 586 587 void 588 print_paragraph(struct html *h) 589 { 590 struct tag *t; 591 struct htmlpair tag; 592 593 PAIR_CLASS_INIT(&tag, "spacer"); 594 t = print_otag(h, TAG_DIV, 1, &tag); 595 print_tagq(h, t); 596 } 597 598 599 void 600 bufinit(struct html *h) 601 { 602 603 h->buf[0] = '\0'; 604 h->buflen = 0; 605 } 606 607 void 608 bufcat_style(struct html *h, const char *key, const char *val) 609 { 610 611 bufcat(h, key); 612 bufcat(h, ":"); 613 bufcat(h, val); 614 bufcat(h, ";"); 615 } 616 617 void 618 bufcat(struct html *h, const char *p) 619 { 620 621 /* 622 * XXX This is broken and not easy to fix. 623 * When using the -Oincludes option, buffmt_includes() 624 * may pass in strings overrunning BUFSIZ, causing a crash. 625 */ 626 627 h->buflen = strlcat(h->buf, p, BUFSIZ); 628 assert(h->buflen < BUFSIZ); 629 } 630 631 void 632 bufcat_fmt(struct html *h, const char *fmt, ...) 633 { 634 va_list ap; 635 636 va_start(ap, fmt); 637 (void)vsnprintf(h->buf + (int)h->buflen, 638 BUFSIZ - h->buflen - 1, fmt, ap); 639 va_end(ap); 640 h->buflen = strlen(h->buf); 641 } 642 643 static void 644 bufncat(struct html *h, const char *p, size_t sz) 645 { 646 647 assert(h->buflen + sz + 1 < BUFSIZ); 648 strncat(h->buf, p, sz); 649 h->buflen += sz; 650 } 651 652 void 653 buffmt_includes(struct html *h, const char *name) 654 { 655 const char *p, *pp; 656 657 pp = h->base_includes; 658 659 bufinit(h); 660 while (NULL != (p = strchr(pp, '%'))) { 661 bufncat(h, pp, (size_t)(p - pp)); 662 switch (*(p + 1)) { 663 case'I': 664 bufcat(h, name); 665 break; 666 default: 667 bufncat(h, p, 2); 668 break; 669 } 670 pp = p + 2; 671 } 672 if (pp) 673 bufcat(h, pp); 674 } 675 676 void 677 buffmt_man(struct html *h, const char *name, const char *sec) 678 { 679 const char *p, *pp; 680 681 pp = h->base_man; 682 683 bufinit(h); 684 while (NULL != (p = strchr(pp, '%'))) { 685 bufncat(h, pp, (size_t)(p - pp)); 686 switch (*(p + 1)) { 687 case 'S': 688 bufcat(h, sec ? sec : "1"); 689 break; 690 case 'N': 691 bufcat_fmt(h, "%s", name); 692 break; 693 default: 694 bufncat(h, p, 2); 695 break; 696 } 697 pp = p + 2; 698 } 699 if (pp) 700 bufcat(h, pp); 701 } 702 703 void 704 bufcat_su(struct html *h, const char *p, const struct roffsu *su) 705 { 706 double v; 707 708 v = su->scale; 709 if (SCALE_MM == su->unit && 0.0 == (v /= 100.0)) 710 v = 1.0; 711 else if (SCALE_BU == su->unit) 712 v /= 24.0; 713 714 bufcat_fmt(h, "%s: %.2f%s;", p, v, roffscales[su->unit]); 715 } 716 717 void 718 bufcat_id(struct html *h, const char *src) 719 { 720 721 /* Cf. <http://www.w3.org/TR/html5/dom.html#the-id-attribute>. */ 722 723 for (; '\0' != *src; src++) 724 bufncat(h, *src == ' ' ? "_" : src, 1); 725 } 726