1 /* $Id: html.c,v 1.30 2012/05/28 13:00:51 schwarze Exp $ */ 2 /* 3 * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv> 4 * Copyright (c) 2011, 2012 Ingo Schwarze <schwarze@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 #include <sys/types.h> 19 20 #include <assert.h> 21 #include <ctype.h> 22 #include <stdarg.h> 23 #include <stdio.h> 24 #include <stdint.h> 25 #include <stdlib.h> 26 #include <string.h> 27 #include <unistd.h> 28 29 #include "mandoc.h" 30 #include "libmandoc.h" 31 #include "out.h" 32 #include "html.h" 33 #include "main.h" 34 35 struct htmldata { 36 const char *name; 37 int flags; 38 #define HTML_CLRLINE (1 << 0) 39 #define HTML_NOSTACK (1 << 1) 40 #define HTML_AUTOCLOSE (1 << 2) /* Tag has auto-closure. */ 41 }; 42 43 static const struct htmldata htmltags[TAG_MAX] = { 44 {"html", HTML_CLRLINE}, /* TAG_HTML */ 45 {"head", HTML_CLRLINE}, /* TAG_HEAD */ 46 {"body", HTML_CLRLINE}, /* TAG_BODY */ 47 {"meta", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_META */ 48 {"title", HTML_CLRLINE}, /* TAG_TITLE */ 49 {"div", HTML_CLRLINE}, /* TAG_DIV */ 50 {"h1", 0}, /* TAG_H1 */ 51 {"h2", 0}, /* TAG_H2 */ 52 {"span", 0}, /* TAG_SPAN */ 53 {"link", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_LINK */ 54 {"br", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_BR */ 55 {"a", 0}, /* TAG_A */ 56 {"table", HTML_CLRLINE}, /* TAG_TABLE */ 57 {"tbody", HTML_CLRLINE}, /* TAG_TBODY */ 58 {"col", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_COL */ 59 {"tr", HTML_CLRLINE}, /* TAG_TR */ 60 {"td", HTML_CLRLINE}, /* TAG_TD */ 61 {"li", HTML_CLRLINE}, /* TAG_LI */ 62 {"ul", HTML_CLRLINE}, /* TAG_UL */ 63 {"ol", HTML_CLRLINE}, /* TAG_OL */ 64 {"dl", HTML_CLRLINE}, /* TAG_DL */ 65 {"dt", HTML_CLRLINE}, /* TAG_DT */ 66 {"dd", HTML_CLRLINE}, /* TAG_DD */ 67 {"blockquote", HTML_CLRLINE}, /* TAG_BLOCKQUOTE */ 68 {"p", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_P */ 69 {"pre", HTML_CLRLINE }, /* TAG_PRE */ 70 {"b", 0 }, /* TAG_B */ 71 {"i", 0 }, /* TAG_I */ 72 {"code", 0 }, /* TAG_CODE */ 73 {"small", 0 }, /* TAG_SMALL */ 74 }; 75 76 static const char *const htmlattrs[ATTR_MAX] = { 77 "http-equiv", /* ATTR_HTTPEQUIV */ 78 "content", /* ATTR_CONTENT */ 79 "name", /* ATTR_NAME */ 80 "rel", /* ATTR_REL */ 81 "href", /* ATTR_HREF */ 82 "type", /* ATTR_TYPE */ 83 "media", /* ATTR_MEDIA */ 84 "class", /* ATTR_CLASS */ 85 "style", /* ATTR_STYLE */ 86 "width", /* ATTR_WIDTH */ 87 "id", /* ATTR_ID */ 88 "summary", /* ATTR_SUMMARY */ 89 "align", /* ATTR_ALIGN */ 90 "colspan", /* ATTR_COLSPAN */ 91 }; 92 93 static const char *const roffscales[SCALE_MAX] = { 94 "cm", /* SCALE_CM */ 95 "in", /* SCALE_IN */ 96 "pc", /* SCALE_PC */ 97 "pt", /* SCALE_PT */ 98 "em", /* SCALE_EM */ 99 "em", /* SCALE_MM */ 100 "ex", /* SCALE_EN */ 101 "ex", /* SCALE_BU */ 102 "em", /* SCALE_VS */ 103 "ex", /* SCALE_FS */ 104 }; 105 106 static void bufncat(struct html *, const char *, size_t); 107 static void print_ctag(struct html *, enum htmltag); 108 static int print_encode(struct html *, const char *, int); 109 static void print_metaf(struct html *, enum mandoc_esc); 110 static void print_attr(struct html *, const char *, const char *); 111 static void *ml_alloc(char *, enum htmltype); 112 113 static void * 114 ml_alloc(char *outopts, enum htmltype type) 115 { 116 struct html *h; 117 const char *toks[5]; 118 char *v; 119 120 toks[0] = "style"; 121 toks[1] = "man"; 122 toks[2] = "includes"; 123 toks[3] = "fragment"; 124 toks[4] = NULL; 125 126 h = mandoc_calloc(1, sizeof(struct html)); 127 128 h->type = type; 129 h->tags.head = NULL; 130 h->symtab = mchars_alloc(); 131 132 while (outopts && *outopts) 133 switch (getsubopt(&outopts, UNCONST(toks), &v)) { 134 case (0): 135 h->style = v; 136 break; 137 case (1): 138 h->base_man = v; 139 break; 140 case (2): 141 h->base_includes = v; 142 break; 143 case (3): 144 h->oflags |= HTML_FRAGMENT; 145 break; 146 default: 147 break; 148 } 149 150 return(h); 151 } 152 153 void * 154 html_alloc(char *outopts) 155 { 156 157 return(ml_alloc(outopts, HTML_HTML_4_01_STRICT)); 158 } 159 160 161 void * 162 xhtml_alloc(char *outopts) 163 { 164 165 return(ml_alloc(outopts, HTML_XHTML_1_0_STRICT)); 166 } 167 168 169 void 170 html_free(void *p) 171 { 172 struct tag *tag; 173 struct html *h; 174 175 h = (struct html *)p; 176 177 while ((tag = h->tags.head) != NULL) { 178 h->tags.head = tag->next; 179 free(tag); 180 } 181 182 if (h->symtab) 183 mchars_free(h->symtab); 184 185 free(h); 186 } 187 188 189 void 190 print_gen_head(struct html *h) 191 { 192 struct htmlpair tag[4]; 193 194 tag[0].key = ATTR_HTTPEQUIV; 195 tag[0].val = "Content-Type"; 196 tag[1].key = ATTR_CONTENT; 197 tag[1].val = "text/html; charset=utf-8"; 198 print_otag(h, TAG_META, 2, tag); 199 200 tag[0].key = ATTR_NAME; 201 tag[0].val = "resource-type"; 202 tag[1].key = ATTR_CONTENT; 203 tag[1].val = "document"; 204 print_otag(h, TAG_META, 2, tag); 205 206 if (h->style) { 207 tag[0].key = ATTR_REL; 208 tag[0].val = "stylesheet"; 209 tag[1].key = ATTR_HREF; 210 tag[1].val = h->style; 211 tag[2].key = ATTR_TYPE; 212 tag[2].val = "text/css"; 213 tag[3].key = ATTR_MEDIA; 214 tag[3].val = "all"; 215 print_otag(h, TAG_LINK, 4, tag); 216 } 217 } 218 219 static void 220 print_metaf(struct html *h, enum mandoc_esc deco) 221 { 222 enum htmlfont font; 223 224 switch (deco) { 225 case (ESCAPE_FONTPREV): 226 font = h->metal; 227 break; 228 case (ESCAPE_FONTITALIC): 229 font = HTMLFONT_ITALIC; 230 break; 231 case (ESCAPE_FONTBOLD): 232 font = HTMLFONT_BOLD; 233 break; 234 case (ESCAPE_FONT): 235 /* FALLTHROUGH */ 236 case (ESCAPE_FONTROMAN): 237 font = HTMLFONT_NONE; 238 break; 239 default: 240 abort(); 241 /* NOTREACHED */ 242 } 243 244 if (h->metaf) { 245 print_tagq(h, h->metaf); 246 h->metaf = NULL; 247 } 248 249 h->metal = h->metac; 250 h->metac = font; 251 252 if (HTMLFONT_NONE != font) 253 h->metaf = HTMLFONT_BOLD == font ? 254 print_otag(h, TAG_B, 0, NULL) : 255 print_otag(h, TAG_I, 0, NULL); 256 } 257 258 int 259 html_strlen(const char *cp) 260 { 261 size_t rsz; 262 int skip, sz; 263 264 /* 265 * Account for escaped sequences within string length 266 * calculations. This follows the logic in term_strlen() as we 267 * must calculate the width of produced strings. 268 * Assume that characters are always width of "1". This is 269 * hacky, but it gets the job done for approximation of widths. 270 */ 271 272 sz = 0; 273 skip = 0; 274 while (1) { 275 rsz = strcspn(cp, "\\"); 276 if (rsz) { 277 cp += rsz; 278 if (skip) { 279 skip = 0; 280 rsz--; 281 } 282 sz += rsz; 283 } 284 if ('\0' == *cp) 285 break; 286 cp++; 287 switch (mandoc_escape(&cp, NULL, NULL)) { 288 case (ESCAPE_ERROR): 289 return(sz); 290 case (ESCAPE_UNICODE): 291 /* FALLTHROUGH */ 292 case (ESCAPE_NUMBERED): 293 /* FALLTHROUGH */ 294 case (ESCAPE_SPECIAL): 295 if (skip) 296 skip = 0; 297 else 298 sz++; 299 break; 300 case (ESCAPE_SKIPCHAR): 301 skip = 1; 302 break; 303 default: 304 break; 305 } 306 } 307 return(sz); 308 } 309 310 static int 311 print_encode(struct html *h, const char *p, int norecurse) 312 { 313 size_t sz; 314 int c, len, nospace; 315 const char *seq; 316 enum mandoc_esc esc; 317 static const char rejs[6] = { '\\', '<', '>', '&', ASCII_HYPH, '\0' }; 318 319 nospace = 0; 320 321 while ('\0' != *p) { 322 if (HTML_SKIPCHAR & h->flags && '\\' != *p) { 323 h->flags &= ~HTML_SKIPCHAR; 324 p++; 325 continue; 326 } 327 328 sz = strcspn(p, rejs); 329 330 fwrite(p, 1, sz, stdout); 331 p += (int)sz; 332 333 if ('\0' == *p) 334 break; 335 336 switch (*p++) { 337 case ('<'): 338 printf("<"); 339 continue; 340 case ('>'): 341 printf(">"); 342 continue; 343 case ('&'): 344 printf("&"); 345 continue; 346 case (ASCII_HYPH): 347 putchar('-'); 348 continue; 349 default: 350 break; 351 } 352 353 esc = mandoc_escape(&p, &seq, &len); 354 if (ESCAPE_ERROR == esc) 355 break; 356 357 switch (esc) { 358 case (ESCAPE_FONT): 359 /* FALLTHROUGH */ 360 case (ESCAPE_FONTPREV): 361 /* FALLTHROUGH */ 362 case (ESCAPE_FONTBOLD): 363 /* FALLTHROUGH */ 364 case (ESCAPE_FONTITALIC): 365 /* FALLTHROUGH */ 366 case (ESCAPE_FONTROMAN): 367 if (0 == norecurse) 368 print_metaf(h, esc); 369 continue; 370 case (ESCAPE_SKIPCHAR): 371 h->flags |= HTML_SKIPCHAR; 372 continue; 373 default: 374 break; 375 } 376 377 if (h->flags & HTML_SKIPCHAR) { 378 h->flags &= ~HTML_SKIPCHAR; 379 continue; 380 } 381 382 switch (esc) { 383 case (ESCAPE_UNICODE): 384 /* Skip passed "u" header. */ 385 c = mchars_num2uc(seq + 1, len - 1); 386 if ('\0' != c) 387 printf("&#x%x;", c); 388 break; 389 case (ESCAPE_NUMBERED): 390 c = mchars_num2char(seq, len); 391 if ('\0' != c) 392 putchar(c); 393 break; 394 case (ESCAPE_SPECIAL): 395 c = mchars_spec2cp(h->symtab, seq, len); 396 if (c > 0) 397 printf("&#%d;", c); 398 else if (-1 == c && 1 == len) 399 putchar((int)*seq); 400 break; 401 case (ESCAPE_NOSPACE): 402 if ('\0' == *p) 403 nospace = 1; 404 break; 405 default: 406 break; 407 } 408 } 409 410 return(nospace); 411 } 412 413 414 static void 415 print_attr(struct html *h, const char *key, const char *val) 416 { 417 printf(" %s=\"", key); 418 (void)print_encode(h, val, 1); 419 putchar('\"'); 420 } 421 422 423 struct tag * 424 print_otag(struct html *h, enum htmltag tag, 425 int sz, const struct htmlpair *p) 426 { 427 int i; 428 struct tag *t; 429 430 /* Push this tags onto the stack of open scopes. */ 431 432 if ( ! (HTML_NOSTACK & htmltags[tag].flags)) { 433 t = mandoc_malloc(sizeof(struct tag)); 434 t->tag = tag; 435 t->next = h->tags.head; 436 h->tags.head = t; 437 } else 438 t = NULL; 439 440 if ( ! (HTML_NOSPACE & h->flags)) 441 if ( ! (HTML_CLRLINE & htmltags[tag].flags)) { 442 /* Manage keeps! */ 443 if ( ! (HTML_KEEP & h->flags)) { 444 if (HTML_PREKEEP & h->flags) 445 h->flags |= HTML_KEEP; 446 putchar(' '); 447 } else 448 printf(" "); 449 } 450 451 if ( ! (h->flags & HTML_NONOSPACE)) 452 h->flags &= ~HTML_NOSPACE; 453 else 454 h->flags |= HTML_NOSPACE; 455 456 /* Print out the tag name and attributes. */ 457 458 printf("<%s", htmltags[tag].name); 459 for (i = 0; i < sz; i++) 460 print_attr(h, htmlattrs[p[i].key], p[i].val); 461 462 /* Add non-overridable attributes. */ 463 464 if (TAG_HTML == tag && HTML_XHTML_1_0_STRICT == h->type) { 465 print_attr(h, "xmlns", "http://www.w3.org/1999/xhtml"); 466 print_attr(h, "xml:lang", "en"); 467 print_attr(h, "lang", "en"); 468 } 469 470 /* Accommodate for XML "well-formed" singleton escaping. */ 471 472 if (HTML_AUTOCLOSE & htmltags[tag].flags) 473 switch (h->type) { 474 case (HTML_XHTML_1_0_STRICT): 475 putchar('/'); 476 break; 477 default: 478 break; 479 } 480 481 putchar('>'); 482 483 h->flags |= HTML_NOSPACE; 484 485 if ((HTML_AUTOCLOSE | HTML_CLRLINE) & htmltags[tag].flags) 486 putchar('\n'); 487 488 return(t); 489 } 490 491 492 static void 493 print_ctag(struct html *h, enum htmltag tag) 494 { 495 496 printf("</%s>", htmltags[tag].name); 497 if (HTML_CLRLINE & htmltags[tag].flags) { 498 h->flags |= HTML_NOSPACE; 499 putchar('\n'); 500 } 501 } 502 503 void 504 print_gen_decls(struct html *h) 505 { 506 const char *doctype; 507 const char *dtd; 508 const char *name; 509 510 switch (h->type) { 511 case (HTML_HTML_4_01_STRICT): 512 name = "HTML"; 513 doctype = "-//W3C//DTD HTML 4.01//EN"; 514 dtd = "http://www.w3.org/TR/html4/strict.dtd"; 515 break; 516 default: 517 puts("<?xml version=\"1.0\" encoding=\"UTF-8\"?>"); 518 name = "html"; 519 doctype = "-//W3C//DTD XHTML 1.0 Strict//EN"; 520 dtd = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"; 521 break; 522 } 523 524 printf("<!DOCTYPE %s PUBLIC \"%s\" \"%s\">\n", 525 name, doctype, dtd); 526 } 527 528 void 529 print_text(struct html *h, const char *word) 530 { 531 532 if ( ! (HTML_NOSPACE & h->flags)) { 533 /* Manage keeps! */ 534 if ( ! (HTML_KEEP & h->flags)) { 535 if (HTML_PREKEEP & h->flags) 536 h->flags |= HTML_KEEP; 537 putchar(' '); 538 } else 539 printf(" "); 540 } 541 542 assert(NULL == h->metaf); 543 if (HTMLFONT_NONE != h->metac) 544 h->metaf = HTMLFONT_BOLD == h->metac ? 545 print_otag(h, TAG_B, 0, NULL) : 546 print_otag(h, TAG_I, 0, NULL); 547 548 assert(word); 549 if ( ! print_encode(h, word, 0)) { 550 if ( ! (h->flags & HTML_NONOSPACE)) 551 h->flags &= ~HTML_NOSPACE; 552 } else 553 h->flags |= HTML_NOSPACE; 554 555 if (h->metaf) { 556 print_tagq(h, h->metaf); 557 h->metaf = NULL; 558 } 559 560 h->flags &= ~HTML_IGNDELIM; 561 } 562 563 564 void 565 print_tagq(struct html *h, const struct tag *until) 566 { 567 struct tag *tag; 568 569 while ((tag = h->tags.head) != NULL) { 570 /* 571 * Remember to close out and nullify the current 572 * meta-font and table, if applicable. 573 */ 574 if (tag == h->metaf) 575 h->metaf = NULL; 576 if (tag == h->tblt) 577 h->tblt = NULL; 578 print_ctag(h, tag->tag); 579 h->tags.head = tag->next; 580 free(tag); 581 if (until && tag == until) 582 return; 583 } 584 } 585 586 587 void 588 print_stagq(struct html *h, const struct tag *suntil) 589 { 590 struct tag *tag; 591 592 while ((tag = h->tags.head) != NULL) { 593 if (suntil && tag == suntil) 594 return; 595 /* 596 * Remember to close out and nullify the current 597 * meta-font and table, if applicable. 598 */ 599 if (tag == h->metaf) 600 h->metaf = NULL; 601 if (tag == h->tblt) 602 h->tblt = NULL; 603 print_ctag(h, tag->tag); 604 h->tags.head = tag->next; 605 free(tag); 606 } 607 } 608 609 void 610 bufinit(struct html *h) 611 { 612 613 h->buf[0] = '\0'; 614 h->buflen = 0; 615 } 616 617 void 618 bufcat_style(struct html *h, const char *key, const char *val) 619 { 620 621 bufcat(h, key); 622 bufcat(h, ":"); 623 bufcat(h, val); 624 bufcat(h, ";"); 625 } 626 627 void 628 bufcat(struct html *h, const char *p) 629 { 630 631 h->buflen = strlcat(h->buf, p, BUFSIZ); 632 assert(h->buflen < BUFSIZ); 633 } 634 635 void 636 bufcat_fmt(struct html *h, const char *fmt, ...) 637 { 638 va_list ap; 639 640 va_start(ap, fmt); 641 (void)vsnprintf(h->buf + (int)h->buflen, 642 BUFSIZ - h->buflen - 1, fmt, ap); 643 va_end(ap); 644 h->buflen = strlen(h->buf); 645 } 646 647 static void 648 bufncat(struct html *h, const char *p, size_t sz) 649 { 650 651 assert(h->buflen + sz + 1 < BUFSIZ); 652 strncat(h->buf, p, sz); 653 h->buflen += sz; 654 } 655 656 void 657 buffmt_includes(struct html *h, const char *name) 658 { 659 const char *p, *pp; 660 661 pp = h->base_includes; 662 663 bufinit(h); 664 while (NULL != (p = strchr(pp, '%'))) { 665 bufncat(h, pp, (size_t)(p - pp)); 666 switch (*(p + 1)) { 667 case('I'): 668 bufcat(h, name); 669 break; 670 default: 671 bufncat(h, p, 2); 672 break; 673 } 674 pp = p + 2; 675 } 676 if (pp) 677 bufcat(h, pp); 678 } 679 680 void 681 buffmt_man(struct html *h, 682 const char *name, const char *sec) 683 { 684 const char *p, *pp; 685 686 pp = h->base_man; 687 688 bufinit(h); 689 while (NULL != (p = strchr(pp, '%'))) { 690 bufncat(h, pp, (size_t)(p - pp)); 691 switch (*(p + 1)) { 692 case('S'): 693 bufcat(h, sec ? sec : "1"); 694 break; 695 case('N'): 696 bufcat_fmt(h, name); 697 break; 698 default: 699 bufncat(h, p, 2); 700 break; 701 } 702 pp = p + 2; 703 } 704 if (pp) 705 bufcat(h, pp); 706 } 707 708 void 709 bufcat_su(struct html *h, const char *p, const struct roffsu *su) 710 { 711 double v; 712 713 v = su->scale; 714 if (SCALE_MM == su->unit && 0.0 == (v /= 100.0)) 715 v = 1.0; 716 717 bufcat_fmt(h, "%s: %.2f%s;", p, v, roffscales[su->unit]); 718 } 719 720 void 721 bufcat_id(struct html *h, const char *src) 722 { 723 724 /* Cf. <http://www.w3.org/TR/html4/types.html#h-6.2>. */ 725 726 while ('\0' != *src) 727 bufcat_fmt(h, "%.2x", *src++); 728 } 729