1 /* $Id: read.c,v 1.3 2011/09/18 10:25:28 schwarze Exp $ */ 2 /* 3 * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv> 4 * Copyright (c) 2010, 2011 Ingo Schwarze <schwarze@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 #include <sys/stat.h> 19 #include <sys/mman.h> 20 21 #include <assert.h> 22 #include <ctype.h> 23 #include <fcntl.h> 24 #include <stdarg.h> 25 #include <stdio.h> 26 #include <stdlib.h> 27 #include <string.h> 28 #include <unistd.h> 29 30 #include "mandoc.h" 31 #include "libmandoc.h" 32 #include "mdoc.h" 33 #include "man.h" 34 35 #define REPARSE_LIMIT 1000 36 37 struct buf { 38 char *buf; /* binary input buffer */ 39 size_t sz; /* size of binary buffer */ 40 }; 41 42 struct mparse { 43 enum mandoclevel file_status; /* status of current parse */ 44 enum mandoclevel wlevel; /* ignore messages below this */ 45 int line; /* line number in the file */ 46 enum mparset inttype; /* which parser to use */ 47 struct man *pman; /* persistent man parser */ 48 struct mdoc *pmdoc; /* persistent mdoc parser */ 49 struct man *man; /* man parser */ 50 struct mdoc *mdoc; /* mdoc parser */ 51 struct roff *roff; /* roff parser (!NULL) */ 52 int reparse_count; /* finite interp. stack */ 53 mandocmsg mmsg; /* warning/error message handler */ 54 void *arg; /* argument to mmsg */ 55 const char *file; 56 }; 57 58 static void resize_buf(struct buf *, size_t); 59 static void mparse_buf_r(struct mparse *, struct buf, int); 60 static void mparse_readfd_r(struct mparse *, int, const char *, int); 61 static void pset(const char *, int, struct mparse *); 62 static void pdesc(struct mparse *, const char *, int); 63 static int read_whole_file(const char *, int, struct buf *, int *); 64 static void mparse_end(struct mparse *); 65 66 static const enum mandocerr mandoclimits[MANDOCLEVEL_MAX] = { 67 MANDOCERR_OK, 68 MANDOCERR_WARNING, 69 MANDOCERR_WARNING, 70 MANDOCERR_ERROR, 71 MANDOCERR_FATAL, 72 MANDOCERR_MAX, 73 MANDOCERR_MAX 74 }; 75 76 static const char * const mandocerrs[MANDOCERR_MAX] = { 77 "ok", 78 79 "generic warning", 80 81 /* related to the prologue */ 82 "no title in document", 83 "document title should be all caps", 84 "unknown manual section", 85 "date missing, using today's date", 86 "cannot parse date, using it verbatim", 87 "prologue macros out of order", 88 "duplicate prologue macro", 89 "macro not allowed in prologue", 90 "macro not allowed in body", 91 92 /* related to document structure */ 93 ".so is fragile, better use ln(1)", 94 "NAME section must come first", 95 "bad NAME section contents", 96 "manual name not yet set", 97 "sections out of conventional order", 98 "duplicate section name", 99 "section not in conventional manual section", 100 101 /* related to macros and nesting */ 102 "skipping obsolete macro", 103 "skipping paragraph macro", 104 "skipping no-space macro", 105 "blocks badly nested", 106 "child violates parent syntax", 107 "nested displays are not portable", 108 "already in literal mode", 109 "line scope broken", 110 111 /* related to missing macro arguments */ 112 "skipping empty macro", 113 "argument count wrong", 114 "missing display type", 115 "list type must come first", 116 "tag lists require a width argument", 117 "missing font type", 118 "skipping end of block that is not open", 119 120 /* related to bad macro arguments */ 121 "skipping argument", 122 "duplicate argument", 123 "duplicate display type", 124 "duplicate list type", 125 "unknown AT&T UNIX version", 126 "bad Boolean value", 127 "unknown font", 128 "unknown standard specifier", 129 "bad width argument", 130 131 /* related to plain text */ 132 "blank line in non-literal context", 133 "tab in non-literal context", 134 "end of line whitespace", 135 "bad comment style", 136 "bad escape sequence", 137 "unterminated quoted string", 138 139 /* related to equations */ 140 "unexpected literal in equation", 141 142 "generic error", 143 144 /* related to equations */ 145 "unexpected equation scope closure", 146 "equation scope open on exit", 147 "overlapping equation scopes", 148 "unexpected end of equation", 149 "equation syntax error", 150 151 /* related to tables */ 152 "bad table syntax", 153 "bad table option", 154 "bad table layout", 155 "no table layout cells specified", 156 "no table data cells specified", 157 "ignore data in cell", 158 "data block still open", 159 "ignoring extra data cells", 160 161 "input stack limit exceeded, infinite loop?", 162 "skipping bad character", 163 "escaped character not allowed in a name", 164 "skipping text before the first section header", 165 "skipping unknown macro", 166 "NOT IMPLEMENTED, please use groff: skipping request", 167 "argument count wrong", 168 "skipping end of block that is not open", 169 "missing end of block", 170 "scope open on exit", 171 "uname(3) system call failed", 172 "macro requires line argument(s)", 173 "macro requires body argument(s)", 174 "macro requires argument(s)", 175 "missing list type", 176 "line argument(s) will be lost", 177 "body argument(s) will be lost", 178 179 "generic fatal error", 180 181 "not a manual", 182 "column syntax is inconsistent", 183 "NOT IMPLEMENTED: .Bd -file", 184 "line scope broken, syntax violated", 185 "argument count wrong, violates syntax", 186 "child violates parent syntax", 187 "argument count wrong, violates syntax", 188 "NOT IMPLEMENTED: .so with absolute path or \"..\"", 189 "no document body", 190 "no document prologue", 191 "static buffer exhausted", 192 }; 193 194 static const char * const mandoclevels[MANDOCLEVEL_MAX] = { 195 "SUCCESS", 196 "RESERVED", 197 "WARNING", 198 "ERROR", 199 "FATAL", 200 "BADARG", 201 "SYSERR" 202 }; 203 204 static void 205 resize_buf(struct buf *buf, size_t initial) 206 { 207 208 buf->sz = buf->sz > initial/2 ? 2 * buf->sz : initial; 209 buf->buf = mandoc_realloc(buf->buf, buf->sz); 210 } 211 212 static void 213 pset(const char *buf, int pos, struct mparse *curp) 214 { 215 int i; 216 217 /* 218 * Try to intuit which kind of manual parser should be used. If 219 * passed in by command-line (-man, -mdoc), then use that 220 * explicitly. If passed as -mandoc, then try to guess from the 221 * line: either skip dot-lines, use -mdoc when finding `.Dt', or 222 * default to -man, which is more lenient. 223 * 224 * Separate out pmdoc/pman from mdoc/man: the first persists 225 * through all parsers, while the latter is used per-parse. 226 */ 227 228 if ('.' == buf[0] || '\'' == buf[0]) { 229 for (i = 1; buf[i]; i++) 230 if (' ' != buf[i] && '\t' != buf[i]) 231 break; 232 if ('\0' == buf[i]) 233 return; 234 } 235 236 switch (curp->inttype) { 237 case (MPARSE_MDOC): 238 if (NULL == curp->pmdoc) 239 curp->pmdoc = mdoc_alloc(curp->roff, curp); 240 assert(curp->pmdoc); 241 curp->mdoc = curp->pmdoc; 242 return; 243 case (MPARSE_MAN): 244 if (NULL == curp->pman) 245 curp->pman = man_alloc(curp->roff, curp); 246 assert(curp->pman); 247 curp->man = curp->pman; 248 return; 249 default: 250 break; 251 } 252 253 if (pos >= 3 && 0 == memcmp(buf, ".Dd", 3)) { 254 if (NULL == curp->pmdoc) 255 curp->pmdoc = mdoc_alloc(curp->roff, curp); 256 assert(curp->pmdoc); 257 curp->mdoc = curp->pmdoc; 258 return; 259 } 260 261 if (NULL == curp->pman) 262 curp->pman = man_alloc(curp->roff, curp); 263 assert(curp->pman); 264 curp->man = curp->pman; 265 } 266 267 /* 268 * Main parse routine for an opened file. This is called for each 269 * opened file and simply loops around the full input file, possibly 270 * nesting (i.e., with `so'). 271 */ 272 static void 273 mparse_buf_r(struct mparse *curp, struct buf blk, int start) 274 { 275 const struct tbl_span *span; 276 struct buf ln; 277 enum rofferr rr; 278 int i, of, rc; 279 int pos; /* byte number in the ln buffer */ 280 int lnn; /* line number in the real file */ 281 unsigned char c; 282 283 memset(&ln, 0, sizeof(struct buf)); 284 285 lnn = curp->line; 286 pos = 0; 287 288 for (i = 0; i < (int)blk.sz; ) { 289 if (0 == pos && '\0' == blk.buf[i]) 290 break; 291 292 if (start) { 293 curp->line = lnn; 294 curp->reparse_count = 0; 295 } 296 297 while (i < (int)blk.sz && (start || '\0' != blk.buf[i])) { 298 299 /* 300 * When finding an unescaped newline character, 301 * leave the character loop to process the line. 302 * Skip a preceding carriage return, if any. 303 */ 304 305 if ('\r' == blk.buf[i] && i + 1 < (int)blk.sz && 306 '\n' == blk.buf[i + 1]) 307 ++i; 308 if ('\n' == blk.buf[i]) { 309 ++i; 310 ++lnn; 311 break; 312 } 313 314 /* 315 * Warn about bogus characters. If you're using 316 * non-ASCII encoding, you're screwing your 317 * readers. Since I'd rather this not happen, 318 * I'll be helpful and drop these characters so 319 * we don't display gibberish. Note to manual 320 * writers: use special characters. 321 */ 322 323 c = (unsigned char) blk.buf[i]; 324 325 if ( ! (isascii(c) && 326 (isgraph(c) || isblank(c)))) { 327 mandoc_msg(MANDOCERR_BADCHAR, curp, 328 curp->line, pos, "ignoring byte"); 329 i++; 330 continue; 331 } 332 333 /* Trailing backslash = a plain char. */ 334 335 if ('\\' != blk.buf[i] || i + 1 == (int)blk.sz) { 336 if (pos >= (int)ln.sz) 337 resize_buf(&ln, 256); 338 ln.buf[pos++] = blk.buf[i++]; 339 continue; 340 } 341 342 /* 343 * Found escape and at least one other character. 344 * When it's a newline character, skip it. 345 * When there is a carriage return in between, 346 * skip that one as well. 347 */ 348 349 if ('\r' == blk.buf[i + 1] && i + 2 < (int)blk.sz && 350 '\n' == blk.buf[i + 2]) 351 ++i; 352 if ('\n' == blk.buf[i + 1]) { 353 i += 2; 354 ++lnn; 355 continue; 356 } 357 358 if ('"' == blk.buf[i + 1] || '#' == blk.buf[i + 1]) { 359 i += 2; 360 /* Comment, skip to end of line */ 361 for (; i < (int)blk.sz; ++i) { 362 if ('\n' == blk.buf[i]) { 363 ++i; 364 ++lnn; 365 break; 366 } 367 } 368 369 /* Backout trailing whitespaces */ 370 for (; pos > 0; --pos) { 371 if (ln.buf[pos - 1] != ' ') 372 break; 373 if (pos > 2 && ln.buf[pos - 2] == '\\') 374 break; 375 } 376 break; 377 } 378 379 /* Some other escape sequence, copy & cont. */ 380 381 if (pos + 1 >= (int)ln.sz) 382 resize_buf(&ln, 256); 383 384 ln.buf[pos++] = blk.buf[i++]; 385 ln.buf[pos++] = blk.buf[i++]; 386 } 387 388 if (pos >= (int)ln.sz) 389 resize_buf(&ln, 256); 390 391 ln.buf[pos] = '\0'; 392 393 /* 394 * A significant amount of complexity is contained by 395 * the roff preprocessor. It's line-oriented but can be 396 * expressed on one line, so we need at times to 397 * readjust our starting point and re-run it. The roff 398 * preprocessor can also readjust the buffers with new 399 * data, so we pass them in wholesale. 400 */ 401 402 of = 0; 403 404 rerun: 405 rr = roff_parseln 406 (curp->roff, curp->line, 407 &ln.buf, &ln.sz, of, &of); 408 409 switch (rr) { 410 case (ROFF_REPARSE): 411 if (REPARSE_LIMIT >= ++curp->reparse_count) 412 mparse_buf_r(curp, ln, 0); 413 else 414 mandoc_msg(MANDOCERR_ROFFLOOP, curp, 415 curp->line, pos, NULL); 416 pos = 0; 417 continue; 418 case (ROFF_APPEND): 419 pos = (int)strlen(ln.buf); 420 continue; 421 case (ROFF_RERUN): 422 goto rerun; 423 case (ROFF_IGN): 424 pos = 0; 425 continue; 426 case (ROFF_ERR): 427 assert(MANDOCLEVEL_FATAL <= curp->file_status); 428 break; 429 case (ROFF_SO): 430 mparse_readfd_r(curp, -1, ln.buf + of, 1); 431 if (MANDOCLEVEL_FATAL <= curp->file_status) 432 break; 433 pos = 0; 434 continue; 435 default: 436 break; 437 } 438 439 /* 440 * If we encounter errors in the recursive parse, make 441 * sure we don't continue parsing. 442 */ 443 444 if (MANDOCLEVEL_FATAL <= curp->file_status) 445 break; 446 447 /* 448 * If input parsers have not been allocated, do so now. 449 * We keep these instanced between parsers, but set them 450 * locally per parse routine since we can use different 451 * parsers with each one. 452 */ 453 454 if ( ! (curp->man || curp->mdoc)) 455 pset(ln.buf + of, pos - of, curp); 456 457 /* 458 * Lastly, push down into the parsers themselves. One 459 * of these will have already been set in the pset() 460 * routine. 461 * If libroff returns ROFF_TBL, then add it to the 462 * currently open parse. Since we only get here if 463 * there does exist data (see tbl_data.c), we're 464 * guaranteed that something's been allocated. 465 * Do the same for ROFF_EQN. 466 */ 467 468 rc = -1; 469 470 if (ROFF_TBL == rr) 471 while (NULL != (span = roff_span(curp->roff))) { 472 rc = curp->man ? 473 man_addspan(curp->man, span) : 474 mdoc_addspan(curp->mdoc, span); 475 if (0 == rc) 476 break; 477 } 478 else if (ROFF_EQN == rr) 479 rc = curp->mdoc ? 480 mdoc_addeqn(curp->mdoc, 481 roff_eqn(curp->roff)) : 482 man_addeqn(curp->man, 483 roff_eqn(curp->roff)); 484 else if (curp->man || curp->mdoc) 485 rc = curp->man ? 486 man_parseln(curp->man, 487 curp->line, ln.buf, of) : 488 mdoc_parseln(curp->mdoc, 489 curp->line, ln.buf, of); 490 491 if (0 == rc) { 492 assert(MANDOCLEVEL_FATAL <= curp->file_status); 493 break; 494 } 495 496 /* Temporary buffers typically are not full. */ 497 498 if (0 == start && '\0' == blk.buf[i]) 499 break; 500 501 /* Start the next input line. */ 502 503 pos = 0; 504 } 505 506 free(ln.buf); 507 } 508 509 static void 510 pdesc(struct mparse *curp, const char *file, int fd) 511 { 512 struct buf blk; 513 int with_mmap; 514 515 /* 516 * Run for each opened file; may be called more than once for 517 * each full parse sequence if the opened file is nested (i.e., 518 * from `so'). Simply sucks in the whole file and moves into 519 * the parse phase for the file. 520 */ 521 522 if ( ! read_whole_file(file, fd, &blk, &with_mmap)) { 523 curp->file_status = MANDOCLEVEL_SYSERR; 524 return; 525 } 526 527 /* Line number is per-file. */ 528 529 curp->line = 1; 530 531 mparse_buf_r(curp, blk, 1); 532 533 if (with_mmap) 534 munmap(blk.buf, blk.sz); 535 else 536 free(blk.buf); 537 } 538 539 static int 540 read_whole_file(const char *file, int fd, struct buf *fb, int *with_mmap) 541 { 542 struct stat st; 543 size_t off; 544 ssize_t ssz; 545 546 if (-1 == fstat(fd, &st)) { 547 perror(file); 548 return(0); 549 } 550 551 /* 552 * If we're a regular file, try just reading in the whole entry 553 * via mmap(). This is faster than reading it into blocks, and 554 * since each file is only a few bytes to begin with, I'm not 555 * concerned that this is going to tank any machines. 556 */ 557 558 if (S_ISREG(st.st_mode)) { 559 if (st.st_size >= (1U << 31)) { 560 fprintf(stderr, "%s: input too large\n", file); 561 return(0); 562 } 563 *with_mmap = 1; 564 fb->sz = (size_t)st.st_size; 565 fb->buf = mmap(NULL, fb->sz, PROT_READ, 566 MAP_FILE|MAP_SHARED, fd, 0); 567 if (fb->buf != MAP_FAILED) 568 return(1); 569 } 570 571 /* 572 * If this isn't a regular file (like, say, stdin), then we must 573 * go the old way and just read things in bit by bit. 574 */ 575 576 *with_mmap = 0; 577 off = 0; 578 fb->sz = 0; 579 fb->buf = NULL; 580 for (;;) { 581 if (off == fb->sz) { 582 if (fb->sz == (1U << 31)) { 583 fprintf(stderr, "%s: input too large\n", file); 584 break; 585 } 586 resize_buf(fb, 65536); 587 } 588 ssz = read(fd, fb->buf + (int)off, fb->sz - off); 589 if (ssz == 0) { 590 fb->sz = off; 591 return(1); 592 } 593 if (ssz == -1) { 594 perror(file); 595 break; 596 } 597 off += (size_t)ssz; 598 } 599 600 free(fb->buf); 601 fb->buf = NULL; 602 return(0); 603 } 604 605 static void 606 mparse_end(struct mparse *curp) 607 { 608 609 if (MANDOCLEVEL_FATAL <= curp->file_status) 610 return; 611 612 if (curp->mdoc && ! mdoc_endparse(curp->mdoc)) { 613 assert(MANDOCLEVEL_FATAL <= curp->file_status); 614 return; 615 } 616 617 if (curp->man && ! man_endparse(curp->man)) { 618 assert(MANDOCLEVEL_FATAL <= curp->file_status); 619 return; 620 } 621 622 if ( ! (curp->man || curp->mdoc)) { 623 mandoc_msg(MANDOCERR_NOTMANUAL, curp, 1, 0, NULL); 624 curp->file_status = MANDOCLEVEL_FATAL; 625 return; 626 } 627 628 roff_endparse(curp->roff); 629 } 630 631 static void 632 mparse_readfd_r(struct mparse *curp, int fd, const char *file, int re) 633 { 634 const char *svfile; 635 636 if (-1 == fd) 637 if (-1 == (fd = open(file, O_RDONLY, 0))) { 638 perror(file); 639 curp->file_status = MANDOCLEVEL_SYSERR; 640 return; 641 } 642 643 svfile = curp->file; 644 curp->file = file; 645 646 pdesc(curp, file, fd); 647 648 if (0 == re && MANDOCLEVEL_FATAL > curp->file_status) 649 mparse_end(curp); 650 651 if (STDIN_FILENO != fd && -1 == close(fd)) 652 perror(file); 653 654 curp->file = svfile; 655 } 656 657 enum mandoclevel 658 mparse_readfd(struct mparse *curp, int fd, const char *file) 659 { 660 661 mparse_readfd_r(curp, fd, file, 0); 662 return(curp->file_status); 663 } 664 665 struct mparse * 666 mparse_alloc(enum mparset inttype, enum mandoclevel wlevel, mandocmsg mmsg, void *arg) 667 { 668 struct mparse *curp; 669 670 assert(wlevel <= MANDOCLEVEL_FATAL); 671 672 curp = mandoc_calloc(1, sizeof(struct mparse)); 673 674 curp->wlevel = wlevel; 675 curp->mmsg = mmsg; 676 curp->arg = arg; 677 curp->inttype = inttype; 678 679 curp->roff = roff_alloc(curp); 680 return(curp); 681 } 682 683 void 684 mparse_reset(struct mparse *curp) 685 { 686 687 roff_reset(curp->roff); 688 689 if (curp->mdoc) 690 mdoc_reset(curp->mdoc); 691 if (curp->man) 692 man_reset(curp->man); 693 694 curp->file_status = MANDOCLEVEL_OK; 695 curp->mdoc = NULL; 696 curp->man = NULL; 697 } 698 699 void 700 mparse_free(struct mparse *curp) 701 { 702 703 if (curp->pmdoc) 704 mdoc_free(curp->pmdoc); 705 if (curp->pman) 706 man_free(curp->pman); 707 if (curp->roff) 708 roff_free(curp->roff); 709 710 free(curp); 711 } 712 713 void 714 mparse_result(struct mparse *curp, struct mdoc **mdoc, struct man **man) 715 { 716 717 if (mdoc) 718 *mdoc = curp->mdoc; 719 if (man) 720 *man = curp->man; 721 } 722 723 void 724 mandoc_vmsg(enum mandocerr t, struct mparse *m, 725 int ln, int pos, const char *fmt, ...) 726 { 727 char buf[256]; 728 va_list ap; 729 730 va_start(ap, fmt); 731 vsnprintf(buf, sizeof(buf) - 1, fmt, ap); 732 va_end(ap); 733 734 mandoc_msg(t, m, ln, pos, buf); 735 } 736 737 void 738 mandoc_msg(enum mandocerr er, struct mparse *m, 739 int ln, int col, const char *msg) 740 { 741 enum mandoclevel level; 742 743 level = MANDOCLEVEL_FATAL; 744 while (er < mandoclimits[level]) 745 level--; 746 747 if (level < m->wlevel) 748 return; 749 750 if (m->mmsg) 751 (*m->mmsg)(er, level, m->file, ln, col, msg); 752 753 if (m->file_status < level) 754 m->file_status = level; 755 } 756 757 const char * 758 mparse_strerror(enum mandocerr er) 759 { 760 761 return(mandocerrs[er]); 762 } 763 764 const char * 765 mparse_strlevel(enum mandoclevel lvl) 766 { 767 return(mandoclevels[lvl]); 768 } 769