1 /* $OpenBSD: read.c,v 1.190 2020/04/24 11:58:02 schwarze Exp $ */ 2 /* 3 * Copyright (c) 2010-2020 Ingo Schwarze <schwarze@openbsd.org> 4 * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv> 5 * Copyright (c) 2010, 2012 Joerg Sonnenberger <joerg@netbsd.org> 6 * 7 * Permission to use, copy, modify, and distribute this software for any 8 * purpose with or without fee is hereby granted, provided that the above 9 * copyright notice and this permission notice appear in all copies. 10 * 11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES 12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR 14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 18 * 19 * Top-level functions of the mandoc(3) parser: 20 * Parser and input encoding selection, decompression, 21 * handling of input bytes, characters, lines, and files, 22 * handling of roff(7) loops and file inclusion, 23 * and steering of the various parsers. 24 */ 25 #include <sys/types.h> 26 #include <sys/mman.h> 27 #include <sys/stat.h> 28 29 #include <assert.h> 30 #include <ctype.h> 31 #include <errno.h> 32 #include <fcntl.h> 33 #include <stdarg.h> 34 #include <stdio.h> 35 #include <stdlib.h> 36 #include <string.h> 37 #include <unistd.h> 38 #include <zlib.h> 39 40 #include "mandoc_aux.h" 41 #include "mandoc.h" 42 #include "roff.h" 43 #include "mdoc.h" 44 #include "man.h" 45 #include "mandoc_parse.h" 46 #include "libmandoc.h" 47 #include "roff_int.h" 48 #include "tag.h" 49 50 #define REPARSE_LIMIT 1000 51 52 struct mparse { 53 struct roff *roff; /* roff parser (!NULL) */ 54 struct roff_man *man; /* man parser */ 55 struct buf *primary; /* buffer currently being parsed */ 56 struct buf *secondary; /* copy of top level input */ 57 struct buf *loop; /* open .while request line */ 58 const char *os_s; /* default operating system */ 59 int options; /* parser options */ 60 int gzip; /* current input file is gzipped */ 61 int filenc; /* encoding of the current file */ 62 int reparse_count; /* finite interp. stack */ 63 int line; /* line number in the file */ 64 }; 65 66 static void choose_parser(struct mparse *); 67 static void free_buf_list(struct buf *); 68 static void resize_buf(struct buf *, size_t); 69 static int mparse_buf_r(struct mparse *, struct buf, size_t, int); 70 static int read_whole_file(struct mparse *, int, struct buf *, int *); 71 static void mparse_end(struct mparse *); 72 73 74 static void 75 resize_buf(struct buf *buf, size_t initial) 76 { 77 78 buf->sz = buf->sz > initial/2 ? 2 * buf->sz : initial; 79 buf->buf = mandoc_realloc(buf->buf, buf->sz); 80 } 81 82 static void 83 free_buf_list(struct buf *buf) 84 { 85 struct buf *tmp; 86 87 while (buf != NULL) { 88 tmp = buf; 89 buf = tmp->next; 90 free(tmp->buf); 91 free(tmp); 92 } 93 } 94 95 static void 96 choose_parser(struct mparse *curp) 97 { 98 char *cp, *ep; 99 int format; 100 101 /* 102 * If neither command line arguments -mdoc or -man select 103 * a parser nor the roff parser found a .Dd or .TH macro 104 * yet, look ahead in the main input buffer. 105 */ 106 107 if ((format = roff_getformat(curp->roff)) == 0) { 108 cp = curp->primary->buf; 109 ep = cp + curp->primary->sz; 110 while (cp < ep) { 111 if (*cp == '.' || *cp == '\'') { 112 cp++; 113 if (cp[0] == 'D' && cp[1] == 'd') { 114 format = MPARSE_MDOC; 115 break; 116 } 117 if (cp[0] == 'T' && cp[1] == 'H') { 118 format = MPARSE_MAN; 119 break; 120 } 121 } 122 cp = memchr(cp, '\n', ep - cp); 123 if (cp == NULL) 124 break; 125 cp++; 126 } 127 } 128 129 if (format == MPARSE_MDOC) { 130 curp->man->meta.macroset = MACROSET_MDOC; 131 if (curp->man->mdocmac == NULL) 132 curp->man->mdocmac = roffhash_alloc(MDOC_Dd, MDOC_MAX); 133 } else { 134 curp->man->meta.macroset = MACROSET_MAN; 135 if (curp->man->manmac == NULL) 136 curp->man->manmac = roffhash_alloc(MAN_TH, MAN_MAX); 137 } 138 curp->man->meta.first->tok = TOKEN_NONE; 139 } 140 141 /* 142 * Main parse routine for a buffer. 143 * It assumes encoding and line numbering are already set up. 144 * It can recurse directly (for invocations of user-defined 145 * macros, inline equations, and input line traps) 146 * and indirectly (for .so file inclusion). 147 */ 148 static int 149 mparse_buf_r(struct mparse *curp, struct buf blk, size_t i, int start) 150 { 151 struct buf ln; 152 struct buf *firstln, *lastln, *thisln, *loop; 153 char *cp; 154 size_t pos; /* byte number in the ln buffer */ 155 int line_result, result; 156 int of; 157 int lnn; /* line number in the real file */ 158 int fd; 159 int inloop; /* Saw .while on this level. */ 160 unsigned char c; 161 162 ln.sz = 256; 163 ln.buf = mandoc_malloc(ln.sz); 164 ln.next = NULL; 165 firstln = lastln = loop = NULL; 166 lnn = curp->line; 167 pos = 0; 168 inloop = 0; 169 result = ROFF_CONT; 170 171 while (i < blk.sz && (blk.buf[i] != '\0' || pos != 0)) { 172 if (start) { 173 curp->line = lnn; 174 curp->reparse_count = 0; 175 176 if (lnn < 3 && 177 curp->filenc & MPARSE_UTF8 && 178 curp->filenc & MPARSE_LATIN1) 179 curp->filenc = preconv_cue(&blk, i); 180 } 181 182 while (i < blk.sz && (start || blk.buf[i] != '\0')) { 183 184 /* 185 * When finding an unescaped newline character, 186 * leave the character loop to process the line. 187 * Skip a preceding carriage return, if any. 188 */ 189 190 if ('\r' == blk.buf[i] && i + 1 < blk.sz && 191 '\n' == blk.buf[i + 1]) 192 ++i; 193 if ('\n' == blk.buf[i]) { 194 ++i; 195 ++lnn; 196 break; 197 } 198 199 /* 200 * Make sure we have space for the worst 201 * case of 12 bytes: "\\[u10ffff]\n\0" 202 */ 203 204 if (pos + 12 > ln.sz) 205 resize_buf(&ln, 256); 206 207 /* 208 * Encode 8-bit input. 209 */ 210 211 c = blk.buf[i]; 212 if (c & 0x80) { 213 if ( ! (curp->filenc && preconv_encode( 214 &blk, &i, &ln, &pos, &curp->filenc))) { 215 mandoc_msg(MANDOCERR_CHAR_BAD, 216 curp->line, pos, "0x%x", c); 217 ln.buf[pos++] = '?'; 218 i++; 219 } 220 continue; 221 } 222 223 /* 224 * Exclude control characters. 225 */ 226 227 if (c == 0x7f || (c < 0x20 && c != 0x09)) { 228 mandoc_msg(c == 0x00 || c == 0x04 || 229 c > 0x0a ? MANDOCERR_CHAR_BAD : 230 MANDOCERR_CHAR_UNSUPP, 231 curp->line, pos, "0x%x", c); 232 i++; 233 if (c != '\r') 234 ln.buf[pos++] = '?'; 235 continue; 236 } 237 238 ln.buf[pos++] = blk.buf[i++]; 239 } 240 ln.buf[pos] = '\0'; 241 242 /* 243 * Maintain a lookaside buffer of all lines. 244 * parsed from this input source. 245 */ 246 247 thisln = mandoc_malloc(sizeof(*thisln)); 248 thisln->buf = mandoc_strdup(ln.buf); 249 thisln->sz = strlen(ln.buf) + 1; 250 thisln->next = NULL; 251 if (firstln == NULL) { 252 firstln = lastln = thisln; 253 if (curp->secondary == NULL) 254 curp->secondary = firstln; 255 } else { 256 lastln->next = thisln; 257 lastln = thisln; 258 } 259 260 /* XXX Ugly hack to mark the end of the input. */ 261 262 if (i == blk.sz || blk.buf[i] == '\0') { 263 if (pos + 2 > ln.sz) 264 resize_buf(&ln, 256); 265 ln.buf[pos++] = '\n'; 266 ln.buf[pos] = '\0'; 267 } 268 269 /* 270 * A significant amount of complexity is contained by 271 * the roff preprocessor. It's line-oriented but can be 272 * expressed on one line, so we need at times to 273 * readjust our starting point and re-run it. The roff 274 * preprocessor can also readjust the buffers with new 275 * data, so we pass them in wholesale. 276 */ 277 278 of = 0; 279 rerun: 280 line_result = roff_parseln(curp->roff, curp->line, &ln, &of); 281 282 /* Process options. */ 283 284 if (line_result & ROFF_APPEND) 285 assert(line_result == (ROFF_IGN | ROFF_APPEND)); 286 287 if (line_result & ROFF_USERCALL) 288 assert((line_result & ROFF_MASK) == ROFF_REPARSE); 289 290 if (line_result & ROFF_USERRET) { 291 assert(line_result == (ROFF_IGN | ROFF_USERRET)); 292 if (start == 0) { 293 /* Return from the current macro. */ 294 result = ROFF_USERRET; 295 goto out; 296 } 297 } 298 299 switch (line_result & ROFF_LOOPMASK) { 300 case ROFF_IGN: 301 break; 302 case ROFF_WHILE: 303 if (curp->loop != NULL) { 304 if (loop == curp->loop) 305 break; 306 mandoc_msg(MANDOCERR_WHILE_NEST, 307 curp->line, pos, NULL); 308 } 309 curp->loop = thisln; 310 loop = NULL; 311 inloop = 1; 312 break; 313 case ROFF_LOOPCONT: 314 case ROFF_LOOPEXIT: 315 if (curp->loop == NULL) { 316 mandoc_msg(MANDOCERR_WHILE_FAIL, 317 curp->line, pos, NULL); 318 break; 319 } 320 if (inloop == 0) { 321 mandoc_msg(MANDOCERR_WHILE_INTO, 322 curp->line, pos, NULL); 323 curp->loop = loop = NULL; 324 break; 325 } 326 if (line_result & ROFF_LOOPCONT) 327 loop = curp->loop; 328 else { 329 curp->loop = loop = NULL; 330 inloop = 0; 331 } 332 break; 333 default: 334 abort(); 335 } 336 337 /* Process the main instruction from the roff parser. */ 338 339 switch (line_result & ROFF_MASK) { 340 case ROFF_IGN: 341 break; 342 case ROFF_CONT: 343 if (curp->man->meta.macroset == MACROSET_NONE) 344 choose_parser(curp); 345 if ((curp->man->meta.macroset == MACROSET_MDOC ? 346 mdoc_parseln(curp->man, curp->line, ln.buf, of) : 347 man_parseln(curp->man, curp->line, ln.buf, of) 348 ) == 2) 349 goto out; 350 break; 351 case ROFF_RERUN: 352 goto rerun; 353 case ROFF_REPARSE: 354 if (++curp->reparse_count > REPARSE_LIMIT) { 355 /* Abort and return to the top level. */ 356 result = ROFF_IGN; 357 mandoc_msg(MANDOCERR_ROFFLOOP, 358 curp->line, pos, NULL); 359 goto out; 360 } 361 result = mparse_buf_r(curp, ln, of, 0); 362 if (line_result & ROFF_USERCALL) { 363 roff_userret(curp->roff); 364 /* Continue normally. */ 365 if (result & ROFF_USERRET) 366 result = ROFF_CONT; 367 } 368 if (start == 0 && result != ROFF_CONT) 369 goto out; 370 break; 371 case ROFF_SO: 372 if ( ! (curp->options & MPARSE_SO) && 373 (i >= blk.sz || blk.buf[i] == '\0')) { 374 curp->man->meta.sodest = 375 mandoc_strdup(ln.buf + of); 376 goto out; 377 } 378 if ((fd = mparse_open(curp, ln.buf + of)) != -1) { 379 mparse_readfd(curp, fd, ln.buf + of); 380 close(fd); 381 } else { 382 mandoc_msg(MANDOCERR_SO_FAIL, 383 curp->line, of, ".so %s: %s", 384 ln.buf + of, strerror(errno)); 385 ln.sz = mandoc_asprintf(&cp, 386 ".sp\nSee the file %s.\n.sp", 387 ln.buf + of); 388 free(ln.buf); 389 ln.buf = cp; 390 of = 0; 391 mparse_buf_r(curp, ln, of, 0); 392 } 393 break; 394 default: 395 abort(); 396 } 397 398 /* Start the next input line. */ 399 400 if (loop != NULL && 401 (line_result & ROFF_LOOPMASK) == ROFF_IGN) 402 loop = loop->next; 403 404 if (loop != NULL) { 405 if ((line_result & ROFF_APPEND) == 0) 406 *ln.buf = '\0'; 407 if (ln.sz < loop->sz) 408 resize_buf(&ln, loop->sz); 409 (void)strlcat(ln.buf, loop->buf, ln.sz); 410 of = 0; 411 goto rerun; 412 } 413 414 pos = (line_result & ROFF_APPEND) ? strlen(ln.buf) : 0; 415 } 416 out: 417 if (inloop) { 418 if (result != ROFF_USERRET) 419 mandoc_msg(MANDOCERR_WHILE_OUTOF, 420 curp->line, pos, NULL); 421 curp->loop = NULL; 422 } 423 free(ln.buf); 424 if (firstln != curp->secondary) 425 free_buf_list(firstln); 426 return result; 427 } 428 429 static int 430 read_whole_file(struct mparse *curp, int fd, struct buf *fb, int *with_mmap) 431 { 432 struct stat st; 433 gzFile gz; 434 size_t off; 435 ssize_t ssz; 436 int gzerrnum, retval; 437 438 if (fstat(fd, &st) == -1) { 439 mandoc_msg(MANDOCERR_FSTAT, 0, 0, "%s", strerror(errno)); 440 return -1; 441 } 442 443 /* 444 * If we're a regular file, try just reading in the whole entry 445 * via mmap(). This is faster than reading it into blocks, and 446 * since each file is only a few bytes to begin with, I'm not 447 * concerned that this is going to tank any machines. 448 */ 449 450 if (curp->gzip == 0 && S_ISREG(st.st_mode)) { 451 if (st.st_size > 0x7fffffff) { 452 mandoc_msg(MANDOCERR_TOOLARGE, 0, 0, NULL); 453 return -1; 454 } 455 *with_mmap = 1; 456 fb->sz = (size_t)st.st_size; 457 fb->buf = mmap(NULL, fb->sz, PROT_READ, MAP_SHARED, fd, 0); 458 if (fb->buf != MAP_FAILED) 459 return 0; 460 } 461 462 if (curp->gzip) { 463 /* 464 * Duplicating the file descriptor is required 465 * because we will have to call gzclose(3) 466 * to free memory used internally by zlib, 467 * but that will also close the file descriptor, 468 * which this function must not do. 469 */ 470 if ((fd = dup(fd)) == -1) { 471 mandoc_msg(MANDOCERR_DUP, 0, 0, 472 "%s", strerror(errno)); 473 return -1; 474 } 475 if ((gz = gzdopen(fd, "rb")) == NULL) { 476 mandoc_msg(MANDOCERR_GZDOPEN, 0, 0, 477 "%s", strerror(errno)); 478 close(fd); 479 return -1; 480 } 481 } else 482 gz = NULL; 483 484 /* 485 * If this isn't a regular file (like, say, stdin), then we must 486 * go the old way and just read things in bit by bit. 487 */ 488 489 *with_mmap = 0; 490 off = 0; 491 retval = -1; 492 fb->sz = 0; 493 fb->buf = NULL; 494 for (;;) { 495 if (off == fb->sz) { 496 if (fb->sz == (1U << 31)) { 497 mandoc_msg(MANDOCERR_TOOLARGE, 0, 0, NULL); 498 break; 499 } 500 resize_buf(fb, 65536); 501 } 502 ssz = curp->gzip ? 503 gzread(gz, fb->buf + (int)off, fb->sz - off) : 504 read(fd, fb->buf + (int)off, fb->sz - off); 505 if (ssz == 0) { 506 fb->sz = off; 507 retval = 0; 508 break; 509 } 510 if (ssz == -1) { 511 if (curp->gzip) 512 (void)gzerror(gz, &gzerrnum); 513 mandoc_msg(MANDOCERR_READ, 0, 0, "%s", 514 curp->gzip && gzerrnum != Z_ERRNO ? 515 zError(gzerrnum) : strerror(errno)); 516 break; 517 } 518 off += (size_t)ssz; 519 } 520 521 if (curp->gzip && (gzerrnum = gzclose(gz)) != Z_OK) 522 mandoc_msg(MANDOCERR_GZCLOSE, 0, 0, "%s", 523 gzerrnum == Z_ERRNO ? strerror(errno) : 524 zError(gzerrnum)); 525 if (retval == -1) { 526 free(fb->buf); 527 fb->buf = NULL; 528 } 529 return retval; 530 } 531 532 static void 533 mparse_end(struct mparse *curp) 534 { 535 if (curp->man->meta.macroset == MACROSET_NONE) 536 curp->man->meta.macroset = MACROSET_MAN; 537 if (curp->man->meta.macroset == MACROSET_MDOC) 538 mdoc_endparse(curp->man); 539 else 540 man_endparse(curp->man); 541 roff_endparse(curp->roff); 542 } 543 544 /* 545 * Read the whole file into memory and call the parsers. 546 * Called recursively when an .so request is encountered. 547 */ 548 void 549 mparse_readfd(struct mparse *curp, int fd, const char *filename) 550 { 551 static int recursion_depth; 552 553 struct buf blk; 554 struct buf *save_primary; 555 const char *save_filename, *cp; 556 size_t offset; 557 int save_filenc, save_lineno; 558 int with_mmap; 559 560 if (recursion_depth > 64) { 561 mandoc_msg(MANDOCERR_ROFFLOOP, curp->line, 0, NULL); 562 return; 563 } else if (recursion_depth == 0 && 564 (cp = strrchr(filename, '.')) != NULL && 565 cp[1] >= '1' && cp[1] <= '9') 566 curp->man->filesec = cp[1]; 567 else 568 curp->man->filesec = '\0'; 569 570 if (read_whole_file(curp, fd, &blk, &with_mmap) == -1) 571 return; 572 573 /* 574 * Save some properties of the parent file. 575 */ 576 577 save_primary = curp->primary; 578 save_filenc = curp->filenc; 579 save_lineno = curp->line; 580 save_filename = mandoc_msg_getinfilename(); 581 582 curp->primary = &blk; 583 curp->filenc = curp->options & (MPARSE_UTF8 | MPARSE_LATIN1); 584 curp->line = 1; 585 mandoc_msg_setinfilename(filename); 586 587 /* Skip an UTF-8 byte order mark. */ 588 if (curp->filenc & MPARSE_UTF8 && blk.sz > 2 && 589 (unsigned char)blk.buf[0] == 0xef && 590 (unsigned char)blk.buf[1] == 0xbb && 591 (unsigned char)blk.buf[2] == 0xbf) { 592 offset = 3; 593 curp->filenc &= ~MPARSE_LATIN1; 594 } else 595 offset = 0; 596 597 recursion_depth++; 598 mparse_buf_r(curp, blk, offset, 1); 599 if (--recursion_depth == 0) 600 mparse_end(curp); 601 602 /* 603 * Clean up and restore saved parent properties. 604 */ 605 606 if (with_mmap) 607 munmap(blk.buf, blk.sz); 608 else 609 free(blk.buf); 610 611 curp->primary = save_primary; 612 curp->filenc = save_filenc; 613 curp->line = save_lineno; 614 if (save_filename != NULL) 615 mandoc_msg_setinfilename(save_filename); 616 } 617 618 int 619 mparse_open(struct mparse *curp, const char *file) 620 { 621 char *cp; 622 int fd, save_errno; 623 624 cp = strrchr(file, '.'); 625 curp->gzip = (cp != NULL && ! strcmp(cp + 1, "gz")); 626 627 /* First try to use the filename as it is. */ 628 629 if ((fd = open(file, O_RDONLY)) != -1) 630 return fd; 631 632 /* 633 * If that doesn't work and the filename doesn't 634 * already end in .gz, try appending .gz. 635 */ 636 637 if ( ! curp->gzip) { 638 save_errno = errno; 639 mandoc_asprintf(&cp, "%s.gz", file); 640 fd = open(cp, O_RDONLY); 641 free(cp); 642 errno = save_errno; 643 if (fd != -1) { 644 curp->gzip = 1; 645 return fd; 646 } 647 } 648 649 /* Neither worked, give up. */ 650 651 return -1; 652 } 653 654 struct mparse * 655 mparse_alloc(int options, enum mandoc_os os_e, const char *os_s) 656 { 657 struct mparse *curp; 658 659 curp = mandoc_calloc(1, sizeof(struct mparse)); 660 661 curp->options = options; 662 curp->os_s = os_s; 663 664 curp->roff = roff_alloc(options); 665 curp->man = roff_man_alloc(curp->roff, curp->os_s, 666 curp->options & MPARSE_QUICK ? 1 : 0); 667 if (curp->options & MPARSE_MDOC) { 668 curp->man->meta.macroset = MACROSET_MDOC; 669 if (curp->man->mdocmac == NULL) 670 curp->man->mdocmac = roffhash_alloc(MDOC_Dd, MDOC_MAX); 671 } else if (curp->options & MPARSE_MAN) { 672 curp->man->meta.macroset = MACROSET_MAN; 673 if (curp->man->manmac == NULL) 674 curp->man->manmac = roffhash_alloc(MAN_TH, MAN_MAX); 675 } 676 curp->man->meta.first->tok = TOKEN_NONE; 677 curp->man->meta.os_e = os_e; 678 tag_alloc(); 679 return curp; 680 } 681 682 void 683 mparse_reset(struct mparse *curp) 684 { 685 tag_free(); 686 roff_reset(curp->roff); 687 roff_man_reset(curp->man); 688 free_buf_list(curp->secondary); 689 curp->secondary = NULL; 690 curp->gzip = 0; 691 tag_alloc(); 692 } 693 694 void 695 mparse_free(struct mparse *curp) 696 { 697 tag_free(); 698 roffhash_free(curp->man->mdocmac); 699 roffhash_free(curp->man->manmac); 700 roff_man_free(curp->man); 701 roff_free(curp->roff); 702 free_buf_list(curp->secondary); 703 free(curp); 704 } 705 706 struct roff_meta * 707 mparse_result(struct mparse *curp) 708 { 709 roff_state_reset(curp->man); 710 if (curp->options & MPARSE_VALIDATE) { 711 if (curp->man->meta.macroset == MACROSET_MDOC) 712 mdoc_validate(curp->man); 713 else 714 man_validate(curp->man); 715 tag_postprocess(curp->man, curp->man->meta.first); 716 } 717 return &curp->man->meta; 718 } 719 720 void 721 mparse_copy(const struct mparse *p) 722 { 723 struct buf *buf; 724 725 for (buf = p->secondary; buf != NULL; buf = buf->next) 726 puts(buf->buf); 727 } 728