1 /* $OpenBSD: read.c,v 1.192 2022/05/19 14:47:47 schwarze Exp $ */ 2 /* 3 * Copyright (c) 2010-2020 Ingo Schwarze <schwarze@openbsd.org> 4 * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv> 5 * Copyright (c) 2010, 2012 Joerg Sonnenberger <joerg@netbsd.org> 6 * 7 * Permission to use, copy, modify, and distribute this software for any 8 * purpose with or without fee is hereby granted, provided that the above 9 * copyright notice and this permission notice appear in all copies. 10 * 11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES 12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR 14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 18 * 19 * Top-level functions of the mandoc(3) parser: 20 * Parser and input encoding selection, decompression, 21 * handling of input bytes, characters, lines, and files, 22 * handling of roff(7) loops and file inclusion, 23 * and steering of the various parsers. 24 */ 25 #include <sys/types.h> 26 #include <sys/mman.h> 27 #include <sys/stat.h> 28 29 #include <assert.h> 30 #include <ctype.h> 31 #include <errno.h> 32 #include <fcntl.h> 33 #include <stdarg.h> 34 #include <stdio.h> 35 #include <stdlib.h> 36 #include <string.h> 37 #include <unistd.h> 38 #include <zlib.h> 39 40 #include "mandoc_aux.h" 41 #include "mandoc.h" 42 #include "roff.h" 43 #include "mdoc.h" 44 #include "man.h" 45 #include "mandoc_parse.h" 46 #include "libmandoc.h" 47 #include "roff_int.h" 48 #include "tag.h" 49 50 #define REPARSE_LIMIT 1000 51 52 struct mparse { 53 struct roff *roff; /* roff parser (!NULL) */ 54 struct roff_man *man; /* man parser */ 55 struct buf *primary; /* buffer currently being parsed */ 56 struct buf *secondary; /* copy of top level input */ 57 struct buf *loop; /* open .while request line */ 58 const char *os_s; /* default operating system */ 59 int options; /* parser options */ 60 int gzip; /* current input file is gzipped */ 61 int filenc; /* encoding of the current file */ 62 int reparse_count; /* finite interp. stack */ 63 int line; /* line number in the file */ 64 }; 65 66 static void choose_parser(struct mparse *); 67 static void free_buf_list(struct buf *); 68 static void resize_buf(struct buf *, size_t); 69 static int mparse_buf_r(struct mparse *, struct buf, size_t, int); 70 static int read_whole_file(struct mparse *, int, struct buf *, int *); 71 static void mparse_end(struct mparse *); 72 73 74 static void 75 resize_buf(struct buf *buf, size_t initial) 76 { 77 78 buf->sz = buf->sz > initial/2 ? 2 * buf->sz : initial; 79 buf->buf = mandoc_realloc(buf->buf, buf->sz); 80 } 81 82 static void 83 free_buf_list(struct buf *buf) 84 { 85 struct buf *tmp; 86 87 while (buf != NULL) { 88 tmp = buf; 89 buf = tmp->next; 90 free(tmp->buf); 91 free(tmp); 92 } 93 } 94 95 static void 96 choose_parser(struct mparse *curp) 97 { 98 char *cp, *ep; 99 int format; 100 101 /* 102 * If neither command line arguments -mdoc or -man select 103 * a parser nor the roff parser found a .Dd or .TH macro 104 * yet, look ahead in the main input buffer. 105 */ 106 107 if ((format = roff_getformat(curp->roff)) == 0) { 108 cp = curp->primary->buf; 109 ep = cp + curp->primary->sz; 110 while (cp < ep) { 111 if (*cp == '.' || *cp == '\'') { 112 cp++; 113 if (cp[0] == 'D' && cp[1] == 'd') { 114 format = MPARSE_MDOC; 115 break; 116 } 117 if (cp[0] == 'T' && cp[1] == 'H') { 118 format = MPARSE_MAN; 119 break; 120 } 121 } 122 cp = memchr(cp, '\n', ep - cp); 123 if (cp == NULL) 124 break; 125 cp++; 126 } 127 } 128 129 if (format == MPARSE_MDOC) { 130 curp->man->meta.macroset = MACROSET_MDOC; 131 if (curp->man->mdocmac == NULL) 132 curp->man->mdocmac = roffhash_alloc(MDOC_Dd, MDOC_MAX); 133 } else { 134 curp->man->meta.macroset = MACROSET_MAN; 135 if (curp->man->manmac == NULL) 136 curp->man->manmac = roffhash_alloc(MAN_TH, MAN_MAX); 137 } 138 curp->man->meta.first->tok = TOKEN_NONE; 139 } 140 141 /* 142 * Main parse routine for a buffer. 143 * It assumes encoding and line numbering are already set up. 144 * It can recurse directly (for invocations of user-defined 145 * macros, inline equations, and input line traps) 146 * and indirectly (for .so file inclusion). 147 */ 148 static int 149 mparse_buf_r(struct mparse *curp, struct buf blk, size_t i, int start) 150 { 151 struct buf ln; 152 struct buf *firstln, *lastln, *thisln, *loop; 153 char *cp; 154 size_t pos; /* byte number in the ln buffer */ 155 size_t spos; /* at the start of the current line parse */ 156 int line_result, result; 157 int of; 158 int lnn; /* line number in the real file */ 159 int fd; 160 int inloop; /* Saw .while on this level. */ 161 unsigned char c; 162 163 ln.sz = 256; 164 ln.buf = mandoc_malloc(ln.sz); 165 ln.next = NULL; 166 firstln = lastln = loop = NULL; 167 lnn = curp->line; 168 pos = 0; 169 inloop = 0; 170 result = ROFF_CONT; 171 172 while (i < blk.sz && (blk.buf[i] != '\0' || pos != 0)) { 173 if (start) { 174 curp->line = lnn; 175 curp->reparse_count = 0; 176 177 if (lnn < 3 && 178 curp->filenc & MPARSE_UTF8 && 179 curp->filenc & MPARSE_LATIN1) 180 curp->filenc = preconv_cue(&blk, i); 181 } 182 spos = pos; 183 184 while (i < blk.sz && (start || blk.buf[i] != '\0')) { 185 186 /* 187 * When finding an unescaped newline character, 188 * leave the character loop to process the line. 189 * Skip a preceding carriage return, if any. 190 */ 191 192 if ('\r' == blk.buf[i] && i + 1 < blk.sz && 193 '\n' == blk.buf[i + 1]) 194 ++i; 195 if ('\n' == blk.buf[i]) { 196 ++i; 197 ++lnn; 198 break; 199 } 200 201 /* 202 * Make sure we have space for the worst 203 * case of 12 bytes: "\\[u10ffff]\n\0" 204 */ 205 206 if (pos + 12 > ln.sz) 207 resize_buf(&ln, 256); 208 209 /* 210 * Encode 8-bit input. 211 */ 212 213 c = blk.buf[i]; 214 if (c & 0x80) { 215 if ( ! (curp->filenc && preconv_encode( 216 &blk, &i, &ln, &pos, &curp->filenc))) { 217 mandoc_msg(MANDOCERR_CHAR_BAD, 218 curp->line, pos, "0x%x", c); 219 ln.buf[pos++] = '?'; 220 i++; 221 } 222 continue; 223 } 224 225 /* 226 * Exclude control characters. 227 */ 228 229 if (c == 0x7f || (c < 0x20 && c != 0x09)) { 230 mandoc_msg(c == 0x00 || c == 0x04 || 231 c > 0x0a ? MANDOCERR_CHAR_BAD : 232 MANDOCERR_CHAR_UNSUPP, 233 curp->line, pos, "0x%x", c); 234 i++; 235 if (c != '\r') 236 ln.buf[pos++] = '?'; 237 continue; 238 } 239 240 ln.buf[pos++] = blk.buf[i++]; 241 } 242 ln.buf[pos] = '\0'; 243 244 /* 245 * Maintain a lookaside buffer of all lines. 246 * parsed from this input source. 247 */ 248 249 thisln = mandoc_malloc(sizeof(*thisln)); 250 thisln->buf = mandoc_strdup(ln.buf); 251 thisln->sz = strlen(ln.buf) + 1; 252 thisln->next = NULL; 253 if (firstln == NULL) { 254 firstln = lastln = thisln; 255 if (curp->secondary == NULL) 256 curp->secondary = firstln; 257 } else { 258 lastln->next = thisln; 259 lastln = thisln; 260 } 261 262 /* 263 * XXX Ugly hack to mark the end of the input, 264 * such that the function roff_parse_comment() 265 * doesn't attempt to append another line if the 266 * last input line ends with an escape character. 267 */ 268 269 if (i == blk.sz || blk.buf[i] == '\0') { 270 if (pos + 2 > ln.sz) 271 resize_buf(&ln, 256); 272 ln.buf[pos++] = '\n'; 273 ln.buf[pos] = '\0'; 274 } 275 276 /* 277 * A significant amount of complexity is contained by 278 * the roff preprocessor. It's line-oriented but can be 279 * expressed on one line, so we need at times to 280 * readjust our starting point and re-run it. The roff 281 * preprocessor can also readjust the buffers with new 282 * data, so we pass them in wholesale. 283 */ 284 285 of = 0; 286 rerun: 287 line_result = roff_parseln(curp->roff, curp->line, 288 &ln, &of, start && spos == 0 ? pos : 0); 289 290 /* Process options. */ 291 292 if (line_result & ROFF_APPEND) 293 assert(line_result == (ROFF_IGN | ROFF_APPEND)); 294 295 if (line_result & ROFF_USERCALL) 296 assert((line_result & ROFF_MASK) == ROFF_REPARSE); 297 298 if (line_result & ROFF_USERRET) { 299 assert(line_result == (ROFF_IGN | ROFF_USERRET)); 300 if (start == 0) { 301 /* Return from the current macro. */ 302 result = ROFF_USERRET; 303 goto out; 304 } 305 } 306 307 switch (line_result & ROFF_LOOPMASK) { 308 case ROFF_IGN: 309 break; 310 case ROFF_WHILE: 311 if (curp->loop != NULL) { 312 if (loop == curp->loop) 313 break; 314 mandoc_msg(MANDOCERR_WHILE_NEST, 315 curp->line, pos, NULL); 316 } 317 curp->loop = thisln; 318 loop = NULL; 319 inloop = 1; 320 break; 321 case ROFF_LOOPCONT: 322 case ROFF_LOOPEXIT: 323 if (curp->loop == NULL) { 324 mandoc_msg(MANDOCERR_WHILE_FAIL, 325 curp->line, pos, NULL); 326 break; 327 } 328 if (inloop == 0) { 329 mandoc_msg(MANDOCERR_WHILE_INTO, 330 curp->line, pos, NULL); 331 curp->loop = loop = NULL; 332 break; 333 } 334 if (line_result & ROFF_LOOPCONT) 335 loop = curp->loop; 336 else { 337 curp->loop = loop = NULL; 338 inloop = 0; 339 } 340 break; 341 default: 342 abort(); 343 } 344 345 /* Process the main instruction from the roff parser. */ 346 347 switch (line_result & ROFF_MASK) { 348 case ROFF_IGN: 349 break; 350 case ROFF_CONT: 351 if (curp->man->meta.macroset == MACROSET_NONE) 352 choose_parser(curp); 353 if ((curp->man->meta.macroset == MACROSET_MDOC ? 354 mdoc_parseln(curp->man, curp->line, ln.buf, of) : 355 man_parseln(curp->man, curp->line, ln.buf, of) 356 ) == 2) 357 goto out; 358 break; 359 case ROFF_RERUN: 360 goto rerun; 361 case ROFF_REPARSE: 362 if (++curp->reparse_count > REPARSE_LIMIT) { 363 /* Abort and return to the top level. */ 364 result = ROFF_IGN; 365 mandoc_msg(MANDOCERR_ROFFLOOP, 366 curp->line, pos, NULL); 367 goto out; 368 } 369 result = mparse_buf_r(curp, ln, of, 0); 370 if (line_result & ROFF_USERCALL) { 371 roff_userret(curp->roff); 372 /* Continue normally. */ 373 if (result & ROFF_USERRET) 374 result = ROFF_CONT; 375 } 376 if (start == 0 && result != ROFF_CONT) 377 goto out; 378 break; 379 case ROFF_SO: 380 if ( ! (curp->options & MPARSE_SO) && 381 (i >= blk.sz || blk.buf[i] == '\0')) { 382 curp->man->meta.sodest = 383 mandoc_strdup(ln.buf + of); 384 goto out; 385 } 386 if ((fd = mparse_open(curp, ln.buf + of)) != -1) { 387 mparse_readfd(curp, fd, ln.buf + of); 388 close(fd); 389 } else { 390 mandoc_msg(MANDOCERR_SO_FAIL, 391 curp->line, of, ".so %s: %s", 392 ln.buf + of, strerror(errno)); 393 ln.sz = mandoc_asprintf(&cp, 394 ".sp\nSee the file %s.\n.sp", 395 ln.buf + of); 396 free(ln.buf); 397 ln.buf = cp; 398 of = 0; 399 mparse_buf_r(curp, ln, of, 0); 400 } 401 break; 402 default: 403 abort(); 404 } 405 406 /* Start the next input line. */ 407 408 if (loop != NULL && 409 (line_result & ROFF_LOOPMASK) == ROFF_IGN) 410 loop = loop->next; 411 412 if (loop != NULL) { 413 if ((line_result & ROFF_APPEND) == 0) 414 *ln.buf = '\0'; 415 if (ln.sz < loop->sz) 416 resize_buf(&ln, loop->sz); 417 (void)strlcat(ln.buf, loop->buf, ln.sz); 418 of = 0; 419 goto rerun; 420 } 421 422 pos = (line_result & ROFF_APPEND) ? strlen(ln.buf) : 0; 423 } 424 out: 425 if (inloop) { 426 if (result != ROFF_USERRET) 427 mandoc_msg(MANDOCERR_WHILE_OUTOF, 428 curp->line, pos, NULL); 429 curp->loop = NULL; 430 } 431 free(ln.buf); 432 if (firstln != curp->secondary) 433 free_buf_list(firstln); 434 return result; 435 } 436 437 static int 438 read_whole_file(struct mparse *curp, int fd, struct buf *fb, int *with_mmap) 439 { 440 struct stat st; 441 gzFile gz; 442 size_t off; 443 ssize_t ssz; 444 int gzerrnum, retval; 445 446 if (fstat(fd, &st) == -1) { 447 mandoc_msg(MANDOCERR_FSTAT, 0, 0, "%s", strerror(errno)); 448 return -1; 449 } 450 451 /* 452 * If we're a regular file, try just reading in the whole entry 453 * via mmap(). This is faster than reading it into blocks, and 454 * since each file is only a few bytes to begin with, I'm not 455 * concerned that this is going to tank any machines. 456 */ 457 458 if (curp->gzip == 0 && S_ISREG(st.st_mode)) { 459 if (st.st_size > 0x7fffffff) { 460 mandoc_msg(MANDOCERR_TOOLARGE, 0, 0, NULL); 461 return -1; 462 } 463 *with_mmap = 1; 464 fb->sz = (size_t)st.st_size; 465 fb->buf = mmap(NULL, fb->sz, PROT_READ, MAP_SHARED, fd, 0); 466 if (fb->buf != MAP_FAILED) 467 return 0; 468 } 469 470 if (curp->gzip) { 471 /* 472 * Duplicating the file descriptor is required 473 * because we will have to call gzclose(3) 474 * to free memory used internally by zlib, 475 * but that will also close the file descriptor, 476 * which this function must not do. 477 */ 478 if ((fd = dup(fd)) == -1) { 479 mandoc_msg(MANDOCERR_DUP, 0, 0, 480 "%s", strerror(errno)); 481 return -1; 482 } 483 if ((gz = gzdopen(fd, "rb")) == NULL) { 484 mandoc_msg(MANDOCERR_GZDOPEN, 0, 0, 485 "%s", strerror(errno)); 486 close(fd); 487 return -1; 488 } 489 } else 490 gz = NULL; 491 492 /* 493 * If this isn't a regular file (like, say, stdin), then we must 494 * go the old way and just read things in bit by bit. 495 */ 496 497 *with_mmap = 0; 498 off = 0; 499 retval = -1; 500 fb->sz = 0; 501 fb->buf = NULL; 502 for (;;) { 503 if (off == fb->sz) { 504 if (fb->sz == (1U << 31)) { 505 mandoc_msg(MANDOCERR_TOOLARGE, 0, 0, NULL); 506 break; 507 } 508 resize_buf(fb, 65536); 509 } 510 ssz = curp->gzip ? 511 gzread(gz, fb->buf + (int)off, fb->sz - off) : 512 read(fd, fb->buf + (int)off, fb->sz - off); 513 if (ssz == 0) { 514 fb->sz = off; 515 retval = 0; 516 break; 517 } 518 if (ssz == -1) { 519 if (curp->gzip) 520 (void)gzerror(gz, &gzerrnum); 521 mandoc_msg(MANDOCERR_READ, 0, 0, "%s", 522 curp->gzip && gzerrnum != Z_ERRNO ? 523 zError(gzerrnum) : strerror(errno)); 524 break; 525 } 526 off += (size_t)ssz; 527 } 528 529 if (curp->gzip && (gzerrnum = gzclose(gz)) != Z_OK) 530 mandoc_msg(MANDOCERR_GZCLOSE, 0, 0, "%s", 531 gzerrnum == Z_ERRNO ? strerror(errno) : 532 zError(gzerrnum)); 533 if (retval == -1) { 534 free(fb->buf); 535 fb->buf = NULL; 536 } 537 return retval; 538 } 539 540 static void 541 mparse_end(struct mparse *curp) 542 { 543 if (curp->man->meta.macroset == MACROSET_NONE) 544 curp->man->meta.macroset = MACROSET_MAN; 545 if (curp->man->meta.macroset == MACROSET_MDOC) 546 mdoc_endparse(curp->man); 547 else 548 man_endparse(curp->man); 549 roff_endparse(curp->roff); 550 } 551 552 /* 553 * Read the whole file into memory and call the parsers. 554 * Called recursively when an .so request is encountered. 555 */ 556 void 557 mparse_readfd(struct mparse *curp, int fd, const char *filename) 558 { 559 static int recursion_depth; 560 561 struct buf blk; 562 struct buf *save_primary; 563 const char *save_filename, *cp; 564 size_t offset; 565 int save_filenc, save_lineno; 566 int with_mmap; 567 568 if (recursion_depth > 64) { 569 mandoc_msg(MANDOCERR_ROFFLOOP, curp->line, 0, NULL); 570 return; 571 } else if (recursion_depth == 0 && 572 (cp = strrchr(filename, '.')) != NULL && 573 cp[1] >= '1' && cp[1] <= '9') 574 curp->man->filesec = cp[1]; 575 else 576 curp->man->filesec = '\0'; 577 578 if (read_whole_file(curp, fd, &blk, &with_mmap) == -1) 579 return; 580 581 /* 582 * Save some properties of the parent file. 583 */ 584 585 save_primary = curp->primary; 586 save_filenc = curp->filenc; 587 save_lineno = curp->line; 588 save_filename = mandoc_msg_getinfilename(); 589 590 curp->primary = &blk; 591 curp->filenc = curp->options & (MPARSE_UTF8 | MPARSE_LATIN1); 592 curp->line = 1; 593 mandoc_msg_setinfilename(filename); 594 595 /* Skip an UTF-8 byte order mark. */ 596 if (curp->filenc & MPARSE_UTF8 && blk.sz > 2 && 597 (unsigned char)blk.buf[0] == 0xef && 598 (unsigned char)blk.buf[1] == 0xbb && 599 (unsigned char)blk.buf[2] == 0xbf) { 600 offset = 3; 601 curp->filenc &= ~MPARSE_LATIN1; 602 } else 603 offset = 0; 604 605 recursion_depth++; 606 mparse_buf_r(curp, blk, offset, 1); 607 if (--recursion_depth == 0) 608 mparse_end(curp); 609 610 /* 611 * Clean up and restore saved parent properties. 612 */ 613 614 if (with_mmap) 615 munmap(blk.buf, blk.sz); 616 else 617 free(blk.buf); 618 619 curp->primary = save_primary; 620 curp->filenc = save_filenc; 621 curp->line = save_lineno; 622 if (save_filename != NULL) 623 mandoc_msg_setinfilename(save_filename); 624 } 625 626 int 627 mparse_open(struct mparse *curp, const char *file) 628 { 629 char *cp; 630 int fd, save_errno; 631 632 cp = strrchr(file, '.'); 633 curp->gzip = (cp != NULL && ! strcmp(cp + 1, "gz")); 634 635 /* First try to use the filename as it is. */ 636 637 if ((fd = open(file, O_RDONLY)) != -1) 638 return fd; 639 640 /* 641 * If that doesn't work and the filename doesn't 642 * already end in .gz, try appending .gz. 643 */ 644 645 if ( ! curp->gzip) { 646 save_errno = errno; 647 mandoc_asprintf(&cp, "%s.gz", file); 648 fd = open(cp, O_RDONLY); 649 free(cp); 650 errno = save_errno; 651 if (fd != -1) { 652 curp->gzip = 1; 653 return fd; 654 } 655 } 656 657 /* Neither worked, give up. */ 658 659 return -1; 660 } 661 662 struct mparse * 663 mparse_alloc(int options, enum mandoc_os os_e, const char *os_s) 664 { 665 struct mparse *curp; 666 667 curp = mandoc_calloc(1, sizeof(struct mparse)); 668 669 curp->options = options; 670 curp->os_s = os_s; 671 672 curp->roff = roff_alloc(options); 673 curp->man = roff_man_alloc(curp->roff, curp->os_s, 674 curp->options & MPARSE_QUICK ? 1 : 0); 675 if (curp->options & MPARSE_MDOC) { 676 curp->man->meta.macroset = MACROSET_MDOC; 677 if (curp->man->mdocmac == NULL) 678 curp->man->mdocmac = roffhash_alloc(MDOC_Dd, MDOC_MAX); 679 } else if (curp->options & MPARSE_MAN) { 680 curp->man->meta.macroset = MACROSET_MAN; 681 if (curp->man->manmac == NULL) 682 curp->man->manmac = roffhash_alloc(MAN_TH, MAN_MAX); 683 } 684 curp->man->meta.first->tok = TOKEN_NONE; 685 curp->man->meta.os_e = os_e; 686 tag_alloc(); 687 return curp; 688 } 689 690 void 691 mparse_reset(struct mparse *curp) 692 { 693 tag_free(); 694 roff_reset(curp->roff); 695 roff_man_reset(curp->man); 696 free_buf_list(curp->secondary); 697 curp->secondary = NULL; 698 curp->gzip = 0; 699 tag_alloc(); 700 } 701 702 void 703 mparse_free(struct mparse *curp) 704 { 705 tag_free(); 706 roffhash_free(curp->man->mdocmac); 707 roffhash_free(curp->man->manmac); 708 roff_man_free(curp->man); 709 roff_free(curp->roff); 710 free_buf_list(curp->secondary); 711 free(curp); 712 } 713 714 struct roff_meta * 715 mparse_result(struct mparse *curp) 716 { 717 roff_state_reset(curp->man); 718 if (curp->options & MPARSE_VALIDATE) { 719 if (curp->man->meta.macroset == MACROSET_MDOC) 720 mdoc_validate(curp->man); 721 else 722 man_validate(curp->man); 723 tag_postprocess(curp->man, curp->man->meta.first); 724 } 725 return &curp->man->meta; 726 } 727 728 void 729 mparse_copy(const struct mparse *p) 730 { 731 struct buf *buf; 732 733 for (buf = p->secondary; buf != NULL; buf = buf->next) 734 puts(buf->buf); 735 } 736