1 /* $OpenBSD: read.c,v 1.182 2019/01/11 17:03:43 schwarze Exp $ */ 2 /* 3 * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv> 4 * Copyright (c) 2010-2019 Ingo Schwarze <schwarze@openbsd.org> 5 * Copyright (c) 2010, 2012 Joerg Sonnenberger <joerg@netbsd.org> 6 * 7 * Permission to use, copy, modify, and distribute this software for any 8 * purpose with or without fee is hereby granted, provided that the above 9 * copyright notice and this permission notice appear in all copies. 10 * 11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES 12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR 14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 18 */ 19 #include <sys/types.h> 20 #include <sys/mman.h> 21 #include <sys/stat.h> 22 23 #include <assert.h> 24 #include <ctype.h> 25 #include <errno.h> 26 #include <fcntl.h> 27 #include <stdarg.h> 28 #include <stdio.h> 29 #include <stdlib.h> 30 #include <string.h> 31 #include <unistd.h> 32 #include <zlib.h> 33 34 #include "mandoc_aux.h" 35 #include "mandoc.h" 36 #include "roff.h" 37 #include "mdoc.h" 38 #include "man.h" 39 #include "mandoc_parse.h" 40 #include "libmandoc.h" 41 #include "roff_int.h" 42 43 #define REPARSE_LIMIT 1000 44 45 struct mparse { 46 struct roff *roff; /* roff parser (!NULL) */ 47 struct roff_man *man; /* man parser */ 48 struct buf *primary; /* buffer currently being parsed */ 49 struct buf *secondary; /* copy of top level input */ 50 struct buf *loop; /* open .while request line */ 51 const char *os_s; /* default operating system */ 52 int options; /* parser options */ 53 int gzip; /* current input file is gzipped */ 54 int filenc; /* encoding of the current file */ 55 int reparse_count; /* finite interp. stack */ 56 int line; /* line number in the file */ 57 }; 58 59 static void choose_parser(struct mparse *); 60 static void free_buf_list(struct buf *); 61 static void resize_buf(struct buf *, size_t); 62 static int mparse_buf_r(struct mparse *, struct buf, size_t, int); 63 static int read_whole_file(struct mparse *, int, struct buf *, int *); 64 static void mparse_end(struct mparse *); 65 66 67 static void 68 resize_buf(struct buf *buf, size_t initial) 69 { 70 71 buf->sz = buf->sz > initial/2 ? 2 * buf->sz : initial; 72 buf->buf = mandoc_realloc(buf->buf, buf->sz); 73 } 74 75 static void 76 free_buf_list(struct buf *buf) 77 { 78 struct buf *tmp; 79 80 while (buf != NULL) { 81 tmp = buf; 82 buf = tmp->next; 83 free(tmp->buf); 84 free(tmp); 85 } 86 } 87 88 static void 89 choose_parser(struct mparse *curp) 90 { 91 char *cp, *ep; 92 int format; 93 94 /* 95 * If neither command line arguments -mdoc or -man select 96 * a parser nor the roff parser found a .Dd or .TH macro 97 * yet, look ahead in the main input buffer. 98 */ 99 100 if ((format = roff_getformat(curp->roff)) == 0) { 101 cp = curp->primary->buf; 102 ep = cp + curp->primary->sz; 103 while (cp < ep) { 104 if (*cp == '.' || *cp == '\'') { 105 cp++; 106 if (cp[0] == 'D' && cp[1] == 'd') { 107 format = MPARSE_MDOC; 108 break; 109 } 110 if (cp[0] == 'T' && cp[1] == 'H') { 111 format = MPARSE_MAN; 112 break; 113 } 114 } 115 cp = memchr(cp, '\n', ep - cp); 116 if (cp == NULL) 117 break; 118 cp++; 119 } 120 } 121 122 if (format == MPARSE_MDOC) { 123 curp->man->meta.macroset = MACROSET_MDOC; 124 if (curp->man->mdocmac == NULL) 125 curp->man->mdocmac = roffhash_alloc(MDOC_Dd, MDOC_MAX); 126 } else { 127 curp->man->meta.macroset = MACROSET_MAN; 128 if (curp->man->manmac == NULL) 129 curp->man->manmac = roffhash_alloc(MAN_TH, MAN_MAX); 130 } 131 curp->man->meta.first->tok = TOKEN_NONE; 132 } 133 134 /* 135 * Main parse routine for a buffer. 136 * It assumes encoding and line numbering are already set up. 137 * It can recurse directly (for invocations of user-defined 138 * macros, inline equations, and input line traps) 139 * and indirectly (for .so file inclusion). 140 */ 141 static int 142 mparse_buf_r(struct mparse *curp, struct buf blk, size_t i, int start) 143 { 144 struct buf ln; 145 struct buf *firstln, *lastln, *thisln, *loop; 146 char *cp; 147 size_t pos; /* byte number in the ln buffer */ 148 int line_result, result; 149 int of; 150 int lnn; /* line number in the real file */ 151 int fd; 152 int inloop; /* Saw .while on this level. */ 153 unsigned char c; 154 155 ln.sz = 256; 156 ln.buf = mandoc_malloc(ln.sz); 157 ln.next = NULL; 158 firstln = loop = NULL; 159 lnn = curp->line; 160 pos = 0; 161 inloop = 0; 162 result = ROFF_CONT; 163 164 while (i < blk.sz && (blk.buf[i] != '\0' || pos != 0)) { 165 if (start) { 166 curp->line = lnn; 167 curp->reparse_count = 0; 168 169 if (lnn < 3 && 170 curp->filenc & MPARSE_UTF8 && 171 curp->filenc & MPARSE_LATIN1) 172 curp->filenc = preconv_cue(&blk, i); 173 } 174 175 while (i < blk.sz && (start || blk.buf[i] != '\0')) { 176 177 /* 178 * When finding an unescaped newline character, 179 * leave the character loop to process the line. 180 * Skip a preceding carriage return, if any. 181 */ 182 183 if ('\r' == blk.buf[i] && i + 1 < blk.sz && 184 '\n' == blk.buf[i + 1]) 185 ++i; 186 if ('\n' == blk.buf[i]) { 187 ++i; 188 ++lnn; 189 break; 190 } 191 192 /* 193 * Make sure we have space for the worst 194 * case of 12 bytes: "\\[u10ffff]\n\0" 195 */ 196 197 if (pos + 12 > ln.sz) 198 resize_buf(&ln, 256); 199 200 /* 201 * Encode 8-bit input. 202 */ 203 204 c = blk.buf[i]; 205 if (c & 0x80) { 206 if ( ! (curp->filenc && preconv_encode( 207 &blk, &i, &ln, &pos, &curp->filenc))) { 208 mandoc_msg(MANDOCERR_CHAR_BAD, 209 curp->line, pos, "0x%x", c); 210 ln.buf[pos++] = '?'; 211 i++; 212 } 213 continue; 214 } 215 216 /* 217 * Exclude control characters. 218 */ 219 220 if (c == 0x7f || (c < 0x20 && c != 0x09)) { 221 mandoc_msg(c == 0x00 || c == 0x04 || 222 c > 0x0a ? MANDOCERR_CHAR_BAD : 223 MANDOCERR_CHAR_UNSUPP, 224 curp->line, pos, "0x%x", c); 225 i++; 226 if (c != '\r') 227 ln.buf[pos++] = '?'; 228 continue; 229 } 230 231 ln.buf[pos++] = blk.buf[i++]; 232 } 233 ln.buf[pos] = '\0'; 234 235 /* 236 * Maintain a lookaside buffer of all lines. 237 * parsed from this input source. 238 */ 239 240 thisln = mandoc_malloc(sizeof(*thisln)); 241 thisln->buf = mandoc_strdup(ln.buf); 242 thisln->sz = strlen(ln.buf) + 1; 243 thisln->next = NULL; 244 if (firstln == NULL) { 245 firstln = lastln = thisln; 246 if (curp->secondary == NULL) 247 curp->secondary = firstln; 248 } else { 249 lastln->next = thisln; 250 lastln = thisln; 251 } 252 253 /* XXX Ugly hack to mark the end of the input. */ 254 255 if (i == blk.sz || blk.buf[i] == '\0') { 256 ln.buf[pos++] = '\n'; 257 ln.buf[pos] = '\0'; 258 } 259 260 /* 261 * A significant amount of complexity is contained by 262 * the roff preprocessor. It's line-oriented but can be 263 * expressed on one line, so we need at times to 264 * readjust our starting point and re-run it. The roff 265 * preprocessor can also readjust the buffers with new 266 * data, so we pass them in wholesale. 267 */ 268 269 of = 0; 270 rerun: 271 line_result = roff_parseln(curp->roff, curp->line, &ln, &of); 272 273 /* Process options. */ 274 275 if (line_result & ROFF_APPEND) 276 assert(line_result == (ROFF_IGN | ROFF_APPEND)); 277 278 if (line_result & ROFF_USERCALL) 279 assert((line_result & ROFF_MASK) == ROFF_REPARSE); 280 281 if (line_result & ROFF_USERRET) { 282 assert(line_result == (ROFF_IGN | ROFF_USERRET)); 283 if (start == 0) { 284 /* Return from the current macro. */ 285 result = ROFF_USERRET; 286 goto out; 287 } 288 } 289 290 switch (line_result & ROFF_LOOPMASK) { 291 case ROFF_IGN: 292 break; 293 case ROFF_WHILE: 294 if (curp->loop != NULL) { 295 if (loop == curp->loop) 296 break; 297 mandoc_msg(MANDOCERR_WHILE_NEST, 298 curp->line, pos, NULL); 299 } 300 curp->loop = thisln; 301 loop = NULL; 302 inloop = 1; 303 break; 304 case ROFF_LOOPCONT: 305 case ROFF_LOOPEXIT: 306 if (curp->loop == NULL) { 307 mandoc_msg(MANDOCERR_WHILE_FAIL, 308 curp->line, pos, NULL); 309 break; 310 } 311 if (inloop == 0) { 312 mandoc_msg(MANDOCERR_WHILE_INTO, 313 curp->line, pos, NULL); 314 curp->loop = loop = NULL; 315 break; 316 } 317 if (line_result & ROFF_LOOPCONT) 318 loop = curp->loop; 319 else { 320 curp->loop = loop = NULL; 321 inloop = 0; 322 } 323 break; 324 default: 325 abort(); 326 } 327 328 /* Process the main instruction from the roff parser. */ 329 330 switch (line_result & ROFF_MASK) { 331 case ROFF_IGN: 332 break; 333 case ROFF_CONT: 334 if (curp->man->meta.macroset == MACROSET_NONE) 335 choose_parser(curp); 336 if ((curp->man->meta.macroset == MACROSET_MDOC ? 337 mdoc_parseln(curp->man, curp->line, ln.buf, of) : 338 man_parseln(curp->man, curp->line, ln.buf, of) 339 ) == 2) 340 goto out; 341 break; 342 case ROFF_RERUN: 343 goto rerun; 344 case ROFF_REPARSE: 345 if (++curp->reparse_count > REPARSE_LIMIT) { 346 /* Abort and return to the top level. */ 347 result = ROFF_IGN; 348 mandoc_msg(MANDOCERR_ROFFLOOP, 349 curp->line, pos, NULL); 350 goto out; 351 } 352 result = mparse_buf_r(curp, ln, of, 0); 353 if (line_result & ROFF_USERCALL) { 354 roff_userret(curp->roff); 355 /* Continue normally. */ 356 if (result & ROFF_USERRET) 357 result = ROFF_CONT; 358 } 359 if (start == 0 && result != ROFF_CONT) 360 goto out; 361 break; 362 case ROFF_SO: 363 if ( ! (curp->options & MPARSE_SO) && 364 (i >= blk.sz || blk.buf[i] == '\0')) { 365 curp->man->meta.sodest = 366 mandoc_strdup(ln.buf + of); 367 goto out; 368 } 369 if ((fd = mparse_open(curp, ln.buf + of)) != -1) { 370 mparse_readfd(curp, fd, ln.buf + of); 371 close(fd); 372 } else { 373 mandoc_msg(MANDOCERR_SO_FAIL, 374 curp->line, of, ".so %s: %s", 375 ln.buf + of, strerror(errno)); 376 ln.sz = mandoc_asprintf(&cp, 377 ".sp\nSee the file %s.\n.sp", 378 ln.buf + of); 379 free(ln.buf); 380 ln.buf = cp; 381 of = 0; 382 mparse_buf_r(curp, ln, of, 0); 383 } 384 break; 385 default: 386 abort(); 387 } 388 389 /* Start the next input line. */ 390 391 if (loop != NULL && 392 (line_result & ROFF_LOOPMASK) == ROFF_IGN) 393 loop = loop->next; 394 395 if (loop != NULL) { 396 if ((line_result & ROFF_APPEND) == 0) 397 *ln.buf = '\0'; 398 if (ln.sz < loop->sz) 399 resize_buf(&ln, loop->sz); 400 (void)strlcat(ln.buf, loop->buf, ln.sz); 401 of = 0; 402 goto rerun; 403 } 404 405 pos = (line_result & ROFF_APPEND) ? strlen(ln.buf) : 0; 406 } 407 out: 408 if (inloop) { 409 if (result != ROFF_USERRET) 410 mandoc_msg(MANDOCERR_WHILE_OUTOF, 411 curp->line, pos, NULL); 412 curp->loop = NULL; 413 } 414 free(ln.buf); 415 if (firstln != curp->secondary) 416 free_buf_list(firstln); 417 return result; 418 } 419 420 static int 421 read_whole_file(struct mparse *curp, int fd, struct buf *fb, int *with_mmap) 422 { 423 struct stat st; 424 gzFile gz; 425 size_t off; 426 ssize_t ssz; 427 int gzerrnum, retval; 428 429 if (fstat(fd, &st) == -1) { 430 mandoc_msg(MANDOCERR_FILE, 0, 0, 431 "fstat: %s", strerror(errno)); 432 return 0; 433 } 434 435 /* 436 * If we're a regular file, try just reading in the whole entry 437 * via mmap(). This is faster than reading it into blocks, and 438 * since each file is only a few bytes to begin with, I'm not 439 * concerned that this is going to tank any machines. 440 */ 441 442 if (curp->gzip == 0 && S_ISREG(st.st_mode)) { 443 if (st.st_size > 0x7fffffff) { 444 mandoc_msg(MANDOCERR_TOOLARGE, 0, 0, NULL); 445 return 0; 446 } 447 *with_mmap = 1; 448 fb->sz = (size_t)st.st_size; 449 fb->buf = mmap(NULL, fb->sz, PROT_READ, MAP_SHARED, fd, 0); 450 if (fb->buf != MAP_FAILED) 451 return 1; 452 } 453 454 if (curp->gzip) { 455 /* 456 * Duplicating the file descriptor is required 457 * because we will have to call gzclose(3) 458 * to free memory used internally by zlib, 459 * but that will also close the file descriptor, 460 * which this function must not do. 461 */ 462 if ((fd = dup(fd)) == -1) { 463 mandoc_msg(MANDOCERR_FILE, 0, 0, 464 "dup: %s", strerror(errno)); 465 return 0; 466 } 467 if ((gz = gzdopen(fd, "rb")) == NULL) { 468 mandoc_msg(MANDOCERR_FILE, 0, 0, 469 "gzdopen: %s", strerror(errno)); 470 close(fd); 471 return 0; 472 } 473 } else 474 gz = NULL; 475 476 /* 477 * If this isn't a regular file (like, say, stdin), then we must 478 * go the old way and just read things in bit by bit. 479 */ 480 481 *with_mmap = 0; 482 off = 0; 483 retval = 0; 484 fb->sz = 0; 485 fb->buf = NULL; 486 for (;;) { 487 if (off == fb->sz) { 488 if (fb->sz == (1U << 31)) { 489 mandoc_msg(MANDOCERR_TOOLARGE, 0, 0, NULL); 490 break; 491 } 492 resize_buf(fb, 65536); 493 } 494 ssz = curp->gzip ? 495 gzread(gz, fb->buf + (int)off, fb->sz - off) : 496 read(fd, fb->buf + (int)off, fb->sz - off); 497 if (ssz == 0) { 498 fb->sz = off; 499 retval = 1; 500 break; 501 } 502 if (ssz == -1) { 503 if (curp->gzip) 504 (void)gzerror(gz, &gzerrnum); 505 mandoc_msg(MANDOCERR_FILE, 0, 0, "read: %s", 506 curp->gzip && gzerrnum != Z_ERRNO ? 507 zError(gzerrnum) : strerror(errno)); 508 break; 509 } 510 off += (size_t)ssz; 511 } 512 513 if (curp->gzip && (gzerrnum = gzclose(gz)) != Z_OK) 514 mandoc_msg(MANDOCERR_FILE, 0, 0, "gzclose: %s", 515 gzerrnum == Z_ERRNO ? strerror(errno) : 516 zError(gzerrnum)); 517 if (retval == 0) { 518 free(fb->buf); 519 fb->buf = NULL; 520 } 521 return retval; 522 } 523 524 static void 525 mparse_end(struct mparse *curp) 526 { 527 if (curp->man->meta.macroset == MACROSET_NONE) 528 curp->man->meta.macroset = MACROSET_MAN; 529 if (curp->man->meta.macroset == MACROSET_MDOC) 530 mdoc_endparse(curp->man); 531 else 532 man_endparse(curp->man); 533 roff_endparse(curp->roff); 534 } 535 536 /* 537 * Read the whole file into memory and call the parsers. 538 * Called recursively when an .so request is encountered. 539 */ 540 void 541 mparse_readfd(struct mparse *curp, int fd, const char *filename) 542 { 543 static int recursion_depth; 544 545 struct buf blk; 546 struct buf *save_primary; 547 const char *save_filename; 548 size_t offset; 549 int save_filenc, save_lineno; 550 int with_mmap; 551 552 if (recursion_depth > 64) { 553 mandoc_msg(MANDOCERR_ROFFLOOP, curp->line, 0, NULL); 554 return; 555 } 556 if (read_whole_file(curp, fd, &blk, &with_mmap) == 0) 557 return; 558 559 /* 560 * Save some properties of the parent file. 561 */ 562 563 save_primary = curp->primary; 564 save_filenc = curp->filenc; 565 save_lineno = curp->line; 566 save_filename = mandoc_msg_getinfilename(); 567 568 curp->primary = &blk; 569 curp->filenc = curp->options & (MPARSE_UTF8 | MPARSE_LATIN1); 570 curp->line = 1; 571 mandoc_msg_setinfilename(filename); 572 573 /* Skip an UTF-8 byte order mark. */ 574 if (curp->filenc & MPARSE_UTF8 && blk.sz > 2 && 575 (unsigned char)blk.buf[0] == 0xef && 576 (unsigned char)blk.buf[1] == 0xbb && 577 (unsigned char)blk.buf[2] == 0xbf) { 578 offset = 3; 579 curp->filenc &= ~MPARSE_LATIN1; 580 } else 581 offset = 0; 582 583 recursion_depth++; 584 mparse_buf_r(curp, blk, offset, 1); 585 if (--recursion_depth == 0) 586 mparse_end(curp); 587 588 /* 589 * Clean up and restore saved parent properties. 590 */ 591 592 if (with_mmap) 593 munmap(blk.buf, blk.sz); 594 else 595 free(blk.buf); 596 597 curp->primary = save_primary; 598 curp->filenc = save_filenc; 599 curp->line = save_lineno; 600 if (save_filename != NULL) 601 mandoc_msg_setinfilename(save_filename); 602 } 603 604 int 605 mparse_open(struct mparse *curp, const char *file) 606 { 607 char *cp; 608 int fd, save_errno; 609 610 cp = strrchr(file, '.'); 611 curp->gzip = (cp != NULL && ! strcmp(cp + 1, "gz")); 612 613 /* First try to use the filename as it is. */ 614 615 if ((fd = open(file, O_RDONLY)) != -1) 616 return fd; 617 618 /* 619 * If that doesn't work and the filename doesn't 620 * already end in .gz, try appending .gz. 621 */ 622 623 if ( ! curp->gzip) { 624 save_errno = errno; 625 mandoc_asprintf(&cp, "%s.gz", file); 626 fd = open(cp, O_RDONLY); 627 free(cp); 628 errno = save_errno; 629 if (fd != -1) { 630 curp->gzip = 1; 631 return fd; 632 } 633 } 634 635 /* Neither worked, give up. */ 636 637 return -1; 638 } 639 640 struct mparse * 641 mparse_alloc(int options, enum mandoc_os os_e, const char *os_s) 642 { 643 struct mparse *curp; 644 645 curp = mandoc_calloc(1, sizeof(struct mparse)); 646 647 curp->options = options; 648 curp->os_s = os_s; 649 650 curp->roff = roff_alloc(options); 651 curp->man = roff_man_alloc(curp->roff, curp->os_s, 652 curp->options & MPARSE_QUICK ? 1 : 0); 653 if (curp->options & MPARSE_MDOC) { 654 curp->man->meta.macroset = MACROSET_MDOC; 655 if (curp->man->mdocmac == NULL) 656 curp->man->mdocmac = roffhash_alloc(MDOC_Dd, MDOC_MAX); 657 } else if (curp->options & MPARSE_MAN) { 658 curp->man->meta.macroset = MACROSET_MAN; 659 if (curp->man->manmac == NULL) 660 curp->man->manmac = roffhash_alloc(MAN_TH, MAN_MAX); 661 } 662 curp->man->meta.first->tok = TOKEN_NONE; 663 curp->man->meta.os_e = os_e; 664 return curp; 665 } 666 667 void 668 mparse_reset(struct mparse *curp) 669 { 670 roff_reset(curp->roff); 671 roff_man_reset(curp->man); 672 free_buf_list(curp->secondary); 673 curp->secondary = NULL; 674 curp->gzip = 0; 675 } 676 677 void 678 mparse_free(struct mparse *curp) 679 { 680 roffhash_free(curp->man->mdocmac); 681 roffhash_free(curp->man->manmac); 682 roff_man_free(curp->man); 683 roff_free(curp->roff); 684 free_buf_list(curp->secondary); 685 free(curp); 686 } 687 688 struct roff_meta * 689 mparse_result(struct mparse *curp) 690 { 691 roff_state_reset(curp->man); 692 if (curp->options & MPARSE_VALIDATE) { 693 if (curp->man->meta.macroset == MACROSET_MDOC) 694 mdoc_validate(curp->man); 695 else 696 man_validate(curp->man); 697 } 698 return &curp->man->meta; 699 } 700 701 void 702 mparse_copy(const struct mparse *p) 703 { 704 struct buf *buf; 705 706 for (buf = p->secondary; buf != NULL; buf = buf->next) 707 puts(buf->buf); 708 } 709