1 /* $OpenBSD: read.c,v 1.185 2019/07/10 19:38:56 schwarze Exp $ */ 2 /* 3 * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv> 4 * Copyright (c) 2010-2019 Ingo Schwarze <schwarze@openbsd.org> 5 * Copyright (c) 2010, 2012 Joerg Sonnenberger <joerg@netbsd.org> 6 * 7 * Permission to use, copy, modify, and distribute this software for any 8 * purpose with or without fee is hereby granted, provided that the above 9 * copyright notice and this permission notice appear in all copies. 10 * 11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES 12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR 14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 18 */ 19 #include <sys/types.h> 20 #include <sys/mman.h> 21 #include <sys/stat.h> 22 23 #include <assert.h> 24 #include <ctype.h> 25 #include <errno.h> 26 #include <fcntl.h> 27 #include <stdarg.h> 28 #include <stdio.h> 29 #include <stdlib.h> 30 #include <string.h> 31 #include <unistd.h> 32 #include <zlib.h> 33 34 #include "mandoc_aux.h" 35 #include "mandoc.h" 36 #include "roff.h" 37 #include "mdoc.h" 38 #include "man.h" 39 #include "mandoc_parse.h" 40 #include "libmandoc.h" 41 #include "roff_int.h" 42 43 #define REPARSE_LIMIT 1000 44 45 struct mparse { 46 struct roff *roff; /* roff parser (!NULL) */ 47 struct roff_man *man; /* man parser */ 48 struct buf *primary; /* buffer currently being parsed */ 49 struct buf *secondary; /* copy of top level input */ 50 struct buf *loop; /* open .while request line */ 51 const char *os_s; /* default operating system */ 52 int options; /* parser options */ 53 int gzip; /* current input file is gzipped */ 54 int filenc; /* encoding of the current file */ 55 int reparse_count; /* finite interp. stack */ 56 int line; /* line number in the file */ 57 }; 58 59 static void choose_parser(struct mparse *); 60 static void free_buf_list(struct buf *); 61 static void resize_buf(struct buf *, size_t); 62 static int mparse_buf_r(struct mparse *, struct buf, size_t, int); 63 static int read_whole_file(struct mparse *, int, struct buf *, int *); 64 static void mparse_end(struct mparse *); 65 66 67 static void 68 resize_buf(struct buf *buf, size_t initial) 69 { 70 71 buf->sz = buf->sz > initial/2 ? 2 * buf->sz : initial; 72 buf->buf = mandoc_realloc(buf->buf, buf->sz); 73 } 74 75 static void 76 free_buf_list(struct buf *buf) 77 { 78 struct buf *tmp; 79 80 while (buf != NULL) { 81 tmp = buf; 82 buf = tmp->next; 83 free(tmp->buf); 84 free(tmp); 85 } 86 } 87 88 static void 89 choose_parser(struct mparse *curp) 90 { 91 char *cp, *ep; 92 int format; 93 94 /* 95 * If neither command line arguments -mdoc or -man select 96 * a parser nor the roff parser found a .Dd or .TH macro 97 * yet, look ahead in the main input buffer. 98 */ 99 100 if ((format = roff_getformat(curp->roff)) == 0) { 101 cp = curp->primary->buf; 102 ep = cp + curp->primary->sz; 103 while (cp < ep) { 104 if (*cp == '.' || *cp == '\'') { 105 cp++; 106 if (cp[0] == 'D' && cp[1] == 'd') { 107 format = MPARSE_MDOC; 108 break; 109 } 110 if (cp[0] == 'T' && cp[1] == 'H') { 111 format = MPARSE_MAN; 112 break; 113 } 114 } 115 cp = memchr(cp, '\n', ep - cp); 116 if (cp == NULL) 117 break; 118 cp++; 119 } 120 } 121 122 if (format == MPARSE_MDOC) { 123 curp->man->meta.macroset = MACROSET_MDOC; 124 if (curp->man->mdocmac == NULL) 125 curp->man->mdocmac = roffhash_alloc(MDOC_Dd, MDOC_MAX); 126 } else { 127 curp->man->meta.macroset = MACROSET_MAN; 128 if (curp->man->manmac == NULL) 129 curp->man->manmac = roffhash_alloc(MAN_TH, MAN_MAX); 130 } 131 curp->man->meta.first->tok = TOKEN_NONE; 132 } 133 134 /* 135 * Main parse routine for a buffer. 136 * It assumes encoding and line numbering are already set up. 137 * It can recurse directly (for invocations of user-defined 138 * macros, inline equations, and input line traps) 139 * and indirectly (for .so file inclusion). 140 */ 141 static int 142 mparse_buf_r(struct mparse *curp, struct buf blk, size_t i, int start) 143 { 144 struct buf ln; 145 struct buf *firstln, *lastln, *thisln, *loop; 146 char *cp; 147 size_t pos; /* byte number in the ln buffer */ 148 int line_result, result; 149 int of; 150 int lnn; /* line number in the real file */ 151 int fd; 152 int inloop; /* Saw .while on this level. */ 153 unsigned char c; 154 155 ln.sz = 256; 156 ln.buf = mandoc_malloc(ln.sz); 157 ln.next = NULL; 158 firstln = lastln = loop = NULL; 159 lnn = curp->line; 160 pos = 0; 161 inloop = 0; 162 result = ROFF_CONT; 163 164 while (i < blk.sz && (blk.buf[i] != '\0' || pos != 0)) { 165 if (start) { 166 curp->line = lnn; 167 curp->reparse_count = 0; 168 169 if (lnn < 3 && 170 curp->filenc & MPARSE_UTF8 && 171 curp->filenc & MPARSE_LATIN1) 172 curp->filenc = preconv_cue(&blk, i); 173 } 174 175 while (i < blk.sz && (start || blk.buf[i] != '\0')) { 176 177 /* 178 * When finding an unescaped newline character, 179 * leave the character loop to process the line. 180 * Skip a preceding carriage return, if any. 181 */ 182 183 if ('\r' == blk.buf[i] && i + 1 < blk.sz && 184 '\n' == blk.buf[i + 1]) 185 ++i; 186 if ('\n' == blk.buf[i]) { 187 ++i; 188 ++lnn; 189 break; 190 } 191 192 /* 193 * Make sure we have space for the worst 194 * case of 12 bytes: "\\[u10ffff]\n\0" 195 */ 196 197 if (pos + 12 > ln.sz) 198 resize_buf(&ln, 256); 199 200 /* 201 * Encode 8-bit input. 202 */ 203 204 c = blk.buf[i]; 205 if (c & 0x80) { 206 if ( ! (curp->filenc && preconv_encode( 207 &blk, &i, &ln, &pos, &curp->filenc))) { 208 mandoc_msg(MANDOCERR_CHAR_BAD, 209 curp->line, pos, "0x%x", c); 210 ln.buf[pos++] = '?'; 211 i++; 212 } 213 continue; 214 } 215 216 /* 217 * Exclude control characters. 218 */ 219 220 if (c == 0x7f || (c < 0x20 && c != 0x09)) { 221 mandoc_msg(c == 0x00 || c == 0x04 || 222 c > 0x0a ? MANDOCERR_CHAR_BAD : 223 MANDOCERR_CHAR_UNSUPP, 224 curp->line, pos, "0x%x", c); 225 i++; 226 if (c != '\r') 227 ln.buf[pos++] = '?'; 228 continue; 229 } 230 231 ln.buf[pos++] = blk.buf[i++]; 232 } 233 ln.buf[pos] = '\0'; 234 235 /* 236 * Maintain a lookaside buffer of all lines. 237 * parsed from this input source. 238 */ 239 240 thisln = mandoc_malloc(sizeof(*thisln)); 241 thisln->buf = mandoc_strdup(ln.buf); 242 thisln->sz = strlen(ln.buf) + 1; 243 thisln->next = NULL; 244 if (firstln == NULL) { 245 firstln = lastln = thisln; 246 if (curp->secondary == NULL) 247 curp->secondary = firstln; 248 } else { 249 lastln->next = thisln; 250 lastln = thisln; 251 } 252 253 /* XXX Ugly hack to mark the end of the input. */ 254 255 if (i == blk.sz || blk.buf[i] == '\0') { 256 if (pos + 2 > ln.sz) 257 resize_buf(&ln, 256); 258 ln.buf[pos++] = '\n'; 259 ln.buf[pos] = '\0'; 260 } 261 262 /* 263 * A significant amount of complexity is contained by 264 * the roff preprocessor. It's line-oriented but can be 265 * expressed on one line, so we need at times to 266 * readjust our starting point and re-run it. The roff 267 * preprocessor can also readjust the buffers with new 268 * data, so we pass them in wholesale. 269 */ 270 271 of = 0; 272 rerun: 273 line_result = roff_parseln(curp->roff, curp->line, &ln, &of); 274 275 /* Process options. */ 276 277 if (line_result & ROFF_APPEND) 278 assert(line_result == (ROFF_IGN | ROFF_APPEND)); 279 280 if (line_result & ROFF_USERCALL) 281 assert((line_result & ROFF_MASK) == ROFF_REPARSE); 282 283 if (line_result & ROFF_USERRET) { 284 assert(line_result == (ROFF_IGN | ROFF_USERRET)); 285 if (start == 0) { 286 /* Return from the current macro. */ 287 result = ROFF_USERRET; 288 goto out; 289 } 290 } 291 292 switch (line_result & ROFF_LOOPMASK) { 293 case ROFF_IGN: 294 break; 295 case ROFF_WHILE: 296 if (curp->loop != NULL) { 297 if (loop == curp->loop) 298 break; 299 mandoc_msg(MANDOCERR_WHILE_NEST, 300 curp->line, pos, NULL); 301 } 302 curp->loop = thisln; 303 loop = NULL; 304 inloop = 1; 305 break; 306 case ROFF_LOOPCONT: 307 case ROFF_LOOPEXIT: 308 if (curp->loop == NULL) { 309 mandoc_msg(MANDOCERR_WHILE_FAIL, 310 curp->line, pos, NULL); 311 break; 312 } 313 if (inloop == 0) { 314 mandoc_msg(MANDOCERR_WHILE_INTO, 315 curp->line, pos, NULL); 316 curp->loop = loop = NULL; 317 break; 318 } 319 if (line_result & ROFF_LOOPCONT) 320 loop = curp->loop; 321 else { 322 curp->loop = loop = NULL; 323 inloop = 0; 324 } 325 break; 326 default: 327 abort(); 328 } 329 330 /* Process the main instruction from the roff parser. */ 331 332 switch (line_result & ROFF_MASK) { 333 case ROFF_IGN: 334 break; 335 case ROFF_CONT: 336 if (curp->man->meta.macroset == MACROSET_NONE) 337 choose_parser(curp); 338 if ((curp->man->meta.macroset == MACROSET_MDOC ? 339 mdoc_parseln(curp->man, curp->line, ln.buf, of) : 340 man_parseln(curp->man, curp->line, ln.buf, of) 341 ) == 2) 342 goto out; 343 break; 344 case ROFF_RERUN: 345 goto rerun; 346 case ROFF_REPARSE: 347 if (++curp->reparse_count > REPARSE_LIMIT) { 348 /* Abort and return to the top level. */ 349 result = ROFF_IGN; 350 mandoc_msg(MANDOCERR_ROFFLOOP, 351 curp->line, pos, NULL); 352 goto out; 353 } 354 result = mparse_buf_r(curp, ln, of, 0); 355 if (line_result & ROFF_USERCALL) { 356 roff_userret(curp->roff); 357 /* Continue normally. */ 358 if (result & ROFF_USERRET) 359 result = ROFF_CONT; 360 } 361 if (start == 0 && result != ROFF_CONT) 362 goto out; 363 break; 364 case ROFF_SO: 365 if ( ! (curp->options & MPARSE_SO) && 366 (i >= blk.sz || blk.buf[i] == '\0')) { 367 curp->man->meta.sodest = 368 mandoc_strdup(ln.buf + of); 369 goto out; 370 } 371 if ((fd = mparse_open(curp, ln.buf + of)) != -1) { 372 mparse_readfd(curp, fd, ln.buf + of); 373 close(fd); 374 } else { 375 mandoc_msg(MANDOCERR_SO_FAIL, 376 curp->line, of, ".so %s: %s", 377 ln.buf + of, strerror(errno)); 378 ln.sz = mandoc_asprintf(&cp, 379 ".sp\nSee the file %s.\n.sp", 380 ln.buf + of); 381 free(ln.buf); 382 ln.buf = cp; 383 of = 0; 384 mparse_buf_r(curp, ln, of, 0); 385 } 386 break; 387 default: 388 abort(); 389 } 390 391 /* Start the next input line. */ 392 393 if (loop != NULL && 394 (line_result & ROFF_LOOPMASK) == ROFF_IGN) 395 loop = loop->next; 396 397 if (loop != NULL) { 398 if ((line_result & ROFF_APPEND) == 0) 399 *ln.buf = '\0'; 400 if (ln.sz < loop->sz) 401 resize_buf(&ln, loop->sz); 402 (void)strlcat(ln.buf, loop->buf, ln.sz); 403 of = 0; 404 goto rerun; 405 } 406 407 pos = (line_result & ROFF_APPEND) ? strlen(ln.buf) : 0; 408 } 409 out: 410 if (inloop) { 411 if (result != ROFF_USERRET) 412 mandoc_msg(MANDOCERR_WHILE_OUTOF, 413 curp->line, pos, NULL); 414 curp->loop = NULL; 415 } 416 free(ln.buf); 417 if (firstln != curp->secondary) 418 free_buf_list(firstln); 419 return result; 420 } 421 422 static int 423 read_whole_file(struct mparse *curp, int fd, struct buf *fb, int *with_mmap) 424 { 425 struct stat st; 426 gzFile gz; 427 size_t off; 428 ssize_t ssz; 429 int gzerrnum, retval; 430 431 if (fstat(fd, &st) == -1) { 432 mandoc_msg(MANDOCERR_FSTAT, 0, 0, "%s", strerror(errno)); 433 return -1; 434 } 435 436 /* 437 * If we're a regular file, try just reading in the whole entry 438 * via mmap(). This is faster than reading it into blocks, and 439 * since each file is only a few bytes to begin with, I'm not 440 * concerned that this is going to tank any machines. 441 */ 442 443 if (curp->gzip == 0 && S_ISREG(st.st_mode)) { 444 if (st.st_size > 0x7fffffff) { 445 mandoc_msg(MANDOCERR_TOOLARGE, 0, 0, NULL); 446 return -1; 447 } 448 *with_mmap = 1; 449 fb->sz = (size_t)st.st_size; 450 fb->buf = mmap(NULL, fb->sz, PROT_READ, MAP_SHARED, fd, 0); 451 if (fb->buf != MAP_FAILED) 452 return 0; 453 } 454 455 if (curp->gzip) { 456 /* 457 * Duplicating the file descriptor is required 458 * because we will have to call gzclose(3) 459 * to free memory used internally by zlib, 460 * but that will also close the file descriptor, 461 * which this function must not do. 462 */ 463 if ((fd = dup(fd)) == -1) { 464 mandoc_msg(MANDOCERR_DUP, 0, 0, 465 "%s", strerror(errno)); 466 return -1; 467 } 468 if ((gz = gzdopen(fd, "rb")) == NULL) { 469 mandoc_msg(MANDOCERR_GZDOPEN, 0, 0, 470 "%s", strerror(errno)); 471 close(fd); 472 return -1; 473 } 474 } else 475 gz = NULL; 476 477 /* 478 * If this isn't a regular file (like, say, stdin), then we must 479 * go the old way and just read things in bit by bit. 480 */ 481 482 *with_mmap = 0; 483 off = 0; 484 retval = -1; 485 fb->sz = 0; 486 fb->buf = NULL; 487 for (;;) { 488 if (off == fb->sz) { 489 if (fb->sz == (1U << 31)) { 490 mandoc_msg(MANDOCERR_TOOLARGE, 0, 0, NULL); 491 break; 492 } 493 resize_buf(fb, 65536); 494 } 495 ssz = curp->gzip ? 496 gzread(gz, fb->buf + (int)off, fb->sz - off) : 497 read(fd, fb->buf + (int)off, fb->sz - off); 498 if (ssz == 0) { 499 fb->sz = off; 500 retval = 0; 501 break; 502 } 503 if (ssz == -1) { 504 if (curp->gzip) 505 (void)gzerror(gz, &gzerrnum); 506 mandoc_msg(MANDOCERR_READ, 0, 0, "%s", 507 curp->gzip && gzerrnum != Z_ERRNO ? 508 zError(gzerrnum) : strerror(errno)); 509 break; 510 } 511 off += (size_t)ssz; 512 } 513 514 if (curp->gzip && (gzerrnum = gzclose(gz)) != Z_OK) 515 mandoc_msg(MANDOCERR_GZCLOSE, 0, 0, "%s", 516 gzerrnum == Z_ERRNO ? strerror(errno) : 517 zError(gzerrnum)); 518 if (retval == -1) { 519 free(fb->buf); 520 fb->buf = NULL; 521 } 522 return retval; 523 } 524 525 static void 526 mparse_end(struct mparse *curp) 527 { 528 if (curp->man->meta.macroset == MACROSET_NONE) 529 curp->man->meta.macroset = MACROSET_MAN; 530 if (curp->man->meta.macroset == MACROSET_MDOC) 531 mdoc_endparse(curp->man); 532 else 533 man_endparse(curp->man); 534 roff_endparse(curp->roff); 535 } 536 537 /* 538 * Read the whole file into memory and call the parsers. 539 * Called recursively when an .so request is encountered. 540 */ 541 void 542 mparse_readfd(struct mparse *curp, int fd, const char *filename) 543 { 544 static int recursion_depth; 545 546 struct buf blk; 547 struct buf *save_primary; 548 const char *save_filename; 549 size_t offset; 550 int save_filenc, save_lineno; 551 int with_mmap; 552 553 if (recursion_depth > 64) { 554 mandoc_msg(MANDOCERR_ROFFLOOP, curp->line, 0, NULL); 555 return; 556 } 557 if (read_whole_file(curp, fd, &blk, &with_mmap) == -1) 558 return; 559 560 /* 561 * Save some properties of the parent file. 562 */ 563 564 save_primary = curp->primary; 565 save_filenc = curp->filenc; 566 save_lineno = curp->line; 567 save_filename = mandoc_msg_getinfilename(); 568 569 curp->primary = &blk; 570 curp->filenc = curp->options & (MPARSE_UTF8 | MPARSE_LATIN1); 571 curp->line = 1; 572 mandoc_msg_setinfilename(filename); 573 574 /* Skip an UTF-8 byte order mark. */ 575 if (curp->filenc & MPARSE_UTF8 && blk.sz > 2 && 576 (unsigned char)blk.buf[0] == 0xef && 577 (unsigned char)blk.buf[1] == 0xbb && 578 (unsigned char)blk.buf[2] == 0xbf) { 579 offset = 3; 580 curp->filenc &= ~MPARSE_LATIN1; 581 } else 582 offset = 0; 583 584 recursion_depth++; 585 mparse_buf_r(curp, blk, offset, 1); 586 if (--recursion_depth == 0) 587 mparse_end(curp); 588 589 /* 590 * Clean up and restore saved parent properties. 591 */ 592 593 if (with_mmap) 594 munmap(blk.buf, blk.sz); 595 else 596 free(blk.buf); 597 598 curp->primary = save_primary; 599 curp->filenc = save_filenc; 600 curp->line = save_lineno; 601 if (save_filename != NULL) 602 mandoc_msg_setinfilename(save_filename); 603 } 604 605 int 606 mparse_open(struct mparse *curp, const char *file) 607 { 608 char *cp; 609 int fd, save_errno; 610 611 cp = strrchr(file, '.'); 612 curp->gzip = (cp != NULL && ! strcmp(cp + 1, "gz")); 613 614 /* First try to use the filename as it is. */ 615 616 if ((fd = open(file, O_RDONLY)) != -1) 617 return fd; 618 619 /* 620 * If that doesn't work and the filename doesn't 621 * already end in .gz, try appending .gz. 622 */ 623 624 if ( ! curp->gzip) { 625 save_errno = errno; 626 mandoc_asprintf(&cp, "%s.gz", file); 627 fd = open(cp, O_RDONLY); 628 free(cp); 629 errno = save_errno; 630 if (fd != -1) { 631 curp->gzip = 1; 632 return fd; 633 } 634 } 635 636 /* Neither worked, give up. */ 637 638 return -1; 639 } 640 641 struct mparse * 642 mparse_alloc(int options, enum mandoc_os os_e, const char *os_s) 643 { 644 struct mparse *curp; 645 646 curp = mandoc_calloc(1, sizeof(struct mparse)); 647 648 curp->options = options; 649 curp->os_s = os_s; 650 651 curp->roff = roff_alloc(options); 652 curp->man = roff_man_alloc(curp->roff, curp->os_s, 653 curp->options & MPARSE_QUICK ? 1 : 0); 654 if (curp->options & MPARSE_MDOC) { 655 curp->man->meta.macroset = MACROSET_MDOC; 656 if (curp->man->mdocmac == NULL) 657 curp->man->mdocmac = roffhash_alloc(MDOC_Dd, MDOC_MAX); 658 } else if (curp->options & MPARSE_MAN) { 659 curp->man->meta.macroset = MACROSET_MAN; 660 if (curp->man->manmac == NULL) 661 curp->man->manmac = roffhash_alloc(MAN_TH, MAN_MAX); 662 } 663 curp->man->meta.first->tok = TOKEN_NONE; 664 curp->man->meta.os_e = os_e; 665 return curp; 666 } 667 668 void 669 mparse_reset(struct mparse *curp) 670 { 671 roff_reset(curp->roff); 672 roff_man_reset(curp->man); 673 free_buf_list(curp->secondary); 674 curp->secondary = NULL; 675 curp->gzip = 0; 676 } 677 678 void 679 mparse_free(struct mparse *curp) 680 { 681 roffhash_free(curp->man->mdocmac); 682 roffhash_free(curp->man->manmac); 683 roff_man_free(curp->man); 684 roff_free(curp->roff); 685 free_buf_list(curp->secondary); 686 free(curp); 687 } 688 689 struct roff_meta * 690 mparse_result(struct mparse *curp) 691 { 692 roff_state_reset(curp->man); 693 if (curp->options & MPARSE_VALIDATE) { 694 if (curp->man->meta.macroset == MACROSET_MDOC) 695 mdoc_validate(curp->man); 696 else 697 man_validate(curp->man); 698 } 699 return &curp->man->meta; 700 } 701 702 void 703 mparse_copy(const struct mparse *p) 704 { 705 struct buf *buf; 706 707 for (buf = p->secondary; buf != NULL; buf = buf->next) 708 puts(buf->buf); 709 } 710