xref: /openbsd-src/usr.bin/mandoc/read.c (revision 6a13ef69787db04ae501a22e92fa10865b44fd7c)
1 /*	$OpenBSD: read.c,v 1.130 2017/01/09 01:36:22 schwarze Exp $ */
2 /*
3  * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
4  * Copyright (c) 2010-2017 Ingo Schwarze <schwarze@openbsd.org>
5  * Copyright (c) 2010, 2012 Joerg Sonnenberger <joerg@netbsd.org>
6  *
7  * Permission to use, copy, modify, and distribute this software for any
8  * purpose with or without fee is hereby granted, provided that the above
9  * copyright notice and this permission notice appear in all copies.
10  *
11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18  */
19 #include <sys/types.h>
20 #include <sys/mman.h>
21 #include <sys/stat.h>
22 
23 #include <assert.h>
24 #include <ctype.h>
25 #include <err.h>
26 #include <errno.h>
27 #include <fcntl.h>
28 #include <stdarg.h>
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32 #include <unistd.h>
33 #include <zlib.h>
34 
35 #include "mandoc_aux.h"
36 #include "mandoc.h"
37 #include "roff.h"
38 #include "mdoc.h"
39 #include "man.h"
40 #include "libmandoc.h"
41 #include "roff_int.h"
42 
43 #define	REPARSE_LIMIT	1000
44 
45 struct	mparse {
46 	struct roff_man	 *man; /* man parser */
47 	struct roff	 *roff; /* roff parser (!NULL) */
48 	char		 *sodest; /* filename pointed to by .so */
49 	const char	 *file; /* filename of current input file */
50 	struct buf	 *primary; /* buffer currently being parsed */
51 	struct buf	 *secondary; /* preprocessed copy of input */
52 	const char	 *defos; /* default operating system */
53 	mandocmsg	  mmsg; /* warning/error message handler */
54 	enum mandoclevel  file_status; /* status of current parse */
55 	enum mandoclevel  wlevel; /* ignore messages below this */
56 	int		  options; /* parser options */
57 	int		  gzip; /* current input file is gzipped */
58 	int		  filenc; /* encoding of the current file */
59 	int		  reparse_count; /* finite interp. stack */
60 	int		  line; /* line number in the file */
61 };
62 
63 static	void	  choose_parser(struct mparse *);
64 static	void	  resize_buf(struct buf *, size_t);
65 static	void	  mparse_buf_r(struct mparse *, struct buf, size_t, int);
66 static	int	  read_whole_file(struct mparse *, const char *, int,
67 				struct buf *, int *);
68 static	void	  mparse_end(struct mparse *);
69 static	void	  mparse_parse_buffer(struct mparse *, struct buf,
70 			const char *);
71 
72 static	const enum mandocerr	mandoclimits[MANDOCLEVEL_MAX] = {
73 	MANDOCERR_OK,
74 	MANDOCERR_WARNING,
75 	MANDOCERR_WARNING,
76 	MANDOCERR_ERROR,
77 	MANDOCERR_UNSUPP,
78 	MANDOCERR_MAX,
79 	MANDOCERR_MAX
80 };
81 
82 static	const char * const	mandocerrs[MANDOCERR_MAX] = {
83 	"ok",
84 
85 	"generic warning",
86 
87 	/* related to the prologue */
88 	"missing manual title, using UNTITLED",
89 	"missing manual title, using \"\"",
90 	"lower case character in document title",
91 	"missing manual section, using \"\"",
92 	"unknown manual section",
93 	"missing date, using today's date",
94 	"cannot parse date, using it verbatim",
95 	"missing Os macro, using \"\"",
96 	"duplicate prologue macro",
97 	"late prologue macro",
98 	"skipping late title macro",
99 	"prologue macros out of order",
100 
101 	/* related to document structure */
102 	".so is fragile, better use ln(1)",
103 	"no document body",
104 	"content before first section header",
105 	"first section is not \"NAME\"",
106 	"NAME section without Nm before Nd",
107 	"NAME section without description",
108 	"description not at the end of NAME",
109 	"bad NAME section content",
110 	"missing comma before name",
111 	"missing description line, using \"\"",
112 	"sections out of conventional order",
113 	"duplicate section title",
114 	"unexpected section",
115 	"unusual Xr order",
116 	"unusual Xr punctuation",
117 	"AUTHORS section without An macro",
118 
119 	/* related to macros and nesting */
120 	"obsolete macro",
121 	"macro neither callable nor escaped",
122 	"skipping paragraph macro",
123 	"moving paragraph macro out of list",
124 	"skipping no-space macro",
125 	"blocks badly nested",
126 	"nested displays are not portable",
127 	"moving content out of list",
128 	"fill mode already enabled, skipping",
129 	"fill mode already disabled, skipping",
130 	"line scope broken",
131 
132 	/* related to missing macro arguments */
133 	"skipping empty request",
134 	"conditional request controls empty scope",
135 	"skipping empty macro",
136 	"empty block",
137 	"empty argument, using 0n",
138 	"missing display type, using -ragged",
139 	"list type is not the first argument",
140 	"missing -width in -tag list, using 6n",
141 	"missing utility name, using \"\"",
142 	"missing function name, using \"\"",
143 	"empty head in list item",
144 	"empty list item",
145 	"missing font type, using \\fR",
146 	"unknown font type, using \\fR",
147 	"nothing follows prefix",
148 	"empty reference block",
149 	"missing section argument",
150 	"missing -std argument, adding it",
151 	"missing option string, using \"\"",
152 	"missing resource identifier, using \"\"",
153 	"missing eqn box, using \"\"",
154 
155 	/* related to bad macro arguments */
156 	"unterminated quoted argument",
157 	"duplicate argument",
158 	"skipping duplicate argument",
159 	"skipping duplicate display type",
160 	"skipping duplicate list type",
161 	"skipping -width argument",
162 	"wrong number of cells",
163 	"unknown AT&T UNIX version",
164 	"comma in function argument",
165 	"parenthesis in function name",
166 	"invalid content in Rs block",
167 	"invalid Boolean argument",
168 	"unknown font, skipping request",
169 	"odd number of characters in request",
170 
171 	/* related to plain text */
172 	"blank line in fill mode, using .sp",
173 	"tab in filled text",
174 	"whitespace at end of input line",
175 	"bad comment style",
176 	"invalid escape sequence",
177 	"undefined string, using \"\"",
178 
179 	/* related to tables */
180 	"tbl line starts with span",
181 	"tbl column starts with span",
182 	"skipping vertical bar in tbl layout",
183 
184 	"generic error",
185 
186 	/* related to tables */
187 	"non-alphabetic character in tbl options",
188 	"skipping unknown tbl option",
189 	"missing tbl option argument",
190 	"wrong tbl option argument size",
191 	"empty tbl layout",
192 	"invalid character in tbl layout",
193 	"unmatched parenthesis in tbl layout",
194 	"tbl without any data cells",
195 	"ignoring data in spanned tbl cell",
196 	"ignoring extra tbl data cells",
197 	"data block open at end of tbl",
198 
199 	/* related to document structure and macros */
200 	NULL,
201 	"input stack limit exceeded, infinite loop?",
202 	"skipping bad character",
203 	"skipping unknown macro",
204 	"skipping insecure request",
205 	"skipping item outside list",
206 	"skipping column outside column list",
207 	"skipping end of block that is not open",
208 	"fewer RS blocks open, skipping",
209 	"inserting missing end of block",
210 	"appending missing end of block",
211 
212 	/* related to request and macro arguments */
213 	"escaped character not allowed in a name",
214 	"NOT IMPLEMENTED: Bd -file",
215 	"skipping display without arguments",
216 	"missing list type, using -item",
217 	"missing manual name, using \"\"",
218 	"uname(3) system call failed, using UNKNOWN",
219 	"unknown standard specifier",
220 	"skipping request without numeric argument",
221 	"NOT IMPLEMENTED: .so with absolute path or \"..\"",
222 	".so request failed",
223 	"skipping all arguments",
224 	"skipping excess arguments",
225 	"divide by zero",
226 
227 	"unsupported feature",
228 	"input too large",
229 	"unsupported control character",
230 	"unsupported roff request",
231 	"eqn delim option in tbl",
232 	"unsupported tbl layout modifier",
233 	"ignoring macro in table",
234 };
235 
236 static	const char * const	mandoclevels[MANDOCLEVEL_MAX] = {
237 	"SUCCESS",
238 	"RESERVED",
239 	"WARNING",
240 	"ERROR",
241 	"UNSUPP",
242 	"BADARG",
243 	"SYSERR"
244 };
245 
246 
247 static void
248 resize_buf(struct buf *buf, size_t initial)
249 {
250 
251 	buf->sz = buf->sz > initial/2 ? 2 * buf->sz : initial;
252 	buf->buf = mandoc_realloc(buf->buf, buf->sz);
253 }
254 
255 static void
256 choose_parser(struct mparse *curp)
257 {
258 	char		*cp, *ep;
259 	int		 format;
260 
261 	/*
262 	 * If neither command line arguments -mdoc or -man select
263 	 * a parser nor the roff parser found a .Dd or .TH macro
264 	 * yet, look ahead in the main input buffer.
265 	 */
266 
267 	if ((format = roff_getformat(curp->roff)) == 0) {
268 		cp = curp->primary->buf;
269 		ep = cp + curp->primary->sz;
270 		while (cp < ep) {
271 			if (*cp == '.' || *cp == '\'') {
272 				cp++;
273 				if (cp[0] == 'D' && cp[1] == 'd') {
274 					format = MPARSE_MDOC;
275 					break;
276 				}
277 				if (cp[0] == 'T' && cp[1] == 'H') {
278 					format = MPARSE_MAN;
279 					break;
280 				}
281 			}
282 			cp = memchr(cp, '\n', ep - cp);
283 			if (cp == NULL)
284 				break;
285 			cp++;
286 		}
287 	}
288 
289 	if (format == MPARSE_MDOC) {
290 		mdoc_hash_init();
291 		curp->man->macroset = MACROSET_MDOC;
292 		curp->man->first->tok = TOKEN_NONE;
293 	} else {
294 		man_hash_init();
295 		curp->man->macroset = MACROSET_MAN;
296 		curp->man->first->tok = TOKEN_NONE;
297 	}
298 }
299 
300 /*
301  * Main parse routine for a buffer.
302  * It assumes encoding and line numbering are already set up.
303  * It can recurse directly (for invocations of user-defined
304  * macros, inline equations, and input line traps)
305  * and indirectly (for .so file inclusion).
306  */
307 static void
308 mparse_buf_r(struct mparse *curp, struct buf blk, size_t i, int start)
309 {
310 	const struct tbl_span	*span;
311 	struct buf	 ln;
312 	const char	*save_file;
313 	char		*cp;
314 	size_t		 pos; /* byte number in the ln buffer */
315 	size_t		 j;  /* auxiliary byte number in the blk buffer */
316 	enum rofferr	 rr;
317 	int		 of;
318 	int		 lnn; /* line number in the real file */
319 	int		 fd;
320 	unsigned char	 c;
321 
322 	memset(&ln, 0, sizeof(ln));
323 
324 	lnn = curp->line;
325 	pos = 0;
326 
327 	while (i < blk.sz) {
328 		if (0 == pos && '\0' == blk.buf[i])
329 			break;
330 
331 		if (start) {
332 			curp->line = lnn;
333 			curp->reparse_count = 0;
334 
335 			if (lnn < 3 &&
336 			    curp->filenc & MPARSE_UTF8 &&
337 			    curp->filenc & MPARSE_LATIN1)
338 				curp->filenc = preconv_cue(&blk, i);
339 		}
340 
341 		while (i < blk.sz && (start || blk.buf[i] != '\0')) {
342 
343 			/*
344 			 * When finding an unescaped newline character,
345 			 * leave the character loop to process the line.
346 			 * Skip a preceding carriage return, if any.
347 			 */
348 
349 			if ('\r' == blk.buf[i] && i + 1 < blk.sz &&
350 			    '\n' == blk.buf[i + 1])
351 				++i;
352 			if ('\n' == blk.buf[i]) {
353 				++i;
354 				++lnn;
355 				break;
356 			}
357 
358 			/*
359 			 * Make sure we have space for the worst
360 			 * case of 11 bytes: "\\[u10ffff]\0"
361 			 */
362 
363 			if (pos + 11 > ln.sz)
364 				resize_buf(&ln, 256);
365 
366 			/*
367 			 * Encode 8-bit input.
368 			 */
369 
370 			c = blk.buf[i];
371 			if (c & 0x80) {
372 				if ( ! (curp->filenc && preconv_encode(
373 				    &blk, &i, &ln, &pos, &curp->filenc))) {
374 					mandoc_vmsg(MANDOCERR_CHAR_BAD, curp,
375 					    curp->line, pos, "0x%x", c);
376 					ln.buf[pos++] = '?';
377 					i++;
378 				}
379 				continue;
380 			}
381 
382 			/*
383 			 * Exclude control characters.
384 			 */
385 
386 			if (c == 0x7f || (c < 0x20 && c != 0x09)) {
387 				mandoc_vmsg(c == 0x00 || c == 0x04 ||
388 				    c > 0x0a ? MANDOCERR_CHAR_BAD :
389 				    MANDOCERR_CHAR_UNSUPP,
390 				    curp, curp->line, pos, "0x%x", c);
391 				i++;
392 				if (c != '\r')
393 					ln.buf[pos++] = '?';
394 				continue;
395 			}
396 
397 			/* Trailing backslash = a plain char. */
398 
399 			if (blk.buf[i] != '\\' || i + 1 == blk.sz) {
400 				ln.buf[pos++] = blk.buf[i++];
401 				continue;
402 			}
403 
404 			/*
405 			 * Found escape and at least one other character.
406 			 * When it's a newline character, skip it.
407 			 * When there is a carriage return in between,
408 			 * skip that one as well.
409 			 */
410 
411 			if ('\r' == blk.buf[i + 1] && i + 2 < blk.sz &&
412 			    '\n' == blk.buf[i + 2])
413 				++i;
414 			if ('\n' == blk.buf[i + 1]) {
415 				i += 2;
416 				++lnn;
417 				continue;
418 			}
419 
420 			if ('"' == blk.buf[i + 1] || '#' == blk.buf[i + 1]) {
421 				j = i;
422 				i += 2;
423 				/* Comment, skip to end of line */
424 				for (; i < blk.sz; ++i) {
425 					if (blk.buf[i] != '\n')
426 						continue;
427 					if (blk.buf[i - 1] == ' ' ||
428 					    blk.buf[i - 1] == '\t')
429 						mandoc_msg(
430 						    MANDOCERR_SPACE_EOL,
431 						    curp, curp->line,
432 						    pos + i-1 - j, NULL);
433 					++i;
434 					++lnn;
435 					break;
436 				}
437 
438 				/* Backout trailing whitespaces */
439 				for (; pos > 0; --pos) {
440 					if (ln.buf[pos - 1] != ' ')
441 						break;
442 					if (pos > 2 && ln.buf[pos - 2] == '\\')
443 						break;
444 				}
445 				break;
446 			}
447 
448 			/* Catch escaped bogus characters. */
449 
450 			c = (unsigned char) blk.buf[i+1];
451 
452 			if ( ! (isascii(c) &&
453 			    (isgraph(c) || isblank(c)))) {
454 				mandoc_vmsg(MANDOCERR_CHAR_BAD, curp,
455 				    curp->line, pos, "0x%x", c);
456 				i += 2;
457 				ln.buf[pos++] = '?';
458 				continue;
459 			}
460 
461 			/* Some other escape sequence, copy & cont. */
462 
463 			ln.buf[pos++] = blk.buf[i++];
464 			ln.buf[pos++] = blk.buf[i++];
465 		}
466 
467 		if (pos >= ln.sz)
468 			resize_buf(&ln, 256);
469 
470 		ln.buf[pos] = '\0';
471 
472 		/*
473 		 * A significant amount of complexity is contained by
474 		 * the roff preprocessor.  It's line-oriented but can be
475 		 * expressed on one line, so we need at times to
476 		 * readjust our starting point and re-run it.  The roff
477 		 * preprocessor can also readjust the buffers with new
478 		 * data, so we pass them in wholesale.
479 		 */
480 
481 		of = 0;
482 
483 		/*
484 		 * Maintain a lookaside buffer of all parsed lines.  We
485 		 * only do this if mparse_keep() has been invoked (the
486 		 * buffer may be accessed with mparse_getkeep()).
487 		 */
488 
489 		if (curp->secondary) {
490 			curp->secondary->buf = mandoc_realloc(
491 			    curp->secondary->buf,
492 			    curp->secondary->sz + pos + 2);
493 			memcpy(curp->secondary->buf +
494 			    curp->secondary->sz,
495 			    ln.buf, pos);
496 			curp->secondary->sz += pos;
497 			curp->secondary->buf
498 				[curp->secondary->sz] = '\n';
499 			curp->secondary->sz++;
500 			curp->secondary->buf
501 				[curp->secondary->sz] = '\0';
502 		}
503 rerun:
504 		rr = roff_parseln(curp->roff, curp->line, &ln, &of);
505 
506 		switch (rr) {
507 		case ROFF_REPARSE:
508 			if (REPARSE_LIMIT >= ++curp->reparse_count)
509 				mparse_buf_r(curp, ln, of, 0);
510 			else
511 				mandoc_msg(MANDOCERR_ROFFLOOP, curp,
512 				    curp->line, pos, NULL);
513 			pos = 0;
514 			continue;
515 		case ROFF_APPEND:
516 			pos = strlen(ln.buf);
517 			continue;
518 		case ROFF_RERUN:
519 			goto rerun;
520 		case ROFF_IGN:
521 			pos = 0;
522 			continue;
523 		case ROFF_SO:
524 			if ( ! (curp->options & MPARSE_SO) &&
525 			    (i >= blk.sz || blk.buf[i] == '\0')) {
526 				curp->sodest = mandoc_strdup(ln.buf + of);
527 				free(ln.buf);
528 				return;
529 			}
530 			/*
531 			 * We remove `so' clauses from our lookaside
532 			 * buffer because we're going to descend into
533 			 * the file recursively.
534 			 */
535 			if (curp->secondary)
536 				curp->secondary->sz -= pos + 1;
537 			save_file = curp->file;
538 			if ((fd = mparse_open(curp, ln.buf + of)) != -1) {
539 				mparse_readfd(curp, fd, ln.buf + of);
540 				close(fd);
541 				curp->file = save_file;
542 			} else {
543 				curp->file = save_file;
544 				mandoc_vmsg(MANDOCERR_SO_FAIL,
545 				    curp, curp->line, pos,
546 				    ".so %s", ln.buf + of);
547 				ln.sz = mandoc_asprintf(&cp,
548 				    ".sp\nSee the file %s.\n.sp",
549 				    ln.buf + of);
550 				free(ln.buf);
551 				ln.buf = cp;
552 				of = 0;
553 				mparse_buf_r(curp, ln, of, 0);
554 			}
555 			pos = 0;
556 			continue;
557 		default:
558 			break;
559 		}
560 
561 		if (curp->man->macroset == MACROSET_NONE)
562 			choose_parser(curp);
563 
564 		/*
565 		 * Lastly, push down into the parsers themselves.
566 		 * If libroff returns ROFF_TBL, then add it to the
567 		 * currently open parse.  Since we only get here if
568 		 * there does exist data (see tbl_data.c), we're
569 		 * guaranteed that something's been allocated.
570 		 * Do the same for ROFF_EQN.
571 		 */
572 
573 		if (rr == ROFF_TBL)
574 			while ((span = roff_span(curp->roff)) != NULL)
575 				roff_addtbl(curp->man, span);
576 		else if (rr == ROFF_EQN)
577 			roff_addeqn(curp->man, roff_eqn(curp->roff));
578 		else if ((curp->man->macroset == MACROSET_MDOC ?
579 		    mdoc_parseln(curp->man, curp->line, ln.buf, of) :
580 		    man_parseln(curp->man, curp->line, ln.buf, of)) == 2)
581 				break;
582 
583 		/* Temporary buffers typically are not full. */
584 
585 		if (0 == start && '\0' == blk.buf[i])
586 			break;
587 
588 		/* Start the next input line. */
589 
590 		pos = 0;
591 	}
592 
593 	free(ln.buf);
594 }
595 
596 static int
597 read_whole_file(struct mparse *curp, const char *file, int fd,
598 		struct buf *fb, int *with_mmap)
599 {
600 	struct stat	 st;
601 	gzFile		 gz;
602 	size_t		 off;
603 	ssize_t		 ssz;
604 
605 	if (fstat(fd, &st) == -1)
606 		err((int)MANDOCLEVEL_SYSERR, "%s", file);
607 
608 	/*
609 	 * If we're a regular file, try just reading in the whole entry
610 	 * via mmap().  This is faster than reading it into blocks, and
611 	 * since each file is only a few bytes to begin with, I'm not
612 	 * concerned that this is going to tank any machines.
613 	 */
614 
615 	if (curp->gzip == 0 && S_ISREG(st.st_mode)) {
616 		if (st.st_size > 0x7fffffff) {
617 			mandoc_msg(MANDOCERR_TOOLARGE, curp, 0, 0, NULL);
618 			return 0;
619 		}
620 		*with_mmap = 1;
621 		fb->sz = (size_t)st.st_size;
622 		fb->buf = mmap(NULL, fb->sz, PROT_READ, MAP_SHARED, fd, 0);
623 		if (fb->buf != MAP_FAILED)
624 			return 1;
625 	}
626 
627 	if (curp->gzip) {
628 		if ((gz = gzdopen(fd, "rb")) == NULL)
629 			err((int)MANDOCLEVEL_SYSERR, "%s", file);
630 	} else
631 		gz = NULL;
632 
633 	/*
634 	 * If this isn't a regular file (like, say, stdin), then we must
635 	 * go the old way and just read things in bit by bit.
636 	 */
637 
638 	*with_mmap = 0;
639 	off = 0;
640 	fb->sz = 0;
641 	fb->buf = NULL;
642 	for (;;) {
643 		if (off == fb->sz) {
644 			if (fb->sz == (1U << 31)) {
645 				mandoc_msg(MANDOCERR_TOOLARGE, curp,
646 				    0, 0, NULL);
647 				break;
648 			}
649 			resize_buf(fb, 65536);
650 		}
651 		ssz = curp->gzip ?
652 		    gzread(gz, fb->buf + (int)off, fb->sz - off) :
653 		    read(fd, fb->buf + (int)off, fb->sz - off);
654 		if (ssz == 0) {
655 			fb->sz = off;
656 			return 1;
657 		}
658 		if (ssz == -1)
659 			err((int)MANDOCLEVEL_SYSERR, "%s", file);
660 		off += (size_t)ssz;
661 	}
662 
663 	free(fb->buf);
664 	fb->buf = NULL;
665 	return 0;
666 }
667 
668 static void
669 mparse_end(struct mparse *curp)
670 {
671 	if (curp->man->macroset == MACROSET_NONE)
672 		curp->man->macroset = MACROSET_MAN;
673 	if (curp->man->macroset == MACROSET_MDOC)
674 		mdoc_endparse(curp->man);
675 	else
676 		man_endparse(curp->man);
677 	roff_endparse(curp->roff);
678 }
679 
680 static void
681 mparse_parse_buffer(struct mparse *curp, struct buf blk, const char *file)
682 {
683 	struct buf	*svprimary;
684 	const char	*svfile;
685 	size_t		 offset;
686 	static int	 recursion_depth;
687 
688 	if (64 < recursion_depth) {
689 		mandoc_msg(MANDOCERR_ROFFLOOP, curp, curp->line, 0, NULL);
690 		return;
691 	}
692 
693 	/* Line number is per-file. */
694 	svfile = curp->file;
695 	curp->file = file;
696 	svprimary = curp->primary;
697 	curp->primary = &blk;
698 	curp->line = 1;
699 	recursion_depth++;
700 
701 	/* Skip an UTF-8 byte order mark. */
702 	if (curp->filenc & MPARSE_UTF8 && blk.sz > 2 &&
703 	    (unsigned char)blk.buf[0] == 0xef &&
704 	    (unsigned char)blk.buf[1] == 0xbb &&
705 	    (unsigned char)blk.buf[2] == 0xbf) {
706 		offset = 3;
707 		curp->filenc &= ~MPARSE_LATIN1;
708 	} else
709 		offset = 0;
710 
711 	mparse_buf_r(curp, blk, offset, 1);
712 
713 	if (--recursion_depth == 0)
714 		mparse_end(curp);
715 
716 	curp->primary = svprimary;
717 	curp->file = svfile;
718 }
719 
720 /*
721  * Read the whole file into memory and call the parsers.
722  * Called recursively when an .so request is encountered.
723  */
724 enum mandoclevel
725 mparse_readfd(struct mparse *curp, int fd, const char *file)
726 {
727 	struct buf	 blk;
728 	int		 with_mmap;
729 	int		 save_filenc;
730 
731 	if (read_whole_file(curp, file, fd, &blk, &with_mmap)) {
732 		save_filenc = curp->filenc;
733 		curp->filenc = curp->options &
734 		    (MPARSE_UTF8 | MPARSE_LATIN1);
735 		mparse_parse_buffer(curp, blk, file);
736 		curp->filenc = save_filenc;
737 		if (with_mmap)
738 			munmap(blk.buf, blk.sz);
739 		else
740 			free(blk.buf);
741 	}
742 	return curp->file_status;
743 }
744 
745 int
746 mparse_open(struct mparse *curp, const char *file)
747 {
748 	char		 *cp;
749 	int		  fd;
750 
751 	curp->file = file;
752 	cp = strrchr(file, '.');
753 	curp->gzip = (cp != NULL && ! strcmp(cp + 1, "gz"));
754 
755 	/* First try to use the filename as it is. */
756 
757 	if ((fd = open(file, O_RDONLY)) != -1)
758 		return fd;
759 
760 	/*
761 	 * If that doesn't work and the filename doesn't
762 	 * already  end in .gz, try appending .gz.
763 	 */
764 
765 	if ( ! curp->gzip) {
766 		mandoc_asprintf(&cp, "%s.gz", file);
767 		fd = open(cp, O_RDONLY);
768 		free(cp);
769 		if (fd != -1) {
770 			curp->gzip = 1;
771 			return fd;
772 		}
773 	}
774 
775 	/* Neither worked, give up. */
776 
777 	mandoc_msg(MANDOCERR_FILE, curp, 0, 0, strerror(errno));
778 	return -1;
779 }
780 
781 struct mparse *
782 mparse_alloc(int options, enum mandoclevel wlevel, mandocmsg mmsg,
783     const char *defos)
784 {
785 	struct mparse	*curp;
786 
787 	curp = mandoc_calloc(1, sizeof(struct mparse));
788 
789 	curp->options = options;
790 	curp->wlevel = wlevel;
791 	curp->mmsg = mmsg;
792 	curp->defos = defos;
793 
794 	curp->roff = roff_alloc(curp, options);
795 	curp->man = roff_man_alloc( curp->roff, curp, curp->defos,
796 		curp->options & MPARSE_QUICK ? 1 : 0);
797 	if (curp->options & MPARSE_MDOC) {
798 		mdoc_hash_init();
799 		curp->man->macroset = MACROSET_MDOC;
800 	} else if (curp->options & MPARSE_MAN) {
801 		man_hash_init();
802 		curp->man->macroset = MACROSET_MAN;
803 	}
804 	curp->man->first->tok = TOKEN_NONE;
805 	return curp;
806 }
807 
808 void
809 mparse_reset(struct mparse *curp)
810 {
811 	roff_reset(curp->roff);
812 	roff_man_reset(curp->man);
813 	if (curp->secondary)
814 		curp->secondary->sz = 0;
815 
816 	curp->file_status = MANDOCLEVEL_OK;
817 
818 	free(curp->sodest);
819 	curp->sodest = NULL;
820 }
821 
822 void
823 mparse_free(struct mparse *curp)
824 {
825 
826 	roff_man_free(curp->man);
827 	if (curp->roff)
828 		roff_free(curp->roff);
829 	if (curp->secondary)
830 		free(curp->secondary->buf);
831 
832 	free(curp->secondary);
833 	free(curp->sodest);
834 	free(curp);
835 }
836 
837 void
838 mparse_result(struct mparse *curp, struct roff_man **man,
839 	char **sodest)
840 {
841 
842 	if (sodest && NULL != (*sodest = curp->sodest)) {
843 		*man = NULL;
844 		return;
845 	}
846 	if (man)
847 		*man = curp->man;
848 }
849 
850 void
851 mparse_updaterc(struct mparse *curp, enum mandoclevel *rc)
852 {
853 	if (curp->file_status > *rc)
854 		*rc = curp->file_status;
855 }
856 
857 void
858 mandoc_vmsg(enum mandocerr t, struct mparse *m,
859 		int ln, int pos, const char *fmt, ...)
860 {
861 	char		 buf[256];
862 	va_list		 ap;
863 
864 	va_start(ap, fmt);
865 	(void)vsnprintf(buf, sizeof(buf), fmt, ap);
866 	va_end(ap);
867 
868 	mandoc_msg(t, m, ln, pos, buf);
869 }
870 
871 void
872 mandoc_msg(enum mandocerr er, struct mparse *m,
873 		int ln, int col, const char *msg)
874 {
875 	enum mandoclevel level;
876 
877 	level = MANDOCLEVEL_UNSUPP;
878 	while (er < mandoclimits[level])
879 		level--;
880 
881 	if (level < m->wlevel && er != MANDOCERR_FILE)
882 		return;
883 
884 	if (m->mmsg)
885 		(*m->mmsg)(er, level, m->file, ln, col, msg);
886 
887 	if (m->file_status < level)
888 		m->file_status = level;
889 }
890 
891 const char *
892 mparse_strerror(enum mandocerr er)
893 {
894 
895 	return mandocerrs[er];
896 }
897 
898 const char *
899 mparse_strlevel(enum mandoclevel lvl)
900 {
901 	return mandoclevels[lvl];
902 }
903 
904 void
905 mparse_keep(struct mparse *p)
906 {
907 
908 	assert(NULL == p->secondary);
909 	p->secondary = mandoc_calloc(1, sizeof(struct buf));
910 }
911 
912 const char *
913 mparse_getkeep(const struct mparse *p)
914 {
915 
916 	assert(p->secondary);
917 	return p->secondary->sz ? p->secondary->buf : NULL;
918 }
919