xref: /openbsd-src/usr.bin/mandoc/read.c (revision 07aa05098856a503e2d4cff5708794aef9318348)
1*07aa0509Sschwarze /* $OpenBSD: read.c,v 1.192 2022/05/19 14:47:47 schwarze Exp $ */
2a35fc07aSschwarze /*
36e2a0df9Sschwarze  * Copyright (c) 2010-2020 Ingo Schwarze <schwarze@openbsd.org>
40ac7e6ecSschwarze  * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
539c2a57eSschwarze  * Copyright (c) 2010, 2012 Joerg Sonnenberger <joerg@netbsd.org>
6a35fc07aSschwarze  *
7a35fc07aSschwarze  * Permission to use, copy, modify, and distribute this software for any
8a35fc07aSschwarze  * purpose with or without fee is hereby granted, provided that the above
9a35fc07aSschwarze  * copyright notice and this permission notice appear in all copies.
10a35fc07aSschwarze  *
11d1982c71Sschwarze  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
12a35fc07aSschwarze  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13d1982c71Sschwarze  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
14a35fc07aSschwarze  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15a35fc07aSschwarze  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16a35fc07aSschwarze  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17a35fc07aSschwarze  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
180ac7e6ecSschwarze  *
190ac7e6ecSschwarze  * Top-level functions of the mandoc(3) parser:
200ac7e6ecSschwarze  * Parser and input encoding selection, decompression,
210ac7e6ecSschwarze  * handling of input bytes, characters, lines, and files,
220ac7e6ecSschwarze  * handling of roff(7) loops and file inclusion,
230ac7e6ecSschwarze  * and steering of the various parsers.
24a35fc07aSschwarze  */
25d395d87cSschwarze #include <sys/types.h>
26a35fc07aSschwarze #include <sys/mman.h>
27d395d87cSschwarze #include <sys/stat.h>
28a35fc07aSschwarze 
29a35fc07aSschwarze #include <assert.h>
30a35fc07aSschwarze #include <ctype.h>
3184d1f063Sschwarze #include <errno.h>
32a35fc07aSschwarze #include <fcntl.h>
33a35fc07aSschwarze #include <stdarg.h>
34a35fc07aSschwarze #include <stdio.h>
35a35fc07aSschwarze #include <stdlib.h>
36a35fc07aSschwarze #include <string.h>
37a35fc07aSschwarze #include <unistd.h>
38d74fe132Sschwarze #include <zlib.h>
39a35fc07aSschwarze 
404f4f7972Sschwarze #include "mandoc_aux.h"
41d1982c71Sschwarze #include "mandoc.h"
42d1982c71Sschwarze #include "roff.h"
43a35fc07aSschwarze #include "mdoc.h"
44a35fc07aSschwarze #include "man.h"
4599acaf1eSschwarze #include "mandoc_parse.h"
46d1982c71Sschwarze #include "libmandoc.h"
476c530f1cSschwarze #include "roff_int.h"
486e2a0df9Sschwarze #include "tag.h"
49a35fc07aSschwarze 
50a35fc07aSschwarze #define	REPARSE_LIMIT	1000
51a35fc07aSschwarze 
52a35fc07aSschwarze struct	mparse {
53a35fc07aSschwarze 	struct roff	 *roff; /* roff parser (!NULL) */
5477f3ec4fSschwarze 	struct roff_man	 *man; /* man parser */
55f0d7487dSschwarze 	struct buf	 *primary; /* buffer currently being parsed */
5641b72316Sschwarze 	struct buf	 *secondary; /* copy of top level input */
57b7f92c5fSschwarze 	struct buf	 *loop; /* open .while request line */
58f3476b07Sschwarze 	const char	 *os_s; /* default operating system */
59f0d7487dSschwarze 	int		  options; /* parser options */
60d74fe132Sschwarze 	int		  gzip; /* current input file is gzipped */
617232fc26Sschwarze 	int		  filenc; /* encoding of the current file */
62f0d7487dSschwarze 	int		  reparse_count; /* finite interp. stack */
63f0d7487dSschwarze 	int		  line; /* line number in the file */
64a35fc07aSschwarze };
65a35fc07aSschwarze 
669d7b4fe8Sschwarze static	void	  choose_parser(struct mparse *);
6741b72316Sschwarze static	void	  free_buf_list(struct buf *);
68a35fc07aSschwarze static	void	  resize_buf(struct buf *, size_t);
69b7f92c5fSschwarze static	int	  mparse_buf_r(struct mparse *, struct buf, size_t, int);
70e501e731Sschwarze static	int	  read_whole_file(struct mparse *, int, struct buf *, int *);
71a35fc07aSschwarze static	void	  mparse_end(struct mparse *);
72a35fc07aSschwarze 
7349aff9f8Sschwarze 
74a35fc07aSschwarze static void
resize_buf(struct buf * buf,size_t initial)75a35fc07aSschwarze resize_buf(struct buf *buf, size_t initial)
76a35fc07aSschwarze {
77a35fc07aSschwarze 
78a35fc07aSschwarze 	buf->sz = buf->sz > initial/2 ? 2 * buf->sz : initial;
79a35fc07aSschwarze 	buf->buf = mandoc_realloc(buf->buf, buf->sz);
80a35fc07aSschwarze }
81a35fc07aSschwarze 
82a35fc07aSschwarze static void
free_buf_list(struct buf * buf)8341b72316Sschwarze free_buf_list(struct buf *buf)
8441b72316Sschwarze {
8541b72316Sschwarze 	struct buf *tmp;
8641b72316Sschwarze 
8741b72316Sschwarze 	while (buf != NULL) {
8841b72316Sschwarze 		tmp = buf;
8941b72316Sschwarze 		buf = tmp->next;
9041b72316Sschwarze 		free(tmp->buf);
9141b72316Sschwarze 		free(tmp);
9241b72316Sschwarze 	}
9341b72316Sschwarze }
9441b72316Sschwarze 
9541b72316Sschwarze static void
choose_parser(struct mparse * curp)969d7b4fe8Sschwarze choose_parser(struct mparse *curp)
97a35fc07aSschwarze {
98f0d7487dSschwarze 	char		*cp, *ep;
99f0d7487dSschwarze 	int		 format;
100a35fc07aSschwarze 
101f0d7487dSschwarze 	/*
102f0d7487dSschwarze 	 * If neither command line arguments -mdoc or -man select
103f0d7487dSschwarze 	 * a parser nor the roff parser found a .Dd or .TH macro
104f0d7487dSschwarze 	 * yet, look ahead in the main input buffer.
105f0d7487dSschwarze 	 */
106f0d7487dSschwarze 
107f0d7487dSschwarze 	if ((format = roff_getformat(curp->roff)) == 0) {
108f0d7487dSschwarze 		cp = curp->primary->buf;
109f0d7487dSschwarze 		ep = cp + curp->primary->sz;
110f0d7487dSschwarze 		while (cp < ep) {
111357cc7d0Sschwarze 			if (*cp == '.' || *cp == '\'') {
112f0d7487dSschwarze 				cp++;
113f0d7487dSschwarze 				if (cp[0] == 'D' && cp[1] == 'd') {
114f0d7487dSschwarze 					format = MPARSE_MDOC;
115f0d7487dSschwarze 					break;
116f0d7487dSschwarze 				}
117f0d7487dSschwarze 				if (cp[0] == 'T' && cp[1] == 'H') {
118f0d7487dSschwarze 					format = MPARSE_MAN;
119f0d7487dSschwarze 					break;
120f0d7487dSschwarze 				}
121f0d7487dSschwarze 			}
122f0d7487dSschwarze 			cp = memchr(cp, '\n', ep - cp);
123f0d7487dSschwarze 			if (cp == NULL)
124f0d7487dSschwarze 				break;
125f0d7487dSschwarze 			cp++;
126f0d7487dSschwarze 		}
127a35fc07aSschwarze 	}
128a35fc07aSschwarze 
129405987fcSschwarze 	if (format == MPARSE_MDOC) {
1306b86842eSschwarze 		curp->man->meta.macroset = MACROSET_MDOC;
1316050a3daSschwarze 		if (curp->man->mdocmac == NULL)
1326050a3daSschwarze 			curp->man->mdocmac = roffhash_alloc(MDOC_Dd, MDOC_MAX);
133405987fcSschwarze 	} else {
1346b86842eSschwarze 		curp->man->meta.macroset = MACROSET_MAN;
1356050a3daSschwarze 		if (curp->man->manmac == NULL)
1366050a3daSschwarze 			curp->man->manmac = roffhash_alloc(MAN_TH, MAN_MAX);
137405987fcSschwarze 	}
1386b86842eSschwarze 	curp->man->meta.first->tok = TOKEN_NONE;
139a35fc07aSschwarze }
140a35fc07aSschwarze 
141a35fc07aSschwarze /*
142cdea9283Sschwarze  * Main parse routine for a buffer.
143cdea9283Sschwarze  * It assumes encoding and line numbering are already set up.
144cdea9283Sschwarze  * It can recurse directly (for invocations of user-defined
145cdea9283Sschwarze  * macros, inline equations, and input line traps)
146cdea9283Sschwarze  * and indirectly (for .so file inclusion).
147a35fc07aSschwarze  */
148b7f92c5fSschwarze static int
mparse_buf_r(struct mparse * curp,struct buf blk,size_t i,int start)149cdea9283Sschwarze mparse_buf_r(struct mparse *curp, struct buf blk, size_t i, int start)
150a35fc07aSschwarze {
151a35fc07aSschwarze 	struct buf	 ln;
152b7f92c5fSschwarze 	struct buf	*firstln, *lastln, *thisln, *loop;
153f7d9ce8dSschwarze 	char		*cp;
154cdea9283Sschwarze 	size_t		 pos; /* byte number in the ln buffer */
155dd9cc97dSschwarze 	size_t		 spos; /* at the start of the current line parse */
156b7f92c5fSschwarze 	int		 line_result, result;
157c55fe189Sschwarze 	int		 of;
158a35fc07aSschwarze 	int		 lnn; /* line number in the real file */
159f7d9ce8dSschwarze 	int		 fd;
160b7f92c5fSschwarze 	int		 inloop; /* Saw .while on this level. */
161a35fc07aSschwarze 	unsigned char	 c;
162a35fc07aSschwarze 
16341b72316Sschwarze 	ln.sz = 256;
16441b72316Sschwarze 	ln.buf = mandoc_malloc(ln.sz);
16541b72316Sschwarze 	ln.next = NULL;
1660bb0865dSschwarze 	firstln = lastln = loop = NULL;
167a35fc07aSschwarze 	lnn = curp->line;
168a35fc07aSschwarze 	pos = 0;
169b7f92c5fSschwarze 	inloop = 0;
17041b72316Sschwarze 	result = ROFF_CONT;
171a35fc07aSschwarze 
172b7f92c5fSschwarze 	while (i < blk.sz && (blk.buf[i] != '\0' || pos != 0)) {
173a35fc07aSschwarze 		if (start) {
174a35fc07aSschwarze 			curp->line = lnn;
175a35fc07aSschwarze 			curp->reparse_count = 0;
1767232fc26Sschwarze 
1777232fc26Sschwarze 			if (lnn < 3 &&
1787232fc26Sschwarze 			    curp->filenc & MPARSE_UTF8 &&
179cdea9283Sschwarze 			    curp->filenc & MPARSE_LATIN1)
180cdea9283Sschwarze 				curp->filenc = preconv_cue(&blk, i);
181a35fc07aSschwarze 		}
182dd9cc97dSschwarze 		spos = pos;
183a35fc07aSschwarze 
184cdea9283Sschwarze 		while (i < blk.sz && (start || blk.buf[i] != '\0')) {
185a35fc07aSschwarze 
186a35fc07aSschwarze 			/*
187a35fc07aSschwarze 			 * When finding an unescaped newline character,
188a35fc07aSschwarze 			 * leave the character loop to process the line.
189a35fc07aSschwarze 			 * Skip a preceding carriage return, if any.
190a35fc07aSschwarze 			 */
191a35fc07aSschwarze 
192cdea9283Sschwarze 			if ('\r' == blk.buf[i] && i + 1 < blk.sz &&
193a35fc07aSschwarze 			    '\n' == blk.buf[i + 1])
194a35fc07aSschwarze 				++i;
195a35fc07aSschwarze 			if ('\n' == blk.buf[i]) {
196a35fc07aSschwarze 				++i;
197a35fc07aSschwarze 				++lnn;
198a35fc07aSschwarze 				break;
199a35fc07aSschwarze 			}
200a35fc07aSschwarze 
201a35fc07aSschwarze 			/*
2027232fc26Sschwarze 			 * Make sure we have space for the worst
20341b72316Sschwarze 			 * case of 12 bytes: "\\[u10ffff]\n\0"
20416f845e4Sschwarze 			 */
20516f845e4Sschwarze 
20641b72316Sschwarze 			if (pos + 12 > ln.sz)
20716f845e4Sschwarze 				resize_buf(&ln, 256);
20816f845e4Sschwarze 
20916f845e4Sschwarze 			/*
2107232fc26Sschwarze 			 * Encode 8-bit input.
211a35fc07aSschwarze 			 */
212a35fc07aSschwarze 
2137232fc26Sschwarze 			c = blk.buf[i];
2147232fc26Sschwarze 			if (c & 0x80) {
215cdea9283Sschwarze 				if ( ! (curp->filenc && preconv_encode(
216cdea9283Sschwarze 				    &blk, &i, &ln, &pos, &curp->filenc))) {
217a5a5f808Sschwarze 					mandoc_msg(MANDOCERR_CHAR_BAD,
21861ee90daSschwarze 					    curp->line, pos, "0x%x", c);
2197232fc26Sschwarze 					ln.buf[pos++] = '?';
2207232fc26Sschwarze 					i++;
2217232fc26Sschwarze 				}
2227232fc26Sschwarze 				continue;
2237232fc26Sschwarze 			}
224a35fc07aSschwarze 
2257232fc26Sschwarze 			/*
2267232fc26Sschwarze 			 * Exclude control characters.
2277232fc26Sschwarze 			 */
2287232fc26Sschwarze 
2297232fc26Sschwarze 			if (c == 0x7f || (c < 0x20 && c != 0x09)) {
230a5a5f808Sschwarze 				mandoc_msg(c == 0x00 || c == 0x04 ||
23161ee90daSschwarze 				    c > 0x0a ? MANDOCERR_CHAR_BAD :
23261ee90daSschwarze 				    MANDOCERR_CHAR_UNSUPP,
233a5a5f808Sschwarze 				    curp->line, pos, "0x%x", c);
234a35fc07aSschwarze 				i++;
23546ec38c0Sschwarze 				if (c != '\r')
23647232c98Sschwarze 					ln.buf[pos++] = '?';
237a35fc07aSschwarze 				continue;
238a35fc07aSschwarze 			}
239a35fc07aSschwarze 
240a35fc07aSschwarze 			ln.buf[pos++] = blk.buf[i++];
241a35fc07aSschwarze 		}
24241b72316Sschwarze 		ln.buf[pos] = '\0';
243a35fc07aSschwarze 
24441b72316Sschwarze 		/*
24541b72316Sschwarze 		 * Maintain a lookaside buffer of all lines.
24641b72316Sschwarze 		 * parsed from this input source.
24741b72316Sschwarze 		 */
248a35fc07aSschwarze 
24941b72316Sschwarze 		thisln = mandoc_malloc(sizeof(*thisln));
25041b72316Sschwarze 		thisln->buf = mandoc_strdup(ln.buf);
25141b72316Sschwarze 		thisln->sz = strlen(ln.buf) + 1;
25241b72316Sschwarze 		thisln->next = NULL;
25341b72316Sschwarze 		if (firstln == NULL) {
25441b72316Sschwarze 			firstln = lastln = thisln;
25541b72316Sschwarze 			if (curp->secondary == NULL)
25641b72316Sschwarze 				curp->secondary = firstln;
25741b72316Sschwarze 		} else {
25841b72316Sschwarze 			lastln->next = thisln;
25941b72316Sschwarze 			lastln = thisln;
26041b72316Sschwarze 		}
26141b72316Sschwarze 
262*07aa0509Sschwarze 		/*
263*07aa0509Sschwarze 		 * XXX Ugly hack to mark the end of the input,
264*07aa0509Sschwarze 		 * such that the function roff_parse_comment()
265*07aa0509Sschwarze 		 * doesn't attempt to append another line if the
266*07aa0509Sschwarze 		 * last input line ends with an escape character.
267*07aa0509Sschwarze 		 */
26841b72316Sschwarze 
26941b72316Sschwarze 		if (i == blk.sz || blk.buf[i] == '\0') {
27018b62721Sschwarze 			if (pos + 2 > ln.sz)
27118b62721Sschwarze 				resize_buf(&ln, 256);
272be477484Sschwarze 			ln.buf[pos++] = '\n';
273a35fc07aSschwarze 			ln.buf[pos] = '\0';
27441b72316Sschwarze 		}
275a35fc07aSschwarze 
276a35fc07aSschwarze 		/*
277a35fc07aSschwarze 		 * A significant amount of complexity is contained by
278a35fc07aSschwarze 		 * the roff preprocessor.  It's line-oriented but can be
279a35fc07aSschwarze 		 * expressed on one line, so we need at times to
280a35fc07aSschwarze 		 * readjust our starting point and re-run it.  The roff
281a35fc07aSschwarze 		 * preprocessor can also readjust the buffers with new
282a35fc07aSschwarze 		 * data, so we pass them in wholesale.
283a35fc07aSschwarze 		 */
284a35fc07aSschwarze 
285a35fc07aSschwarze 		of = 0;
286a35fc07aSschwarze rerun:
287dd9cc97dSschwarze 		line_result = roff_parseln(curp->roff, curp->line,
288dd9cc97dSschwarze 		    &ln, &of, start && spos == 0 ? pos : 0);
289a35fc07aSschwarze 
290b7f92c5fSschwarze 		/* Process options. */
291b7f92c5fSschwarze 
292b7f92c5fSschwarze 		if (line_result & ROFF_APPEND)
293b7f92c5fSschwarze 			assert(line_result == (ROFF_IGN | ROFF_APPEND));
294b7f92c5fSschwarze 
295b7f92c5fSschwarze 		if (line_result & ROFF_USERCALL)
296b7f92c5fSschwarze 			assert((line_result & ROFF_MASK) == ROFF_REPARSE);
297b7f92c5fSschwarze 
298b7f92c5fSschwarze 		if (line_result & ROFF_USERRET) {
299b7f92c5fSschwarze 			assert(line_result == (ROFF_IGN | ROFF_USERRET));
300b7f92c5fSschwarze 			if (start == 0) {
301b7f92c5fSschwarze 				/* Return from the current macro. */
302b7f92c5fSschwarze 				result = ROFF_USERRET;
303b7f92c5fSschwarze 				goto out;
304b7f92c5fSschwarze 			}
305b7f92c5fSschwarze 		}
306b7f92c5fSschwarze 
307b7f92c5fSschwarze 		switch (line_result & ROFF_LOOPMASK) {
308b7f92c5fSschwarze 		case ROFF_IGN:
309b7f92c5fSschwarze 			break;
310b7f92c5fSschwarze 		case ROFF_WHILE:
311b7f92c5fSschwarze 			if (curp->loop != NULL) {
312b7f92c5fSschwarze 				if (loop == curp->loop)
313b7f92c5fSschwarze 					break;
314b7f92c5fSschwarze 				mandoc_msg(MANDOCERR_WHILE_NEST,
315a5a5f808Sschwarze 				    curp->line, pos, NULL);
316b7f92c5fSschwarze 			}
317b7f92c5fSschwarze 			curp->loop = thisln;
318b7f92c5fSschwarze 			loop = NULL;
319b7f92c5fSschwarze 			inloop = 1;
320b7f92c5fSschwarze 			break;
321b7f92c5fSschwarze 		case ROFF_LOOPCONT:
322b7f92c5fSschwarze 		case ROFF_LOOPEXIT:
323b7f92c5fSschwarze 			if (curp->loop == NULL) {
324b7f92c5fSschwarze 				mandoc_msg(MANDOCERR_WHILE_FAIL,
325a5a5f808Sschwarze 				    curp->line, pos, NULL);
326b7f92c5fSschwarze 				break;
327b7f92c5fSschwarze 			}
328b7f92c5fSschwarze 			if (inloop == 0) {
329b7f92c5fSschwarze 				mandoc_msg(MANDOCERR_WHILE_INTO,
330a5a5f808Sschwarze 				    curp->line, pos, NULL);
331b7f92c5fSschwarze 				curp->loop = loop = NULL;
332b7f92c5fSschwarze 				break;
333b7f92c5fSschwarze 			}
334b7f92c5fSschwarze 			if (line_result & ROFF_LOOPCONT)
335b7f92c5fSschwarze 				loop = curp->loop;
336b7f92c5fSschwarze 			else {
337b7f92c5fSschwarze 				curp->loop = loop = NULL;
338b7f92c5fSschwarze 				inloop = 0;
339b7f92c5fSschwarze 			}
340b7f92c5fSschwarze 			break;
341b7f92c5fSschwarze 		default:
342b7f92c5fSschwarze 			abort();
343b7f92c5fSschwarze 		}
344b7f92c5fSschwarze 
345b7f92c5fSschwarze 		/* Process the main instruction from the roff parser. */
346b7f92c5fSschwarze 
347b7f92c5fSschwarze 		switch (line_result & ROFF_MASK) {
348b7f92c5fSschwarze 		case ROFF_IGN:
349b7f92c5fSschwarze 			break;
350b7f92c5fSschwarze 		case ROFF_CONT:
3516b86842eSschwarze 			if (curp->man->meta.macroset == MACROSET_NONE)
352b7f92c5fSschwarze 				choose_parser(curp);
3536b86842eSschwarze 			if ((curp->man->meta.macroset == MACROSET_MDOC ?
354b7f92c5fSschwarze 			     mdoc_parseln(curp->man, curp->line, ln.buf, of) :
355b7f92c5fSschwarze 			     man_parseln(curp->man, curp->line, ln.buf, of)
356b7f92c5fSschwarze 			    ) == 2)
357b7f92c5fSschwarze 				goto out;
358b7f92c5fSschwarze 			break;
359b7f92c5fSschwarze 		case ROFF_RERUN:
360b7f92c5fSschwarze 			goto rerun;
36149aff9f8Sschwarze 		case ROFF_REPARSE:
3623dc5225dSschwarze 			if (++curp->reparse_count > REPARSE_LIMIT) {
363b7f92c5fSschwarze 				/* Abort and return to the top level. */
36441b72316Sschwarze 				result = ROFF_IGN;
365a5a5f808Sschwarze 				mandoc_msg(MANDOCERR_ROFFLOOP,
366a35fc07aSschwarze 				    curp->line, pos, NULL);
367b7f92c5fSschwarze 				goto out;
368b7f92c5fSschwarze 			}
36941b72316Sschwarze 			result = mparse_buf_r(curp, ln, of, 0);
370b7f92c5fSschwarze 			if (line_result & ROFF_USERCALL) {
3713dc5225dSschwarze 				roff_userret(curp->roff);
372b7f92c5fSschwarze 				/* Continue normally. */
373b7f92c5fSschwarze 				if (result & ROFF_USERRET)
374b7f92c5fSschwarze 					result = ROFF_CONT;
3753dc5225dSschwarze 			}
376b7f92c5fSschwarze 			if (start == 0 && result != ROFF_CONT)
37741b72316Sschwarze 				goto out;
378b7f92c5fSschwarze 			break;
37949aff9f8Sschwarze 		case ROFF_SO:
380cdea9283Sschwarze 			if ( ! (curp->options & MPARSE_SO) &&
381cdea9283Sschwarze 			    (i >= blk.sz || blk.buf[i] == '\0')) {
3826b86842eSschwarze 				curp->man->meta.sodest =
3836b86842eSschwarze 				    mandoc_strdup(ln.buf + of);
38441b72316Sschwarze 				goto out;
385310147f5Sschwarze 			}
386723ae0efSschwarze 			if ((fd = mparse_open(curp, ln.buf + of)) != -1) {
387f7d9ce8dSschwarze 				mparse_readfd(curp, fd, ln.buf + of);
3887a6e7816Sschwarze 				close(fd);
3891be61736Sschwarze 			} else {
39018f7d3b8Sschwarze 				mandoc_msg(MANDOCERR_SO_FAIL,
39118f7d3b8Sschwarze 				    curp->line, of, ".so %s: %s",
39218f7d3b8Sschwarze 				    ln.buf + of, strerror(errno));
393f7d9ce8dSschwarze 				ln.sz = mandoc_asprintf(&cp,
394f7d9ce8dSschwarze 				    ".sp\nSee the file %s.\n.sp",
395f7d9ce8dSschwarze 				    ln.buf + of);
396f7d9ce8dSschwarze 				free(ln.buf);
397f7d9ce8dSschwarze 				ln.buf = cp;
398f7d9ce8dSschwarze 				of = 0;
399f7d9ce8dSschwarze 				mparse_buf_r(curp, ln, of, 0);
40085527626Sschwarze 			}
401b7f92c5fSschwarze 			break;
402a35fc07aSschwarze 		default:
403b7f92c5fSschwarze 			abort();
404a35fc07aSschwarze 		}
405a35fc07aSschwarze 
406a35fc07aSschwarze 		/* Start the next input line. */
407a35fc07aSschwarze 
408b7f92c5fSschwarze 		if (loop != NULL &&
409b7f92c5fSschwarze 		    (line_result & ROFF_LOOPMASK) == ROFF_IGN)
410b7f92c5fSschwarze 			loop = loop->next;
411b7f92c5fSschwarze 
412b7f92c5fSschwarze 		if (loop != NULL) {
413b7f92c5fSschwarze 			if ((line_result & ROFF_APPEND) == 0)
414b7f92c5fSschwarze 				*ln.buf = '\0';
415b7f92c5fSschwarze 			if (ln.sz < loop->sz)
416b7f92c5fSschwarze 				resize_buf(&ln, loop->sz);
417b7f92c5fSschwarze 			(void)strlcat(ln.buf, loop->buf, ln.sz);
418b7f92c5fSschwarze 			of = 0;
419b7f92c5fSschwarze 			goto rerun;
420b7f92c5fSschwarze 		}
421b7f92c5fSschwarze 
422b7f92c5fSschwarze 		pos = (line_result & ROFF_APPEND) ? strlen(ln.buf) : 0;
423a35fc07aSschwarze 	}
42441b72316Sschwarze out:
425b7f92c5fSschwarze 	if (inloop) {
426b7f92c5fSschwarze 		if (result != ROFF_USERRET)
427a5a5f808Sschwarze 			mandoc_msg(MANDOCERR_WHILE_OUTOF,
428b7f92c5fSschwarze 			    curp->line, pos, NULL);
429b7f92c5fSschwarze 		curp->loop = NULL;
430b7f92c5fSschwarze 	}
431a35fc07aSschwarze 	free(ln.buf);
43241b72316Sschwarze 	if (firstln != curp->secondary)
43341b72316Sschwarze 		free_buf_list(firstln);
43441b72316Sschwarze 	return result;
435a35fc07aSschwarze }
436a35fc07aSschwarze 
437a35fc07aSschwarze static int
read_whole_file(struct mparse * curp,int fd,struct buf * fb,int * with_mmap)438e501e731Sschwarze read_whole_file(struct mparse *curp, int fd, struct buf *fb, int *with_mmap)
439a35fc07aSschwarze {
440a35fc07aSschwarze 	struct stat	 st;
441d74fe132Sschwarze 	gzFile		 gz;
442a35fc07aSschwarze 	size_t		 off;
443a35fc07aSschwarze 	ssize_t		 ssz;
4441485d9f7Sschwarze 	int		 gzerrnum, retval;
445a35fc07aSschwarze 
446e8cab092Sschwarze 	if (fstat(fd, &st) == -1) {
447ecd1ed85Sschwarze 		mandoc_msg(MANDOCERR_FSTAT, 0, 0, "%s", strerror(errno));
448ecd1ed85Sschwarze 		return -1;
449e8cab092Sschwarze 	}
450a35fc07aSschwarze 
451a35fc07aSschwarze 	/*
452a35fc07aSschwarze 	 * If we're a regular file, try just reading in the whole entry
453a35fc07aSschwarze 	 * via mmap().  This is faster than reading it into blocks, and
454a35fc07aSschwarze 	 * since each file is only a few bytes to begin with, I'm not
455a35fc07aSschwarze 	 * concerned that this is going to tank any machines.
456a35fc07aSschwarze 	 */
457a35fc07aSschwarze 
458d74fe132Sschwarze 	if (curp->gzip == 0 && S_ISREG(st.st_mode)) {
4593fcf3a03Sschwarze 		if (st.st_size > 0x7fffffff) {
460a5a5f808Sschwarze 			mandoc_msg(MANDOCERR_TOOLARGE, 0, 0, NULL);
461ecd1ed85Sschwarze 			return -1;
462a35fc07aSschwarze 		}
463a35fc07aSschwarze 		*with_mmap = 1;
464a35fc07aSschwarze 		fb->sz = (size_t)st.st_size;
465b6ac0686Sschwarze 		fb->buf = mmap(NULL, fb->sz, PROT_READ, MAP_SHARED, fd, 0);
466a35fc07aSschwarze 		if (fb->buf != MAP_FAILED)
467ecd1ed85Sschwarze 			return 0;
468a35fc07aSschwarze 	}
469a35fc07aSschwarze 
470d74fe132Sschwarze 	if (curp->gzip) {
4711485d9f7Sschwarze 		/*
4721485d9f7Sschwarze 		 * Duplicating the file descriptor is required
4731485d9f7Sschwarze 		 * because we will have to call gzclose(3)
4741485d9f7Sschwarze 		 * to free memory used internally by zlib,
4751485d9f7Sschwarze 		 * but that will also close the file descriptor,
4761485d9f7Sschwarze 		 * which this function must not do.
4771485d9f7Sschwarze 		 */
4781485d9f7Sschwarze 		if ((fd = dup(fd)) == -1) {
479ecd1ed85Sschwarze 			mandoc_msg(MANDOCERR_DUP, 0, 0,
480ecd1ed85Sschwarze 			    "%s", strerror(errno));
481ecd1ed85Sschwarze 			return -1;
4821485d9f7Sschwarze 		}
483e8cab092Sschwarze 		if ((gz = gzdopen(fd, "rb")) == NULL) {
484ecd1ed85Sschwarze 			mandoc_msg(MANDOCERR_GZDOPEN, 0, 0,
485ecd1ed85Sschwarze 			    "%s", strerror(errno));
4861485d9f7Sschwarze 			close(fd);
487ecd1ed85Sschwarze 			return -1;
488e8cab092Sschwarze 		}
489d74fe132Sschwarze 	} else
490d74fe132Sschwarze 		gz = NULL;
491d74fe132Sschwarze 
492a35fc07aSschwarze 	/*
493a35fc07aSschwarze 	 * If this isn't a regular file (like, say, stdin), then we must
494a35fc07aSschwarze 	 * go the old way and just read things in bit by bit.
495a35fc07aSschwarze 	 */
496a35fc07aSschwarze 
497a35fc07aSschwarze 	*with_mmap = 0;
498a35fc07aSschwarze 	off = 0;
499ecd1ed85Sschwarze 	retval = -1;
500a35fc07aSschwarze 	fb->sz = 0;
501a35fc07aSschwarze 	fb->buf = NULL;
502a35fc07aSschwarze 	for (;;) {
503a35fc07aSschwarze 		if (off == fb->sz) {
504a35fc07aSschwarze 			if (fb->sz == (1U << 31)) {
505a5a5f808Sschwarze 				mandoc_msg(MANDOCERR_TOOLARGE, 0, 0, NULL);
506a35fc07aSschwarze 				break;
507a35fc07aSschwarze 			}
508a35fc07aSschwarze 			resize_buf(fb, 65536);
509a35fc07aSschwarze 		}
510d74fe132Sschwarze 		ssz = curp->gzip ?
511d74fe132Sschwarze 		    gzread(gz, fb->buf + (int)off, fb->sz - off) :
512d74fe132Sschwarze 		    read(fd, fb->buf + (int)off, fb->sz - off);
513a35fc07aSschwarze 		if (ssz == 0) {
514a35fc07aSschwarze 			fb->sz = off;
515ecd1ed85Sschwarze 			retval = 0;
5161485d9f7Sschwarze 			break;
517a35fc07aSschwarze 		}
518e8cab092Sschwarze 		if (ssz == -1) {
5191485d9f7Sschwarze 			if (curp->gzip)
5201485d9f7Sschwarze 				(void)gzerror(gz, &gzerrnum);
521ecd1ed85Sschwarze 			mandoc_msg(MANDOCERR_READ, 0, 0, "%s",
5221485d9f7Sschwarze 			    curp->gzip && gzerrnum != Z_ERRNO ?
5231485d9f7Sschwarze 			    zError(gzerrnum) : strerror(errno));
524e8cab092Sschwarze 			break;
525e8cab092Sschwarze 		}
526a35fc07aSschwarze 		off += (size_t)ssz;
527a35fc07aSschwarze 	}
528a35fc07aSschwarze 
5291485d9f7Sschwarze 	if (curp->gzip && (gzerrnum = gzclose(gz)) != Z_OK)
530ecd1ed85Sschwarze 		mandoc_msg(MANDOCERR_GZCLOSE, 0, 0, "%s",
5311485d9f7Sschwarze 		    gzerrnum == Z_ERRNO ? strerror(errno) :
5321485d9f7Sschwarze 		    zError(gzerrnum));
533ecd1ed85Sschwarze 	if (retval == -1) {
534a35fc07aSschwarze 		free(fb->buf);
535a35fc07aSschwarze 		fb->buf = NULL;
5361485d9f7Sschwarze 	}
5371485d9f7Sschwarze 	return retval;
538a35fc07aSschwarze }
539a35fc07aSschwarze 
540a35fc07aSschwarze static void
mparse_end(struct mparse * curp)541a35fc07aSschwarze mparse_end(struct mparse *curp)
542a35fc07aSschwarze {
5436b86842eSschwarze 	if (curp->man->meta.macroset == MACROSET_NONE)
5446b86842eSschwarze 		curp->man->meta.macroset = MACROSET_MAN;
5456b86842eSschwarze 	if (curp->man->meta.macroset == MACROSET_MDOC)
546f2d5c709Sschwarze 		mdoc_endparse(curp->man);
547f2d5c709Sschwarze 	else
548df927bb6Sschwarze 		man_endparse(curp->man);
549a35fc07aSschwarze 	roff_endparse(curp->roff);
550a35fc07aSschwarze }
551a35fc07aSschwarze 
552b8bc906aSschwarze /*
553b8bc906aSschwarze  * Read the whole file into memory and call the parsers.
554b8bc906aSschwarze  * Called recursively when an .so request is encountered.
555b8bc906aSschwarze  */
556b8bc906aSschwarze void
mparse_readfd(struct mparse * curp,int fd,const char * filename)557b8bc906aSschwarze mparse_readfd(struct mparse *curp, int fd, const char *filename)
558a35fc07aSschwarze {
5592943e6e6Sschwarze 	static int	 recursion_depth;
5602943e6e6Sschwarze 
561b8bc906aSschwarze 	struct buf	 blk;
562b8bc906aSschwarze 	struct buf	*save_primary;
563ee646987Sschwarze 	const char	*save_filename, *cp;
564b8bc906aSschwarze 	size_t		 offset;
565b8bc906aSschwarze 	int		 save_filenc, save_lineno;
566b8bc906aSschwarze 	int		 with_mmap;
567b8bc906aSschwarze 
568b8bc906aSschwarze 	if (recursion_depth > 64) {
569a5a5f808Sschwarze 		mandoc_msg(MANDOCERR_ROFFLOOP, curp->line, 0, NULL);
570b6ac0686Sschwarze 		return;
571ee646987Sschwarze 	} else if (recursion_depth == 0 &&
572ee646987Sschwarze 	    (cp = strrchr(filename, '.')) != NULL &&
573ee646987Sschwarze             cp[1] >= '1' && cp[1] <= '9')
574ee646987Sschwarze                 curp->man->filesec = cp[1];
575ee646987Sschwarze         else
576ee646987Sschwarze                 curp->man->filesec = '\0';
577ee646987Sschwarze 
578ecd1ed85Sschwarze 	if (read_whole_file(curp, fd, &blk, &with_mmap) == -1)
579b8bc906aSschwarze 		return;
580a35fc07aSschwarze 
581b8bc906aSschwarze 	/*
582b8bc906aSschwarze 	 * Save some properties of the parent file.
583b8bc906aSschwarze 	 */
584b8bc906aSschwarze 
585b8bc906aSschwarze 	save_primary = curp->primary;
586b8bc906aSschwarze 	save_filenc = curp->filenc;
587b8bc906aSschwarze 	save_lineno = curp->line;
588b8bc906aSschwarze 	save_filename = mandoc_msg_getinfilename();
589b8bc906aSschwarze 
590f0d7487dSschwarze 	curp->primary = &blk;
591b8bc906aSschwarze 	curp->filenc = curp->options & (MPARSE_UTF8 | MPARSE_LATIN1);
592b6ac0686Sschwarze 	curp->line = 1;
593b8bc906aSschwarze 	mandoc_msg_setinfilename(filename);
594b6ac0686Sschwarze 
5957232fc26Sschwarze 	/* Skip an UTF-8 byte order mark. */
5967232fc26Sschwarze 	if (curp->filenc & MPARSE_UTF8 && blk.sz > 2 &&
5977232fc26Sschwarze 	    (unsigned char)blk.buf[0] == 0xef &&
5987232fc26Sschwarze 	    (unsigned char)blk.buf[1] == 0xbb &&
5997232fc26Sschwarze 	    (unsigned char)blk.buf[2] == 0xbf) {
600cdea9283Sschwarze 		offset = 3;
6017232fc26Sschwarze 		curp->filenc &= ~MPARSE_LATIN1;
602cdea9283Sschwarze 	} else
603cdea9283Sschwarze 		offset = 0;
6047232fc26Sschwarze 
605b8bc906aSschwarze 	recursion_depth++;
606cdea9283Sschwarze 	mparse_buf_r(curp, blk, offset, 1);
607df927bb6Sschwarze 	if (--recursion_depth == 0)
608b6ac0686Sschwarze 		mparse_end(curp);
609b6ac0686Sschwarze 
61065be529eSschwarze 	/*
611b8bc906aSschwarze 	 * Clean up and restore saved parent properties.
61265be529eSschwarze 	 */
613a35fc07aSschwarze 
614b6ac0686Sschwarze 	if (with_mmap)
615b6ac0686Sschwarze 		munmap(blk.buf, blk.sz);
616b6ac0686Sschwarze 	else
617b6ac0686Sschwarze 		free(blk.buf);
618b8bc906aSschwarze 
619b8bc906aSschwarze 	curp->primary = save_primary;
620b8bc906aSschwarze 	curp->filenc = save_filenc;
621b8bc906aSschwarze 	curp->line = save_lineno;
622b8bc906aSschwarze 	if (save_filename != NULL)
623b8bc906aSschwarze 		mandoc_msg_setinfilename(save_filename);
624a35fc07aSschwarze }
625a35fc07aSschwarze 
626723ae0efSschwarze int
mparse_open(struct mparse * curp,const char * file)627723ae0efSschwarze mparse_open(struct mparse *curp, const char *file)
628d395d87cSschwarze {
629d395d87cSschwarze 	char		 *cp;
6305c29163cSschwarze 	int		  fd, save_errno;
631d395d87cSschwarze 
632d74fe132Sschwarze 	cp = strrchr(file, '.');
633d74fe132Sschwarze 	curp->gzip = (cp != NULL && ! strcmp(cp + 1, "gz"));
63465be529eSschwarze 
635d74fe132Sschwarze 	/* First try to use the filename as it is. */
63665be529eSschwarze 
637723ae0efSschwarze 	if ((fd = open(file, O_RDONLY)) != -1)
638723ae0efSschwarze 		return fd;
63965be529eSschwarze 
640d74fe132Sschwarze 	/*
641d74fe132Sschwarze 	 * If that doesn't work and the filename doesn't
642d74fe132Sschwarze 	 * already  end in .gz, try appending .gz.
643d74fe132Sschwarze 	 */
64465be529eSschwarze 
645d74fe132Sschwarze 	if ( ! curp->gzip) {
6465c29163cSschwarze 		save_errno = errno;
64765be529eSschwarze 		mandoc_asprintf(&cp, "%s.gz", file);
6484552f795Sschwarze 		fd = open(cp, O_RDONLY);
649fc1cb77bSschwarze 		free(cp);
6505c29163cSschwarze 		errno = save_errno;
651723ae0efSschwarze 		if (fd != -1) {
652d74fe132Sschwarze 			curp->gzip = 1;
653723ae0efSschwarze 			return fd;
654d74fe132Sschwarze 		}
655d74fe132Sschwarze 	}
656d74fe132Sschwarze 
657d74fe132Sschwarze 	/* Neither worked, give up. */
658d74fe132Sschwarze 
659723ae0efSschwarze 	return -1;
660d395d87cSschwarze }
66165be529eSschwarze 
662a35fc07aSschwarze struct mparse *
mparse_alloc(int options,enum mandoc_os os_e,const char * os_s)663e501e731Sschwarze mparse_alloc(int options, enum mandoc_os os_e, const char *os_s)
664a35fc07aSschwarze {
665a35fc07aSschwarze 	struct mparse	*curp;
666a35fc07aSschwarze 
667a35fc07aSschwarze 	curp = mandoc_calloc(1, sizeof(struct mparse));
668a35fc07aSschwarze 
669fee846f0Sschwarze 	curp->options = options;
670f3476b07Sschwarze 	curp->os_s = os_s;
671a35fc07aSschwarze 
67291305757Sschwarze 	curp->roff = roff_alloc(options);
67391305757Sschwarze 	curp->man = roff_man_alloc(curp->roff, curp->os_s,
674405987fcSschwarze 		curp->options & MPARSE_QUICK ? 1 : 0);
675d0370668Sschwarze 	if (curp->options & MPARSE_MDOC) {
6766b86842eSschwarze 		curp->man->meta.macroset = MACROSET_MDOC;
6776050a3daSschwarze 		if (curp->man->mdocmac == NULL)
6786050a3daSschwarze 			curp->man->mdocmac = roffhash_alloc(MDOC_Dd, MDOC_MAX);
679405987fcSschwarze 	} else if (curp->options & MPARSE_MAN) {
6806b86842eSschwarze 		curp->man->meta.macroset = MACROSET_MAN;
6816050a3daSschwarze 		if (curp->man->manmac == NULL)
6826050a3daSschwarze 			curp->man->manmac = roffhash_alloc(MAN_TH, MAN_MAX);
683d0370668Sschwarze 	}
6846b86842eSschwarze 	curp->man->meta.first->tok = TOKEN_NONE;
685f3476b07Sschwarze 	curp->man->meta.os_e = os_e;
6860ac7e6ecSschwarze 	tag_alloc();
687526e306bSschwarze 	return curp;
688a35fc07aSschwarze }
689a35fc07aSschwarze 
690a35fc07aSschwarze void
mparse_reset(struct mparse * curp)691a35fc07aSschwarze mparse_reset(struct mparse *curp)
692a35fc07aSschwarze {
6934c8dba62Sschwarze 	tag_free();
694a35fc07aSschwarze 	roff_reset(curp->roff);
695405987fcSschwarze 	roff_man_reset(curp->man);
69641b72316Sschwarze 	free_buf_list(curp->secondary);
69741b72316Sschwarze 	curp->secondary = NULL;
698610cce7fSschwarze 	curp->gzip = 0;
6994c8dba62Sschwarze 	tag_alloc();
700a35fc07aSschwarze }
701a35fc07aSschwarze 
702a35fc07aSschwarze void
mparse_free(struct mparse * curp)703a35fc07aSschwarze mparse_free(struct mparse *curp)
704a35fc07aSschwarze {
7050ac7e6ecSschwarze 	tag_free();
7066050a3daSschwarze 	roffhash_free(curp->man->mdocmac);
7076050a3daSschwarze 	roffhash_free(curp->man->manmac);
708405987fcSschwarze 	roff_man_free(curp->man);
709a35fc07aSschwarze 	roff_free(curp->roff);
71041b72316Sschwarze 	free_buf_list(curp->secondary);
711a35fc07aSschwarze 	free(curp);
712a35fc07aSschwarze }
713a35fc07aSschwarze 
7146b86842eSschwarze struct roff_meta *
mparse_result(struct mparse * curp)7156b86842eSschwarze mparse_result(struct mparse *curp)
716a35fc07aSschwarze {
71783d65a5aSschwarze 	roff_state_reset(curp->man);
7186b86842eSschwarze 	if (curp->options & MPARSE_VALIDATE) {
7196b86842eSschwarze 		if (curp->man->meta.macroset == MACROSET_MDOC)
7206b86842eSschwarze 			mdoc_validate(curp->man);
7216b86842eSschwarze 		else
7226b86842eSschwarze 			man_validate(curp->man);
7236e2a0df9Sschwarze 		tag_postprocess(curp->man, curp->man->meta.first);
724310147f5Sschwarze 	}
7256b86842eSschwarze 	return &curp->man->meta;
726a35fc07aSschwarze }
727a35fc07aSschwarze 
728a35fc07aSschwarze void
mparse_copy(const struct mparse * p)72941b72316Sschwarze mparse_copy(const struct mparse *p)
730ca0ce676Sschwarze {
73141b72316Sschwarze 	struct buf	*buf;
732ca0ce676Sschwarze 
73341b72316Sschwarze 	for (buf = p->secondary; buf != NULL; buf = buf->next)
73441b72316Sschwarze 		puts(buf->buf);
735ca0ce676Sschwarze }
736