1*07aa0509Sschwarze /* $OpenBSD: read.c,v 1.192 2022/05/19 14:47:47 schwarze Exp $ */
2a35fc07aSschwarze /*
36e2a0df9Sschwarze * Copyright (c) 2010-2020 Ingo Schwarze <schwarze@openbsd.org>
40ac7e6ecSschwarze * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
539c2a57eSschwarze * Copyright (c) 2010, 2012 Joerg Sonnenberger <joerg@netbsd.org>
6a35fc07aSschwarze *
7a35fc07aSschwarze * Permission to use, copy, modify, and distribute this software for any
8a35fc07aSschwarze * purpose with or without fee is hereby granted, provided that the above
9a35fc07aSschwarze * copyright notice and this permission notice appear in all copies.
10a35fc07aSschwarze *
11d1982c71Sschwarze * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
12a35fc07aSschwarze * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13d1982c71Sschwarze * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
14a35fc07aSschwarze * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15a35fc07aSschwarze * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16a35fc07aSschwarze * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17a35fc07aSschwarze * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
180ac7e6ecSschwarze *
190ac7e6ecSschwarze * Top-level functions of the mandoc(3) parser:
200ac7e6ecSschwarze * Parser and input encoding selection, decompression,
210ac7e6ecSschwarze * handling of input bytes, characters, lines, and files,
220ac7e6ecSschwarze * handling of roff(7) loops and file inclusion,
230ac7e6ecSschwarze * and steering of the various parsers.
24a35fc07aSschwarze */
25d395d87cSschwarze #include <sys/types.h>
26a35fc07aSschwarze #include <sys/mman.h>
27d395d87cSschwarze #include <sys/stat.h>
28a35fc07aSschwarze
29a35fc07aSschwarze #include <assert.h>
30a35fc07aSschwarze #include <ctype.h>
3184d1f063Sschwarze #include <errno.h>
32a35fc07aSschwarze #include <fcntl.h>
33a35fc07aSschwarze #include <stdarg.h>
34a35fc07aSschwarze #include <stdio.h>
35a35fc07aSschwarze #include <stdlib.h>
36a35fc07aSschwarze #include <string.h>
37a35fc07aSschwarze #include <unistd.h>
38d74fe132Sschwarze #include <zlib.h>
39a35fc07aSschwarze
404f4f7972Sschwarze #include "mandoc_aux.h"
41d1982c71Sschwarze #include "mandoc.h"
42d1982c71Sschwarze #include "roff.h"
43a35fc07aSschwarze #include "mdoc.h"
44a35fc07aSschwarze #include "man.h"
4599acaf1eSschwarze #include "mandoc_parse.h"
46d1982c71Sschwarze #include "libmandoc.h"
476c530f1cSschwarze #include "roff_int.h"
486e2a0df9Sschwarze #include "tag.h"
49a35fc07aSschwarze
50a35fc07aSschwarze #define REPARSE_LIMIT 1000
51a35fc07aSschwarze
52a35fc07aSschwarze struct mparse {
53a35fc07aSschwarze struct roff *roff; /* roff parser (!NULL) */
5477f3ec4fSschwarze struct roff_man *man; /* man parser */
55f0d7487dSschwarze struct buf *primary; /* buffer currently being parsed */
5641b72316Sschwarze struct buf *secondary; /* copy of top level input */
57b7f92c5fSschwarze struct buf *loop; /* open .while request line */
58f3476b07Sschwarze const char *os_s; /* default operating system */
59f0d7487dSschwarze int options; /* parser options */
60d74fe132Sschwarze int gzip; /* current input file is gzipped */
617232fc26Sschwarze int filenc; /* encoding of the current file */
62f0d7487dSschwarze int reparse_count; /* finite interp. stack */
63f0d7487dSschwarze int line; /* line number in the file */
64a35fc07aSschwarze };
65a35fc07aSschwarze
669d7b4fe8Sschwarze static void choose_parser(struct mparse *);
6741b72316Sschwarze static void free_buf_list(struct buf *);
68a35fc07aSschwarze static void resize_buf(struct buf *, size_t);
69b7f92c5fSschwarze static int mparse_buf_r(struct mparse *, struct buf, size_t, int);
70e501e731Sschwarze static int read_whole_file(struct mparse *, int, struct buf *, int *);
71a35fc07aSschwarze static void mparse_end(struct mparse *);
72a35fc07aSschwarze
7349aff9f8Sschwarze
74a35fc07aSschwarze static void
resize_buf(struct buf * buf,size_t initial)75a35fc07aSschwarze resize_buf(struct buf *buf, size_t initial)
76a35fc07aSschwarze {
77a35fc07aSschwarze
78a35fc07aSschwarze buf->sz = buf->sz > initial/2 ? 2 * buf->sz : initial;
79a35fc07aSschwarze buf->buf = mandoc_realloc(buf->buf, buf->sz);
80a35fc07aSschwarze }
81a35fc07aSschwarze
82a35fc07aSschwarze static void
free_buf_list(struct buf * buf)8341b72316Sschwarze free_buf_list(struct buf *buf)
8441b72316Sschwarze {
8541b72316Sschwarze struct buf *tmp;
8641b72316Sschwarze
8741b72316Sschwarze while (buf != NULL) {
8841b72316Sschwarze tmp = buf;
8941b72316Sschwarze buf = tmp->next;
9041b72316Sschwarze free(tmp->buf);
9141b72316Sschwarze free(tmp);
9241b72316Sschwarze }
9341b72316Sschwarze }
9441b72316Sschwarze
9541b72316Sschwarze static void
choose_parser(struct mparse * curp)969d7b4fe8Sschwarze choose_parser(struct mparse *curp)
97a35fc07aSschwarze {
98f0d7487dSschwarze char *cp, *ep;
99f0d7487dSschwarze int format;
100a35fc07aSschwarze
101f0d7487dSschwarze /*
102f0d7487dSschwarze * If neither command line arguments -mdoc or -man select
103f0d7487dSschwarze * a parser nor the roff parser found a .Dd or .TH macro
104f0d7487dSschwarze * yet, look ahead in the main input buffer.
105f0d7487dSschwarze */
106f0d7487dSschwarze
107f0d7487dSschwarze if ((format = roff_getformat(curp->roff)) == 0) {
108f0d7487dSschwarze cp = curp->primary->buf;
109f0d7487dSschwarze ep = cp + curp->primary->sz;
110f0d7487dSschwarze while (cp < ep) {
111357cc7d0Sschwarze if (*cp == '.' || *cp == '\'') {
112f0d7487dSschwarze cp++;
113f0d7487dSschwarze if (cp[0] == 'D' && cp[1] == 'd') {
114f0d7487dSschwarze format = MPARSE_MDOC;
115f0d7487dSschwarze break;
116f0d7487dSschwarze }
117f0d7487dSschwarze if (cp[0] == 'T' && cp[1] == 'H') {
118f0d7487dSschwarze format = MPARSE_MAN;
119f0d7487dSschwarze break;
120f0d7487dSschwarze }
121f0d7487dSschwarze }
122f0d7487dSschwarze cp = memchr(cp, '\n', ep - cp);
123f0d7487dSschwarze if (cp == NULL)
124f0d7487dSschwarze break;
125f0d7487dSschwarze cp++;
126f0d7487dSschwarze }
127a35fc07aSschwarze }
128a35fc07aSschwarze
129405987fcSschwarze if (format == MPARSE_MDOC) {
1306b86842eSschwarze curp->man->meta.macroset = MACROSET_MDOC;
1316050a3daSschwarze if (curp->man->mdocmac == NULL)
1326050a3daSschwarze curp->man->mdocmac = roffhash_alloc(MDOC_Dd, MDOC_MAX);
133405987fcSschwarze } else {
1346b86842eSschwarze curp->man->meta.macroset = MACROSET_MAN;
1356050a3daSschwarze if (curp->man->manmac == NULL)
1366050a3daSschwarze curp->man->manmac = roffhash_alloc(MAN_TH, MAN_MAX);
137405987fcSschwarze }
1386b86842eSschwarze curp->man->meta.first->tok = TOKEN_NONE;
139a35fc07aSschwarze }
140a35fc07aSschwarze
141a35fc07aSschwarze /*
142cdea9283Sschwarze * Main parse routine for a buffer.
143cdea9283Sschwarze * It assumes encoding and line numbering are already set up.
144cdea9283Sschwarze * It can recurse directly (for invocations of user-defined
145cdea9283Sschwarze * macros, inline equations, and input line traps)
146cdea9283Sschwarze * and indirectly (for .so file inclusion).
147a35fc07aSschwarze */
148b7f92c5fSschwarze static int
mparse_buf_r(struct mparse * curp,struct buf blk,size_t i,int start)149cdea9283Sschwarze mparse_buf_r(struct mparse *curp, struct buf blk, size_t i, int start)
150a35fc07aSschwarze {
151a35fc07aSschwarze struct buf ln;
152b7f92c5fSschwarze struct buf *firstln, *lastln, *thisln, *loop;
153f7d9ce8dSschwarze char *cp;
154cdea9283Sschwarze size_t pos; /* byte number in the ln buffer */
155dd9cc97dSschwarze size_t spos; /* at the start of the current line parse */
156b7f92c5fSschwarze int line_result, result;
157c55fe189Sschwarze int of;
158a35fc07aSschwarze int lnn; /* line number in the real file */
159f7d9ce8dSschwarze int fd;
160b7f92c5fSschwarze int inloop; /* Saw .while on this level. */
161a35fc07aSschwarze unsigned char c;
162a35fc07aSschwarze
16341b72316Sschwarze ln.sz = 256;
16441b72316Sschwarze ln.buf = mandoc_malloc(ln.sz);
16541b72316Sschwarze ln.next = NULL;
1660bb0865dSschwarze firstln = lastln = loop = NULL;
167a35fc07aSschwarze lnn = curp->line;
168a35fc07aSschwarze pos = 0;
169b7f92c5fSschwarze inloop = 0;
17041b72316Sschwarze result = ROFF_CONT;
171a35fc07aSschwarze
172b7f92c5fSschwarze while (i < blk.sz && (blk.buf[i] != '\0' || pos != 0)) {
173a35fc07aSschwarze if (start) {
174a35fc07aSschwarze curp->line = lnn;
175a35fc07aSschwarze curp->reparse_count = 0;
1767232fc26Sschwarze
1777232fc26Sschwarze if (lnn < 3 &&
1787232fc26Sschwarze curp->filenc & MPARSE_UTF8 &&
179cdea9283Sschwarze curp->filenc & MPARSE_LATIN1)
180cdea9283Sschwarze curp->filenc = preconv_cue(&blk, i);
181a35fc07aSschwarze }
182dd9cc97dSschwarze spos = pos;
183a35fc07aSschwarze
184cdea9283Sschwarze while (i < blk.sz && (start || blk.buf[i] != '\0')) {
185a35fc07aSschwarze
186a35fc07aSschwarze /*
187a35fc07aSschwarze * When finding an unescaped newline character,
188a35fc07aSschwarze * leave the character loop to process the line.
189a35fc07aSschwarze * Skip a preceding carriage return, if any.
190a35fc07aSschwarze */
191a35fc07aSschwarze
192cdea9283Sschwarze if ('\r' == blk.buf[i] && i + 1 < blk.sz &&
193a35fc07aSschwarze '\n' == blk.buf[i + 1])
194a35fc07aSschwarze ++i;
195a35fc07aSschwarze if ('\n' == blk.buf[i]) {
196a35fc07aSschwarze ++i;
197a35fc07aSschwarze ++lnn;
198a35fc07aSschwarze break;
199a35fc07aSschwarze }
200a35fc07aSschwarze
201a35fc07aSschwarze /*
2027232fc26Sschwarze * Make sure we have space for the worst
20341b72316Sschwarze * case of 12 bytes: "\\[u10ffff]\n\0"
20416f845e4Sschwarze */
20516f845e4Sschwarze
20641b72316Sschwarze if (pos + 12 > ln.sz)
20716f845e4Sschwarze resize_buf(&ln, 256);
20816f845e4Sschwarze
20916f845e4Sschwarze /*
2107232fc26Sschwarze * Encode 8-bit input.
211a35fc07aSschwarze */
212a35fc07aSschwarze
2137232fc26Sschwarze c = blk.buf[i];
2147232fc26Sschwarze if (c & 0x80) {
215cdea9283Sschwarze if ( ! (curp->filenc && preconv_encode(
216cdea9283Sschwarze &blk, &i, &ln, &pos, &curp->filenc))) {
217a5a5f808Sschwarze mandoc_msg(MANDOCERR_CHAR_BAD,
21861ee90daSschwarze curp->line, pos, "0x%x", c);
2197232fc26Sschwarze ln.buf[pos++] = '?';
2207232fc26Sschwarze i++;
2217232fc26Sschwarze }
2227232fc26Sschwarze continue;
2237232fc26Sschwarze }
224a35fc07aSschwarze
2257232fc26Sschwarze /*
2267232fc26Sschwarze * Exclude control characters.
2277232fc26Sschwarze */
2287232fc26Sschwarze
2297232fc26Sschwarze if (c == 0x7f || (c < 0x20 && c != 0x09)) {
230a5a5f808Sschwarze mandoc_msg(c == 0x00 || c == 0x04 ||
23161ee90daSschwarze c > 0x0a ? MANDOCERR_CHAR_BAD :
23261ee90daSschwarze MANDOCERR_CHAR_UNSUPP,
233a5a5f808Sschwarze curp->line, pos, "0x%x", c);
234a35fc07aSschwarze i++;
23546ec38c0Sschwarze if (c != '\r')
23647232c98Sschwarze ln.buf[pos++] = '?';
237a35fc07aSschwarze continue;
238a35fc07aSschwarze }
239a35fc07aSschwarze
240a35fc07aSschwarze ln.buf[pos++] = blk.buf[i++];
241a35fc07aSschwarze }
24241b72316Sschwarze ln.buf[pos] = '\0';
243a35fc07aSschwarze
24441b72316Sschwarze /*
24541b72316Sschwarze * Maintain a lookaside buffer of all lines.
24641b72316Sschwarze * parsed from this input source.
24741b72316Sschwarze */
248a35fc07aSschwarze
24941b72316Sschwarze thisln = mandoc_malloc(sizeof(*thisln));
25041b72316Sschwarze thisln->buf = mandoc_strdup(ln.buf);
25141b72316Sschwarze thisln->sz = strlen(ln.buf) + 1;
25241b72316Sschwarze thisln->next = NULL;
25341b72316Sschwarze if (firstln == NULL) {
25441b72316Sschwarze firstln = lastln = thisln;
25541b72316Sschwarze if (curp->secondary == NULL)
25641b72316Sschwarze curp->secondary = firstln;
25741b72316Sschwarze } else {
25841b72316Sschwarze lastln->next = thisln;
25941b72316Sschwarze lastln = thisln;
26041b72316Sschwarze }
26141b72316Sschwarze
262*07aa0509Sschwarze /*
263*07aa0509Sschwarze * XXX Ugly hack to mark the end of the input,
264*07aa0509Sschwarze * such that the function roff_parse_comment()
265*07aa0509Sschwarze * doesn't attempt to append another line if the
266*07aa0509Sschwarze * last input line ends with an escape character.
267*07aa0509Sschwarze */
26841b72316Sschwarze
26941b72316Sschwarze if (i == blk.sz || blk.buf[i] == '\0') {
27018b62721Sschwarze if (pos + 2 > ln.sz)
27118b62721Sschwarze resize_buf(&ln, 256);
272be477484Sschwarze ln.buf[pos++] = '\n';
273a35fc07aSschwarze ln.buf[pos] = '\0';
27441b72316Sschwarze }
275a35fc07aSschwarze
276a35fc07aSschwarze /*
277a35fc07aSschwarze * A significant amount of complexity is contained by
278a35fc07aSschwarze * the roff preprocessor. It's line-oriented but can be
279a35fc07aSschwarze * expressed on one line, so we need at times to
280a35fc07aSschwarze * readjust our starting point and re-run it. The roff
281a35fc07aSschwarze * preprocessor can also readjust the buffers with new
282a35fc07aSschwarze * data, so we pass them in wholesale.
283a35fc07aSschwarze */
284a35fc07aSschwarze
285a35fc07aSschwarze of = 0;
286a35fc07aSschwarze rerun:
287dd9cc97dSschwarze line_result = roff_parseln(curp->roff, curp->line,
288dd9cc97dSschwarze &ln, &of, start && spos == 0 ? pos : 0);
289a35fc07aSschwarze
290b7f92c5fSschwarze /* Process options. */
291b7f92c5fSschwarze
292b7f92c5fSschwarze if (line_result & ROFF_APPEND)
293b7f92c5fSschwarze assert(line_result == (ROFF_IGN | ROFF_APPEND));
294b7f92c5fSschwarze
295b7f92c5fSschwarze if (line_result & ROFF_USERCALL)
296b7f92c5fSschwarze assert((line_result & ROFF_MASK) == ROFF_REPARSE);
297b7f92c5fSschwarze
298b7f92c5fSschwarze if (line_result & ROFF_USERRET) {
299b7f92c5fSschwarze assert(line_result == (ROFF_IGN | ROFF_USERRET));
300b7f92c5fSschwarze if (start == 0) {
301b7f92c5fSschwarze /* Return from the current macro. */
302b7f92c5fSschwarze result = ROFF_USERRET;
303b7f92c5fSschwarze goto out;
304b7f92c5fSschwarze }
305b7f92c5fSschwarze }
306b7f92c5fSschwarze
307b7f92c5fSschwarze switch (line_result & ROFF_LOOPMASK) {
308b7f92c5fSschwarze case ROFF_IGN:
309b7f92c5fSschwarze break;
310b7f92c5fSschwarze case ROFF_WHILE:
311b7f92c5fSschwarze if (curp->loop != NULL) {
312b7f92c5fSschwarze if (loop == curp->loop)
313b7f92c5fSschwarze break;
314b7f92c5fSschwarze mandoc_msg(MANDOCERR_WHILE_NEST,
315a5a5f808Sschwarze curp->line, pos, NULL);
316b7f92c5fSschwarze }
317b7f92c5fSschwarze curp->loop = thisln;
318b7f92c5fSschwarze loop = NULL;
319b7f92c5fSschwarze inloop = 1;
320b7f92c5fSschwarze break;
321b7f92c5fSschwarze case ROFF_LOOPCONT:
322b7f92c5fSschwarze case ROFF_LOOPEXIT:
323b7f92c5fSschwarze if (curp->loop == NULL) {
324b7f92c5fSschwarze mandoc_msg(MANDOCERR_WHILE_FAIL,
325a5a5f808Sschwarze curp->line, pos, NULL);
326b7f92c5fSschwarze break;
327b7f92c5fSschwarze }
328b7f92c5fSschwarze if (inloop == 0) {
329b7f92c5fSschwarze mandoc_msg(MANDOCERR_WHILE_INTO,
330a5a5f808Sschwarze curp->line, pos, NULL);
331b7f92c5fSschwarze curp->loop = loop = NULL;
332b7f92c5fSschwarze break;
333b7f92c5fSschwarze }
334b7f92c5fSschwarze if (line_result & ROFF_LOOPCONT)
335b7f92c5fSschwarze loop = curp->loop;
336b7f92c5fSschwarze else {
337b7f92c5fSschwarze curp->loop = loop = NULL;
338b7f92c5fSschwarze inloop = 0;
339b7f92c5fSschwarze }
340b7f92c5fSschwarze break;
341b7f92c5fSschwarze default:
342b7f92c5fSschwarze abort();
343b7f92c5fSschwarze }
344b7f92c5fSschwarze
345b7f92c5fSschwarze /* Process the main instruction from the roff parser. */
346b7f92c5fSschwarze
347b7f92c5fSschwarze switch (line_result & ROFF_MASK) {
348b7f92c5fSschwarze case ROFF_IGN:
349b7f92c5fSschwarze break;
350b7f92c5fSschwarze case ROFF_CONT:
3516b86842eSschwarze if (curp->man->meta.macroset == MACROSET_NONE)
352b7f92c5fSschwarze choose_parser(curp);
3536b86842eSschwarze if ((curp->man->meta.macroset == MACROSET_MDOC ?
354b7f92c5fSschwarze mdoc_parseln(curp->man, curp->line, ln.buf, of) :
355b7f92c5fSschwarze man_parseln(curp->man, curp->line, ln.buf, of)
356b7f92c5fSschwarze ) == 2)
357b7f92c5fSschwarze goto out;
358b7f92c5fSschwarze break;
359b7f92c5fSschwarze case ROFF_RERUN:
360b7f92c5fSschwarze goto rerun;
36149aff9f8Sschwarze case ROFF_REPARSE:
3623dc5225dSschwarze if (++curp->reparse_count > REPARSE_LIMIT) {
363b7f92c5fSschwarze /* Abort and return to the top level. */
36441b72316Sschwarze result = ROFF_IGN;
365a5a5f808Sschwarze mandoc_msg(MANDOCERR_ROFFLOOP,
366a35fc07aSschwarze curp->line, pos, NULL);
367b7f92c5fSschwarze goto out;
368b7f92c5fSschwarze }
36941b72316Sschwarze result = mparse_buf_r(curp, ln, of, 0);
370b7f92c5fSschwarze if (line_result & ROFF_USERCALL) {
3713dc5225dSschwarze roff_userret(curp->roff);
372b7f92c5fSschwarze /* Continue normally. */
373b7f92c5fSschwarze if (result & ROFF_USERRET)
374b7f92c5fSschwarze result = ROFF_CONT;
3753dc5225dSschwarze }
376b7f92c5fSschwarze if (start == 0 && result != ROFF_CONT)
37741b72316Sschwarze goto out;
378b7f92c5fSschwarze break;
37949aff9f8Sschwarze case ROFF_SO:
380cdea9283Sschwarze if ( ! (curp->options & MPARSE_SO) &&
381cdea9283Sschwarze (i >= blk.sz || blk.buf[i] == '\0')) {
3826b86842eSschwarze curp->man->meta.sodest =
3836b86842eSschwarze mandoc_strdup(ln.buf + of);
38441b72316Sschwarze goto out;
385310147f5Sschwarze }
386723ae0efSschwarze if ((fd = mparse_open(curp, ln.buf + of)) != -1) {
387f7d9ce8dSschwarze mparse_readfd(curp, fd, ln.buf + of);
3887a6e7816Sschwarze close(fd);
3891be61736Sschwarze } else {
39018f7d3b8Sschwarze mandoc_msg(MANDOCERR_SO_FAIL,
39118f7d3b8Sschwarze curp->line, of, ".so %s: %s",
39218f7d3b8Sschwarze ln.buf + of, strerror(errno));
393f7d9ce8dSschwarze ln.sz = mandoc_asprintf(&cp,
394f7d9ce8dSschwarze ".sp\nSee the file %s.\n.sp",
395f7d9ce8dSschwarze ln.buf + of);
396f7d9ce8dSschwarze free(ln.buf);
397f7d9ce8dSschwarze ln.buf = cp;
398f7d9ce8dSschwarze of = 0;
399f7d9ce8dSschwarze mparse_buf_r(curp, ln, of, 0);
40085527626Sschwarze }
401b7f92c5fSschwarze break;
402a35fc07aSschwarze default:
403b7f92c5fSschwarze abort();
404a35fc07aSschwarze }
405a35fc07aSschwarze
406a35fc07aSschwarze /* Start the next input line. */
407a35fc07aSschwarze
408b7f92c5fSschwarze if (loop != NULL &&
409b7f92c5fSschwarze (line_result & ROFF_LOOPMASK) == ROFF_IGN)
410b7f92c5fSschwarze loop = loop->next;
411b7f92c5fSschwarze
412b7f92c5fSschwarze if (loop != NULL) {
413b7f92c5fSschwarze if ((line_result & ROFF_APPEND) == 0)
414b7f92c5fSschwarze *ln.buf = '\0';
415b7f92c5fSschwarze if (ln.sz < loop->sz)
416b7f92c5fSschwarze resize_buf(&ln, loop->sz);
417b7f92c5fSschwarze (void)strlcat(ln.buf, loop->buf, ln.sz);
418b7f92c5fSschwarze of = 0;
419b7f92c5fSschwarze goto rerun;
420b7f92c5fSschwarze }
421b7f92c5fSschwarze
422b7f92c5fSschwarze pos = (line_result & ROFF_APPEND) ? strlen(ln.buf) : 0;
423a35fc07aSschwarze }
42441b72316Sschwarze out:
425b7f92c5fSschwarze if (inloop) {
426b7f92c5fSschwarze if (result != ROFF_USERRET)
427a5a5f808Sschwarze mandoc_msg(MANDOCERR_WHILE_OUTOF,
428b7f92c5fSschwarze curp->line, pos, NULL);
429b7f92c5fSschwarze curp->loop = NULL;
430b7f92c5fSschwarze }
431a35fc07aSschwarze free(ln.buf);
43241b72316Sschwarze if (firstln != curp->secondary)
43341b72316Sschwarze free_buf_list(firstln);
43441b72316Sschwarze return result;
435a35fc07aSschwarze }
436a35fc07aSschwarze
437a35fc07aSschwarze static int
read_whole_file(struct mparse * curp,int fd,struct buf * fb,int * with_mmap)438e501e731Sschwarze read_whole_file(struct mparse *curp, int fd, struct buf *fb, int *with_mmap)
439a35fc07aSschwarze {
440a35fc07aSschwarze struct stat st;
441d74fe132Sschwarze gzFile gz;
442a35fc07aSschwarze size_t off;
443a35fc07aSschwarze ssize_t ssz;
4441485d9f7Sschwarze int gzerrnum, retval;
445a35fc07aSschwarze
446e8cab092Sschwarze if (fstat(fd, &st) == -1) {
447ecd1ed85Sschwarze mandoc_msg(MANDOCERR_FSTAT, 0, 0, "%s", strerror(errno));
448ecd1ed85Sschwarze return -1;
449e8cab092Sschwarze }
450a35fc07aSschwarze
451a35fc07aSschwarze /*
452a35fc07aSschwarze * If we're a regular file, try just reading in the whole entry
453a35fc07aSschwarze * via mmap(). This is faster than reading it into blocks, and
454a35fc07aSschwarze * since each file is only a few bytes to begin with, I'm not
455a35fc07aSschwarze * concerned that this is going to tank any machines.
456a35fc07aSschwarze */
457a35fc07aSschwarze
458d74fe132Sschwarze if (curp->gzip == 0 && S_ISREG(st.st_mode)) {
4593fcf3a03Sschwarze if (st.st_size > 0x7fffffff) {
460a5a5f808Sschwarze mandoc_msg(MANDOCERR_TOOLARGE, 0, 0, NULL);
461ecd1ed85Sschwarze return -1;
462a35fc07aSschwarze }
463a35fc07aSschwarze *with_mmap = 1;
464a35fc07aSschwarze fb->sz = (size_t)st.st_size;
465b6ac0686Sschwarze fb->buf = mmap(NULL, fb->sz, PROT_READ, MAP_SHARED, fd, 0);
466a35fc07aSschwarze if (fb->buf != MAP_FAILED)
467ecd1ed85Sschwarze return 0;
468a35fc07aSschwarze }
469a35fc07aSschwarze
470d74fe132Sschwarze if (curp->gzip) {
4711485d9f7Sschwarze /*
4721485d9f7Sschwarze * Duplicating the file descriptor is required
4731485d9f7Sschwarze * because we will have to call gzclose(3)
4741485d9f7Sschwarze * to free memory used internally by zlib,
4751485d9f7Sschwarze * but that will also close the file descriptor,
4761485d9f7Sschwarze * which this function must not do.
4771485d9f7Sschwarze */
4781485d9f7Sschwarze if ((fd = dup(fd)) == -1) {
479ecd1ed85Sschwarze mandoc_msg(MANDOCERR_DUP, 0, 0,
480ecd1ed85Sschwarze "%s", strerror(errno));
481ecd1ed85Sschwarze return -1;
4821485d9f7Sschwarze }
483e8cab092Sschwarze if ((gz = gzdopen(fd, "rb")) == NULL) {
484ecd1ed85Sschwarze mandoc_msg(MANDOCERR_GZDOPEN, 0, 0,
485ecd1ed85Sschwarze "%s", strerror(errno));
4861485d9f7Sschwarze close(fd);
487ecd1ed85Sschwarze return -1;
488e8cab092Sschwarze }
489d74fe132Sschwarze } else
490d74fe132Sschwarze gz = NULL;
491d74fe132Sschwarze
492a35fc07aSschwarze /*
493a35fc07aSschwarze * If this isn't a regular file (like, say, stdin), then we must
494a35fc07aSschwarze * go the old way and just read things in bit by bit.
495a35fc07aSschwarze */
496a35fc07aSschwarze
497a35fc07aSschwarze *with_mmap = 0;
498a35fc07aSschwarze off = 0;
499ecd1ed85Sschwarze retval = -1;
500a35fc07aSschwarze fb->sz = 0;
501a35fc07aSschwarze fb->buf = NULL;
502a35fc07aSschwarze for (;;) {
503a35fc07aSschwarze if (off == fb->sz) {
504a35fc07aSschwarze if (fb->sz == (1U << 31)) {
505a5a5f808Sschwarze mandoc_msg(MANDOCERR_TOOLARGE, 0, 0, NULL);
506a35fc07aSschwarze break;
507a35fc07aSschwarze }
508a35fc07aSschwarze resize_buf(fb, 65536);
509a35fc07aSschwarze }
510d74fe132Sschwarze ssz = curp->gzip ?
511d74fe132Sschwarze gzread(gz, fb->buf + (int)off, fb->sz - off) :
512d74fe132Sschwarze read(fd, fb->buf + (int)off, fb->sz - off);
513a35fc07aSschwarze if (ssz == 0) {
514a35fc07aSschwarze fb->sz = off;
515ecd1ed85Sschwarze retval = 0;
5161485d9f7Sschwarze break;
517a35fc07aSschwarze }
518e8cab092Sschwarze if (ssz == -1) {
5191485d9f7Sschwarze if (curp->gzip)
5201485d9f7Sschwarze (void)gzerror(gz, &gzerrnum);
521ecd1ed85Sschwarze mandoc_msg(MANDOCERR_READ, 0, 0, "%s",
5221485d9f7Sschwarze curp->gzip && gzerrnum != Z_ERRNO ?
5231485d9f7Sschwarze zError(gzerrnum) : strerror(errno));
524e8cab092Sschwarze break;
525e8cab092Sschwarze }
526a35fc07aSschwarze off += (size_t)ssz;
527a35fc07aSschwarze }
528a35fc07aSschwarze
5291485d9f7Sschwarze if (curp->gzip && (gzerrnum = gzclose(gz)) != Z_OK)
530ecd1ed85Sschwarze mandoc_msg(MANDOCERR_GZCLOSE, 0, 0, "%s",
5311485d9f7Sschwarze gzerrnum == Z_ERRNO ? strerror(errno) :
5321485d9f7Sschwarze zError(gzerrnum));
533ecd1ed85Sschwarze if (retval == -1) {
534a35fc07aSschwarze free(fb->buf);
535a35fc07aSschwarze fb->buf = NULL;
5361485d9f7Sschwarze }
5371485d9f7Sschwarze return retval;
538a35fc07aSschwarze }
539a35fc07aSschwarze
540a35fc07aSschwarze static void
mparse_end(struct mparse * curp)541a35fc07aSschwarze mparse_end(struct mparse *curp)
542a35fc07aSschwarze {
5436b86842eSschwarze if (curp->man->meta.macroset == MACROSET_NONE)
5446b86842eSschwarze curp->man->meta.macroset = MACROSET_MAN;
5456b86842eSschwarze if (curp->man->meta.macroset == MACROSET_MDOC)
546f2d5c709Sschwarze mdoc_endparse(curp->man);
547f2d5c709Sschwarze else
548df927bb6Sschwarze man_endparse(curp->man);
549a35fc07aSschwarze roff_endparse(curp->roff);
550a35fc07aSschwarze }
551a35fc07aSschwarze
552b8bc906aSschwarze /*
553b8bc906aSschwarze * Read the whole file into memory and call the parsers.
554b8bc906aSschwarze * Called recursively when an .so request is encountered.
555b8bc906aSschwarze */
556b8bc906aSschwarze void
mparse_readfd(struct mparse * curp,int fd,const char * filename)557b8bc906aSschwarze mparse_readfd(struct mparse *curp, int fd, const char *filename)
558a35fc07aSschwarze {
5592943e6e6Sschwarze static int recursion_depth;
5602943e6e6Sschwarze
561b8bc906aSschwarze struct buf blk;
562b8bc906aSschwarze struct buf *save_primary;
563ee646987Sschwarze const char *save_filename, *cp;
564b8bc906aSschwarze size_t offset;
565b8bc906aSschwarze int save_filenc, save_lineno;
566b8bc906aSschwarze int with_mmap;
567b8bc906aSschwarze
568b8bc906aSschwarze if (recursion_depth > 64) {
569a5a5f808Sschwarze mandoc_msg(MANDOCERR_ROFFLOOP, curp->line, 0, NULL);
570b6ac0686Sschwarze return;
571ee646987Sschwarze } else if (recursion_depth == 0 &&
572ee646987Sschwarze (cp = strrchr(filename, '.')) != NULL &&
573ee646987Sschwarze cp[1] >= '1' && cp[1] <= '9')
574ee646987Sschwarze curp->man->filesec = cp[1];
575ee646987Sschwarze else
576ee646987Sschwarze curp->man->filesec = '\0';
577ee646987Sschwarze
578ecd1ed85Sschwarze if (read_whole_file(curp, fd, &blk, &with_mmap) == -1)
579b8bc906aSschwarze return;
580a35fc07aSschwarze
581b8bc906aSschwarze /*
582b8bc906aSschwarze * Save some properties of the parent file.
583b8bc906aSschwarze */
584b8bc906aSschwarze
585b8bc906aSschwarze save_primary = curp->primary;
586b8bc906aSschwarze save_filenc = curp->filenc;
587b8bc906aSschwarze save_lineno = curp->line;
588b8bc906aSschwarze save_filename = mandoc_msg_getinfilename();
589b8bc906aSschwarze
590f0d7487dSschwarze curp->primary = &blk;
591b8bc906aSschwarze curp->filenc = curp->options & (MPARSE_UTF8 | MPARSE_LATIN1);
592b6ac0686Sschwarze curp->line = 1;
593b8bc906aSschwarze mandoc_msg_setinfilename(filename);
594b6ac0686Sschwarze
5957232fc26Sschwarze /* Skip an UTF-8 byte order mark. */
5967232fc26Sschwarze if (curp->filenc & MPARSE_UTF8 && blk.sz > 2 &&
5977232fc26Sschwarze (unsigned char)blk.buf[0] == 0xef &&
5987232fc26Sschwarze (unsigned char)blk.buf[1] == 0xbb &&
5997232fc26Sschwarze (unsigned char)blk.buf[2] == 0xbf) {
600cdea9283Sschwarze offset = 3;
6017232fc26Sschwarze curp->filenc &= ~MPARSE_LATIN1;
602cdea9283Sschwarze } else
603cdea9283Sschwarze offset = 0;
6047232fc26Sschwarze
605b8bc906aSschwarze recursion_depth++;
606cdea9283Sschwarze mparse_buf_r(curp, blk, offset, 1);
607df927bb6Sschwarze if (--recursion_depth == 0)
608b6ac0686Sschwarze mparse_end(curp);
609b6ac0686Sschwarze
61065be529eSschwarze /*
611b8bc906aSschwarze * Clean up and restore saved parent properties.
61265be529eSschwarze */
613a35fc07aSschwarze
614b6ac0686Sschwarze if (with_mmap)
615b6ac0686Sschwarze munmap(blk.buf, blk.sz);
616b6ac0686Sschwarze else
617b6ac0686Sschwarze free(blk.buf);
618b8bc906aSschwarze
619b8bc906aSschwarze curp->primary = save_primary;
620b8bc906aSschwarze curp->filenc = save_filenc;
621b8bc906aSschwarze curp->line = save_lineno;
622b8bc906aSschwarze if (save_filename != NULL)
623b8bc906aSschwarze mandoc_msg_setinfilename(save_filename);
624a35fc07aSschwarze }
625a35fc07aSschwarze
626723ae0efSschwarze int
mparse_open(struct mparse * curp,const char * file)627723ae0efSschwarze mparse_open(struct mparse *curp, const char *file)
628d395d87cSschwarze {
629d395d87cSschwarze char *cp;
6305c29163cSschwarze int fd, save_errno;
631d395d87cSschwarze
632d74fe132Sschwarze cp = strrchr(file, '.');
633d74fe132Sschwarze curp->gzip = (cp != NULL && ! strcmp(cp + 1, "gz"));
63465be529eSschwarze
635d74fe132Sschwarze /* First try to use the filename as it is. */
63665be529eSschwarze
637723ae0efSschwarze if ((fd = open(file, O_RDONLY)) != -1)
638723ae0efSschwarze return fd;
63965be529eSschwarze
640d74fe132Sschwarze /*
641d74fe132Sschwarze * If that doesn't work and the filename doesn't
642d74fe132Sschwarze * already end in .gz, try appending .gz.
643d74fe132Sschwarze */
64465be529eSschwarze
645d74fe132Sschwarze if ( ! curp->gzip) {
6465c29163cSschwarze save_errno = errno;
64765be529eSschwarze mandoc_asprintf(&cp, "%s.gz", file);
6484552f795Sschwarze fd = open(cp, O_RDONLY);
649fc1cb77bSschwarze free(cp);
6505c29163cSschwarze errno = save_errno;
651723ae0efSschwarze if (fd != -1) {
652d74fe132Sschwarze curp->gzip = 1;
653723ae0efSschwarze return fd;
654d74fe132Sschwarze }
655d74fe132Sschwarze }
656d74fe132Sschwarze
657d74fe132Sschwarze /* Neither worked, give up. */
658d74fe132Sschwarze
659723ae0efSschwarze return -1;
660d395d87cSschwarze }
66165be529eSschwarze
662a35fc07aSschwarze struct mparse *
mparse_alloc(int options,enum mandoc_os os_e,const char * os_s)663e501e731Sschwarze mparse_alloc(int options, enum mandoc_os os_e, const char *os_s)
664a35fc07aSschwarze {
665a35fc07aSschwarze struct mparse *curp;
666a35fc07aSschwarze
667a35fc07aSschwarze curp = mandoc_calloc(1, sizeof(struct mparse));
668a35fc07aSschwarze
669fee846f0Sschwarze curp->options = options;
670f3476b07Sschwarze curp->os_s = os_s;
671a35fc07aSschwarze
67291305757Sschwarze curp->roff = roff_alloc(options);
67391305757Sschwarze curp->man = roff_man_alloc(curp->roff, curp->os_s,
674405987fcSschwarze curp->options & MPARSE_QUICK ? 1 : 0);
675d0370668Sschwarze if (curp->options & MPARSE_MDOC) {
6766b86842eSschwarze curp->man->meta.macroset = MACROSET_MDOC;
6776050a3daSschwarze if (curp->man->mdocmac == NULL)
6786050a3daSschwarze curp->man->mdocmac = roffhash_alloc(MDOC_Dd, MDOC_MAX);
679405987fcSschwarze } else if (curp->options & MPARSE_MAN) {
6806b86842eSschwarze curp->man->meta.macroset = MACROSET_MAN;
6816050a3daSschwarze if (curp->man->manmac == NULL)
6826050a3daSschwarze curp->man->manmac = roffhash_alloc(MAN_TH, MAN_MAX);
683d0370668Sschwarze }
6846b86842eSschwarze curp->man->meta.first->tok = TOKEN_NONE;
685f3476b07Sschwarze curp->man->meta.os_e = os_e;
6860ac7e6ecSschwarze tag_alloc();
687526e306bSschwarze return curp;
688a35fc07aSschwarze }
689a35fc07aSschwarze
690a35fc07aSschwarze void
mparse_reset(struct mparse * curp)691a35fc07aSschwarze mparse_reset(struct mparse *curp)
692a35fc07aSschwarze {
6934c8dba62Sschwarze tag_free();
694a35fc07aSschwarze roff_reset(curp->roff);
695405987fcSschwarze roff_man_reset(curp->man);
69641b72316Sschwarze free_buf_list(curp->secondary);
69741b72316Sschwarze curp->secondary = NULL;
698610cce7fSschwarze curp->gzip = 0;
6994c8dba62Sschwarze tag_alloc();
700a35fc07aSschwarze }
701a35fc07aSschwarze
702a35fc07aSschwarze void
mparse_free(struct mparse * curp)703a35fc07aSschwarze mparse_free(struct mparse *curp)
704a35fc07aSschwarze {
7050ac7e6ecSschwarze tag_free();
7066050a3daSschwarze roffhash_free(curp->man->mdocmac);
7076050a3daSschwarze roffhash_free(curp->man->manmac);
708405987fcSschwarze roff_man_free(curp->man);
709a35fc07aSschwarze roff_free(curp->roff);
71041b72316Sschwarze free_buf_list(curp->secondary);
711a35fc07aSschwarze free(curp);
712a35fc07aSschwarze }
713a35fc07aSschwarze
7146b86842eSschwarze struct roff_meta *
mparse_result(struct mparse * curp)7156b86842eSschwarze mparse_result(struct mparse *curp)
716a35fc07aSschwarze {
71783d65a5aSschwarze roff_state_reset(curp->man);
7186b86842eSschwarze if (curp->options & MPARSE_VALIDATE) {
7196b86842eSschwarze if (curp->man->meta.macroset == MACROSET_MDOC)
7206b86842eSschwarze mdoc_validate(curp->man);
7216b86842eSschwarze else
7226b86842eSschwarze man_validate(curp->man);
7236e2a0df9Sschwarze tag_postprocess(curp->man, curp->man->meta.first);
724310147f5Sschwarze }
7256b86842eSschwarze return &curp->man->meta;
726a35fc07aSschwarze }
727a35fc07aSschwarze
728a35fc07aSschwarze void
mparse_copy(const struct mparse * p)72941b72316Sschwarze mparse_copy(const struct mparse *p)
730ca0ce676Sschwarze {
73141b72316Sschwarze struct buf *buf;
732ca0ce676Sschwarze
73341b72316Sschwarze for (buf = p->secondary; buf != NULL; buf = buf->next)
73441b72316Sschwarze puts(buf->buf);
735ca0ce676Sschwarze }
736