1544c191cSchristos /* Id: mdoc.c,v 1.274 2018/12/31 07:46:07 schwarze Exp */
24154958bSjoerg /*
348741257Sjoerg * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
4544c191cSchristos * Copyright (c) 2010, 2012-2018 Ingo Schwarze <schwarze@openbsd.org>
54154958bSjoerg *
64154958bSjoerg * Permission to use, copy, modify, and distribute this software for any
74154958bSjoerg * purpose with or without fee is hereby granted, provided that the above
84154958bSjoerg * copyright notice and this permission notice appear in all copies.
94154958bSjoerg *
109ff1f2acSchristos * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
114154958bSjoerg * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
129ff1f2acSchristos * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
134154958bSjoerg * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
144154958bSjoerg * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
154154958bSjoerg * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
164154958bSjoerg * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
174154958bSjoerg */
18d5e63c8dSjoerg #include "config.h"
19d5e63c8dSjoerg
204154958bSjoerg #include <sys/types.h>
214154958bSjoerg
224154958bSjoerg #include <assert.h>
23fec65c98Schristos #include <ctype.h>
244154958bSjoerg #include <stdarg.h>
254154958bSjoerg #include <stdio.h>
264154958bSjoerg #include <stdlib.h>
274154958bSjoerg #include <string.h>
287bcc2a5fSjoerg #include <time.h>
294154958bSjoerg
30fec65c98Schristos #include "mandoc_aux.h"
319ff1f2acSchristos #include "mandoc.h"
329ff1f2acSchristos #include "roff.h"
339ff1f2acSchristos #include "mdoc.h"
343514411fSjoerg #include "libmandoc.h"
35*61075eb3Swiz #include "roff_int.h"
369ff1f2acSchristos #include "libmdoc.h"
374154958bSjoerg
384154958bSjoerg const char *const __mdoc_argnames[MDOC_ARG_MAX] = {
394154958bSjoerg "split", "nosplit", "ragged",
404154958bSjoerg "unfilled", "literal", "file",
414154958bSjoerg "offset", "bullet", "dash",
424154958bSjoerg "hyphen", "item", "enum",
434154958bSjoerg "tag", "diag", "hang",
444154958bSjoerg "ohang", "inset", "column",
454154958bSjoerg "width", "compact", "std",
464154958bSjoerg "filled", "words", "emphasis",
474154958bSjoerg "symbolic", "nested", "centered"
484154958bSjoerg };
494154958bSjoerg const char * const *mdoc_argnames = __mdoc_argnames;
504154958bSjoerg
519ff1f2acSchristos static int mdoc_ptext(struct roff_man *, int, char *, int);
529ff1f2acSchristos static int mdoc_pmacro(struct roff_man *, int, char *, int);
534154958bSjoerg
54fec65c98Schristos
554154958bSjoerg /*
564154958bSjoerg * Main parse routine. Parses a single line -- really just hands off to
570a84adc5Sjoerg * the macro (mdoc_pmacro()) or text parser (mdoc_ptext()).
584154958bSjoerg */
594154958bSjoerg int
mdoc_parseln(struct roff_man * mdoc,int ln,char * buf,int offs)609ff1f2acSchristos mdoc_parseln(struct roff_man *mdoc, int ln, char *buf, int offs)
614154958bSjoerg {
624154958bSjoerg
639ff1f2acSchristos if (mdoc->last->type != ROFFT_EQN || ln > mdoc->last->line)
6470f041f9Sjoerg mdoc->flags |= MDOC_NEWLINE;
6582361f10Sjoerg
6682361f10Sjoerg /*
6782361f10Sjoerg * Let the roff nS register switch SYNOPSIS mode early,
6882361f10Sjoerg * such that the parser knows at all times
6982361f10Sjoerg * whether this mode is on or off.
7082361f10Sjoerg * Note that this mode is also switched by the Sh macro.
7182361f10Sjoerg */
7270f041f9Sjoerg if (roff_getreg(mdoc->roff, "nS"))
7370f041f9Sjoerg mdoc->flags |= MDOC_SYNOPSIS;
7482361f10Sjoerg else
7570f041f9Sjoerg mdoc->flags &= ~MDOC_SYNOPSIS;
7682361f10Sjoerg
779ff1f2acSchristos return roff_getcontrol(mdoc->roff, buf, &offs) ?
7870f041f9Sjoerg mdoc_pmacro(mdoc, ln, buf, offs) :
799ff1f2acSchristos mdoc_ptext(mdoc, ln, buf, offs);
804154958bSjoerg }
814154958bSjoerg
82fec65c98Schristos void
mdoc_tail_alloc(struct roff_man * mdoc,int line,int pos,enum roff_tok tok)83c9bcef03Schristos mdoc_tail_alloc(struct roff_man *mdoc, int line, int pos, enum roff_tok tok)
844154958bSjoerg {
859ff1f2acSchristos struct roff_node *p;
864154958bSjoerg
879ff1f2acSchristos p = roff_node_alloc(mdoc, line, pos, ROFFT_TAIL, tok);
889ff1f2acSchristos roff_node_append(mdoc, p);
899ff1f2acSchristos mdoc->next = ROFF_NEXT_CHILD;
904154958bSjoerg }
914154958bSjoerg
929ff1f2acSchristos struct roff_node *
mdoc_endbody_alloc(struct roff_man * mdoc,int line,int pos,enum roff_tok tok,struct roff_node * body)93c9bcef03Schristos mdoc_endbody_alloc(struct roff_man *mdoc, int line, int pos,
94c9bcef03Schristos enum roff_tok tok, struct roff_node *body)
954154958bSjoerg {
969ff1f2acSchristos struct roff_node *p;
9782361f10Sjoerg
989508192eSchristos body->flags |= NODE_ENDED;
999508192eSchristos body->parent->flags |= NODE_ENDED;
1009ff1f2acSchristos p = roff_node_alloc(mdoc, line, pos, ROFFT_BODY, tok);
101fec65c98Schristos p->body = body;
10270f041f9Sjoerg p->norm = body->norm;
1039508192eSchristos p->end = ENDBODY_SPACE;
1049ff1f2acSchristos roff_node_append(mdoc, p);
1059ff1f2acSchristos mdoc->next = ROFF_NEXT_SIBLING;
1069ff1f2acSchristos return p;
10782361f10Sjoerg }
10882361f10Sjoerg
1099ff1f2acSchristos struct roff_node *
mdoc_block_alloc(struct roff_man * mdoc,int line,int pos,enum roff_tok tok,struct mdoc_arg * args)1109ff1f2acSchristos mdoc_block_alloc(struct roff_man *mdoc, int line, int pos,
111c9bcef03Schristos enum roff_tok tok, struct mdoc_arg *args)
1124154958bSjoerg {
1139ff1f2acSchristos struct roff_node *p;
1144154958bSjoerg
1159ff1f2acSchristos p = roff_node_alloc(mdoc, line, pos, ROFFT_BLOCK, tok);
1164154958bSjoerg p->args = args;
1174154958bSjoerg if (p->args)
1184154958bSjoerg (args->refcnt)++;
119c0d9444aSjoerg
120c0d9444aSjoerg switch (tok) {
121fec65c98Schristos case MDOC_Bd:
122fec65c98Schristos case MDOC_Bf:
123fec65c98Schristos case MDOC_Bl:
124fec65c98Schristos case MDOC_En:
125fec65c98Schristos case MDOC_Rs:
126c0d9444aSjoerg p->norm = mandoc_calloc(1, sizeof(union mdoc_data));
127c0d9444aSjoerg break;
128c0d9444aSjoerg default:
129c0d9444aSjoerg break;
130c0d9444aSjoerg }
1319ff1f2acSchristos roff_node_append(mdoc, p);
1329ff1f2acSchristos mdoc->next = ROFF_NEXT_CHILD;
1339ff1f2acSchristos return p;
1344154958bSjoerg }
1354154958bSjoerg
136fec65c98Schristos void
mdoc_elem_alloc(struct roff_man * mdoc,int line,int pos,enum roff_tok tok,struct mdoc_arg * args)1379ff1f2acSchristos mdoc_elem_alloc(struct roff_man *mdoc, int line, int pos,
138c9bcef03Schristos enum roff_tok tok, struct mdoc_arg *args)
1394154958bSjoerg {
1409ff1f2acSchristos struct roff_node *p;
1414154958bSjoerg
1429ff1f2acSchristos p = roff_node_alloc(mdoc, line, pos, ROFFT_ELEM, tok);
1434154958bSjoerg p->args = args;
1444154958bSjoerg if (p->args)
1454154958bSjoerg (args->refcnt)++;
146c0d9444aSjoerg
147c0d9444aSjoerg switch (tok) {
148fec65c98Schristos case MDOC_An:
149c0d9444aSjoerg p->norm = mandoc_calloc(1, sizeof(union mdoc_data));
150c0d9444aSjoerg break;
151c0d9444aSjoerg default:
152c0d9444aSjoerg break;
153c0d9444aSjoerg }
1549ff1f2acSchristos roff_node_append(mdoc, p);
1559ff1f2acSchristos mdoc->next = ROFF_NEXT_CHILD;
1564154958bSjoerg }
1574154958bSjoerg
1584154958bSjoerg /*
1594154958bSjoerg * Parse free-form text, that is, a line that does not begin with the
1604154958bSjoerg * control character.
1614154958bSjoerg */
1624154958bSjoerg static int
mdoc_ptext(struct roff_man * mdoc,int line,char * buf,int offs)1639ff1f2acSchristos mdoc_ptext(struct roff_man *mdoc, int line, char *buf, int offs)
1644154958bSjoerg {
1659ff1f2acSchristos struct roff_node *n;
166c9bcef03Schristos const char *cp, *sp;
1670a84adc5Sjoerg char *c, *ws, *end;
1680a84adc5Sjoerg
16970f041f9Sjoerg n = mdoc->last;
1707574e07eSjoerg
1717574e07eSjoerg /*
1729508192eSchristos * If a column list contains plain text, assume an implicit item
1739508192eSchristos * macro. This can happen one or more times at the beginning
1749508192eSchristos * of such a list, intermixed with non-It mdoc macros and with
1759508192eSchristos * nodes generated on the roff level, for example by tbl.
1767574e07eSjoerg */
1777574e07eSjoerg
1789508192eSchristos if ((n->tok == MDOC_Bl && n->type == ROFFT_BODY &&
1799508192eSchristos n->end == ENDBODY_NOT && n->norm->Bl.type == LIST_column) ||
1809508192eSchristos (n->parent != NULL && n->parent->tok == MDOC_Bl &&
1819508192eSchristos n->parent->norm->Bl.type == LIST_column)) {
18270f041f9Sjoerg mdoc->flags |= MDOC_FREECOL;
183544c191cSchristos (*mdoc_macro(MDOC_It)->fp)(mdoc, MDOC_It,
184544c191cSchristos line, offs, &offs, buf);
1859ff1f2acSchristos return 1;
1867574e07eSjoerg }
1877574e07eSjoerg
1884154958bSjoerg /*
1890a84adc5Sjoerg * Search for the beginning of unescaped trailing whitespace (ws)
1900a84adc5Sjoerg * and for the first character not to be output (end).
1914154958bSjoerg */
1924154958bSjoerg
1930a84adc5Sjoerg /* FIXME: replace with strcspn(). */
1940a84adc5Sjoerg ws = NULL;
1950a84adc5Sjoerg for (c = end = buf + offs; *c; c++) {
1960a84adc5Sjoerg switch (*c) {
1970a84adc5Sjoerg case ' ':
1980a84adc5Sjoerg if (NULL == ws)
1990a84adc5Sjoerg ws = c;
2000a84adc5Sjoerg continue;
2010a84adc5Sjoerg case '\t':
2020a84adc5Sjoerg /*
2030a84adc5Sjoerg * Always warn about trailing tabs,
2040a84adc5Sjoerg * even outside literal context,
2050a84adc5Sjoerg * where they should be put on the next line.
2060a84adc5Sjoerg */
2070a84adc5Sjoerg if (NULL == ws)
2080a84adc5Sjoerg ws = c;
2090a84adc5Sjoerg /*
2100a84adc5Sjoerg * Strip trailing tabs in literal context only;
2110a84adc5Sjoerg * outside, they affect the next line.
2120a84adc5Sjoerg */
213544c191cSchristos if (mdoc->flags & ROFF_NOFILL)
2144154958bSjoerg continue;
2150a84adc5Sjoerg break;
2160a84adc5Sjoerg case '\\':
2170a84adc5Sjoerg /* Skip the escaped character, too, if any. */
2180a84adc5Sjoerg if (c[1])
2190a84adc5Sjoerg c++;
2200a84adc5Sjoerg /* FALLTHROUGH */
2210a84adc5Sjoerg default:
2220a84adc5Sjoerg ws = NULL;
2234154958bSjoerg break;
2244154958bSjoerg }
2250a84adc5Sjoerg end = c + 1;
2260a84adc5Sjoerg }
2270a84adc5Sjoerg *end = '\0';
2284154958bSjoerg
2290a84adc5Sjoerg if (ws)
230544c191cSchristos mandoc_msg(MANDOCERR_SPACE_EOL, line, (int)(ws - buf), NULL);
2310a84adc5Sjoerg
232c9bcef03Schristos /*
233c9bcef03Schristos * Blank lines are allowed in no-fill mode
234c9bcef03Schristos * and cancel preceding \c,
235c9bcef03Schristos * but add a single vertical space elsewhere.
236c9bcef03Schristos */
237c9bcef03Schristos
238544c191cSchristos if (buf[offs] == '\0' && (mdoc->flags & ROFF_NOFILL) == 0) {
239c9bcef03Schristos switch (mdoc->last->type) {
240c9bcef03Schristos case ROFFT_TEXT:
241c9bcef03Schristos sp = mdoc->last->string;
242c9bcef03Schristos cp = end = strchr(sp, '\0') - 2;
243c9bcef03Schristos if (cp < sp || cp[0] != '\\' || cp[1] != 'c')
244c9bcef03Schristos break;
245c9bcef03Schristos while (cp > sp && cp[-1] == '\\')
246c9bcef03Schristos cp--;
247c9bcef03Schristos if ((end - cp) % 2)
248c9bcef03Schristos break;
249c9bcef03Schristos *end = '\0';
250c9bcef03Schristos return 1;
251c9bcef03Schristos default:
252c9bcef03Schristos break;
253c9bcef03Schristos }
254544c191cSchristos mandoc_msg(MANDOCERR_FI_BLANK, line, (int)(c - buf), NULL);
255c9bcef03Schristos roff_elem_alloc(mdoc, line, offs, ROFF_sp);
2569508192eSchristos mdoc->last->flags |= NODE_VALID | NODE_ENDED;
2579ff1f2acSchristos mdoc->next = ROFF_NEXT_SIBLING;
2589ff1f2acSchristos return 1;
2594154958bSjoerg }
2604154958bSjoerg
2619ff1f2acSchristos roff_word_alloc(mdoc, line, offs, buf+offs);
2620a84adc5Sjoerg
263544c191cSchristos if (mdoc->flags & ROFF_NOFILL)
2649ff1f2acSchristos return 1;
2650a84adc5Sjoerg
2660a84adc5Sjoerg /*
2670a84adc5Sjoerg * End-of-sentence check. If the last character is an unescaped
2680a84adc5Sjoerg * EOS character, then flag the node as being the end of a
2690a84adc5Sjoerg * sentence. The front-end will know how to interpret this.
2700a84adc5Sjoerg */
2710a84adc5Sjoerg
2720a84adc5Sjoerg assert(buf < end);
2730a84adc5Sjoerg
27470f041f9Sjoerg if (mandoc_eos(buf+offs, (size_t)(end-buf-offs)))
2759508192eSchristos mdoc->last->flags |= NODE_EOS;
2769508192eSchristos
2779508192eSchristos for (c = buf + offs; c != NULL; c = strchr(c + 1, '.')) {
2789508192eSchristos if (c - buf < offs + 2)
2799508192eSchristos continue;
280c9bcef03Schristos if (end - c < 3)
2819508192eSchristos break;
282c9bcef03Schristos if (c[1] != ' ' ||
283c9bcef03Schristos isalnum((unsigned char)c[-2]) == 0 ||
284c9bcef03Schristos isalnum((unsigned char)c[-1]) == 0 ||
285c9bcef03Schristos (c[-2] == 'n' && c[-1] == 'c') ||
286c9bcef03Schristos (c[-2] == 'v' && c[-1] == 's'))
287c9bcef03Schristos continue;
288c9bcef03Schristos c += 2;
289c9bcef03Schristos if (*c == ' ')
290c9bcef03Schristos c++;
291c9bcef03Schristos if (*c == ' ')
292c9bcef03Schristos c++;
293c9bcef03Schristos if (isupper((unsigned char)(*c)))
294544c191cSchristos mandoc_msg(MANDOCERR_EOS, line, (int)(c - buf), NULL);
2959508192eSchristos }
2969508192eSchristos
2979ff1f2acSchristos return 1;
2980a84adc5Sjoerg }
2994154958bSjoerg
3004154958bSjoerg /*
3014154958bSjoerg * Parse a macro line, that is, a line beginning with the control
3024154958bSjoerg * character.
3034154958bSjoerg */
30482361f10Sjoerg static int
mdoc_pmacro(struct roff_man * mdoc,int ln,char * buf,int offs)3059ff1f2acSchristos mdoc_pmacro(struct roff_man *mdoc, int ln, char *buf, int offs)
3064154958bSjoerg {
3079ff1f2acSchristos struct roff_node *n;
308fec65c98Schristos const char *cp;
309c9bcef03Schristos size_t sz;
310c9bcef03Schristos enum roff_tok tok;
311c9bcef03Schristos int sv;
312c9bcef03Schristos
313c9bcef03Schristos /* Determine the line macro. */
3144154958bSjoerg
31548741257Sjoerg sv = offs;
316c9bcef03Schristos tok = TOKEN_NONE;
317c9bcef03Schristos for (sz = 0; sz < 4 && strchr(" \t\\", buf[offs]) == NULL; sz++)
318c9bcef03Schristos offs++;
319c9bcef03Schristos if (sz == 2 || sz == 3)
320c9bcef03Schristos tok = roffhash_find(mdoc->mdocmac, buf + sv, sz);
3219ff1f2acSchristos if (tok == TOKEN_NONE) {
322544c191cSchristos mandoc_msg(MANDOCERR_MACRO, ln, sv, "%s", buf + sv - 1);
3239ff1f2acSchristos return 1;
3244154958bSjoerg }
3254154958bSjoerg
326fec65c98Schristos /* Skip a leading escape sequence or tab. */
3274154958bSjoerg
328fec65c98Schristos switch (buf[offs]) {
329fec65c98Schristos case '\\':
330fec65c98Schristos cp = buf + offs + 1;
331fec65c98Schristos mandoc_escape(&cp, NULL, NULL);
332fec65c98Schristos offs = cp - buf;
333fec65c98Schristos break;
334fec65c98Schristos case '\t':
33548741257Sjoerg offs++;
336fec65c98Schristos break;
337fec65c98Schristos default:
338fec65c98Schristos break;
339fec65c98Schristos }
340c0d9444aSjoerg
341c0d9444aSjoerg /* Jump to the next non-whitespace word. */
3424154958bSjoerg
343c9bcef03Schristos while (buf[offs] == ' ')
34448741257Sjoerg offs++;
3454154958bSjoerg
3460a84adc5Sjoerg /*
3470a84adc5Sjoerg * Trailing whitespace. Note that tabs are allowed to be passed
3480a84adc5Sjoerg * into the parser as "text", so we only warn about spaces here.
3490a84adc5Sjoerg */
350d5e63c8dSjoerg
35148741257Sjoerg if ('\0' == buf[offs] && ' ' == buf[offs - 1])
352544c191cSchristos mandoc_msg(MANDOCERR_SPACE_EOL, ln, offs - 1, NULL);
353d5e63c8dSjoerg
3544154958bSjoerg /*
3557574e07eSjoerg * If an initial macro or a list invocation, divert directly
3567574e07eSjoerg * into macro processing.
3574154958bSjoerg */
3587574e07eSjoerg
3599508192eSchristos n = mdoc->last;
3609508192eSchristos if (n == NULL || tok == MDOC_It || tok == MDOC_El) {
361544c191cSchristos (*mdoc_macro(tok)->fp)(mdoc, tok, ln, sv, &offs, buf);
3629ff1f2acSchristos return 1;
3637574e07eSjoerg }
3647574e07eSjoerg
3657574e07eSjoerg /*
3669508192eSchristos * If a column list contains a non-It macro, assume an implicit
3679508192eSchristos * item macro. This can happen one or more times at the
3689508192eSchristos * beginning of such a list, intermixed with text lines and
3699508192eSchristos * with nodes generated on the roff level, for example by tbl.
3707574e07eSjoerg */
3717574e07eSjoerg
3729508192eSchristos if ((n->tok == MDOC_Bl && n->type == ROFFT_BODY &&
3739508192eSchristos n->end == ENDBODY_NOT && n->norm->Bl.type == LIST_column) ||
3749508192eSchristos (n->parent != NULL && n->parent->tok == MDOC_Bl &&
3759508192eSchristos n->parent->norm->Bl.type == LIST_column)) {
37670f041f9Sjoerg mdoc->flags |= MDOC_FREECOL;
377544c191cSchristos (*mdoc_macro(MDOC_It)->fp)(mdoc, MDOC_It, ln, sv, &sv, buf);
3789ff1f2acSchristos return 1;
3797574e07eSjoerg }
3807574e07eSjoerg
3817574e07eSjoerg /* Normal processing of a macro. */
3827574e07eSjoerg
383544c191cSchristos (*mdoc_macro(tok)->fp)(mdoc, tok, ln, sv, &offs, buf);
384fec65c98Schristos
385fec65c98Schristos /* In quick mode (for mandocdb), abort after the NAME section. */
386fec65c98Schristos
387fec65c98Schristos if (mdoc->quick && MDOC_Sh == tok &&
388fec65c98Schristos SEC_NAME != mdoc->last->sec)
3899ff1f2acSchristos return 2;
3904154958bSjoerg
3919ff1f2acSchristos return 1;
3924154958bSjoerg }
3934154958bSjoerg
39448741257Sjoerg enum mdelim
mdoc_isdelim(const char * p)39548741257Sjoerg mdoc_isdelim(const char *p)
39648741257Sjoerg {
3974154958bSjoerg
39848741257Sjoerg if ('\0' == p[0])
3999ff1f2acSchristos return DELIM_NONE;
40048741257Sjoerg
40148741257Sjoerg if ('\0' == p[1])
40248741257Sjoerg switch (p[0]) {
403fec65c98Schristos case '(':
404fec65c98Schristos case '[':
4059ff1f2acSchristos return DELIM_OPEN;
406fec65c98Schristos case '|':
4079ff1f2acSchristos return DELIM_MIDDLE;
408fec65c98Schristos case '.':
409fec65c98Schristos case ',':
410fec65c98Schristos case ';':
411fec65c98Schristos case ':':
412fec65c98Schristos case '?':
413fec65c98Schristos case '!':
414fec65c98Schristos case ')':
415fec65c98Schristos case ']':
4169ff1f2acSchristos return DELIM_CLOSE;
41748741257Sjoerg default:
4189ff1f2acSchristos return DELIM_NONE;
41948741257Sjoerg }
42048741257Sjoerg
42148741257Sjoerg if ('\\' != p[0])
4229ff1f2acSchristos return DELIM_NONE;
42348741257Sjoerg
42448741257Sjoerg if (0 == strcmp(p + 1, "."))
4259ff1f2acSchristos return DELIM_CLOSE;
42670f041f9Sjoerg if (0 == strcmp(p + 1, "fR|\\fP"))
4279ff1f2acSchristos return DELIM_MIDDLE;
42848741257Sjoerg
4299ff1f2acSchristos return DELIM_NONE;
43048741257Sjoerg }
431