xref: /openbsd-src/usr.bin/mandoc/mandoc.c (revision ddce81d18ead1cead306cecb446175faf94d13e9)
1 /*	$OpenBSD: mandoc.c,v 1.76 2018/10/25 01:21:30 schwarze Exp $ */
2 /*
3  * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4  * Copyright (c) 2011-2015, 2017, 2018 Ingo Schwarze <schwarze@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 #include <sys/types.h>
19 
20 #include <assert.h>
21 #include <ctype.h>
22 #include <errno.h>
23 #include <limits.h>
24 #include <stdlib.h>
25 #include <stdio.h>
26 #include <string.h>
27 #include <time.h>
28 
29 #include "mandoc_aux.h"
30 #include "mandoc.h"
31 #include "roff.h"
32 #include "libmandoc.h"
33 
34 static	int	 a2time(time_t *, const char *, const char *);
35 static	char	*time2a(time_t);
36 
37 
38 enum mandoc_esc
39 mandoc_escape(const char **end, const char **start, int *sz)
40 {
41 	const char	*local_start;
42 	int		 local_sz, c, i;
43 	char		 term;
44 	enum mandoc_esc	 gly;
45 
46 	/*
47 	 * When the caller doesn't provide return storage,
48 	 * use local storage.
49 	 */
50 
51 	if (NULL == start)
52 		start = &local_start;
53 	if (NULL == sz)
54 		sz = &local_sz;
55 
56 	/*
57 	 * Beyond the backslash, at least one input character
58 	 * is part of the escape sequence.  With one exception
59 	 * (see below), that character won't be returned.
60 	 */
61 
62 	gly = ESCAPE_ERROR;
63 	*start = ++*end;
64 	*sz = 0;
65 	term = '\0';
66 
67 	switch ((*start)[-1]) {
68 	/*
69 	 * First the glyphs.  There are several different forms of
70 	 * these, but each eventually returns a substring of the glyph
71 	 * name.
72 	 */
73 	case '(':
74 		gly = ESCAPE_SPECIAL;
75 		*sz = 2;
76 		break;
77 	case '[':
78 		gly = ESCAPE_SPECIAL;
79 		term = ']';
80 		break;
81 	case 'C':
82 		if ('\'' != **start)
83 			return ESCAPE_ERROR;
84 		*start = ++*end;
85 		gly = ESCAPE_SPECIAL;
86 		term = '\'';
87 		break;
88 
89 	/*
90 	 * Escapes taking no arguments at all.
91 	 */
92 	case 'd':
93 	case 'u':
94 	case ',':
95 	case '/':
96 		return ESCAPE_IGNORE;
97 	case 'p':
98 		return ESCAPE_BREAK;
99 
100 	/*
101 	 * The \z escape is supposed to output the following
102 	 * character without advancing the cursor position.
103 	 * Since we are mostly dealing with terminal mode,
104 	 * let us just skip the next character.
105 	 */
106 	case 'z':
107 		return ESCAPE_SKIPCHAR;
108 
109 	/*
110 	 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
111 	 * 'X' is the trigger.  These have opaque sub-strings.
112 	 */
113 	case 'F':
114 	case 'g':
115 	case 'k':
116 	case 'M':
117 	case 'm':
118 	case 'n':
119 	case 'V':
120 	case 'Y':
121 		gly = ESCAPE_IGNORE;
122 		/* FALLTHROUGH */
123 	case 'f':
124 		if (ESCAPE_ERROR == gly)
125 			gly = ESCAPE_FONT;
126 		switch (**start) {
127 		case '(':
128 			*start = ++*end;
129 			*sz = 2;
130 			break;
131 		case '[':
132 			*start = ++*end;
133 			term = ']';
134 			break;
135 		default:
136 			*sz = 1;
137 			break;
138 		}
139 		break;
140 	case '*':
141 		if (strncmp(*start, "(.T", 3) != 0)
142 			abort();
143 		gly = ESCAPE_DEVICE;
144 		*start = ++*end;
145 		*sz = 2;
146 		break;
147 
148 	/*
149 	 * These escapes are of the form \X'Y', where 'X' is the trigger
150 	 * and 'Y' is any string.  These have opaque sub-strings.
151 	 * The \B and \w escapes are handled in roff.c, roff_res().
152 	 */
153 	case 'A':
154 	case 'b':
155 	case 'D':
156 	case 'R':
157 	case 'X':
158 	case 'Z':
159 		gly = ESCAPE_IGNORE;
160 		/* FALLTHROUGH */
161 	case 'o':
162 		if (**start == '\0')
163 			return ESCAPE_ERROR;
164 		if (gly == ESCAPE_ERROR)
165 			gly = ESCAPE_OVERSTRIKE;
166 		term = **start;
167 		*start = ++*end;
168 		break;
169 
170 	/*
171 	 * These escapes are of the form \X'N', where 'X' is the trigger
172 	 * and 'N' resolves to a numerical expression.
173 	 */
174 	case 'h':
175 	case 'H':
176 	case 'L':
177 	case 'l':
178 	case 'S':
179 	case 'v':
180 	case 'x':
181 		if (strchr(" %&()*+-./0123456789:<=>", **start)) {
182 			if ('\0' != **start)
183 				++*end;
184 			return ESCAPE_ERROR;
185 		}
186 		switch ((*start)[-1]) {
187 		case 'h':
188 			gly = ESCAPE_HORIZ;
189 			break;
190 		case 'l':
191 			gly = ESCAPE_HLINE;
192 			break;
193 		default:
194 			gly = ESCAPE_IGNORE;
195 			break;
196 		}
197 		term = **start;
198 		*start = ++*end;
199 		break;
200 
201 	/*
202 	 * Special handling for the numbered character escape.
203 	 * XXX Do any other escapes need similar handling?
204 	 */
205 	case 'N':
206 		if ('\0' == **start)
207 			return ESCAPE_ERROR;
208 		(*end)++;
209 		if (isdigit((unsigned char)**start)) {
210 			*sz = 1;
211 			return ESCAPE_IGNORE;
212 		}
213 		(*start)++;
214 		while (isdigit((unsigned char)**end))
215 			(*end)++;
216 		*sz = *end - *start;
217 		if ('\0' != **end)
218 			(*end)++;
219 		return ESCAPE_NUMBERED;
220 
221 	/*
222 	 * Sizes get a special category of their own.
223 	 */
224 	case 's':
225 		gly = ESCAPE_IGNORE;
226 
227 		/* See +/- counts as a sign. */
228 		if ('+' == **end || '-' == **end || ASCII_HYPH == **end)
229 			*start = ++*end;
230 
231 		switch (**end) {
232 		case '(':
233 			*start = ++*end;
234 			*sz = 2;
235 			break;
236 		case '[':
237 			*start = ++*end;
238 			term = ']';
239 			break;
240 		case '\'':
241 			*start = ++*end;
242 			term = '\'';
243 			break;
244 		case '3':
245 		case '2':
246 		case '1':
247 			*sz = (*end)[-1] == 's' &&
248 			    isdigit((unsigned char)(*end)[1]) ? 2 : 1;
249 			break;
250 		default:
251 			*sz = 1;
252 			break;
253 		}
254 
255 		break;
256 
257 	/*
258 	 * Anything else is assumed to be a glyph.
259 	 * In this case, pass back the character after the backslash.
260 	 */
261 	default:
262 		gly = ESCAPE_SPECIAL;
263 		*start = --*end;
264 		*sz = 1;
265 		break;
266 	}
267 
268 	assert(ESCAPE_ERROR != gly);
269 
270 	/*
271 	 * Read up to the terminating character,
272 	 * paying attention to nested escapes.
273 	 */
274 
275 	if ('\0' != term) {
276 		while (**end != term) {
277 			switch (**end) {
278 			case '\0':
279 				return ESCAPE_ERROR;
280 			case '\\':
281 				(*end)++;
282 				if (ESCAPE_ERROR ==
283 				    mandoc_escape(end, NULL, NULL))
284 					return ESCAPE_ERROR;
285 				break;
286 			default:
287 				(*end)++;
288 				break;
289 			}
290 		}
291 		*sz = (*end)++ - *start;
292 	} else {
293 		assert(*sz > 0);
294 		if ((size_t)*sz > strlen(*start))
295 			return ESCAPE_ERROR;
296 		*end += *sz;
297 	}
298 
299 	/* Run post-processors. */
300 
301 	switch (gly) {
302 	case ESCAPE_FONT:
303 		if (*sz == 2) {
304 			if (**start == 'C') {
305 				if ((*start)[1] == 'W' ||
306 				    (*start)[1] == 'R') {
307 					gly = ESCAPE_FONTCW;
308 					break;
309 				}
310 				/*
311 				 * Treat other constant-width font modes
312 				 * just like regular font modes.
313 				 */
314 				(*start)++;
315 				(*sz)--;
316 			} else {
317 				if ((*start)[0] == 'B' && (*start)[1] == 'I')
318 					gly = ESCAPE_FONTBI;
319 				break;
320 			}
321 		} else if (*sz != 1) {
322 			if (*sz == 0)
323 				gly = ESCAPE_FONTPREV;
324 			break;
325 		}
326 
327 		switch (**start) {
328 		case '3':
329 		case 'B':
330 			gly = ESCAPE_FONTBOLD;
331 			break;
332 		case '2':
333 		case 'I':
334 			gly = ESCAPE_FONTITALIC;
335 			break;
336 		case 'P':
337 			gly = ESCAPE_FONTPREV;
338 			break;
339 		case '1':
340 		case 'R':
341 			gly = ESCAPE_FONTROMAN;
342 			break;
343 		}
344 		break;
345 	case ESCAPE_SPECIAL:
346 		if (**start == 'c') {
347 			if (*sz == 1) {
348 				gly = ESCAPE_NOSPACE;
349 				break;
350 			}
351 			if (*sz < 6 || *sz > 7 ||
352 			    strncmp(*start, "char", 4) != 0 ||
353 			    (int)strspn(*start + 4, "0123456789") + 4 < *sz)
354 				break;
355 			c = 0;
356 			for (i = 4; i < *sz; i++)
357 				c = 10 * c + ((*start)[i] - '0');
358 			if (c < 0x21 || (c > 0x7e && c < 0xa0) || c > 0xff)
359 				break;
360 			*start += 4;
361 			*sz -= 4;
362 			gly = ESCAPE_NUMBERED;
363 			break;
364 		}
365 
366 		/*
367 		 * Unicode escapes are defined in groff as \[u0000]
368 		 * to \[u10FFFF], where the contained value must be
369 		 * a valid Unicode codepoint.  Here, however, only
370 		 * check the length and range.
371 		 */
372 		if (**start != 'u' || *sz < 5 || *sz > 7)
373 			break;
374 		if (*sz == 7 && ((*start)[1] != '1' || (*start)[2] != '0'))
375 			break;
376 		if (*sz == 6 && (*start)[1] == '0')
377 			break;
378 		if (*sz == 5 && (*start)[1] == 'D' &&
379 		    strchr("89ABCDEF", (*start)[2]) != NULL)
380 			break;
381 		if ((int)strspn(*start + 1, "0123456789ABCDEFabcdef")
382 		    + 1 == *sz)
383 			gly = ESCAPE_UNICODE;
384 		break;
385 	default:
386 		break;
387 	}
388 
389 	return gly;
390 }
391 
392 /*
393  * Parse a quoted or unquoted roff-style request or macro argument.
394  * Return a pointer to the parsed argument, which is either the original
395  * pointer or advanced by one byte in case the argument is quoted.
396  * NUL-terminate the argument in place.
397  * Collapse pairs of quotes inside quoted arguments.
398  * Advance the argument pointer to the next argument,
399  * or to the NUL byte terminating the argument line.
400  */
401 char *
402 mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos)
403 {
404 	char	 *start, *cp;
405 	int	  quoted, pairs, white;
406 
407 	/* Quoting can only start with a new word. */
408 	start = *cpp;
409 	quoted = 0;
410 	if ('"' == *start) {
411 		quoted = 1;
412 		start++;
413 	}
414 
415 	pairs = 0;
416 	white = 0;
417 	for (cp = start; '\0' != *cp; cp++) {
418 
419 		/*
420 		 * Move the following text left
421 		 * after quoted quotes and after "\\" and "\t".
422 		 */
423 		if (pairs)
424 			cp[-pairs] = cp[0];
425 
426 		if ('\\' == cp[0]) {
427 			/*
428 			 * In copy mode, translate double to single
429 			 * backslashes and backslash-t to literal tabs.
430 			 */
431 			switch (cp[1]) {
432 			case 't':
433 				cp[0] = '\t';
434 				/* FALLTHROUGH */
435 			case '\\':
436 				pairs++;
437 				cp++;
438 				break;
439 			case ' ':
440 				/* Skip escaped blanks. */
441 				if (0 == quoted)
442 					cp++;
443 				break;
444 			default:
445 				break;
446 			}
447 		} else if (0 == quoted) {
448 			if (' ' == cp[0]) {
449 				/* Unescaped blanks end unquoted args. */
450 				white = 1;
451 				break;
452 			}
453 		} else if ('"' == cp[0]) {
454 			if ('"' == cp[1]) {
455 				/* Quoted quotes collapse. */
456 				pairs++;
457 				cp++;
458 			} else {
459 				/* Unquoted quotes end quoted args. */
460 				quoted = 2;
461 				break;
462 			}
463 		}
464 	}
465 
466 	/* Quoted argument without a closing quote. */
467 	if (1 == quoted)
468 		mandoc_msg(MANDOCERR_ARG_QUOTE, parse, ln, *pos, NULL);
469 
470 	/* NUL-terminate this argument and move to the next one. */
471 	if (pairs)
472 		cp[-pairs] = '\0';
473 	if ('\0' != *cp) {
474 		*cp++ = '\0';
475 		while (' ' == *cp)
476 			cp++;
477 	}
478 	*pos += (int)(cp - start) + (quoted ? 1 : 0);
479 	*cpp = cp;
480 
481 	if ('\0' == *cp && (white || ' ' == cp[-1]))
482 		mandoc_msg(MANDOCERR_SPACE_EOL, parse, ln, *pos, NULL);
483 
484 	return start;
485 }
486 
487 static int
488 a2time(time_t *t, const char *fmt, const char *p)
489 {
490 	struct tm	 tm;
491 	char		*pp;
492 
493 	memset(&tm, 0, sizeof(struct tm));
494 
495 	pp = strptime(p, fmt, &tm);
496 	if (NULL != pp && '\0' == *pp) {
497 		*t = mktime(&tm);
498 		return 1;
499 	}
500 
501 	return 0;
502 }
503 
504 static char *
505 time2a(time_t t)
506 {
507 	struct tm	*tm;
508 	char		*buf, *p;
509 	size_t		 ssz;
510 	int		 isz;
511 
512 	tm = localtime(&t);
513 	if (tm == NULL)
514 		return NULL;
515 
516 	/*
517 	 * Reserve space:
518 	 * up to 9 characters for the month (September) + blank
519 	 * up to 2 characters for the day + comma + blank
520 	 * 4 characters for the year and a terminating '\0'
521 	 */
522 
523 	p = buf = mandoc_malloc(10 + 4 + 4 + 1);
524 
525 	if ((ssz = strftime(p, 10 + 1, "%B ", tm)) == 0)
526 		goto fail;
527 	p += (int)ssz;
528 
529 	/*
530 	 * The output format is just "%d" here, not "%2d" or "%02d".
531 	 * That's also the reason why we can't just format the
532 	 * date as a whole with "%B %e, %Y" or "%B %d, %Y".
533 	 * Besides, the present approach is less prone to buffer
534 	 * overflows, in case anybody should ever introduce the bug
535 	 * of looking at LC_TIME.
536 	 */
537 
538 	if ((isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday)) == -1)
539 		goto fail;
540 	p += isz;
541 
542 	if (strftime(p, 4 + 1, "%Y", tm) == 0)
543 		goto fail;
544 	return buf;
545 
546 fail:
547 	free(buf);
548 	return NULL;
549 }
550 
551 char *
552 mandoc_normdate(struct roff_man *man, char *in, int ln, int pos)
553 {
554 	char		*cp;
555 	time_t		 t;
556 
557 	/* No date specified: use today's date. */
558 
559 	if (in == NULL || *in == '\0' || strcmp(in, "$" "Mdocdate$") == 0) {
560 		mandoc_msg(MANDOCERR_DATE_MISSING, man->parse, ln, pos, NULL);
561 		return time2a(time(NULL));
562 	}
563 
564 	/* Valid mdoc(7) date format. */
565 
566 	if (a2time(&t, "$" "Mdocdate: %b %d %Y $", in) ||
567 	    a2time(&t, "%b %d, %Y", in)) {
568 		cp = time2a(t);
569 		if (t > time(NULL) + 86400)
570 			mandoc_msg(MANDOCERR_DATE_FUTURE, man->parse,
571 			    ln, pos, cp);
572 		else if (*in != '$' && strcmp(in, cp) != 0)
573 			mandoc_msg(MANDOCERR_DATE_NORM, man->parse,
574 			    ln, pos, cp);
575 		return cp;
576 	}
577 
578 	/* In man(7), do not warn about the legacy format. */
579 
580 	if (a2time(&t, "%Y-%m-%d", in) == 0)
581 		mandoc_msg(MANDOCERR_DATE_BAD, man->parse, ln, pos, in);
582 	else if (t > time(NULL) + 86400)
583 		mandoc_msg(MANDOCERR_DATE_FUTURE, man->parse, ln, pos, in);
584 	else if (man->macroset == MACROSET_MDOC)
585 		mandoc_vmsg(MANDOCERR_DATE_LEGACY, man->parse,
586 		    ln, pos, "Dd %s", in);
587 
588 	/* Use any non-mdoc(7) date verbatim. */
589 
590 	return mandoc_strdup(in);
591 }
592 
593 int
594 mandoc_eos(const char *p, size_t sz)
595 {
596 	const char	*q;
597 	int		 enclosed, found;
598 
599 	if (0 == sz)
600 		return 0;
601 
602 	/*
603 	 * End-of-sentence recognition must include situations where
604 	 * some symbols, such as `)', allow prior EOS punctuation to
605 	 * propagate outward.
606 	 */
607 
608 	enclosed = found = 0;
609 	for (q = p + (int)sz - 1; q >= p; q--) {
610 		switch (*q) {
611 		case '\"':
612 		case '\'':
613 		case ']':
614 		case ')':
615 			if (0 == found)
616 				enclosed = 1;
617 			break;
618 		case '.':
619 		case '!':
620 		case '?':
621 			found = 1;
622 			break;
623 		default:
624 			return found &&
625 			    (!enclosed || isalnum((unsigned char)*q));
626 		}
627 	}
628 
629 	return found && !enclosed;
630 }
631 
632 /*
633  * Convert a string to a long that may not be <0.
634  * If the string is invalid, or is less than 0, return -1.
635  */
636 int
637 mandoc_strntoi(const char *p, size_t sz, int base)
638 {
639 	char		 buf[32];
640 	char		*ep;
641 	long		 v;
642 
643 	if (sz > 31)
644 		return -1;
645 
646 	memcpy(buf, p, sz);
647 	buf[(int)sz] = '\0';
648 
649 	errno = 0;
650 	v = strtol(buf, &ep, base);
651 
652 	if (buf[0] == '\0' || *ep != '\0')
653 		return -1;
654 
655 	if (v > INT_MAX)
656 		v = INT_MAX;
657 	if (v < INT_MIN)
658 		v = INT_MIN;
659 
660 	return (int)v;
661 }
662