xref: /openbsd-src/usr.bin/mandoc/mandoc.c (revision ae3cb403620ab940fbaabb3055fac045a63d56b7)
1 /*	$OpenBSD: mandoc.c,v 1.71 2017/07/03 13:40:00 schwarze Exp $ */
2 /*
3  * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4  * Copyright (c) 2011-2015, 2017 Ingo Schwarze <schwarze@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 #include <sys/types.h>
19 
20 #include <assert.h>
21 #include <ctype.h>
22 #include <errno.h>
23 #include <limits.h>
24 #include <stdlib.h>
25 #include <stdio.h>
26 #include <string.h>
27 #include <time.h>
28 
29 #include "mandoc_aux.h"
30 #include "mandoc.h"
31 #include "roff.h"
32 #include "libmandoc.h"
33 
34 static	int	 a2time(time_t *, const char *, const char *);
35 static	char	*time2a(time_t);
36 
37 
38 enum mandoc_esc
39 mandoc_escape(const char **end, const char **start, int *sz)
40 {
41 	const char	*local_start;
42 	int		 local_sz;
43 	char		 term;
44 	enum mandoc_esc	 gly;
45 
46 	/*
47 	 * When the caller doesn't provide return storage,
48 	 * use local storage.
49 	 */
50 
51 	if (NULL == start)
52 		start = &local_start;
53 	if (NULL == sz)
54 		sz = &local_sz;
55 
56 	/*
57 	 * Beyond the backslash, at least one input character
58 	 * is part of the escape sequence.  With one exception
59 	 * (see below), that character won't be returned.
60 	 */
61 
62 	gly = ESCAPE_ERROR;
63 	*start = ++*end;
64 	*sz = 0;
65 	term = '\0';
66 
67 	switch ((*start)[-1]) {
68 	/*
69 	 * First the glyphs.  There are several different forms of
70 	 * these, but each eventually returns a substring of the glyph
71 	 * name.
72 	 */
73 	case '(':
74 		gly = ESCAPE_SPECIAL;
75 		*sz = 2;
76 		break;
77 	case '[':
78 		gly = ESCAPE_SPECIAL;
79 		term = ']';
80 		break;
81 	case 'C':
82 		if ('\'' != **start)
83 			return ESCAPE_ERROR;
84 		*start = ++*end;
85 		gly = ESCAPE_SPECIAL;
86 		term = '\'';
87 		break;
88 
89 	/*
90 	 * Escapes taking no arguments at all.
91 	 */
92 	case 'd':
93 	case 'u':
94 	case ',':
95 	case '/':
96 		return ESCAPE_IGNORE;
97 	case 'p':
98 		return ESCAPE_BREAK;
99 
100 	/*
101 	 * The \z escape is supposed to output the following
102 	 * character without advancing the cursor position.
103 	 * Since we are mostly dealing with terminal mode,
104 	 * let us just skip the next character.
105 	 */
106 	case 'z':
107 		return ESCAPE_SKIPCHAR;
108 
109 	/*
110 	 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
111 	 * 'X' is the trigger.  These have opaque sub-strings.
112 	 */
113 	case 'F':
114 	case 'g':
115 	case 'k':
116 	case 'M':
117 	case 'm':
118 	case 'n':
119 	case 'V':
120 	case 'Y':
121 		gly = ESCAPE_IGNORE;
122 		/* FALLTHROUGH */
123 	case 'f':
124 		if (ESCAPE_ERROR == gly)
125 			gly = ESCAPE_FONT;
126 		switch (**start) {
127 		case '(':
128 			*start = ++*end;
129 			*sz = 2;
130 			break;
131 		case '[':
132 			*start = ++*end;
133 			term = ']';
134 			break;
135 		default:
136 			*sz = 1;
137 			break;
138 		}
139 		break;
140 
141 	/*
142 	 * These escapes are of the form \X'Y', where 'X' is the trigger
143 	 * and 'Y' is any string.  These have opaque sub-strings.
144 	 * The \B and \w escapes are handled in roff.c, roff_res().
145 	 */
146 	case 'A':
147 	case 'b':
148 	case 'D':
149 	case 'R':
150 	case 'X':
151 	case 'Z':
152 		gly = ESCAPE_IGNORE;
153 		/* FALLTHROUGH */
154 	case 'o':
155 		if (**start == '\0')
156 			return ESCAPE_ERROR;
157 		if (gly == ESCAPE_ERROR)
158 			gly = ESCAPE_OVERSTRIKE;
159 		term = **start;
160 		*start = ++*end;
161 		break;
162 
163 	/*
164 	 * These escapes are of the form \X'N', where 'X' is the trigger
165 	 * and 'N' resolves to a numerical expression.
166 	 */
167 	case 'h':
168 	case 'H':
169 	case 'L':
170 	case 'l':
171 	case 'S':
172 	case 'v':
173 	case 'x':
174 		if (strchr(" %&()*+-./0123456789:<=>", **start)) {
175 			if ('\0' != **start)
176 				++*end;
177 			return ESCAPE_ERROR;
178 		}
179 		switch ((*start)[-1]) {
180 		case 'h':
181 			gly = ESCAPE_HORIZ;
182 			break;
183 		case 'l':
184 			gly = ESCAPE_HLINE;
185 			break;
186 		default:
187 			gly = ESCAPE_IGNORE;
188 			break;
189 		}
190 		term = **start;
191 		*start = ++*end;
192 		break;
193 
194 	/*
195 	 * Special handling for the numbered character escape.
196 	 * XXX Do any other escapes need similar handling?
197 	 */
198 	case 'N':
199 		if ('\0' == **start)
200 			return ESCAPE_ERROR;
201 		(*end)++;
202 		if (isdigit((unsigned char)**start)) {
203 			*sz = 1;
204 			return ESCAPE_IGNORE;
205 		}
206 		(*start)++;
207 		while (isdigit((unsigned char)**end))
208 			(*end)++;
209 		*sz = *end - *start;
210 		if ('\0' != **end)
211 			(*end)++;
212 		return ESCAPE_NUMBERED;
213 
214 	/*
215 	 * Sizes get a special category of their own.
216 	 */
217 	case 's':
218 		gly = ESCAPE_IGNORE;
219 
220 		/* See +/- counts as a sign. */
221 		if ('+' == **end || '-' == **end || ASCII_HYPH == **end)
222 			*start = ++*end;
223 
224 		switch (**end) {
225 		case '(':
226 			*start = ++*end;
227 			*sz = 2;
228 			break;
229 		case '[':
230 			*start = ++*end;
231 			term = ']';
232 			break;
233 		case '\'':
234 			*start = ++*end;
235 			term = '\'';
236 			break;
237 		case '3':
238 		case '2':
239 		case '1':
240 			*sz = (*end)[-1] == 's' &&
241 			    isdigit((unsigned char)(*end)[1]) ? 2 : 1;
242 			break;
243 		default:
244 			*sz = 1;
245 			break;
246 		}
247 
248 		break;
249 
250 	/*
251 	 * Anything else is assumed to be a glyph.
252 	 * In this case, pass back the character after the backslash.
253 	 */
254 	default:
255 		gly = ESCAPE_SPECIAL;
256 		*start = --*end;
257 		*sz = 1;
258 		break;
259 	}
260 
261 	assert(ESCAPE_ERROR != gly);
262 
263 	/*
264 	 * Read up to the terminating character,
265 	 * paying attention to nested escapes.
266 	 */
267 
268 	if ('\0' != term) {
269 		while (**end != term) {
270 			switch (**end) {
271 			case '\0':
272 				return ESCAPE_ERROR;
273 			case '\\':
274 				(*end)++;
275 				if (ESCAPE_ERROR ==
276 				    mandoc_escape(end, NULL, NULL))
277 					return ESCAPE_ERROR;
278 				break;
279 			default:
280 				(*end)++;
281 				break;
282 			}
283 		}
284 		*sz = (*end)++ - *start;
285 	} else {
286 		assert(*sz > 0);
287 		if ((size_t)*sz > strlen(*start))
288 			return ESCAPE_ERROR;
289 		*end += *sz;
290 	}
291 
292 	/* Run post-processors. */
293 
294 	switch (gly) {
295 	case ESCAPE_FONT:
296 		if (2 == *sz) {
297 			if ('C' == **start) {
298 				/*
299 				 * Treat constant-width font modes
300 				 * just like regular font modes.
301 				 */
302 				(*start)++;
303 				(*sz)--;
304 			} else {
305 				if ('B' == (*start)[0] && 'I' == (*start)[1])
306 					gly = ESCAPE_FONTBI;
307 				break;
308 			}
309 		} else if (1 != *sz)
310 			break;
311 
312 		switch (**start) {
313 		case '3':
314 		case 'B':
315 			gly = ESCAPE_FONTBOLD;
316 			break;
317 		case '2':
318 		case 'I':
319 			gly = ESCAPE_FONTITALIC;
320 			break;
321 		case 'P':
322 			gly = ESCAPE_FONTPREV;
323 			break;
324 		case '1':
325 		case 'R':
326 			gly = ESCAPE_FONTROMAN;
327 			break;
328 		}
329 		break;
330 	case ESCAPE_SPECIAL:
331 		if (1 == *sz && 'c' == **start)
332 			gly = ESCAPE_NOSPACE;
333 		/*
334 		 * Unicode escapes are defined in groff as \[u0000]
335 		 * to \[u10FFFF], where the contained value must be
336 		 * a valid Unicode codepoint.  Here, however, only
337 		 * check the length and range.
338 		 */
339 		if (**start != 'u' || *sz < 5 || *sz > 7)
340 			break;
341 		if (*sz == 7 && ((*start)[1] != '1' || (*start)[2] != '0'))
342 			break;
343 		if (*sz == 6 && (*start)[1] == '0')
344 			break;
345 		if (*sz == 5 && (*start)[1] == 'D' &&
346 		    strchr("89ABCDEF", (*start)[2]) != NULL)
347 			break;
348 		if ((int)strspn(*start + 1, "0123456789ABCDEFabcdef")
349 		    + 1 == *sz)
350 			gly = ESCAPE_UNICODE;
351 		break;
352 	default:
353 		break;
354 	}
355 
356 	return gly;
357 }
358 
359 /*
360  * Parse a quoted or unquoted roff-style request or macro argument.
361  * Return a pointer to the parsed argument, which is either the original
362  * pointer or advanced by one byte in case the argument is quoted.
363  * NUL-terminate the argument in place.
364  * Collapse pairs of quotes inside quoted arguments.
365  * Advance the argument pointer to the next argument,
366  * or to the NUL byte terminating the argument line.
367  */
368 char *
369 mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos)
370 {
371 	char	 *start, *cp;
372 	int	  quoted, pairs, white;
373 
374 	/* Quoting can only start with a new word. */
375 	start = *cpp;
376 	quoted = 0;
377 	if ('"' == *start) {
378 		quoted = 1;
379 		start++;
380 	}
381 
382 	pairs = 0;
383 	white = 0;
384 	for (cp = start; '\0' != *cp; cp++) {
385 
386 		/*
387 		 * Move the following text left
388 		 * after quoted quotes and after "\\" and "\t".
389 		 */
390 		if (pairs)
391 			cp[-pairs] = cp[0];
392 
393 		if ('\\' == cp[0]) {
394 			/*
395 			 * In copy mode, translate double to single
396 			 * backslashes and backslash-t to literal tabs.
397 			 */
398 			switch (cp[1]) {
399 			case 't':
400 				cp[0] = '\t';
401 				/* FALLTHROUGH */
402 			case '\\':
403 				pairs++;
404 				cp++;
405 				break;
406 			case ' ':
407 				/* Skip escaped blanks. */
408 				if (0 == quoted)
409 					cp++;
410 				break;
411 			default:
412 				break;
413 			}
414 		} else if (0 == quoted) {
415 			if (' ' == cp[0]) {
416 				/* Unescaped blanks end unquoted args. */
417 				white = 1;
418 				break;
419 			}
420 		} else if ('"' == cp[0]) {
421 			if ('"' == cp[1]) {
422 				/* Quoted quotes collapse. */
423 				pairs++;
424 				cp++;
425 			} else {
426 				/* Unquoted quotes end quoted args. */
427 				quoted = 2;
428 				break;
429 			}
430 		}
431 	}
432 
433 	/* Quoted argument without a closing quote. */
434 	if (1 == quoted)
435 		mandoc_msg(MANDOCERR_ARG_QUOTE, parse, ln, *pos, NULL);
436 
437 	/* NUL-terminate this argument and move to the next one. */
438 	if (pairs)
439 		cp[-pairs] = '\0';
440 	if ('\0' != *cp) {
441 		*cp++ = '\0';
442 		while (' ' == *cp)
443 			cp++;
444 	}
445 	*pos += (int)(cp - start) + (quoted ? 1 : 0);
446 	*cpp = cp;
447 
448 	if ('\0' == *cp && (white || ' ' == cp[-1]))
449 		mandoc_msg(MANDOCERR_SPACE_EOL, parse, ln, *pos, NULL);
450 
451 	return start;
452 }
453 
454 static int
455 a2time(time_t *t, const char *fmt, const char *p)
456 {
457 	struct tm	 tm;
458 	char		*pp;
459 
460 	memset(&tm, 0, sizeof(struct tm));
461 
462 	pp = strptime(p, fmt, &tm);
463 	if (NULL != pp && '\0' == *pp) {
464 		*t = mktime(&tm);
465 		return 1;
466 	}
467 
468 	return 0;
469 }
470 
471 static char *
472 time2a(time_t t)
473 {
474 	struct tm	*tm;
475 	char		*buf, *p;
476 	size_t		 ssz;
477 	int		 isz;
478 
479 	tm = localtime(&t);
480 	if (tm == NULL)
481 		return NULL;
482 
483 	/*
484 	 * Reserve space:
485 	 * up to 9 characters for the month (September) + blank
486 	 * up to 2 characters for the day + comma + blank
487 	 * 4 characters for the year and a terminating '\0'
488 	 */
489 
490 	p = buf = mandoc_malloc(10 + 4 + 4 + 1);
491 
492 	if ((ssz = strftime(p, 10 + 1, "%B ", tm)) == 0)
493 		goto fail;
494 	p += (int)ssz;
495 
496 	/*
497 	 * The output format is just "%d" here, not "%2d" or "%02d".
498 	 * That's also the reason why we can't just format the
499 	 * date as a whole with "%B %e, %Y" or "%B %d, %Y".
500 	 * Besides, the present approach is less prone to buffer
501 	 * overflows, in case anybody should ever introduce the bug
502 	 * of looking at LC_TIME.
503 	 */
504 
505 	if ((isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday)) == -1)
506 		goto fail;
507 	p += isz;
508 
509 	if (strftime(p, 4 + 1, "%Y", tm) == 0)
510 		goto fail;
511 	return buf;
512 
513 fail:
514 	free(buf);
515 	return NULL;
516 }
517 
518 char *
519 mandoc_normdate(struct roff_man *man, char *in, int ln, int pos)
520 {
521 	char		*cp;
522 	time_t		 t;
523 
524 	/* No date specified: use today's date. */
525 
526 	if (in == NULL || *in == '\0' || strcmp(in, "$" "Mdocdate$") == 0) {
527 		mandoc_msg(MANDOCERR_DATE_MISSING, man->parse, ln, pos, NULL);
528 		return time2a(time(NULL));
529 	}
530 
531 	/* Valid mdoc(7) date format. */
532 
533 	if (a2time(&t, "$" "Mdocdate: %b %d %Y $", in) ||
534 	    a2time(&t, "%b %d, %Y", in)) {
535 		cp = time2a(t);
536 		if (t > time(NULL) + 86400)
537 			mandoc_msg(MANDOCERR_DATE_FUTURE, man->parse,
538 			    ln, pos, cp);
539 		return cp;
540 	}
541 
542 	/* In man(7), do not warn about the legacy format. */
543 
544 	if (a2time(&t, "%Y-%m-%d", in) == 0)
545 		mandoc_msg(MANDOCERR_DATE_BAD, man->parse, ln, pos, in);
546 	else if (t > time(NULL) + 86400)
547 		mandoc_msg(MANDOCERR_DATE_FUTURE, man->parse, ln, pos, in);
548 	else if (man->macroset == MACROSET_MDOC)
549 		mandoc_vmsg(MANDOCERR_DATE_LEGACY, man->parse,
550 		    ln, pos, "Dd %s", in);
551 
552 	/* Use any non-mdoc(7) date verbatim. */
553 
554 	return mandoc_strdup(in);
555 }
556 
557 int
558 mandoc_eos(const char *p, size_t sz)
559 {
560 	const char	*q;
561 	int		 enclosed, found;
562 
563 	if (0 == sz)
564 		return 0;
565 
566 	/*
567 	 * End-of-sentence recognition must include situations where
568 	 * some symbols, such as `)', allow prior EOS punctuation to
569 	 * propagate outward.
570 	 */
571 
572 	enclosed = found = 0;
573 	for (q = p + (int)sz - 1; q >= p; q--) {
574 		switch (*q) {
575 		case '\"':
576 		case '\'':
577 		case ']':
578 		case ')':
579 			if (0 == found)
580 				enclosed = 1;
581 			break;
582 		case '.':
583 		case '!':
584 		case '?':
585 			found = 1;
586 			break;
587 		default:
588 			return found &&
589 			    (!enclosed || isalnum((unsigned char)*q));
590 		}
591 	}
592 
593 	return found && !enclosed;
594 }
595 
596 /*
597  * Convert a string to a long that may not be <0.
598  * If the string is invalid, or is less than 0, return -1.
599  */
600 int
601 mandoc_strntoi(const char *p, size_t sz, int base)
602 {
603 	char		 buf[32];
604 	char		*ep;
605 	long		 v;
606 
607 	if (sz > 31)
608 		return -1;
609 
610 	memcpy(buf, p, sz);
611 	buf[(int)sz] = '\0';
612 
613 	errno = 0;
614 	v = strtol(buf, &ep, base);
615 
616 	if (buf[0] == '\0' || *ep != '\0')
617 		return -1;
618 
619 	if (v > INT_MAX)
620 		v = INT_MAX;
621 	if (v < INT_MIN)
622 		v = INT_MIN;
623 
624 	return (int)v;
625 }
626