xref: /openbsd-src/usr.bin/mandoc/mandoc.c (revision 6f05df2d9be0954bec42d51d943d77bd250fb664)
1 /*	$OpenBSD: mandoc.c,v 1.56 2014/11/28 19:25:03 schwarze Exp $ */
2 /*
3  * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
4  * Copyright (c) 2011, 2012, 2013, 2014 Ingo Schwarze <schwarze@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 #include <sys/types.h>
19 
20 #include <assert.h>
21 #include <ctype.h>
22 #include <errno.h>
23 #include <limits.h>
24 #include <stdlib.h>
25 #include <stdio.h>
26 #include <string.h>
27 #include <time.h>
28 
29 #include "mandoc.h"
30 #include "mandoc_aux.h"
31 #include "libmandoc.h"
32 
33 #define DATESIZE 32
34 
35 static	int	 a2time(time_t *, const char *, const char *);
36 static	char	*time2a(time_t);
37 
38 
39 enum mandoc_esc
40 mandoc_escape(const char **end, const char **start, int *sz)
41 {
42 	const char	*local_start;
43 	int		 local_sz;
44 	char		 term;
45 	enum mandoc_esc	 gly;
46 
47 	/*
48 	 * When the caller doesn't provide return storage,
49 	 * use local storage.
50 	 */
51 
52 	if (NULL == start)
53 		start = &local_start;
54 	if (NULL == sz)
55 		sz = &local_sz;
56 
57 	/*
58 	 * Beyond the backslash, at least one input character
59 	 * is part of the escape sequence.  With one exception
60 	 * (see below), that character won't be returned.
61 	 */
62 
63 	gly = ESCAPE_ERROR;
64 	*start = ++*end;
65 	*sz = 0;
66 	term = '\0';
67 
68 	switch ((*start)[-1]) {
69 	/*
70 	 * First the glyphs.  There are several different forms of
71 	 * these, but each eventually returns a substring of the glyph
72 	 * name.
73 	 */
74 	case '(':
75 		gly = ESCAPE_SPECIAL;
76 		*sz = 2;
77 		break;
78 	case '[':
79 		gly = ESCAPE_SPECIAL;
80 		term = ']';
81 		break;
82 	case 'C':
83 		if ('\'' != **start)
84 			return(ESCAPE_ERROR);
85 		*start = ++*end;
86 		gly = ESCAPE_SPECIAL;
87 		term = '\'';
88 		break;
89 
90 	/*
91 	 * Escapes taking no arguments at all.
92 	 */
93 	case 'd':
94 		/* FALLTHROUGH */
95 	case 'u':
96 		return(ESCAPE_IGNORE);
97 
98 	/*
99 	 * The \z escape is supposed to output the following
100 	 * character without advancing the cursor position.
101 	 * Since we are mostly dealing with terminal mode,
102 	 * let us just skip the next character.
103 	 */
104 	case 'z':
105 		return(ESCAPE_SKIPCHAR);
106 
107 	/*
108 	 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
109 	 * 'X' is the trigger.  These have opaque sub-strings.
110 	 */
111 	case 'F':
112 		/* FALLTHROUGH */
113 	case 'g':
114 		/* FALLTHROUGH */
115 	case 'k':
116 		/* FALLTHROUGH */
117 	case 'M':
118 		/* FALLTHROUGH */
119 	case 'm':
120 		/* FALLTHROUGH */
121 	case 'n':
122 		/* FALLTHROUGH */
123 	case 'V':
124 		/* FALLTHROUGH */
125 	case 'Y':
126 		gly = ESCAPE_IGNORE;
127 		/* FALLTHROUGH */
128 	case 'f':
129 		if (ESCAPE_ERROR == gly)
130 			gly = ESCAPE_FONT;
131 		switch (**start) {
132 		case '(':
133 			*start = ++*end;
134 			*sz = 2;
135 			break;
136 		case '[':
137 			*start = ++*end;
138 			term = ']';
139 			break;
140 		default:
141 			*sz = 1;
142 			break;
143 		}
144 		break;
145 
146 	/*
147 	 * These escapes are of the form \X'Y', where 'X' is the trigger
148 	 * and 'Y' is any string.  These have opaque sub-strings.
149 	 * The \B and \w escapes are handled in roff.c, roff_res().
150 	 */
151 	case 'A':
152 		/* FALLTHROUGH */
153 	case 'b':
154 		/* FALLTHROUGH */
155 	case 'D':
156 		/* FALLTHROUGH */
157 	case 'o':
158 		/* FALLTHROUGH */
159 	case 'R':
160 		/* FALLTHROUGH */
161 	case 'X':
162 		/* FALLTHROUGH */
163 	case 'Z':
164 		if ('\0' == **start)
165 			return(ESCAPE_ERROR);
166 		gly = ESCAPE_IGNORE;
167 		term = **start;
168 		*start = ++*end;
169 		break;
170 
171 	/*
172 	 * These escapes are of the form \X'N', where 'X' is the trigger
173 	 * and 'N' resolves to a numerical expression.
174 	 */
175 	case 'h':
176 		/* FALLTHROUGH */
177 	case 'H':
178 		/* FALLTHROUGH */
179 	case 'L':
180 		/* FALLTHROUGH */
181 	case 'l':
182 		/* FALLTHROUGH */
183 	case 'S':
184 		/* FALLTHROUGH */
185 	case 'v':
186 		/* FALLTHROUGH */
187 	case 'x':
188 		if (strchr(" %&()*+-./0123456789:<=>", **start)) {
189 			if ('\0' != **start)
190 				++*end;
191 			return(ESCAPE_ERROR);
192 		}
193 		gly = ESCAPE_IGNORE;
194 		term = **start;
195 		*start = ++*end;
196 		break;
197 
198 	/*
199 	 * Special handling for the numbered character escape.
200 	 * XXX Do any other escapes need similar handling?
201 	 */
202 	case 'N':
203 		if ('\0' == **start)
204 			return(ESCAPE_ERROR);
205 		(*end)++;
206 		if (isdigit((unsigned char)**start)) {
207 			*sz = 1;
208 			return(ESCAPE_IGNORE);
209 		}
210 		(*start)++;
211 		while (isdigit((unsigned char)**end))
212 			(*end)++;
213 		*sz = *end - *start;
214 		if ('\0' != **end)
215 			(*end)++;
216 		return(ESCAPE_NUMBERED);
217 
218 	/*
219 	 * Sizes get a special category of their own.
220 	 */
221 	case 's':
222 		gly = ESCAPE_IGNORE;
223 
224 		/* See +/- counts as a sign. */
225 		if ('+' == **end || '-' == **end || ASCII_HYPH == **end)
226 			(*end)++;
227 
228 		switch (**end) {
229 		case '(':
230 			*start = ++*end;
231 			*sz = 2;
232 			break;
233 		case '[':
234 			*start = ++*end;
235 			term = ']';
236 			break;
237 		case '\'':
238 			*start = ++*end;
239 			term = '\'';
240 			break;
241 		default:
242 			*sz = 1;
243 			break;
244 		}
245 
246 		break;
247 
248 	/*
249 	 * Anything else is assumed to be a glyph.
250 	 * In this case, pass back the character after the backslash.
251 	 */
252 	default:
253 		gly = ESCAPE_SPECIAL;
254 		*start = --*end;
255 		*sz = 1;
256 		break;
257 	}
258 
259 	assert(ESCAPE_ERROR != gly);
260 
261 	/*
262 	 * Read up to the terminating character,
263 	 * paying attention to nested escapes.
264 	 */
265 
266 	if ('\0' != term) {
267 		while (**end != term) {
268 			switch (**end) {
269 			case '\0':
270 				return(ESCAPE_ERROR);
271 			case '\\':
272 				(*end)++;
273 				if (ESCAPE_ERROR ==
274 				    mandoc_escape(end, NULL, NULL))
275 					return(ESCAPE_ERROR);
276 				break;
277 			default:
278 				(*end)++;
279 				break;
280 			}
281 		}
282 		*sz = (*end)++ - *start;
283 	} else {
284 		assert(*sz > 0);
285 		if ((size_t)*sz > strlen(*start))
286 			return(ESCAPE_ERROR);
287 		*end += *sz;
288 	}
289 
290 	/* Run post-processors. */
291 
292 	switch (gly) {
293 	case ESCAPE_FONT:
294 		if (2 == *sz) {
295 			if ('C' == **start) {
296 				/*
297 				 * Treat constant-width font modes
298 				 * just like regular font modes.
299 				 */
300 				(*start)++;
301 				(*sz)--;
302 			} else {
303 				if ('B' == (*start)[0] && 'I' == (*start)[1])
304 					gly = ESCAPE_FONTBI;
305 				break;
306 			}
307 		} else if (1 != *sz)
308 			break;
309 
310 		switch (**start) {
311 		case '3':
312 			/* FALLTHROUGH */
313 		case 'B':
314 			gly = ESCAPE_FONTBOLD;
315 			break;
316 		case '2':
317 			/* FALLTHROUGH */
318 		case 'I':
319 			gly = ESCAPE_FONTITALIC;
320 			break;
321 		case 'P':
322 			gly = ESCAPE_FONTPREV;
323 			break;
324 		case '1':
325 			/* FALLTHROUGH */
326 		case 'R':
327 			gly = ESCAPE_FONTROMAN;
328 			break;
329 		}
330 		break;
331 	case ESCAPE_SPECIAL:
332 		if (1 == *sz && 'c' == **start)
333 			gly = ESCAPE_NOSPACE;
334 		/*
335 		 * Unicode escapes are defined in groff as \[u0000]
336 		 * to \[u10FFFF], where the contained value must be
337 		 * a valid Unicode codepoint.  Here, however, only
338 		 * check the length and range.
339 		 */
340 		if (**start != 'u' || *sz < 5 || *sz > 7)
341 			break;
342 		if (*sz == 7 && ((*start)[1] != '1' || (*start)[2] != '0'))
343 			break;
344 		if (*sz == 6 && (*start)[1] == '0')
345 			break;
346 		if ((int)strspn(*start + 1, "0123456789ABCDEFabcdef")
347 		    + 1 == *sz)
348 			gly = ESCAPE_UNICODE;
349 		break;
350 	default:
351 		break;
352 	}
353 
354 	return(gly);
355 }
356 
357 /*
358  * Parse a quoted or unquoted roff-style request or macro argument.
359  * Return a pointer to the parsed argument, which is either the original
360  * pointer or advanced by one byte in case the argument is quoted.
361  * NUL-terminate the argument in place.
362  * Collapse pairs of quotes inside quoted arguments.
363  * Advance the argument pointer to the next argument,
364  * or to the NUL byte terminating the argument line.
365  */
366 char *
367 mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos)
368 {
369 	char	 *start, *cp;
370 	int	  quoted, pairs, white;
371 
372 	/* Quoting can only start with a new word. */
373 	start = *cpp;
374 	quoted = 0;
375 	if ('"' == *start) {
376 		quoted = 1;
377 		start++;
378 	}
379 
380 	pairs = 0;
381 	white = 0;
382 	for (cp = start; '\0' != *cp; cp++) {
383 
384 		/*
385 		 * Move the following text left
386 		 * after quoted quotes and after "\\" and "\t".
387 		 */
388 		if (pairs)
389 			cp[-pairs] = cp[0];
390 
391 		if ('\\' == cp[0]) {
392 			/*
393 			 * In copy mode, translate double to single
394 			 * backslashes and backslash-t to literal tabs.
395 			 */
396 			switch (cp[1]) {
397 			case 't':
398 				cp[0] = '\t';
399 				/* FALLTHROUGH */
400 			case '\\':
401 				pairs++;
402 				cp++;
403 				break;
404 			case ' ':
405 				/* Skip escaped blanks. */
406 				if (0 == quoted)
407 					cp++;
408 				break;
409 			default:
410 				break;
411 			}
412 		} else if (0 == quoted) {
413 			if (' ' == cp[0]) {
414 				/* Unescaped blanks end unquoted args. */
415 				white = 1;
416 				break;
417 			}
418 		} else if ('"' == cp[0]) {
419 			if ('"' == cp[1]) {
420 				/* Quoted quotes collapse. */
421 				pairs++;
422 				cp++;
423 			} else {
424 				/* Unquoted quotes end quoted args. */
425 				quoted = 2;
426 				break;
427 			}
428 		}
429 	}
430 
431 	/* Quoted argument without a closing quote. */
432 	if (1 == quoted)
433 		mandoc_msg(MANDOCERR_ARG_QUOTE, parse, ln, *pos, NULL);
434 
435 	/* NUL-terminate this argument and move to the next one. */
436 	if (pairs)
437 		cp[-pairs] = '\0';
438 	if ('\0' != *cp) {
439 		*cp++ = '\0';
440 		while (' ' == *cp)
441 			cp++;
442 	}
443 	*pos += (int)(cp - start) + (quoted ? 1 : 0);
444 	*cpp = cp;
445 
446 	if ('\0' == *cp && (white || ' ' == cp[-1]))
447 		mandoc_msg(MANDOCERR_SPACE_EOL, parse, ln, *pos, NULL);
448 
449 	return(start);
450 }
451 
452 static int
453 a2time(time_t *t, const char *fmt, const char *p)
454 {
455 	struct tm	 tm;
456 	char		*pp;
457 
458 	memset(&tm, 0, sizeof(struct tm));
459 
460 	pp = strptime(p, fmt, &tm);
461 	if (NULL != pp && '\0' == *pp) {
462 		*t = mktime(&tm);
463 		return(1);
464 	}
465 
466 	return(0);
467 }
468 
469 static char *
470 time2a(time_t t)
471 {
472 	struct tm	*tm;
473 	char		*buf, *p;
474 	size_t		 ssz;
475 	int		 isz;
476 
477 	tm = localtime(&t);
478 
479 	/*
480 	 * Reserve space:
481 	 * up to 9 characters for the month (September) + blank
482 	 * up to 2 characters for the day + comma + blank
483 	 * 4 characters for the year and a terminating '\0'
484 	 */
485 	p = buf = mandoc_malloc(10 + 4 + 4 + 1);
486 
487 	if (0 == (ssz = strftime(p, 10 + 1, "%B ", tm)))
488 		goto fail;
489 	p += (int)ssz;
490 
491 	if (-1 == (isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday)))
492 		goto fail;
493 	p += isz;
494 
495 	if (0 == strftime(p, 4 + 1, "%Y", tm))
496 		goto fail;
497 	return(buf);
498 
499 fail:
500 	free(buf);
501 	return(NULL);
502 }
503 
504 char *
505 mandoc_normdate(struct mparse *parse, char *in, int ln, int pos)
506 {
507 	char		*out;
508 	time_t		 t;
509 
510 	if (NULL == in || '\0' == *in ||
511 	    0 == strcmp(in, "$" "Mdocdate$")) {
512 		mandoc_msg(MANDOCERR_DATE_MISSING, parse, ln, pos, NULL);
513 		time(&t);
514 	}
515 	else if (a2time(&t, "%Y-%m-%d", in))
516 		t = 0;
517 	else if (!a2time(&t, "$" "Mdocdate: %b %d %Y $", in) &&
518 	    !a2time(&t, "%b %d, %Y", in)) {
519 		mandoc_msg(MANDOCERR_DATE_BAD, parse, ln, pos, in);
520 		t = 0;
521 	}
522 	out = t ? time2a(t) : NULL;
523 	return(out ? out : mandoc_strdup(in));
524 }
525 
526 int
527 mandoc_eos(const char *p, size_t sz)
528 {
529 	const char	*q;
530 	int		 enclosed, found;
531 
532 	if (0 == sz)
533 		return(0);
534 
535 	/*
536 	 * End-of-sentence recognition must include situations where
537 	 * some symbols, such as `)', allow prior EOS punctuation to
538 	 * propagate outward.
539 	 */
540 
541 	enclosed = found = 0;
542 	for (q = p + (int)sz - 1; q >= p; q--) {
543 		switch (*q) {
544 		case '\"':
545 			/* FALLTHROUGH */
546 		case '\'':
547 			/* FALLTHROUGH */
548 		case ']':
549 			/* FALLTHROUGH */
550 		case ')':
551 			if (0 == found)
552 				enclosed = 1;
553 			break;
554 		case '.':
555 			/* FALLTHROUGH */
556 		case '!':
557 			/* FALLTHROUGH */
558 		case '?':
559 			found = 1;
560 			break;
561 		default:
562 			return(found && (!enclosed || isalnum((unsigned char)*q)));
563 		}
564 	}
565 
566 	return(found && !enclosed);
567 }
568 
569 /*
570  * Convert a string to a long that may not be <0.
571  * If the string is invalid, or is less than 0, return -1.
572  */
573 int
574 mandoc_strntoi(const char *p, size_t sz, int base)
575 {
576 	char		 buf[32];
577 	char		*ep;
578 	long		 v;
579 
580 	if (sz > 31)
581 		return(-1);
582 
583 	memcpy(buf, p, sz);
584 	buf[(int)sz] = '\0';
585 
586 	errno = 0;
587 	v = strtol(buf, &ep, base);
588 
589 	if (buf[0] == '\0' || *ep != '\0')
590 		return(-1);
591 
592 	if (v > INT_MAX)
593 		v = INT_MAX;
594 	if (v < INT_MIN)
595 		v = INT_MIN;
596 
597 	return((int)v);
598 }
599