xref: /openbsd-src/usr.bin/mandoc/mandoc.c (revision 91f110e064cd7c194e59e019b83bb7496c1c84d4)
1 /*	$Id: mandoc.c,v 1.45 2014/03/21 22:17:01 schwarze Exp $ */
2 /*
3  * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
4  * Copyright (c) 2011, 2012, 2013 Ingo Schwarze <schwarze@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 #include <sys/types.h>
19 
20 #include <assert.h>
21 #include <ctype.h>
22 #include <errno.h>
23 #include <limits.h>
24 #include <stdlib.h>
25 #include <stdio.h>
26 #include <string.h>
27 #include <time.h>
28 
29 #include "mandoc.h"
30 #include "mandoc_aux.h"
31 #include "libmandoc.h"
32 
33 #define DATESIZE 32
34 
35 static	int	 a2time(time_t *, const char *, const char *);
36 static	char	*time2a(time_t);
37 
38 
39 enum mandoc_esc
40 mandoc_escape(const char **end, const char **start, int *sz)
41 {
42 	const char	*local_start;
43 	int		 local_sz;
44 	char		 term;
45 	enum mandoc_esc	 gly;
46 
47 	/*
48 	 * When the caller doesn't provide return storage,
49 	 * use local storage.
50 	 */
51 
52 	if (NULL == start)
53 		start = &local_start;
54 	if (NULL == sz)
55 		sz = &local_sz;
56 
57 	/*
58 	 * Beyond the backslash, at least one input character
59 	 * is part of the escape sequence.  With one exception
60 	 * (see below), that character won't be returned.
61 	 */
62 
63 	gly = ESCAPE_ERROR;
64 	*start = ++*end;
65 	*sz = 0;
66 	term = '\0';
67 
68 	switch ((*start)[-1]) {
69 	/*
70 	 * First the glyphs.  There are several different forms of
71 	 * these, but each eventually returns a substring of the glyph
72 	 * name.
73 	 */
74 	case ('('):
75 		gly = ESCAPE_SPECIAL;
76 		*sz = 2;
77 		break;
78 	case ('['):
79 		gly = ESCAPE_SPECIAL;
80 		/*
81 		 * Unicode escapes are defined in groff as \[uXXXX] to
82 		 * \[u10FFFF], where the contained value must be a valid
83 		 * Unicode codepoint.  Here, however, only check whether
84 		 * it's not a zero-width escape.
85 		 */
86 		if ('u' == (*start)[0] && ']' != (*start)[1])
87 			gly = ESCAPE_UNICODE;
88 		term = ']';
89 		break;
90 	case ('C'):
91 		if ('\'' != **start)
92 			return(ESCAPE_ERROR);
93 		*start = ++*end;
94 		if ('u' == (*start)[0] && '\'' != (*start)[1])
95 			gly = ESCAPE_UNICODE;
96 		else
97 			gly = ESCAPE_SPECIAL;
98 		term = '\'';
99 		break;
100 
101 	/*
102 	 * Escapes taking no arguments at all.
103 	 */
104 	case ('d'):
105 		/* FALLTHROUGH */
106 	case ('u'):
107 		return(ESCAPE_IGNORE);
108 
109 	/*
110 	 * The \z escape is supposed to output the following
111 	 * character without advancing the cursor position.
112 	 * Since we are mostly dealing with terminal mode,
113 	 * let us just skip the next character.
114 	 */
115 	case ('z'):
116 		return(ESCAPE_SKIPCHAR);
117 
118 	/*
119 	 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
120 	 * 'X' is the trigger.  These have opaque sub-strings.
121 	 */
122 	case ('F'):
123 		/* FALLTHROUGH */
124 	case ('g'):
125 		/* FALLTHROUGH */
126 	case ('k'):
127 		/* FALLTHROUGH */
128 	case ('M'):
129 		/* FALLTHROUGH */
130 	case ('m'):
131 		/* FALLTHROUGH */
132 	case ('n'):
133 		/* FALLTHROUGH */
134 	case ('V'):
135 		/* FALLTHROUGH */
136 	case ('Y'):
137 		gly = ESCAPE_IGNORE;
138 		/* FALLTHROUGH */
139 	case ('f'):
140 		if (ESCAPE_ERROR == gly)
141 			gly = ESCAPE_FONT;
142 		switch (**start) {
143 		case ('('):
144 			*start = ++*end;
145 			*sz = 2;
146 			break;
147 		case ('['):
148 			*start = ++*end;
149 			term = ']';
150 			break;
151 		default:
152 			*sz = 1;
153 			break;
154 		}
155 		break;
156 
157 	/*
158 	 * These escapes are of the form \X'Y', where 'X' is the trigger
159 	 * and 'Y' is any string.  These have opaque sub-strings.
160 	 */
161 	case ('A'):
162 		/* FALLTHROUGH */
163 	case ('b'):
164 		/* FALLTHROUGH */
165 	case ('B'):
166 		/* FALLTHROUGH */
167 	case ('D'):
168 		/* FALLTHROUGH */
169 	case ('o'):
170 		/* FALLTHROUGH */
171 	case ('R'):
172 		/* FALLTHROUGH */
173 	case ('w'):
174 		/* FALLTHROUGH */
175 	case ('X'):
176 		/* FALLTHROUGH */
177 	case ('Z'):
178 		if ('\'' != **start)
179 			return(ESCAPE_ERROR);
180 		gly = ESCAPE_IGNORE;
181 		*start = ++*end;
182 		term = '\'';
183 		break;
184 
185 	/*
186 	 * These escapes are of the form \X'N', where 'X' is the trigger
187 	 * and 'N' resolves to a numerical expression.
188 	 */
189 	case ('h'):
190 		/* FALLTHROUGH */
191 	case ('H'):
192 		/* FALLTHROUGH */
193 	case ('L'):
194 		/* FALLTHROUGH */
195 	case ('l'):
196 		/* FALLTHROUGH */
197 	case ('S'):
198 		/* FALLTHROUGH */
199 	case ('v'):
200 		/* FALLTHROUGH */
201 	case ('x'):
202 		if ('\'' != **start)
203 			return(ESCAPE_ERROR);
204 		gly = ESCAPE_IGNORE;
205 		*start = ++*end;
206 		term = '\'';
207 		break;
208 
209 	/*
210 	 * Special handling for the numbered character escape.
211 	 * XXX Do any other escapes need similar handling?
212 	 */
213 	case ('N'):
214 		if ('\0' == **start)
215 			return(ESCAPE_ERROR);
216 		(*end)++;
217 		if (isdigit((unsigned char)**start)) {
218 			*sz = 1;
219 			return(ESCAPE_IGNORE);
220 		}
221 		(*start)++;
222 		while (isdigit((unsigned char)**end))
223 			(*end)++;
224 		*sz = *end - *start;
225 		if ('\0' != **end)
226 			(*end)++;
227 		return(ESCAPE_NUMBERED);
228 
229 	/*
230 	 * Sizes get a special category of their own.
231 	 */
232 	case ('s'):
233 		gly = ESCAPE_IGNORE;
234 
235 		/* See +/- counts as a sign. */
236 		if ('+' == **end || '-' == **end || ASCII_HYPH == **end)
237 			(*end)++;
238 
239 		switch (**end) {
240 		case ('('):
241 			*start = ++*end;
242 			*sz = 2;
243 			break;
244 		case ('['):
245 			*start = ++*end;
246 			term = ']';
247 			break;
248 		case ('\''):
249 			*start = ++*end;
250 			term = '\'';
251 			break;
252 		default:
253 			*sz = 1;
254 			break;
255 		}
256 
257 		break;
258 
259 	/*
260 	 * Anything else is assumed to be a glyph.
261 	 * In this case, pass back the character after the backslash.
262 	 */
263 	default:
264 		gly = ESCAPE_SPECIAL;
265 		*start = --*end;
266 		*sz = 1;
267 		break;
268 	}
269 
270 	assert(ESCAPE_ERROR != gly);
271 
272 	/*
273 	 * Read up to the terminating character,
274 	 * paying attention to nested escapes.
275 	 */
276 
277 	if ('\0' != term) {
278 		while (**end != term) {
279 			switch (**end) {
280 			case ('\0'):
281 				return(ESCAPE_ERROR);
282 			case ('\\'):
283 				(*end)++;
284 				if (ESCAPE_ERROR ==
285 				    mandoc_escape(end, NULL, NULL))
286 					return(ESCAPE_ERROR);
287 				break;
288 			default:
289 				(*end)++;
290 				break;
291 			}
292 		}
293 		*sz = (*end)++ - *start;
294 	} else {
295 		assert(*sz > 0);
296 		if ((size_t)*sz > strlen(*start))
297 			return(ESCAPE_ERROR);
298 		*end += *sz;
299 	}
300 
301 	/* Run post-processors. */
302 
303 	switch (gly) {
304 	case (ESCAPE_FONT):
305 		if (2 == *sz) {
306 			if ('C' == **start) {
307 				/*
308 				 * Treat constant-width font modes
309 				 * just like regular font modes.
310 				 */
311 				(*start)++;
312 				(*sz)--;
313 			} else {
314 				if ('B' == (*start)[0] && 'I' == (*start)[1])
315 					gly = ESCAPE_FONTBI;
316 				break;
317 			}
318 		} else if (1 != *sz)
319 			break;
320 
321 		switch (**start) {
322 		case ('3'):
323 			/* FALLTHROUGH */
324 		case ('B'):
325 			gly = ESCAPE_FONTBOLD;
326 			break;
327 		case ('2'):
328 			/* FALLTHROUGH */
329 		case ('I'):
330 			gly = ESCAPE_FONTITALIC;
331 			break;
332 		case ('P'):
333 			gly = ESCAPE_FONTPREV;
334 			break;
335 		case ('1'):
336 			/* FALLTHROUGH */
337 		case ('R'):
338 			gly = ESCAPE_FONTROMAN;
339 			break;
340 		}
341 		break;
342 	case (ESCAPE_SPECIAL):
343 		if (1 == *sz && 'c' == **start)
344 			gly = ESCAPE_NOSPACE;
345 		break;
346 	default:
347 		break;
348 	}
349 
350 	return(gly);
351 }
352 
353 /*
354  * Parse a quoted or unquoted roff-style request or macro argument.
355  * Return a pointer to the parsed argument, which is either the original
356  * pointer or advanced by one byte in case the argument is quoted.
357  * NUL-terminate the argument in place.
358  * Collapse pairs of quotes inside quoted arguments.
359  * Advance the argument pointer to the next argument,
360  * or to the NUL byte terminating the argument line.
361  */
362 char *
363 mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos)
364 {
365 	char	 *start, *cp;
366 	int	  quoted, pairs, white;
367 
368 	/* Quoting can only start with a new word. */
369 	start = *cpp;
370 	quoted = 0;
371 	if ('"' == *start) {
372 		quoted = 1;
373 		start++;
374 	}
375 
376 	pairs = 0;
377 	white = 0;
378 	for (cp = start; '\0' != *cp; cp++) {
379 
380 		/*
381 		 * Move the following text left
382 		 * after quoted quotes and after "\\" and "\t".
383 		 */
384 		if (pairs)
385 			cp[-pairs] = cp[0];
386 
387 		if ('\\' == cp[0]) {
388 			/*
389 			 * In copy mode, translate double to single
390 			 * backslashes and backslash-t to literal tabs.
391 			 */
392 			switch (cp[1]) {
393 			case ('t'):
394 				cp[0] = '\t';
395 				/* FALLTHROUGH */
396 			case ('\\'):
397 				pairs++;
398 				cp++;
399 				break;
400 			case (' '):
401 				/* Skip escaped blanks. */
402 				if (0 == quoted)
403 					cp++;
404 				break;
405 			default:
406 				break;
407 			}
408 		} else if (0 == quoted) {
409 			if (' ' == cp[0]) {
410 				/* Unescaped blanks end unquoted args. */
411 				white = 1;
412 				break;
413 			}
414 		} else if ('"' == cp[0]) {
415 			if ('"' == cp[1]) {
416 				/* Quoted quotes collapse. */
417 				pairs++;
418 				cp++;
419 			} else {
420 				/* Unquoted quotes end quoted args. */
421 				quoted = 2;
422 				break;
423 			}
424 		}
425 	}
426 
427 	/* Quoted argument without a closing quote. */
428 	if (1 == quoted)
429 		mandoc_msg(MANDOCERR_BADQUOTE, parse, ln, *pos, NULL);
430 
431 	/* NUL-terminate this argument and move to the next one. */
432 	if (pairs)
433 		cp[-pairs] = '\0';
434 	if ('\0' != *cp) {
435 		*cp++ = '\0';
436 		while (' ' == *cp)
437 			cp++;
438 	}
439 	*pos += (int)(cp - start) + (quoted ? 1 : 0);
440 	*cpp = cp;
441 
442 	if ('\0' == *cp && (white || ' ' == cp[-1]))
443 		mandoc_msg(MANDOCERR_EOLNSPACE, parse, ln, *pos, NULL);
444 
445 	return(start);
446 }
447 
448 static int
449 a2time(time_t *t, const char *fmt, const char *p)
450 {
451 	struct tm	 tm;
452 	char		*pp;
453 
454 	memset(&tm, 0, sizeof(struct tm));
455 
456 	pp = strptime(p, fmt, &tm);
457 	if (NULL != pp && '\0' == *pp) {
458 		*t = mktime(&tm);
459 		return(1);
460 	}
461 
462 	return(0);
463 }
464 
465 static char *
466 time2a(time_t t)
467 {
468 	struct tm	*tm;
469 	char		*buf, *p;
470 	size_t		 ssz;
471 	int		 isz;
472 
473 	tm = localtime(&t);
474 
475 	/*
476 	 * Reserve space:
477 	 * up to 9 characters for the month (September) + blank
478 	 * up to 2 characters for the day + comma + blank
479 	 * 4 characters for the year and a terminating '\0'
480 	 */
481 	p = buf = mandoc_malloc(10 + 4 + 4 + 1);
482 
483 	if (0 == (ssz = strftime(p, 10 + 1, "%B ", tm)))
484 		goto fail;
485 	p += (int)ssz;
486 
487 	if (-1 == (isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday)))
488 		goto fail;
489 	p += isz;
490 
491 	if (0 == strftime(p, 4 + 1, "%Y", tm))
492 		goto fail;
493 	return(buf);
494 
495 fail:
496 	free(buf);
497 	return(NULL);
498 }
499 
500 char *
501 mandoc_normdate(struct mparse *parse, char *in, int ln, int pos)
502 {
503 	char		*out;
504 	time_t		 t;
505 
506 	if (NULL == in || '\0' == *in ||
507 	    0 == strcmp(in, "$" "Mdocdate$")) {
508 		mandoc_msg(MANDOCERR_NODATE, parse, ln, pos, NULL);
509 		time(&t);
510 	}
511 	else if (a2time(&t, "%Y-%m-%d", in))
512 		t = 0;
513 	else if (!a2time(&t, "$" "Mdocdate: %b %d %Y $", in) &&
514 	    !a2time(&t, "%b %d, %Y", in)) {
515 		mandoc_msg(MANDOCERR_BADDATE, parse, ln, pos, NULL);
516 		t = 0;
517 	}
518 	out = t ? time2a(t) : NULL;
519 	return(out ? out : mandoc_strdup(in));
520 }
521 
522 int
523 mandoc_eos(const char *p, size_t sz)
524 {
525 	const char	*q;
526 	int		 enclosed, found;
527 
528 	if (0 == sz)
529 		return(0);
530 
531 	/*
532 	 * End-of-sentence recognition must include situations where
533 	 * some symbols, such as `)', allow prior EOS punctuation to
534 	 * propagate outward.
535 	 */
536 
537 	enclosed = found = 0;
538 	for (q = p + (int)sz - 1; q >= p; q--) {
539 		switch (*q) {
540 		case ('\"'):
541 			/* FALLTHROUGH */
542 		case ('\''):
543 			/* FALLTHROUGH */
544 		case (']'):
545 			/* FALLTHROUGH */
546 		case (')'):
547 			if (0 == found)
548 				enclosed = 1;
549 			break;
550 		case ('.'):
551 			/* FALLTHROUGH */
552 		case ('!'):
553 			/* FALLTHROUGH */
554 		case ('?'):
555 			found = 1;
556 			break;
557 		default:
558 			return(found && (!enclosed || isalnum((unsigned char)*q)));
559 		}
560 	}
561 
562 	return(found && !enclosed);
563 }
564 
565 /*
566  * Convert a string to a long that may not be <0.
567  * If the string is invalid, or is less than 0, return -1.
568  */
569 int
570 mandoc_strntoi(const char *p, size_t sz, int base)
571 {
572 	char		 buf[32];
573 	char		*ep;
574 	long		 v;
575 
576 	if (sz > 31)
577 		return(-1);
578 
579 	memcpy(buf, p, sz);
580 	buf[(int)sz] = '\0';
581 
582 	errno = 0;
583 	v = strtol(buf, &ep, base);
584 
585 	if (buf[0] == '\0' || *ep != '\0')
586 		return(-1);
587 
588 	if (v > INT_MAX)
589 		v = INT_MAX;
590 	if (v < INT_MIN)
591 		v = INT_MIN;
592 
593 	return((int)v);
594 }
595