xref: /openbsd-src/usr.bin/mandoc/mandoc.c (revision 48950c12d106c85f315112191a0228d7b83b9510)
1 /*	$Id: mandoc.c,v 1.35 2012/07/07 18:27:36 schwarze Exp $ */
2 /*
3  * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
4  * Copyright (c) 2011, 2012 Ingo Schwarze <schwarze@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 #include <sys/types.h>
19 
20 #include <assert.h>
21 #include <ctype.h>
22 #include <errno.h>
23 #include <limits.h>
24 #include <stdlib.h>
25 #include <stdio.h>
26 #include <string.h>
27 #include <time.h>
28 
29 #include "mandoc.h"
30 #include "libmandoc.h"
31 
32 #define DATESIZE 32
33 
34 static	int	 a2time(time_t *, const char *, const char *);
35 static	char	*time2a(time_t);
36 
37 
38 enum mandoc_esc
39 mandoc_escape(const char **end, const char **start, int *sz)
40 {
41 	const char	*local_start;
42 	int		 local_sz;
43 	char		 term;
44 	enum mandoc_esc	 gly;
45 
46 	/*
47 	 * When the caller doesn't provide return storage,
48 	 * use local storage.
49 	 */
50 
51 	if (NULL == start)
52 		start = &local_start;
53 	if (NULL == sz)
54 		sz = &local_sz;
55 
56 	/*
57 	 * Beyond the backslash, at least one input character
58 	 * is part of the escape sequence.  With one exception
59 	 * (see below), that character won't be returned.
60 	 */
61 
62 	gly = ESCAPE_ERROR;
63 	*start = ++*end;
64 	*sz = 0;
65 	term = '\0';
66 
67 	switch ((*start)[-1]) {
68 	/*
69 	 * First the glyphs.  There are several different forms of
70 	 * these, but each eventually returns a substring of the glyph
71 	 * name.
72 	 */
73 	case ('('):
74 		gly = ESCAPE_SPECIAL;
75 		*sz = 2;
76 		break;
77 	case ('['):
78 		gly = ESCAPE_SPECIAL;
79 		/*
80 		 * Unicode escapes are defined in groff as \[uXXXX] to
81 		 * \[u10FFFF], where the contained value must be a valid
82 		 * Unicode codepoint.  Here, however, only check whether
83 		 * it's not a zero-width escape.
84 		 */
85 		if ('u' == (*start)[0] && ']' != (*start)[1])
86 			gly = ESCAPE_UNICODE;
87 		term = ']';
88 		break;
89 	case ('C'):
90 		if ('\'' != **start)
91 			return(ESCAPE_ERROR);
92 		gly = ESCAPE_SPECIAL;
93 		*start = ++*end;
94 		term = '\'';
95 		break;
96 
97 	/*
98 	 * The \z escape is supposed to output the following
99 	 * character without advancing the cursor position.
100 	 * Since we are mostly dealing with terminal mode,
101 	 * let us just skip the next character.
102 	 */
103 	case ('z'):
104 		return(ESCAPE_SKIPCHAR);
105 
106 	/*
107 	 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
108 	 * 'X' is the trigger.  These have opaque sub-strings.
109 	 */
110 	case ('F'):
111 		/* FALLTHROUGH */
112 	case ('g'):
113 		/* FALLTHROUGH */
114 	case ('k'):
115 		/* FALLTHROUGH */
116 	case ('M'):
117 		/* FALLTHROUGH */
118 	case ('m'):
119 		/* FALLTHROUGH */
120 	case ('n'):
121 		/* FALLTHROUGH */
122 	case ('V'):
123 		/* FALLTHROUGH */
124 	case ('Y'):
125 		gly = ESCAPE_IGNORE;
126 		/* FALLTHROUGH */
127 	case ('f'):
128 		if (ESCAPE_ERROR == gly)
129 			gly = ESCAPE_FONT;
130 		switch (**start) {
131 		case ('('):
132 			*start = ++*end;
133 			*sz = 2;
134 			break;
135 		case ('['):
136 			*start = ++*end;
137 			term = ']';
138 			break;
139 		default:
140 			*sz = 1;
141 			break;
142 		}
143 		break;
144 
145 	/*
146 	 * These escapes are of the form \X'Y', where 'X' is the trigger
147 	 * and 'Y' is any string.  These have opaque sub-strings.
148 	 */
149 	case ('A'):
150 		/* FALLTHROUGH */
151 	case ('b'):
152 		/* FALLTHROUGH */
153 	case ('D'):
154 		/* FALLTHROUGH */
155 	case ('o'):
156 		/* FALLTHROUGH */
157 	case ('R'):
158 		/* FALLTHROUGH */
159 	case ('X'):
160 		/* FALLTHROUGH */
161 	case ('Z'):
162 		if ('\'' != **start)
163 			return(ESCAPE_ERROR);
164 		gly = ESCAPE_IGNORE;
165 		*start = ++*end;
166 		term = '\'';
167 		break;
168 
169 	/*
170 	 * These escapes are of the form \X'N', where 'X' is the trigger
171 	 * and 'N' resolves to a numerical expression.
172 	 */
173 	case ('B'):
174 		/* FALLTHROUGH */
175 	case ('h'):
176 		/* FALLTHROUGH */
177 	case ('H'):
178 		/* FALLTHROUGH */
179 	case ('L'):
180 		/* FALLTHROUGH */
181 	case ('l'):
182 		gly = ESCAPE_NUMBERED;
183 		/* FALLTHROUGH */
184 	case ('S'):
185 		/* FALLTHROUGH */
186 	case ('v'):
187 		/* FALLTHROUGH */
188 	case ('w'):
189 		/* FALLTHROUGH */
190 	case ('x'):
191 		if ('\'' != **start)
192 			return(ESCAPE_ERROR);
193 		if (ESCAPE_ERROR == gly)
194 			gly = ESCAPE_IGNORE;
195 		*start = ++*end;
196 		term = '\'';
197 		break;
198 
199 	/*
200 	 * Special handling for the numbered character escape.
201 	 * XXX Do any other escapes need similar handling?
202 	 */
203 	case ('N'):
204 		if ('\0' == **start)
205 			return(ESCAPE_ERROR);
206 		(*end)++;
207 		if (isdigit((unsigned char)**start)) {
208 			*sz = 1;
209 			return(ESCAPE_IGNORE);
210 		}
211 		(*start)++;
212 		while (isdigit((unsigned char)**end))
213 			(*end)++;
214 		*sz = *end - *start;
215 		if ('\0' != **end)
216 			(*end)++;
217 		return(ESCAPE_NUMBERED);
218 
219 	/*
220 	 * Sizes get a special category of their own.
221 	 */
222 	case ('s'):
223 		gly = ESCAPE_IGNORE;
224 
225 		/* See +/- counts as a sign. */
226 		if ('+' == **end || '-' == **end || ASCII_HYPH == **end)
227 			(*end)++;
228 
229 		switch (**end) {
230 		case ('('):
231 			*start = ++*end;
232 			*sz = 2;
233 			break;
234 		case ('['):
235 			*start = ++*end;
236 			term = ']';
237 			break;
238 		case ('\''):
239 			*start = ++*end;
240 			term = '\'';
241 			break;
242 		default:
243 			*sz = 1;
244 			break;
245 		}
246 
247 		break;
248 
249 	/*
250 	 * Anything else is assumed to be a glyph.
251 	 * In this case, pass back the character after the backslash.
252 	 */
253 	default:
254 		gly = ESCAPE_SPECIAL;
255 		*start = --*end;
256 		*sz = 1;
257 		break;
258 	}
259 
260 	assert(ESCAPE_ERROR != gly);
261 
262 	/*
263 	 * Read up to the terminating character,
264 	 * paying attention to nested escapes.
265 	 */
266 
267 	if ('\0' != term) {
268 		while (**end != term) {
269 			switch (**end) {
270 			case ('\0'):
271 				return(ESCAPE_ERROR);
272 			case ('\\'):
273 				(*end)++;
274 				if (ESCAPE_ERROR ==
275 				    mandoc_escape(end, NULL, NULL))
276 					return(ESCAPE_ERROR);
277 				break;
278 			default:
279 				(*end)++;
280 				break;
281 			}
282 		}
283 		*sz = (*end)++ - *start;
284 	} else {
285 		assert(*sz > 0);
286 		if ((size_t)*sz > strlen(*start))
287 			return(ESCAPE_ERROR);
288 		*end += *sz;
289 	}
290 
291 	/* Run post-processors. */
292 
293 	switch (gly) {
294 	case (ESCAPE_FONT):
295 		/*
296 		 * Pretend that the constant-width font modes are the
297 		 * same as the regular font modes.
298 		 */
299 		if (2 == *sz && 'C' == **start) {
300 			(*start)++;
301 			(*sz)--;
302 		} else if (1 != *sz)
303 			break;
304 
305 		switch (**start) {
306 		case ('3'):
307 			/* FALLTHROUGH */
308 		case ('B'):
309 			gly = ESCAPE_FONTBOLD;
310 			break;
311 		case ('2'):
312 			/* FALLTHROUGH */
313 		case ('I'):
314 			gly = ESCAPE_FONTITALIC;
315 			break;
316 		case ('P'):
317 			gly = ESCAPE_FONTPREV;
318 			break;
319 		case ('1'):
320 			/* FALLTHROUGH */
321 		case ('R'):
322 			gly = ESCAPE_FONTROMAN;
323 			break;
324 		}
325 		break;
326 	case (ESCAPE_SPECIAL):
327 		if (1 == *sz && 'c' == **start)
328 			gly = ESCAPE_NOSPACE;
329 		break;
330 	default:
331 		break;
332 	}
333 
334 	return(gly);
335 }
336 
337 void *
338 mandoc_calloc(size_t num, size_t size)
339 {
340 	void		*ptr;
341 
342 	ptr = calloc(num, size);
343 	if (NULL == ptr) {
344 		perror(NULL);
345 		exit((int)MANDOCLEVEL_SYSERR);
346 	}
347 
348 	return(ptr);
349 }
350 
351 
352 void *
353 mandoc_malloc(size_t size)
354 {
355 	void		*ptr;
356 
357 	ptr = malloc(size);
358 	if (NULL == ptr) {
359 		perror(NULL);
360 		exit((int)MANDOCLEVEL_SYSERR);
361 	}
362 
363 	return(ptr);
364 }
365 
366 
367 void *
368 mandoc_realloc(void *ptr, size_t size)
369 {
370 
371 	ptr = realloc(ptr, size);
372 	if (NULL == ptr) {
373 		perror(NULL);
374 		exit((int)MANDOCLEVEL_SYSERR);
375 	}
376 
377 	return(ptr);
378 }
379 
380 char *
381 mandoc_strndup(const char *ptr, size_t sz)
382 {
383 	char		*p;
384 
385 	p = mandoc_malloc(sz + 1);
386 	memcpy(p, ptr, sz);
387 	p[(int)sz] = '\0';
388 	return(p);
389 }
390 
391 char *
392 mandoc_strdup(const char *ptr)
393 {
394 	char		*p;
395 
396 	p = strdup(ptr);
397 	if (NULL == p) {
398 		perror(NULL);
399 		exit((int)MANDOCLEVEL_SYSERR);
400 	}
401 
402 	return(p);
403 }
404 
405 /*
406  * Parse a quoted or unquoted roff-style request or macro argument.
407  * Return a pointer to the parsed argument, which is either the original
408  * pointer or advanced by one byte in case the argument is quoted.
409  * Null-terminate the argument in place.
410  * Collapse pairs of quotes inside quoted arguments.
411  * Advance the argument pointer to the next argument,
412  * or to the null byte terminating the argument line.
413  */
414 char *
415 mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos)
416 {
417 	char	 *start, *cp;
418 	int	  quoted, pairs, white;
419 
420 	/* Quoting can only start with a new word. */
421 	start = *cpp;
422 	quoted = 0;
423 	if ('"' == *start) {
424 		quoted = 1;
425 		start++;
426 	}
427 
428 	pairs = 0;
429 	white = 0;
430 	for (cp = start; '\0' != *cp; cp++) {
431 		/* Move left after quoted quotes and escaped backslashes. */
432 		if (pairs)
433 			cp[-pairs] = cp[0];
434 		if ('\\' == cp[0]) {
435 			if ('\\' == cp[1]) {
436 				/* Poor man's copy mode. */
437 				pairs++;
438 				cp++;
439 			} else if (0 == quoted && ' ' == cp[1])
440 				/* Skip escaped blanks. */
441 				cp++;
442 		} else if (0 == quoted) {
443 			if (' ' == cp[0]) {
444 				/* Unescaped blanks end unquoted args. */
445 				white = 1;
446 				break;
447 			}
448 		} else if ('"' == cp[0]) {
449 			if ('"' == cp[1]) {
450 				/* Quoted quotes collapse. */
451 				pairs++;
452 				cp++;
453 			} else {
454 				/* Unquoted quotes end quoted args. */
455 				quoted = 2;
456 				break;
457 			}
458 		}
459 	}
460 
461 	/* Quoted argument without a closing quote. */
462 	if (1 == quoted)
463 		mandoc_msg(MANDOCERR_BADQUOTE, parse, ln, *pos, NULL);
464 
465 	/* Null-terminate this argument and move to the next one. */
466 	if (pairs)
467 		cp[-pairs] = '\0';
468 	if ('\0' != *cp) {
469 		*cp++ = '\0';
470 		while (' ' == *cp)
471 			cp++;
472 	}
473 	*pos += (int)(cp - start) + (quoted ? 1 : 0);
474 	*cpp = cp;
475 
476 	if ('\0' == *cp && (white || ' ' == cp[-1]))
477 		mandoc_msg(MANDOCERR_EOLNSPACE, parse, ln, *pos, NULL);
478 
479 	return(start);
480 }
481 
482 static int
483 a2time(time_t *t, const char *fmt, const char *p)
484 {
485 	struct tm	 tm;
486 	char		*pp;
487 
488 	memset(&tm, 0, sizeof(struct tm));
489 
490 	pp = strptime(p, fmt, &tm);
491 	if (NULL != pp && '\0' == *pp) {
492 		*t = mktime(&tm);
493 		return(1);
494 	}
495 
496 	return(0);
497 }
498 
499 static char *
500 time2a(time_t t)
501 {
502 	struct tm	*tm;
503 	char		*buf, *p;
504 	size_t		 ssz;
505 	int		 isz;
506 
507 	tm = localtime(&t);
508 
509 	/*
510 	 * Reserve space:
511 	 * up to 9 characters for the month (September) + blank
512 	 * up to 2 characters for the day + comma + blank
513 	 * 4 characters for the year and a terminating '\0'
514 	 */
515 	p = buf = mandoc_malloc(10 + 4 + 4 + 1);
516 
517 	if (0 == (ssz = strftime(p, 10 + 1, "%B ", tm)))
518 		goto fail;
519 	p += (int)ssz;
520 
521 	if (-1 == (isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday)))
522 		goto fail;
523 	p += isz;
524 
525 	if (0 == strftime(p, 4 + 1, "%Y", tm))
526 		goto fail;
527 	return(buf);
528 
529 fail:
530 	free(buf);
531 	return(NULL);
532 }
533 
534 char *
535 mandoc_normdate(struct mparse *parse, char *in, int ln, int pos)
536 {
537 	char		*out;
538 	time_t		 t;
539 
540 	if (NULL == in || '\0' == *in ||
541 	    0 == strcmp(in, "$" "Mdocdate$")) {
542 		mandoc_msg(MANDOCERR_NODATE, parse, ln, pos, NULL);
543 		time(&t);
544 	}
545 	else if (a2time(&t, "%Y-%m-%d", in))
546 		t = 0;
547 	else if (!a2time(&t, "$" "Mdocdate: %b %d %Y $", in) &&
548 	    !a2time(&t, "%b %d, %Y", in)) {
549 		mandoc_msg(MANDOCERR_BADDATE, parse, ln, pos, NULL);
550 		t = 0;
551 	}
552 	out = t ? time2a(t) : NULL;
553 	return(out ? out : mandoc_strdup(in));
554 }
555 
556 int
557 mandoc_eos(const char *p, size_t sz, int enclosed)
558 {
559 	const char *q;
560 	int found;
561 
562 	if (0 == sz)
563 		return(0);
564 
565 	/*
566 	 * End-of-sentence recognition must include situations where
567 	 * some symbols, such as `)', allow prior EOS punctuation to
568 	 * propagate outward.
569 	 */
570 
571 	found = 0;
572 	for (q = p + (int)sz - 1; q >= p; q--) {
573 		switch (*q) {
574 		case ('\"'):
575 			/* FALLTHROUGH */
576 		case ('\''):
577 			/* FALLTHROUGH */
578 		case (']'):
579 			/* FALLTHROUGH */
580 		case (')'):
581 			if (0 == found)
582 				enclosed = 1;
583 			break;
584 		case ('.'):
585 			/* FALLTHROUGH */
586 		case ('!'):
587 			/* FALLTHROUGH */
588 		case ('?'):
589 			found = 1;
590 			break;
591 		default:
592 			return(found && (!enclosed || isalnum((unsigned char)*q)));
593 		}
594 	}
595 
596 	return(found && !enclosed);
597 }
598 
599 /*
600  * Convert a string to a long that may not be <0.
601  * If the string is invalid, or is less than 0, return -1.
602  */
603 int
604 mandoc_strntoi(const char *p, size_t sz, int base)
605 {
606 	char		 buf[32];
607 	char		*ep;
608 	long		 v;
609 
610 	if (sz > 31)
611 		return(-1);
612 
613 	memcpy(buf, p, sz);
614 	buf[(int)sz] = '\0';
615 
616 	errno = 0;
617 	v = strtol(buf, &ep, base);
618 
619 	if (buf[0] == '\0' || *ep != '\0')
620 		return(-1);
621 
622 	if (v > INT_MAX)
623 		v = INT_MAX;
624 	if (v < INT_MIN)
625 		v = INT_MIN;
626 
627 	return((int)v);
628 }
629