1 /* $Vendor-Id: mandoc.c,v 1.59 2011/09/18 14:14:15 schwarze Exp $ */ 2 /* 3 * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv> 4 * Copyright (c) 2011 Ingo Schwarze <schwarze@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 #ifdef HAVE_CONFIG_H 19 #include "config.h" 20 #endif 21 22 #include <sys/types.h> 23 24 #include <assert.h> 25 #include <ctype.h> 26 #include <errno.h> 27 #include <limits.h> 28 #include <stdlib.h> 29 #include <stdio.h> 30 #include <string.h> 31 #include <time.h> 32 33 #include "mandoc.h" 34 #include "libmandoc.h" 35 36 #define DATESIZE 32 37 38 static int a2time(time_t *, const char *, const char *); 39 static char *time2a(time_t); 40 static int numescape(const char *); 41 42 /* 43 * Pass over recursive numerical expressions. This context of this 44 * function is important: it's only called within character-terminating 45 * escapes (e.g., \s[xxxyyy]), so all we need to do is handle initial 46 * recursion: we don't care about what's in these blocks. 47 * This returns the number of characters skipped or -1 if an error 48 * occurs (the caller should bail). 49 */ 50 static int 51 numescape(const char *start) 52 { 53 int i; 54 size_t sz; 55 const char *cp; 56 57 i = 0; 58 59 /* The expression consists of a subexpression. */ 60 61 if ('\\' == start[i]) { 62 cp = &start[++i]; 63 /* 64 * Read past the end of the subexpression. 65 * Bail immediately on errors. 66 */ 67 if (ESCAPE_ERROR == mandoc_escape(&cp, NULL, NULL)) 68 return(-1); 69 return(i + cp - &start[i]); 70 } 71 72 if ('(' != start[i++]) 73 return(0); 74 75 /* 76 * A parenthesised subexpression. Read until the closing 77 * parenthesis, making sure to handle any nested subexpressions 78 * that might ruin our parse. 79 */ 80 81 while (')' != start[i]) { 82 sz = strcspn(&start[i], ")\\"); 83 i += (int)sz; 84 85 if ('\0' == start[i]) 86 return(-1); 87 else if ('\\' != start[i]) 88 continue; 89 90 cp = &start[++i]; 91 if (ESCAPE_ERROR == mandoc_escape(&cp, NULL, NULL)) 92 return(-1); 93 i += cp - &start[i]; 94 } 95 96 /* Read past the terminating ')'. */ 97 return(++i); 98 } 99 100 enum mandoc_esc 101 mandoc_escape(const char **end, const char **start, int *sz) 102 { 103 char c, term, numeric; 104 int i, lim, ssz, rlim; 105 const char *cp, *rstart; 106 enum mandoc_esc gly; 107 108 cp = *end; 109 rstart = cp; 110 if (start) 111 *start = rstart; 112 i = lim = 0; 113 gly = ESCAPE_ERROR; 114 term = numeric = '\0'; 115 116 switch ((c = cp[i++])) { 117 /* 118 * First the glyphs. There are several different forms of 119 * these, but each eventually returns a substring of the glyph 120 * name. 121 */ 122 case ('('): 123 gly = ESCAPE_SPECIAL; 124 lim = 2; 125 break; 126 case ('['): 127 gly = ESCAPE_SPECIAL; 128 /* 129 * Unicode escapes are defined in groff as \[uXXXX] to 130 * \[u10FFFF], where the contained value must be a valid 131 * Unicode codepoint. Here, however, only check whether 132 * it's not a zero-width escape. 133 */ 134 if ('u' == cp[i] && ']' != cp[i + 1]) 135 gly = ESCAPE_UNICODE; 136 term = ']'; 137 break; 138 case ('C'): 139 if ('\'' != cp[i]) 140 return(ESCAPE_ERROR); 141 gly = ESCAPE_SPECIAL; 142 term = '\''; 143 break; 144 145 /* 146 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where 147 * 'X' is the trigger. These have opaque sub-strings. 148 */ 149 case ('F'): 150 /* FALLTHROUGH */ 151 case ('g'): 152 /* FALLTHROUGH */ 153 case ('k'): 154 /* FALLTHROUGH */ 155 case ('M'): 156 /* FALLTHROUGH */ 157 case ('m'): 158 /* FALLTHROUGH */ 159 case ('n'): 160 /* FALLTHROUGH */ 161 case ('V'): 162 /* FALLTHROUGH */ 163 case ('Y'): 164 if (ESCAPE_ERROR == gly) 165 gly = ESCAPE_IGNORE; 166 /* FALLTHROUGH */ 167 case ('f'): 168 if (ESCAPE_ERROR == gly) 169 gly = ESCAPE_FONT; 170 171 rstart= &cp[i]; 172 if (start) 173 *start = rstart; 174 175 switch (cp[i++]) { 176 case ('('): 177 lim = 2; 178 break; 179 case ('['): 180 term = ']'; 181 break; 182 default: 183 lim = 1; 184 i--; 185 break; 186 } 187 break; 188 189 /* 190 * These escapes are of the form \X'Y', where 'X' is the trigger 191 * and 'Y' is any string. These have opaque sub-strings. 192 */ 193 case ('A'): 194 /* FALLTHROUGH */ 195 case ('b'): 196 /* FALLTHROUGH */ 197 case ('D'): 198 /* FALLTHROUGH */ 199 case ('o'): 200 /* FALLTHROUGH */ 201 case ('R'): 202 /* FALLTHROUGH */ 203 case ('X'): 204 /* FALLTHROUGH */ 205 case ('Z'): 206 if ('\'' != cp[i++]) 207 return(ESCAPE_ERROR); 208 gly = ESCAPE_IGNORE; 209 term = '\''; 210 break; 211 212 /* 213 * These escapes are of the form \X'N', where 'X' is the trigger 214 * and 'N' resolves to a numerical expression. 215 */ 216 case ('B'): 217 /* FALLTHROUGH */ 218 case ('h'): 219 /* FALLTHROUGH */ 220 case ('H'): 221 /* FALLTHROUGH */ 222 case ('L'): 223 /* FALLTHROUGH */ 224 case ('l'): 225 /* FALLTHROUGH */ 226 case ('N'): 227 if (ESCAPE_ERROR == gly) 228 gly = ESCAPE_NUMBERED; 229 /* FALLTHROUGH */ 230 case ('S'): 231 /* FALLTHROUGH */ 232 case ('v'): 233 /* FALLTHROUGH */ 234 case ('w'): 235 /* FALLTHROUGH */ 236 case ('x'): 237 if (ESCAPE_ERROR == gly) 238 gly = ESCAPE_IGNORE; 239 if ('\'' != cp[i++]) 240 return(ESCAPE_ERROR); 241 term = numeric = '\''; 242 break; 243 244 /* 245 * Sizes get a special category of their own. 246 */ 247 case ('s'): 248 gly = ESCAPE_IGNORE; 249 250 rstart = &cp[i]; 251 if (start) 252 *start = rstart; 253 254 /* See +/- counts as a sign. */ 255 c = cp[i]; 256 if ('+' == c || '-' == c || ASCII_HYPH == c) 257 ++i; 258 259 switch (cp[i++]) { 260 case ('('): 261 lim = 2; 262 break; 263 case ('['): 264 term = numeric = ']'; 265 break; 266 case ('\''): 267 term = numeric = '\''; 268 break; 269 default: 270 lim = 1; 271 i--; 272 break; 273 } 274 275 /* See +/- counts as a sign. */ 276 c = cp[i]; 277 if ('+' == c || '-' == c || ASCII_HYPH == c) 278 ++i; 279 280 break; 281 282 /* 283 * Anything else is assumed to be a glyph. 284 */ 285 default: 286 gly = ESCAPE_SPECIAL; 287 lim = 1; 288 i--; 289 break; 290 } 291 292 assert(ESCAPE_ERROR != gly); 293 294 rstart = &cp[i]; 295 if (start) 296 *start = rstart; 297 298 /* 299 * If a terminating block has been specified, we need to 300 * handle the case of recursion, which could have their 301 * own terminating blocks that mess up our parse. This, by the 302 * way, means that the "start" and "size" values will be 303 * effectively meaningless. 304 */ 305 306 ssz = 0; 307 if (numeric && -1 == (ssz = numescape(&cp[i]))) 308 return(ESCAPE_ERROR); 309 310 i += ssz; 311 rlim = -1; 312 313 /* 314 * We have a character terminator. Try to read up to that 315 * character. If we can't (i.e., we hit the nil), then return 316 * an error; if we can, calculate our length, read past the 317 * terminating character, and exit. 318 */ 319 320 if ('\0' != term) { 321 *end = strchr(&cp[i], term); 322 if ('\0' == *end) 323 return(ESCAPE_ERROR); 324 325 rlim = *end - &cp[i]; 326 if (sz) 327 *sz = rlim; 328 (*end)++; 329 goto out; 330 } 331 332 assert(lim > 0); 333 334 /* 335 * We have a numeric limit. If the string is shorter than that, 336 * stop and return an error. Else adjust our endpoint, length, 337 * and return the current glyph. 338 */ 339 340 if ((size_t)lim > strlen(&cp[i])) 341 return(ESCAPE_ERROR); 342 343 rlim = lim; 344 if (sz) 345 *sz = rlim; 346 347 *end = &cp[i] + lim; 348 349 out: 350 assert(rlim >= 0 && rstart); 351 352 /* Run post-processors. */ 353 354 switch (gly) { 355 case (ESCAPE_FONT): 356 if (1 != rlim) 357 break; 358 switch (*rstart) { 359 case ('3'): 360 /* FALLTHROUGH */ 361 case ('B'): 362 gly = ESCAPE_FONTBOLD; 363 break; 364 case ('2'): 365 /* FALLTHROUGH */ 366 case ('I'): 367 gly = ESCAPE_FONTITALIC; 368 break; 369 case ('P'): 370 gly = ESCAPE_FONTPREV; 371 break; 372 case ('1'): 373 /* FALLTHROUGH */ 374 case ('R'): 375 gly = ESCAPE_FONTROMAN; 376 break; 377 } 378 break; 379 case (ESCAPE_SPECIAL): 380 if (1 != rlim) 381 break; 382 if ('c' == *rstart) 383 gly = ESCAPE_NOSPACE; 384 break; 385 default: 386 break; 387 } 388 389 return(gly); 390 } 391 392 void * 393 mandoc_calloc(size_t num, size_t size) 394 { 395 void *ptr; 396 397 ptr = calloc(num, size); 398 if (NULL == ptr) { 399 perror(NULL); 400 exit((int)MANDOCLEVEL_SYSERR); 401 } 402 403 return(ptr); 404 } 405 406 407 void * 408 mandoc_malloc(size_t size) 409 { 410 void *ptr; 411 412 ptr = malloc(size); 413 if (NULL == ptr) { 414 perror(NULL); 415 exit((int)MANDOCLEVEL_SYSERR); 416 } 417 418 return(ptr); 419 } 420 421 422 void * 423 mandoc_realloc(void *ptr, size_t size) 424 { 425 426 ptr = realloc(ptr, size); 427 if (NULL == ptr) { 428 perror(NULL); 429 exit((int)MANDOCLEVEL_SYSERR); 430 } 431 432 return(ptr); 433 } 434 435 char * 436 mandoc_strndup(const char *ptr, size_t sz) 437 { 438 char *p; 439 440 p = mandoc_malloc(sz + 1); 441 memcpy(p, ptr, sz); 442 p[(int)sz] = '\0'; 443 return(p); 444 } 445 446 char * 447 mandoc_strdup(const char *ptr) 448 { 449 char *p; 450 451 p = strdup(ptr); 452 if (NULL == p) { 453 perror(NULL); 454 exit((int)MANDOCLEVEL_SYSERR); 455 } 456 457 return(p); 458 } 459 460 /* 461 * Parse a quoted or unquoted roff-style request or macro argument. 462 * Return a pointer to the parsed argument, which is either the original 463 * pointer or advanced by one byte in case the argument is quoted. 464 * Null-terminate the argument in place. 465 * Collapse pairs of quotes inside quoted arguments. 466 * Advance the argument pointer to the next argument, 467 * or to the null byte terminating the argument line. 468 */ 469 char * 470 mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos) 471 { 472 char *start, *cp; 473 int quoted, pairs, white; 474 475 /* Quoting can only start with a new word. */ 476 start = *cpp; 477 quoted = 0; 478 if ('"' == *start) { 479 quoted = 1; 480 start++; 481 } 482 483 pairs = 0; 484 white = 0; 485 for (cp = start; '\0' != *cp; cp++) { 486 /* Move left after quoted quotes and escaped backslashes. */ 487 if (pairs) 488 cp[-pairs] = cp[0]; 489 if ('\\' == cp[0]) { 490 if ('\\' == cp[1]) { 491 /* Poor man's copy mode. */ 492 pairs++; 493 cp++; 494 } else if (0 == quoted && ' ' == cp[1]) 495 /* Skip escaped blanks. */ 496 cp++; 497 } else if (0 == quoted) { 498 if (' ' == cp[0]) { 499 /* Unescaped blanks end unquoted args. */ 500 white = 1; 501 break; 502 } 503 } else if ('"' == cp[0]) { 504 if ('"' == cp[1]) { 505 /* Quoted quotes collapse. */ 506 pairs++; 507 cp++; 508 } else { 509 /* Unquoted quotes end quoted args. */ 510 quoted = 2; 511 break; 512 } 513 } 514 } 515 516 /* Quoted argument without a closing quote. */ 517 if (1 == quoted) 518 mandoc_msg(MANDOCERR_BADQUOTE, parse, ln, *pos, NULL); 519 520 /* Null-terminate this argument and move to the next one. */ 521 if (pairs) 522 cp[-pairs] = '\0'; 523 if ('\0' != *cp) { 524 *cp++ = '\0'; 525 while (' ' == *cp) 526 cp++; 527 } 528 *pos += (int)(cp - start) + (quoted ? 1 : 0); 529 *cpp = cp; 530 531 if ('\0' == *cp && (white || ' ' == cp[-1])) 532 mandoc_msg(MANDOCERR_EOLNSPACE, parse, ln, *pos, NULL); 533 534 return(start); 535 } 536 537 static int 538 a2time(time_t *t, const char *fmt, const char *p) 539 { 540 struct tm tm; 541 char *pp; 542 543 memset(&tm, 0, sizeof(struct tm)); 544 545 pp = NULL; 546 #ifdef HAVE_STRPTIME 547 pp = strptime(p, fmt, &tm); 548 #endif 549 if (NULL != pp && '\0' == *pp) { 550 *t = mktime(&tm); 551 return(1); 552 } 553 554 return(0); 555 } 556 557 static char * 558 time2a(time_t t) 559 { 560 struct tm *tm; 561 char *buf, *p; 562 size_t ssz; 563 int isz; 564 565 tm = localtime(&t); 566 567 /* 568 * Reserve space: 569 * up to 9 characters for the month (September) + blank 570 * up to 2 characters for the day + comma + blank 571 * 4 characters for the year and a terminating '\0' 572 */ 573 p = buf = mandoc_malloc(10 + 4 + 4 + 1); 574 575 if (0 == (ssz = strftime(p, 10 + 1, "%B ", tm))) 576 goto fail; 577 p += (int)ssz; 578 579 if (-1 == (isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday))) 580 goto fail; 581 p += isz; 582 583 if (0 == strftime(p, 4 + 1, "%Y", tm)) 584 goto fail; 585 return(buf); 586 587 fail: 588 free(buf); 589 return(NULL); 590 } 591 592 char * 593 mandoc_normdate(struct mparse *parse, char *in, int ln, int pos) 594 { 595 char *out; 596 time_t t; 597 598 if (NULL == in || '\0' == *in || 599 0 == strcmp(in, "$" "Mdocdate$")) { 600 mandoc_msg(MANDOCERR_NODATE, parse, ln, pos, NULL); 601 time(&t); 602 } 603 else if (!a2time(&t, "$" "Mdocdate: %b %d %Y $", in) && 604 !a2time(&t, "%b %d, %Y", in) && 605 !a2time(&t, "%Y-%m-%d", in)) { 606 mandoc_msg(MANDOCERR_BADDATE, parse, ln, pos, NULL); 607 t = 0; 608 } 609 out = t ? time2a(t) : NULL; 610 return(out ? out : mandoc_strdup(in)); 611 } 612 613 int 614 mandoc_eos(const char *p, size_t sz, int enclosed) 615 { 616 const char *q; 617 int found; 618 619 if (0 == sz) 620 return(0); 621 622 /* 623 * End-of-sentence recognition must include situations where 624 * some symbols, such as `)', allow prior EOS punctuation to 625 * propagate outward. 626 */ 627 628 found = 0; 629 for (q = p + (int)sz - 1; q >= p; q--) { 630 switch (*q) { 631 case ('\"'): 632 /* FALLTHROUGH */ 633 case ('\''): 634 /* FALLTHROUGH */ 635 case (']'): 636 /* FALLTHROUGH */ 637 case (')'): 638 if (0 == found) 639 enclosed = 1; 640 break; 641 case ('.'): 642 /* FALLTHROUGH */ 643 case ('!'): 644 /* FALLTHROUGH */ 645 case ('?'): 646 found = 1; 647 break; 648 default: 649 return(found && (!enclosed || isalnum((unsigned char)*q))); 650 } 651 } 652 653 return(found && !enclosed); 654 } 655 656 /* 657 * Find out whether a line is a macro line or not. If it is, adjust the 658 * current position and return one; if it isn't, return zero and don't 659 * change the current position. 660 */ 661 int 662 mandoc_getcontrol(const char *cp, int *ppos) 663 { 664 int pos; 665 666 pos = *ppos; 667 668 if ('\\' == cp[pos] && '.' == cp[pos + 1]) 669 pos += 2; 670 else if ('.' == cp[pos] || '\'' == cp[pos]) 671 pos++; 672 else 673 return(0); 674 675 while (' ' == cp[pos] || '\t' == cp[pos]) 676 pos++; 677 678 *ppos = pos; 679 return(1); 680 } 681 682 /* 683 * Convert a string to a long that may not be <0. 684 * If the string is invalid, or is less than 0, return -1. 685 */ 686 int 687 mandoc_strntoi(const char *p, size_t sz, int base) 688 { 689 char buf[32]; 690 char *ep; 691 long v; 692 693 if (sz > 31) 694 return(-1); 695 696 memcpy(buf, p, sz); 697 buf[(int)sz] = '\0'; 698 699 errno = 0; 700 v = strtol(buf, &ep, base); 701 702 if (buf[0] == '\0' || *ep != '\0') 703 return(-1); 704 705 if (v > INT_MAX) 706 v = INT_MAX; 707 if (v < INT_MIN) 708 v = INT_MIN; 709 710 return((int)v); 711 } 712