1 /* $Id: mandoc.c,v 1.28 2011/09/18 15:54:48 schwarze Exp $ */ 2 /* 3 * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv> 4 * Copyright (c) 2011 Ingo Schwarze <schwarze@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 #include <sys/types.h> 19 20 #include <assert.h> 21 #include <ctype.h> 22 #include <errno.h> 23 #include <limits.h> 24 #include <stdlib.h> 25 #include <stdio.h> 26 #include <string.h> 27 #include <time.h> 28 29 #include "mandoc.h" 30 #include "libmandoc.h" 31 32 #define DATESIZE 32 33 34 static int a2time(time_t *, const char *, const char *); 35 static char *time2a(time_t); 36 static int numescape(const char *); 37 38 /* 39 * Pass over recursive numerical expressions. This context of this 40 * function is important: it's only called within character-terminating 41 * escapes (e.g., \s[xxxyyy]), so all we need to do is handle initial 42 * recursion: we don't care about what's in these blocks. 43 * This returns the number of characters skipped or -1 if an error 44 * occurs (the caller should bail). 45 */ 46 static int 47 numescape(const char *start) 48 { 49 int i; 50 size_t sz; 51 const char *cp; 52 53 i = 0; 54 55 /* The expression consists of a subexpression. */ 56 57 if ('\\' == start[i]) { 58 cp = &start[++i]; 59 /* 60 * Read past the end of the subexpression. 61 * Bail immediately on errors. 62 */ 63 if (ESCAPE_ERROR == mandoc_escape(&cp, NULL, NULL)) 64 return(-1); 65 return(i + cp - &start[i]); 66 } 67 68 if ('(' != start[i++]) 69 return(0); 70 71 /* 72 * A parenthesised subexpression. Read until the closing 73 * parenthesis, making sure to handle any nested subexpressions 74 * that might ruin our parse. 75 */ 76 77 while (')' != start[i]) { 78 sz = strcspn(&start[i], ")\\"); 79 i += (int)sz; 80 81 if ('\0' == start[i]) 82 return(-1); 83 else if ('\\' != start[i]) 84 continue; 85 86 cp = &start[++i]; 87 if (ESCAPE_ERROR == mandoc_escape(&cp, NULL, NULL)) 88 return(-1); 89 i += cp - &start[i]; 90 } 91 92 /* Read past the terminating ')'. */ 93 return(++i); 94 } 95 96 enum mandoc_esc 97 mandoc_escape(const char **end, const char **start, int *sz) 98 { 99 char c, term, numeric; 100 int i, lim, ssz, rlim; 101 const char *cp, *rstart; 102 enum mandoc_esc gly; 103 104 cp = *end; 105 rstart = cp; 106 if (start) 107 *start = rstart; 108 i = lim = 0; 109 gly = ESCAPE_ERROR; 110 term = numeric = '\0'; 111 112 switch ((c = cp[i++])) { 113 /* 114 * First the glyphs. There are several different forms of 115 * these, but each eventually returns a substring of the glyph 116 * name. 117 */ 118 case ('('): 119 gly = ESCAPE_SPECIAL; 120 lim = 2; 121 break; 122 case ('['): 123 gly = ESCAPE_SPECIAL; 124 /* 125 * Unicode escapes are defined in groff as \[uXXXX] to 126 * \[u10FFFF], where the contained value must be a valid 127 * Unicode codepoint. Here, however, only check whether 128 * it's not a zero-width escape. 129 */ 130 if ('u' == cp[i] && ']' != cp[i + 1]) 131 gly = ESCAPE_UNICODE; 132 term = ']'; 133 break; 134 case ('C'): 135 if ('\'' != cp[i]) 136 return(ESCAPE_ERROR); 137 gly = ESCAPE_SPECIAL; 138 term = '\''; 139 break; 140 141 /* 142 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where 143 * 'X' is the trigger. These have opaque sub-strings. 144 */ 145 case ('F'): 146 /* FALLTHROUGH */ 147 case ('g'): 148 /* FALLTHROUGH */ 149 case ('k'): 150 /* FALLTHROUGH */ 151 case ('M'): 152 /* FALLTHROUGH */ 153 case ('m'): 154 /* FALLTHROUGH */ 155 case ('n'): 156 /* FALLTHROUGH */ 157 case ('V'): 158 /* FALLTHROUGH */ 159 case ('Y'): 160 if (ESCAPE_ERROR == gly) 161 gly = ESCAPE_IGNORE; 162 /* FALLTHROUGH */ 163 case ('f'): 164 if (ESCAPE_ERROR == gly) 165 gly = ESCAPE_FONT; 166 167 rstart= &cp[i]; 168 if (start) 169 *start = rstart; 170 171 switch (cp[i++]) { 172 case ('('): 173 lim = 2; 174 break; 175 case ('['): 176 term = ']'; 177 break; 178 default: 179 lim = 1; 180 i--; 181 break; 182 } 183 break; 184 185 /* 186 * These escapes are of the form \X'Y', where 'X' is the trigger 187 * and 'Y' is any string. These have opaque sub-strings. 188 */ 189 case ('A'): 190 /* FALLTHROUGH */ 191 case ('b'): 192 /* FALLTHROUGH */ 193 case ('D'): 194 /* FALLTHROUGH */ 195 case ('o'): 196 /* FALLTHROUGH */ 197 case ('R'): 198 /* FALLTHROUGH */ 199 case ('X'): 200 /* FALLTHROUGH */ 201 case ('Z'): 202 if ('\'' != cp[i++]) 203 return(ESCAPE_ERROR); 204 gly = ESCAPE_IGNORE; 205 term = '\''; 206 break; 207 208 /* 209 * These escapes are of the form \X'N', where 'X' is the trigger 210 * and 'N' resolves to a numerical expression. 211 */ 212 case ('B'): 213 /* FALLTHROUGH */ 214 case ('h'): 215 /* FALLTHROUGH */ 216 case ('H'): 217 /* FALLTHROUGH */ 218 case ('L'): 219 /* FALLTHROUGH */ 220 case ('l'): 221 /* FALLTHROUGH */ 222 case ('N'): 223 if (ESCAPE_ERROR == gly) 224 gly = ESCAPE_NUMBERED; 225 /* FALLTHROUGH */ 226 case ('S'): 227 /* FALLTHROUGH */ 228 case ('v'): 229 /* FALLTHROUGH */ 230 case ('w'): 231 /* FALLTHROUGH */ 232 case ('x'): 233 if (ESCAPE_ERROR == gly) 234 gly = ESCAPE_IGNORE; 235 if ('\'' != cp[i++]) 236 return(ESCAPE_ERROR); 237 term = numeric = '\''; 238 break; 239 240 /* 241 * Sizes get a special category of their own. 242 */ 243 case ('s'): 244 gly = ESCAPE_IGNORE; 245 246 rstart = &cp[i]; 247 if (start) 248 *start = rstart; 249 250 /* See +/- counts as a sign. */ 251 c = cp[i]; 252 if ('+' == c || '-' == c || ASCII_HYPH == c) 253 ++i; 254 255 switch (cp[i++]) { 256 case ('('): 257 lim = 2; 258 break; 259 case ('['): 260 term = numeric = ']'; 261 break; 262 case ('\''): 263 term = numeric = '\''; 264 break; 265 default: 266 lim = 1; 267 i--; 268 break; 269 } 270 271 /* See +/- counts as a sign. */ 272 c = cp[i]; 273 if ('+' == c || '-' == c || ASCII_HYPH == c) 274 ++i; 275 276 break; 277 278 /* 279 * Anything else is assumed to be a glyph. 280 */ 281 default: 282 gly = ESCAPE_SPECIAL; 283 lim = 1; 284 i--; 285 break; 286 } 287 288 assert(ESCAPE_ERROR != gly); 289 290 rstart = &cp[i]; 291 if (start) 292 *start = rstart; 293 294 /* 295 * If a terminating block has been specified, we need to 296 * handle the case of recursion, which could have their 297 * own terminating blocks that mess up our parse. This, by the 298 * way, means that the "start" and "size" values will be 299 * effectively meaningless. 300 */ 301 302 ssz = 0; 303 if (numeric && -1 == (ssz = numescape(&cp[i]))) 304 return(ESCAPE_ERROR); 305 306 i += ssz; 307 rlim = -1; 308 309 /* 310 * We have a character terminator. Try to read up to that 311 * character. If we can't (i.e., we hit the nil), then return 312 * an error; if we can, calculate our length, read past the 313 * terminating character, and exit. 314 */ 315 316 if ('\0' != term) { 317 *end = strchr(&cp[i], term); 318 if ('\0' == *end) 319 return(ESCAPE_ERROR); 320 321 rlim = *end - &cp[i]; 322 if (sz) 323 *sz = rlim; 324 (*end)++; 325 goto out; 326 } 327 328 assert(lim > 0); 329 330 /* 331 * We have a numeric limit. If the string is shorter than that, 332 * stop and return an error. Else adjust our endpoint, length, 333 * and return the current glyph. 334 */ 335 336 if ((size_t)lim > strlen(&cp[i])) 337 return(ESCAPE_ERROR); 338 339 rlim = lim; 340 if (sz) 341 *sz = rlim; 342 343 *end = &cp[i] + lim; 344 345 out: 346 assert(rlim >= 0 && rstart); 347 348 /* Run post-processors. */ 349 350 switch (gly) { 351 case (ESCAPE_FONT): 352 if (1 != rlim) 353 break; 354 switch (*rstart) { 355 case ('3'): 356 /* FALLTHROUGH */ 357 case ('B'): 358 gly = ESCAPE_FONTBOLD; 359 break; 360 case ('2'): 361 /* FALLTHROUGH */ 362 case ('I'): 363 gly = ESCAPE_FONTITALIC; 364 break; 365 case ('P'): 366 gly = ESCAPE_FONTPREV; 367 break; 368 case ('1'): 369 /* FALLTHROUGH */ 370 case ('R'): 371 gly = ESCAPE_FONTROMAN; 372 break; 373 } 374 break; 375 case (ESCAPE_SPECIAL): 376 if (1 != rlim) 377 break; 378 if ('c' == *rstart) 379 gly = ESCAPE_NOSPACE; 380 break; 381 default: 382 break; 383 } 384 385 return(gly); 386 } 387 388 void * 389 mandoc_calloc(size_t num, size_t size) 390 { 391 void *ptr; 392 393 ptr = calloc(num, size); 394 if (NULL == ptr) { 395 perror(NULL); 396 exit((int)MANDOCLEVEL_SYSERR); 397 } 398 399 return(ptr); 400 } 401 402 403 void * 404 mandoc_malloc(size_t size) 405 { 406 void *ptr; 407 408 ptr = malloc(size); 409 if (NULL == ptr) { 410 perror(NULL); 411 exit((int)MANDOCLEVEL_SYSERR); 412 } 413 414 return(ptr); 415 } 416 417 418 void * 419 mandoc_realloc(void *ptr, size_t size) 420 { 421 422 ptr = realloc(ptr, size); 423 if (NULL == ptr) { 424 perror(NULL); 425 exit((int)MANDOCLEVEL_SYSERR); 426 } 427 428 return(ptr); 429 } 430 431 char * 432 mandoc_strndup(const char *ptr, size_t sz) 433 { 434 char *p; 435 436 p = mandoc_malloc(sz + 1); 437 memcpy(p, ptr, sz); 438 p[(int)sz] = '\0'; 439 return(p); 440 } 441 442 char * 443 mandoc_strdup(const char *ptr) 444 { 445 char *p; 446 447 p = strdup(ptr); 448 if (NULL == p) { 449 perror(NULL); 450 exit((int)MANDOCLEVEL_SYSERR); 451 } 452 453 return(p); 454 } 455 456 /* 457 * Parse a quoted or unquoted roff-style request or macro argument. 458 * Return a pointer to the parsed argument, which is either the original 459 * pointer or advanced by one byte in case the argument is quoted. 460 * Null-terminate the argument in place. 461 * Collapse pairs of quotes inside quoted arguments. 462 * Advance the argument pointer to the next argument, 463 * or to the null byte terminating the argument line. 464 */ 465 char * 466 mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos) 467 { 468 char *start, *cp; 469 int quoted, pairs, white; 470 471 /* Quoting can only start with a new word. */ 472 start = *cpp; 473 quoted = 0; 474 if ('"' == *start) { 475 quoted = 1; 476 start++; 477 } 478 479 pairs = 0; 480 white = 0; 481 for (cp = start; '\0' != *cp; cp++) { 482 /* Move left after quoted quotes and escaped backslashes. */ 483 if (pairs) 484 cp[-pairs] = cp[0]; 485 if ('\\' == cp[0]) { 486 if ('\\' == cp[1]) { 487 /* Poor man's copy mode. */ 488 pairs++; 489 cp++; 490 } else if (0 == quoted && ' ' == cp[1]) 491 /* Skip escaped blanks. */ 492 cp++; 493 } else if (0 == quoted) { 494 if (' ' == cp[0]) { 495 /* Unescaped blanks end unquoted args. */ 496 white = 1; 497 break; 498 } 499 } else if ('"' == cp[0]) { 500 if ('"' == cp[1]) { 501 /* Quoted quotes collapse. */ 502 pairs++; 503 cp++; 504 } else { 505 /* Unquoted quotes end quoted args. */ 506 quoted = 2; 507 break; 508 } 509 } 510 } 511 512 /* Quoted argument without a closing quote. */ 513 if (1 == quoted) 514 mandoc_msg(MANDOCERR_BADQUOTE, parse, ln, *pos, NULL); 515 516 /* Null-terminate this argument and move to the next one. */ 517 if (pairs) 518 cp[-pairs] = '\0'; 519 if ('\0' != *cp) { 520 *cp++ = '\0'; 521 while (' ' == *cp) 522 cp++; 523 } 524 *pos += (int)(cp - start) + (quoted ? 1 : 0); 525 *cpp = cp; 526 527 if ('\0' == *cp && (white || ' ' == cp[-1])) 528 mandoc_msg(MANDOCERR_EOLNSPACE, parse, ln, *pos, NULL); 529 530 return(start); 531 } 532 533 static int 534 a2time(time_t *t, const char *fmt, const char *p) 535 { 536 struct tm tm; 537 char *pp; 538 539 memset(&tm, 0, sizeof(struct tm)); 540 541 pp = strptime(p, fmt, &tm); 542 if (NULL != pp && '\0' == *pp) { 543 *t = mktime(&tm); 544 return(1); 545 } 546 547 return(0); 548 } 549 550 static char * 551 time2a(time_t t) 552 { 553 struct tm *tm; 554 char *buf, *p; 555 size_t ssz; 556 int isz; 557 558 tm = localtime(&t); 559 560 /* 561 * Reserve space: 562 * up to 9 characters for the month (September) + blank 563 * up to 2 characters for the day + comma + blank 564 * 4 characters for the year and a terminating '\0' 565 */ 566 p = buf = mandoc_malloc(10 + 4 + 4 + 1); 567 568 if (0 == (ssz = strftime(p, 10 + 1, "%B ", tm))) 569 goto fail; 570 p += (int)ssz; 571 572 if (-1 == (isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday))) 573 goto fail; 574 p += isz; 575 576 if (0 == strftime(p, 4 + 1, "%Y", tm)) 577 goto fail; 578 return(buf); 579 580 fail: 581 free(buf); 582 return(NULL); 583 } 584 585 char * 586 mandoc_normdate(struct mparse *parse, char *in, int ln, int pos) 587 { 588 char *out; 589 time_t t; 590 591 if (NULL == in || '\0' == *in || 592 0 == strcmp(in, "$" "Mdocdate$")) { 593 mandoc_msg(MANDOCERR_NODATE, parse, ln, pos, NULL); 594 time(&t); 595 } 596 else if (!a2time(&t, "$" "Mdocdate: %b %d %Y $", in) && 597 !a2time(&t, "%b %d, %Y", in) && 598 !a2time(&t, "%Y-%m-%d", in)) { 599 mandoc_msg(MANDOCERR_BADDATE, parse, ln, pos, NULL); 600 t = 0; 601 } 602 out = t ? time2a(t) : NULL; 603 return(out ? out : mandoc_strdup(in)); 604 } 605 606 int 607 mandoc_eos(const char *p, size_t sz, int enclosed) 608 { 609 const char *q; 610 int found; 611 612 if (0 == sz) 613 return(0); 614 615 /* 616 * End-of-sentence recognition must include situations where 617 * some symbols, such as `)', allow prior EOS punctuation to 618 * propagate outward. 619 */ 620 621 found = 0; 622 for (q = p + (int)sz - 1; q >= p; q--) { 623 switch (*q) { 624 case ('\"'): 625 /* FALLTHROUGH */ 626 case ('\''): 627 /* FALLTHROUGH */ 628 case (']'): 629 /* FALLTHROUGH */ 630 case (')'): 631 if (0 == found) 632 enclosed = 1; 633 break; 634 case ('.'): 635 /* FALLTHROUGH */ 636 case ('!'): 637 /* FALLTHROUGH */ 638 case ('?'): 639 found = 1; 640 break; 641 default: 642 return(found && (!enclosed || isalnum((unsigned char)*q))); 643 } 644 } 645 646 return(found && !enclosed); 647 } 648 649 /* 650 * Find out whether a line is a macro line or not. If it is, adjust the 651 * current position and return one; if it isn't, return zero and don't 652 * change the current position. 653 */ 654 int 655 mandoc_getcontrol(const char *cp, int *ppos) 656 { 657 int pos; 658 659 pos = *ppos; 660 661 if ('\\' == cp[pos] && '.' == cp[pos + 1]) 662 pos += 2; 663 else if ('.' == cp[pos] || '\'' == cp[pos]) 664 pos++; 665 else 666 return(0); 667 668 while (' ' == cp[pos] || '\t' == cp[pos]) 669 pos++; 670 671 *ppos = pos; 672 return(1); 673 } 674 675 /* 676 * Convert a string to a long that may not be <0. 677 * If the string is invalid, or is less than 0, return -1. 678 */ 679 int 680 mandoc_strntoi(const char *p, size_t sz, int base) 681 { 682 char buf[32]; 683 char *ep; 684 long v; 685 686 if (sz > 31) 687 return(-1); 688 689 memcpy(buf, p, sz); 690 buf[(int)sz] = '\0'; 691 692 errno = 0; 693 v = strtol(buf, &ep, base); 694 695 if (buf[0] == '\0' || *ep != '\0') 696 return(-1); 697 698 if (v > INT_MAX) 699 v = INT_MAX; 700 if (v < INT_MIN) 701 v = INT_MIN; 702 703 return((int)v); 704 } 705