1 /* $Id: mandoc.c,v 1.35 2012/07/07 18:27:36 schwarze Exp $ */ 2 /* 3 * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv> 4 * Copyright (c) 2011, 2012 Ingo Schwarze <schwarze@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 #include <sys/types.h> 19 20 #include <assert.h> 21 #include <ctype.h> 22 #include <errno.h> 23 #include <limits.h> 24 #include <stdlib.h> 25 #include <stdio.h> 26 #include <string.h> 27 #include <time.h> 28 29 #include "mandoc.h" 30 #include "libmandoc.h" 31 32 #define DATESIZE 32 33 34 static int a2time(time_t *, const char *, const char *); 35 static char *time2a(time_t); 36 37 38 enum mandoc_esc 39 mandoc_escape(const char **end, const char **start, int *sz) 40 { 41 const char *local_start; 42 int local_sz; 43 char term; 44 enum mandoc_esc gly; 45 46 /* 47 * When the caller doesn't provide return storage, 48 * use local storage. 49 */ 50 51 if (NULL == start) 52 start = &local_start; 53 if (NULL == sz) 54 sz = &local_sz; 55 56 /* 57 * Beyond the backslash, at least one input character 58 * is part of the escape sequence. With one exception 59 * (see below), that character won't be returned. 60 */ 61 62 gly = ESCAPE_ERROR; 63 *start = ++*end; 64 *sz = 0; 65 term = '\0'; 66 67 switch ((*start)[-1]) { 68 /* 69 * First the glyphs. There are several different forms of 70 * these, but each eventually returns a substring of the glyph 71 * name. 72 */ 73 case ('('): 74 gly = ESCAPE_SPECIAL; 75 *sz = 2; 76 break; 77 case ('['): 78 gly = ESCAPE_SPECIAL; 79 /* 80 * Unicode escapes are defined in groff as \[uXXXX] to 81 * \[u10FFFF], where the contained value must be a valid 82 * Unicode codepoint. Here, however, only check whether 83 * it's not a zero-width escape. 84 */ 85 if ('u' == (*start)[0] && ']' != (*start)[1]) 86 gly = ESCAPE_UNICODE; 87 term = ']'; 88 break; 89 case ('C'): 90 if ('\'' != **start) 91 return(ESCAPE_ERROR); 92 gly = ESCAPE_SPECIAL; 93 *start = ++*end; 94 term = '\''; 95 break; 96 97 /* 98 * The \z escape is supposed to output the following 99 * character without advancing the cursor position. 100 * Since we are mostly dealing with terminal mode, 101 * let us just skip the next character. 102 */ 103 case ('z'): 104 return(ESCAPE_SKIPCHAR); 105 106 /* 107 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where 108 * 'X' is the trigger. These have opaque sub-strings. 109 */ 110 case ('F'): 111 /* FALLTHROUGH */ 112 case ('g'): 113 /* FALLTHROUGH */ 114 case ('k'): 115 /* FALLTHROUGH */ 116 case ('M'): 117 /* FALLTHROUGH */ 118 case ('m'): 119 /* FALLTHROUGH */ 120 case ('n'): 121 /* FALLTHROUGH */ 122 case ('V'): 123 /* FALLTHROUGH */ 124 case ('Y'): 125 gly = ESCAPE_IGNORE; 126 /* FALLTHROUGH */ 127 case ('f'): 128 if (ESCAPE_ERROR == gly) 129 gly = ESCAPE_FONT; 130 switch (**start) { 131 case ('('): 132 *start = ++*end; 133 *sz = 2; 134 break; 135 case ('['): 136 *start = ++*end; 137 term = ']'; 138 break; 139 default: 140 *sz = 1; 141 break; 142 } 143 break; 144 145 /* 146 * These escapes are of the form \X'Y', where 'X' is the trigger 147 * and 'Y' is any string. These have opaque sub-strings. 148 */ 149 case ('A'): 150 /* FALLTHROUGH */ 151 case ('b'): 152 /* FALLTHROUGH */ 153 case ('D'): 154 /* FALLTHROUGH */ 155 case ('o'): 156 /* FALLTHROUGH */ 157 case ('R'): 158 /* FALLTHROUGH */ 159 case ('X'): 160 /* FALLTHROUGH */ 161 case ('Z'): 162 if ('\'' != **start) 163 return(ESCAPE_ERROR); 164 gly = ESCAPE_IGNORE; 165 *start = ++*end; 166 term = '\''; 167 break; 168 169 /* 170 * These escapes are of the form \X'N', where 'X' is the trigger 171 * and 'N' resolves to a numerical expression. 172 */ 173 case ('B'): 174 /* FALLTHROUGH */ 175 case ('h'): 176 /* FALLTHROUGH */ 177 case ('H'): 178 /* FALLTHROUGH */ 179 case ('L'): 180 /* FALLTHROUGH */ 181 case ('l'): 182 gly = ESCAPE_NUMBERED; 183 /* FALLTHROUGH */ 184 case ('S'): 185 /* FALLTHROUGH */ 186 case ('v'): 187 /* FALLTHROUGH */ 188 case ('w'): 189 /* FALLTHROUGH */ 190 case ('x'): 191 if ('\'' != **start) 192 return(ESCAPE_ERROR); 193 if (ESCAPE_ERROR == gly) 194 gly = ESCAPE_IGNORE; 195 *start = ++*end; 196 term = '\''; 197 break; 198 199 /* 200 * Special handling for the numbered character escape. 201 * XXX Do any other escapes need similar handling? 202 */ 203 case ('N'): 204 if ('\0' == **start) 205 return(ESCAPE_ERROR); 206 (*end)++; 207 if (isdigit((unsigned char)**start)) { 208 *sz = 1; 209 return(ESCAPE_IGNORE); 210 } 211 (*start)++; 212 while (isdigit((unsigned char)**end)) 213 (*end)++; 214 *sz = *end - *start; 215 if ('\0' != **end) 216 (*end)++; 217 return(ESCAPE_NUMBERED); 218 219 /* 220 * Sizes get a special category of their own. 221 */ 222 case ('s'): 223 gly = ESCAPE_IGNORE; 224 225 /* See +/- counts as a sign. */ 226 if ('+' == **end || '-' == **end || ASCII_HYPH == **end) 227 (*end)++; 228 229 switch (**end) { 230 case ('('): 231 *start = ++*end; 232 *sz = 2; 233 break; 234 case ('['): 235 *start = ++*end; 236 term = ']'; 237 break; 238 case ('\''): 239 *start = ++*end; 240 term = '\''; 241 break; 242 default: 243 *sz = 1; 244 break; 245 } 246 247 break; 248 249 /* 250 * Anything else is assumed to be a glyph. 251 * In this case, pass back the character after the backslash. 252 */ 253 default: 254 gly = ESCAPE_SPECIAL; 255 *start = --*end; 256 *sz = 1; 257 break; 258 } 259 260 assert(ESCAPE_ERROR != gly); 261 262 /* 263 * Read up to the terminating character, 264 * paying attention to nested escapes. 265 */ 266 267 if ('\0' != term) { 268 while (**end != term) { 269 switch (**end) { 270 case ('\0'): 271 return(ESCAPE_ERROR); 272 case ('\\'): 273 (*end)++; 274 if (ESCAPE_ERROR == 275 mandoc_escape(end, NULL, NULL)) 276 return(ESCAPE_ERROR); 277 break; 278 default: 279 (*end)++; 280 break; 281 } 282 } 283 *sz = (*end)++ - *start; 284 } else { 285 assert(*sz > 0); 286 if ((size_t)*sz > strlen(*start)) 287 return(ESCAPE_ERROR); 288 *end += *sz; 289 } 290 291 /* Run post-processors. */ 292 293 switch (gly) { 294 case (ESCAPE_FONT): 295 /* 296 * Pretend that the constant-width font modes are the 297 * same as the regular font modes. 298 */ 299 if (2 == *sz && 'C' == **start) { 300 (*start)++; 301 (*sz)--; 302 } else if (1 != *sz) 303 break; 304 305 switch (**start) { 306 case ('3'): 307 /* FALLTHROUGH */ 308 case ('B'): 309 gly = ESCAPE_FONTBOLD; 310 break; 311 case ('2'): 312 /* FALLTHROUGH */ 313 case ('I'): 314 gly = ESCAPE_FONTITALIC; 315 break; 316 case ('P'): 317 gly = ESCAPE_FONTPREV; 318 break; 319 case ('1'): 320 /* FALLTHROUGH */ 321 case ('R'): 322 gly = ESCAPE_FONTROMAN; 323 break; 324 } 325 break; 326 case (ESCAPE_SPECIAL): 327 if (1 == *sz && 'c' == **start) 328 gly = ESCAPE_NOSPACE; 329 break; 330 default: 331 break; 332 } 333 334 return(gly); 335 } 336 337 void * 338 mandoc_calloc(size_t num, size_t size) 339 { 340 void *ptr; 341 342 ptr = calloc(num, size); 343 if (NULL == ptr) { 344 perror(NULL); 345 exit((int)MANDOCLEVEL_SYSERR); 346 } 347 348 return(ptr); 349 } 350 351 352 void * 353 mandoc_malloc(size_t size) 354 { 355 void *ptr; 356 357 ptr = malloc(size); 358 if (NULL == ptr) { 359 perror(NULL); 360 exit((int)MANDOCLEVEL_SYSERR); 361 } 362 363 return(ptr); 364 } 365 366 367 void * 368 mandoc_realloc(void *ptr, size_t size) 369 { 370 371 ptr = realloc(ptr, size); 372 if (NULL == ptr) { 373 perror(NULL); 374 exit((int)MANDOCLEVEL_SYSERR); 375 } 376 377 return(ptr); 378 } 379 380 char * 381 mandoc_strndup(const char *ptr, size_t sz) 382 { 383 char *p; 384 385 p = mandoc_malloc(sz + 1); 386 memcpy(p, ptr, sz); 387 p[(int)sz] = '\0'; 388 return(p); 389 } 390 391 char * 392 mandoc_strdup(const char *ptr) 393 { 394 char *p; 395 396 p = strdup(ptr); 397 if (NULL == p) { 398 perror(NULL); 399 exit((int)MANDOCLEVEL_SYSERR); 400 } 401 402 return(p); 403 } 404 405 /* 406 * Parse a quoted or unquoted roff-style request or macro argument. 407 * Return a pointer to the parsed argument, which is either the original 408 * pointer or advanced by one byte in case the argument is quoted. 409 * Null-terminate the argument in place. 410 * Collapse pairs of quotes inside quoted arguments. 411 * Advance the argument pointer to the next argument, 412 * or to the null byte terminating the argument line. 413 */ 414 char * 415 mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos) 416 { 417 char *start, *cp; 418 int quoted, pairs, white; 419 420 /* Quoting can only start with a new word. */ 421 start = *cpp; 422 quoted = 0; 423 if ('"' == *start) { 424 quoted = 1; 425 start++; 426 } 427 428 pairs = 0; 429 white = 0; 430 for (cp = start; '\0' != *cp; cp++) { 431 /* Move left after quoted quotes and escaped backslashes. */ 432 if (pairs) 433 cp[-pairs] = cp[0]; 434 if ('\\' == cp[0]) { 435 if ('\\' == cp[1]) { 436 /* Poor man's copy mode. */ 437 pairs++; 438 cp++; 439 } else if (0 == quoted && ' ' == cp[1]) 440 /* Skip escaped blanks. */ 441 cp++; 442 } else if (0 == quoted) { 443 if (' ' == cp[0]) { 444 /* Unescaped blanks end unquoted args. */ 445 white = 1; 446 break; 447 } 448 } else if ('"' == cp[0]) { 449 if ('"' == cp[1]) { 450 /* Quoted quotes collapse. */ 451 pairs++; 452 cp++; 453 } else { 454 /* Unquoted quotes end quoted args. */ 455 quoted = 2; 456 break; 457 } 458 } 459 } 460 461 /* Quoted argument without a closing quote. */ 462 if (1 == quoted) 463 mandoc_msg(MANDOCERR_BADQUOTE, parse, ln, *pos, NULL); 464 465 /* Null-terminate this argument and move to the next one. */ 466 if (pairs) 467 cp[-pairs] = '\0'; 468 if ('\0' != *cp) { 469 *cp++ = '\0'; 470 while (' ' == *cp) 471 cp++; 472 } 473 *pos += (int)(cp - start) + (quoted ? 1 : 0); 474 *cpp = cp; 475 476 if ('\0' == *cp && (white || ' ' == cp[-1])) 477 mandoc_msg(MANDOCERR_EOLNSPACE, parse, ln, *pos, NULL); 478 479 return(start); 480 } 481 482 static int 483 a2time(time_t *t, const char *fmt, const char *p) 484 { 485 struct tm tm; 486 char *pp; 487 488 memset(&tm, 0, sizeof(struct tm)); 489 490 pp = strptime(p, fmt, &tm); 491 if (NULL != pp && '\0' == *pp) { 492 *t = mktime(&tm); 493 return(1); 494 } 495 496 return(0); 497 } 498 499 static char * 500 time2a(time_t t) 501 { 502 struct tm *tm; 503 char *buf, *p; 504 size_t ssz; 505 int isz; 506 507 tm = localtime(&t); 508 509 /* 510 * Reserve space: 511 * up to 9 characters for the month (September) + blank 512 * up to 2 characters for the day + comma + blank 513 * 4 characters for the year and a terminating '\0' 514 */ 515 p = buf = mandoc_malloc(10 + 4 + 4 + 1); 516 517 if (0 == (ssz = strftime(p, 10 + 1, "%B ", tm))) 518 goto fail; 519 p += (int)ssz; 520 521 if (-1 == (isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday))) 522 goto fail; 523 p += isz; 524 525 if (0 == strftime(p, 4 + 1, "%Y", tm)) 526 goto fail; 527 return(buf); 528 529 fail: 530 free(buf); 531 return(NULL); 532 } 533 534 char * 535 mandoc_normdate(struct mparse *parse, char *in, int ln, int pos) 536 { 537 char *out; 538 time_t t; 539 540 if (NULL == in || '\0' == *in || 541 0 == strcmp(in, "$" "Mdocdate$")) { 542 mandoc_msg(MANDOCERR_NODATE, parse, ln, pos, NULL); 543 time(&t); 544 } 545 else if (a2time(&t, "%Y-%m-%d", in)) 546 t = 0; 547 else if (!a2time(&t, "$" "Mdocdate: %b %d %Y $", in) && 548 !a2time(&t, "%b %d, %Y", in)) { 549 mandoc_msg(MANDOCERR_BADDATE, parse, ln, pos, NULL); 550 t = 0; 551 } 552 out = t ? time2a(t) : NULL; 553 return(out ? out : mandoc_strdup(in)); 554 } 555 556 int 557 mandoc_eos(const char *p, size_t sz, int enclosed) 558 { 559 const char *q; 560 int found; 561 562 if (0 == sz) 563 return(0); 564 565 /* 566 * End-of-sentence recognition must include situations where 567 * some symbols, such as `)', allow prior EOS punctuation to 568 * propagate outward. 569 */ 570 571 found = 0; 572 for (q = p + (int)sz - 1; q >= p; q--) { 573 switch (*q) { 574 case ('\"'): 575 /* FALLTHROUGH */ 576 case ('\''): 577 /* FALLTHROUGH */ 578 case (']'): 579 /* FALLTHROUGH */ 580 case (')'): 581 if (0 == found) 582 enclosed = 1; 583 break; 584 case ('.'): 585 /* FALLTHROUGH */ 586 case ('!'): 587 /* FALLTHROUGH */ 588 case ('?'): 589 found = 1; 590 break; 591 default: 592 return(found && (!enclosed || isalnum((unsigned char)*q))); 593 } 594 } 595 596 return(found && !enclosed); 597 } 598 599 /* 600 * Convert a string to a long that may not be <0. 601 * If the string is invalid, or is less than 0, return -1. 602 */ 603 int 604 mandoc_strntoi(const char *p, size_t sz, int base) 605 { 606 char buf[32]; 607 char *ep; 608 long v; 609 610 if (sz > 31) 611 return(-1); 612 613 memcpy(buf, p, sz); 614 buf[(int)sz] = '\0'; 615 616 errno = 0; 617 v = strtol(buf, &ep, base); 618 619 if (buf[0] == '\0' || *ep != '\0') 620 return(-1); 621 622 if (v > INT_MAX) 623 v = INT_MAX; 624 if (v < INT_MIN) 625 v = INT_MIN; 626 627 return((int)v); 628 } 629