1 /* $OpenBSD: mandoc.c,v 1.75 2018/08/20 18:06:42 schwarze Exp $ */ 2 /* 3 * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv> 4 * Copyright (c) 2011-2015, 2017, 2018 Ingo Schwarze <schwarze@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 #include <sys/types.h> 19 20 #include <assert.h> 21 #include <ctype.h> 22 #include <errno.h> 23 #include <limits.h> 24 #include <stdlib.h> 25 #include <stdio.h> 26 #include <string.h> 27 #include <time.h> 28 29 #include "mandoc_aux.h" 30 #include "mandoc.h" 31 #include "roff.h" 32 #include "libmandoc.h" 33 34 static int a2time(time_t *, const char *, const char *); 35 static char *time2a(time_t); 36 37 38 enum mandoc_esc 39 mandoc_escape(const char **end, const char **start, int *sz) 40 { 41 const char *local_start; 42 int local_sz, c, i; 43 char term; 44 enum mandoc_esc gly; 45 46 /* 47 * When the caller doesn't provide return storage, 48 * use local storage. 49 */ 50 51 if (NULL == start) 52 start = &local_start; 53 if (NULL == sz) 54 sz = &local_sz; 55 56 /* 57 * Beyond the backslash, at least one input character 58 * is part of the escape sequence. With one exception 59 * (see below), that character won't be returned. 60 */ 61 62 gly = ESCAPE_ERROR; 63 *start = ++*end; 64 *sz = 0; 65 term = '\0'; 66 67 switch ((*start)[-1]) { 68 /* 69 * First the glyphs. There are several different forms of 70 * these, but each eventually returns a substring of the glyph 71 * name. 72 */ 73 case '(': 74 gly = ESCAPE_SPECIAL; 75 *sz = 2; 76 break; 77 case '[': 78 gly = ESCAPE_SPECIAL; 79 term = ']'; 80 break; 81 case 'C': 82 if ('\'' != **start) 83 return ESCAPE_ERROR; 84 *start = ++*end; 85 gly = ESCAPE_SPECIAL; 86 term = '\''; 87 break; 88 89 /* 90 * Escapes taking no arguments at all. 91 */ 92 case 'd': 93 case 'u': 94 case ',': 95 case '/': 96 return ESCAPE_IGNORE; 97 case 'p': 98 return ESCAPE_BREAK; 99 100 /* 101 * The \z escape is supposed to output the following 102 * character without advancing the cursor position. 103 * Since we are mostly dealing with terminal mode, 104 * let us just skip the next character. 105 */ 106 case 'z': 107 return ESCAPE_SKIPCHAR; 108 109 /* 110 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where 111 * 'X' is the trigger. These have opaque sub-strings. 112 */ 113 case 'F': 114 case 'g': 115 case 'k': 116 case 'M': 117 case 'm': 118 case 'n': 119 case 'V': 120 case 'Y': 121 gly = ESCAPE_IGNORE; 122 /* FALLTHROUGH */ 123 case 'f': 124 if (ESCAPE_ERROR == gly) 125 gly = ESCAPE_FONT; 126 switch (**start) { 127 case '(': 128 *start = ++*end; 129 *sz = 2; 130 break; 131 case '[': 132 *start = ++*end; 133 term = ']'; 134 break; 135 default: 136 *sz = 1; 137 break; 138 } 139 break; 140 case '*': 141 if (strncmp(*start, "(.T", 3) != 0) 142 abort(); 143 gly = ESCAPE_DEVICE; 144 *start = ++*end; 145 *sz = 2; 146 break; 147 148 /* 149 * These escapes are of the form \X'Y', where 'X' is the trigger 150 * and 'Y' is any string. These have opaque sub-strings. 151 * The \B and \w escapes are handled in roff.c, roff_res(). 152 */ 153 case 'A': 154 case 'b': 155 case 'D': 156 case 'R': 157 case 'X': 158 case 'Z': 159 gly = ESCAPE_IGNORE; 160 /* FALLTHROUGH */ 161 case 'o': 162 if (**start == '\0') 163 return ESCAPE_ERROR; 164 if (gly == ESCAPE_ERROR) 165 gly = ESCAPE_OVERSTRIKE; 166 term = **start; 167 *start = ++*end; 168 break; 169 170 /* 171 * These escapes are of the form \X'N', where 'X' is the trigger 172 * and 'N' resolves to a numerical expression. 173 */ 174 case 'h': 175 case 'H': 176 case 'L': 177 case 'l': 178 case 'S': 179 case 'v': 180 case 'x': 181 if (strchr(" %&()*+-./0123456789:<=>", **start)) { 182 if ('\0' != **start) 183 ++*end; 184 return ESCAPE_ERROR; 185 } 186 switch ((*start)[-1]) { 187 case 'h': 188 gly = ESCAPE_HORIZ; 189 break; 190 case 'l': 191 gly = ESCAPE_HLINE; 192 break; 193 default: 194 gly = ESCAPE_IGNORE; 195 break; 196 } 197 term = **start; 198 *start = ++*end; 199 break; 200 201 /* 202 * Special handling for the numbered character escape. 203 * XXX Do any other escapes need similar handling? 204 */ 205 case 'N': 206 if ('\0' == **start) 207 return ESCAPE_ERROR; 208 (*end)++; 209 if (isdigit((unsigned char)**start)) { 210 *sz = 1; 211 return ESCAPE_IGNORE; 212 } 213 (*start)++; 214 while (isdigit((unsigned char)**end)) 215 (*end)++; 216 *sz = *end - *start; 217 if ('\0' != **end) 218 (*end)++; 219 return ESCAPE_NUMBERED; 220 221 /* 222 * Sizes get a special category of their own. 223 */ 224 case 's': 225 gly = ESCAPE_IGNORE; 226 227 /* See +/- counts as a sign. */ 228 if ('+' == **end || '-' == **end || ASCII_HYPH == **end) 229 *start = ++*end; 230 231 switch (**end) { 232 case '(': 233 *start = ++*end; 234 *sz = 2; 235 break; 236 case '[': 237 *start = ++*end; 238 term = ']'; 239 break; 240 case '\'': 241 *start = ++*end; 242 term = '\''; 243 break; 244 case '3': 245 case '2': 246 case '1': 247 *sz = (*end)[-1] == 's' && 248 isdigit((unsigned char)(*end)[1]) ? 2 : 1; 249 break; 250 default: 251 *sz = 1; 252 break; 253 } 254 255 break; 256 257 /* 258 * Anything else is assumed to be a glyph. 259 * In this case, pass back the character after the backslash. 260 */ 261 default: 262 gly = ESCAPE_SPECIAL; 263 *start = --*end; 264 *sz = 1; 265 break; 266 } 267 268 assert(ESCAPE_ERROR != gly); 269 270 /* 271 * Read up to the terminating character, 272 * paying attention to nested escapes. 273 */ 274 275 if ('\0' != term) { 276 while (**end != term) { 277 switch (**end) { 278 case '\0': 279 return ESCAPE_ERROR; 280 case '\\': 281 (*end)++; 282 if (ESCAPE_ERROR == 283 mandoc_escape(end, NULL, NULL)) 284 return ESCAPE_ERROR; 285 break; 286 default: 287 (*end)++; 288 break; 289 } 290 } 291 *sz = (*end)++ - *start; 292 } else { 293 assert(*sz > 0); 294 if ((size_t)*sz > strlen(*start)) 295 return ESCAPE_ERROR; 296 *end += *sz; 297 } 298 299 /* Run post-processors. */ 300 301 switch (gly) { 302 case ESCAPE_FONT: 303 if (*sz == 2) { 304 if (**start == 'C') { 305 /* 306 * Treat constant-width font modes 307 * just like regular font modes. 308 */ 309 (*start)++; 310 (*sz)--; 311 } else { 312 if ((*start)[0] == 'B' && (*start)[1] == 'I') 313 gly = ESCAPE_FONTBI; 314 break; 315 } 316 } else if (*sz != 1) { 317 if (*sz == 0) 318 gly = ESCAPE_FONTPREV; 319 break; 320 } 321 322 switch (**start) { 323 case '3': 324 case 'B': 325 gly = ESCAPE_FONTBOLD; 326 break; 327 case '2': 328 case 'I': 329 gly = ESCAPE_FONTITALIC; 330 break; 331 case 'P': 332 gly = ESCAPE_FONTPREV; 333 break; 334 case '1': 335 case 'R': 336 gly = ESCAPE_FONTROMAN; 337 break; 338 } 339 break; 340 case ESCAPE_SPECIAL: 341 if (**start == 'c') { 342 if (*sz == 1) { 343 gly = ESCAPE_NOSPACE; 344 break; 345 } 346 if (*sz < 6 || *sz > 7 || 347 strncmp(*start, "char", 4) != 0 || 348 (int)strspn(*start + 4, "0123456789") + 4 < *sz) 349 break; 350 c = 0; 351 for (i = 4; i < *sz; i++) 352 c = 10 * c + ((*start)[i] - '0'); 353 if (c < 0x21 || (c > 0x7e && c < 0xa0) || c > 0xff) 354 break; 355 *start += 4; 356 *sz -= 4; 357 gly = ESCAPE_NUMBERED; 358 break; 359 } 360 361 /* 362 * Unicode escapes are defined in groff as \[u0000] 363 * to \[u10FFFF], where the contained value must be 364 * a valid Unicode codepoint. Here, however, only 365 * check the length and range. 366 */ 367 if (**start != 'u' || *sz < 5 || *sz > 7) 368 break; 369 if (*sz == 7 && ((*start)[1] != '1' || (*start)[2] != '0')) 370 break; 371 if (*sz == 6 && (*start)[1] == '0') 372 break; 373 if (*sz == 5 && (*start)[1] == 'D' && 374 strchr("89ABCDEF", (*start)[2]) != NULL) 375 break; 376 if ((int)strspn(*start + 1, "0123456789ABCDEFabcdef") 377 + 1 == *sz) 378 gly = ESCAPE_UNICODE; 379 break; 380 default: 381 break; 382 } 383 384 return gly; 385 } 386 387 /* 388 * Parse a quoted or unquoted roff-style request or macro argument. 389 * Return a pointer to the parsed argument, which is either the original 390 * pointer or advanced by one byte in case the argument is quoted. 391 * NUL-terminate the argument in place. 392 * Collapse pairs of quotes inside quoted arguments. 393 * Advance the argument pointer to the next argument, 394 * or to the NUL byte terminating the argument line. 395 */ 396 char * 397 mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos) 398 { 399 char *start, *cp; 400 int quoted, pairs, white; 401 402 /* Quoting can only start with a new word. */ 403 start = *cpp; 404 quoted = 0; 405 if ('"' == *start) { 406 quoted = 1; 407 start++; 408 } 409 410 pairs = 0; 411 white = 0; 412 for (cp = start; '\0' != *cp; cp++) { 413 414 /* 415 * Move the following text left 416 * after quoted quotes and after "\\" and "\t". 417 */ 418 if (pairs) 419 cp[-pairs] = cp[0]; 420 421 if ('\\' == cp[0]) { 422 /* 423 * In copy mode, translate double to single 424 * backslashes and backslash-t to literal tabs. 425 */ 426 switch (cp[1]) { 427 case 't': 428 cp[0] = '\t'; 429 /* FALLTHROUGH */ 430 case '\\': 431 pairs++; 432 cp++; 433 break; 434 case ' ': 435 /* Skip escaped blanks. */ 436 if (0 == quoted) 437 cp++; 438 break; 439 default: 440 break; 441 } 442 } else if (0 == quoted) { 443 if (' ' == cp[0]) { 444 /* Unescaped blanks end unquoted args. */ 445 white = 1; 446 break; 447 } 448 } else if ('"' == cp[0]) { 449 if ('"' == cp[1]) { 450 /* Quoted quotes collapse. */ 451 pairs++; 452 cp++; 453 } else { 454 /* Unquoted quotes end quoted args. */ 455 quoted = 2; 456 break; 457 } 458 } 459 } 460 461 /* Quoted argument without a closing quote. */ 462 if (1 == quoted) 463 mandoc_msg(MANDOCERR_ARG_QUOTE, parse, ln, *pos, NULL); 464 465 /* NUL-terminate this argument and move to the next one. */ 466 if (pairs) 467 cp[-pairs] = '\0'; 468 if ('\0' != *cp) { 469 *cp++ = '\0'; 470 while (' ' == *cp) 471 cp++; 472 } 473 *pos += (int)(cp - start) + (quoted ? 1 : 0); 474 *cpp = cp; 475 476 if ('\0' == *cp && (white || ' ' == cp[-1])) 477 mandoc_msg(MANDOCERR_SPACE_EOL, parse, ln, *pos, NULL); 478 479 return start; 480 } 481 482 static int 483 a2time(time_t *t, const char *fmt, const char *p) 484 { 485 struct tm tm; 486 char *pp; 487 488 memset(&tm, 0, sizeof(struct tm)); 489 490 pp = strptime(p, fmt, &tm); 491 if (NULL != pp && '\0' == *pp) { 492 *t = mktime(&tm); 493 return 1; 494 } 495 496 return 0; 497 } 498 499 static char * 500 time2a(time_t t) 501 { 502 struct tm *tm; 503 char *buf, *p; 504 size_t ssz; 505 int isz; 506 507 tm = localtime(&t); 508 if (tm == NULL) 509 return NULL; 510 511 /* 512 * Reserve space: 513 * up to 9 characters for the month (September) + blank 514 * up to 2 characters for the day + comma + blank 515 * 4 characters for the year and a terminating '\0' 516 */ 517 518 p = buf = mandoc_malloc(10 + 4 + 4 + 1); 519 520 if ((ssz = strftime(p, 10 + 1, "%B ", tm)) == 0) 521 goto fail; 522 p += (int)ssz; 523 524 /* 525 * The output format is just "%d" here, not "%2d" or "%02d". 526 * That's also the reason why we can't just format the 527 * date as a whole with "%B %e, %Y" or "%B %d, %Y". 528 * Besides, the present approach is less prone to buffer 529 * overflows, in case anybody should ever introduce the bug 530 * of looking at LC_TIME. 531 */ 532 533 if ((isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday)) == -1) 534 goto fail; 535 p += isz; 536 537 if (strftime(p, 4 + 1, "%Y", tm) == 0) 538 goto fail; 539 return buf; 540 541 fail: 542 free(buf); 543 return NULL; 544 } 545 546 char * 547 mandoc_normdate(struct roff_man *man, char *in, int ln, int pos) 548 { 549 char *cp; 550 time_t t; 551 552 /* No date specified: use today's date. */ 553 554 if (in == NULL || *in == '\0' || strcmp(in, "$" "Mdocdate$") == 0) { 555 mandoc_msg(MANDOCERR_DATE_MISSING, man->parse, ln, pos, NULL); 556 return time2a(time(NULL)); 557 } 558 559 /* Valid mdoc(7) date format. */ 560 561 if (a2time(&t, "$" "Mdocdate: %b %d %Y $", in) || 562 a2time(&t, "%b %d, %Y", in)) { 563 cp = time2a(t); 564 if (t > time(NULL) + 86400) 565 mandoc_msg(MANDOCERR_DATE_FUTURE, man->parse, 566 ln, pos, cp); 567 else if (*in != '$' && strcmp(in, cp) != 0) 568 mandoc_msg(MANDOCERR_DATE_NORM, man->parse, 569 ln, pos, cp); 570 return cp; 571 } 572 573 /* In man(7), do not warn about the legacy format. */ 574 575 if (a2time(&t, "%Y-%m-%d", in) == 0) 576 mandoc_msg(MANDOCERR_DATE_BAD, man->parse, ln, pos, in); 577 else if (t > time(NULL) + 86400) 578 mandoc_msg(MANDOCERR_DATE_FUTURE, man->parse, ln, pos, in); 579 else if (man->macroset == MACROSET_MDOC) 580 mandoc_vmsg(MANDOCERR_DATE_LEGACY, man->parse, 581 ln, pos, "Dd %s", in); 582 583 /* Use any non-mdoc(7) date verbatim. */ 584 585 return mandoc_strdup(in); 586 } 587 588 int 589 mandoc_eos(const char *p, size_t sz) 590 { 591 const char *q; 592 int enclosed, found; 593 594 if (0 == sz) 595 return 0; 596 597 /* 598 * End-of-sentence recognition must include situations where 599 * some symbols, such as `)', allow prior EOS punctuation to 600 * propagate outward. 601 */ 602 603 enclosed = found = 0; 604 for (q = p + (int)sz - 1; q >= p; q--) { 605 switch (*q) { 606 case '\"': 607 case '\'': 608 case ']': 609 case ')': 610 if (0 == found) 611 enclosed = 1; 612 break; 613 case '.': 614 case '!': 615 case '?': 616 found = 1; 617 break; 618 default: 619 return found && 620 (!enclosed || isalnum((unsigned char)*q)); 621 } 622 } 623 624 return found && !enclosed; 625 } 626 627 /* 628 * Convert a string to a long that may not be <0. 629 * If the string is invalid, or is less than 0, return -1. 630 */ 631 int 632 mandoc_strntoi(const char *p, size_t sz, int base) 633 { 634 char buf[32]; 635 char *ep; 636 long v; 637 638 if (sz > 31) 639 return -1; 640 641 memcpy(buf, p, sz); 642 buf[(int)sz] = '\0'; 643 644 errno = 0; 645 v = strtol(buf, &ep, base); 646 647 if (buf[0] == '\0' || *ep != '\0') 648 return -1; 649 650 if (v > INT_MAX) 651 v = INT_MAX; 652 if (v < INT_MIN) 653 v = INT_MIN; 654 655 return (int)v; 656 } 657