1 /* $OpenBSD: mandoc.c,v 1.76 2018/10/25 01:21:30 schwarze Exp $ */ 2 /* 3 * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv> 4 * Copyright (c) 2011-2015, 2017, 2018 Ingo Schwarze <schwarze@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 #include <sys/types.h> 19 20 #include <assert.h> 21 #include <ctype.h> 22 #include <errno.h> 23 #include <limits.h> 24 #include <stdlib.h> 25 #include <stdio.h> 26 #include <string.h> 27 #include <time.h> 28 29 #include "mandoc_aux.h" 30 #include "mandoc.h" 31 #include "roff.h" 32 #include "libmandoc.h" 33 34 static int a2time(time_t *, const char *, const char *); 35 static char *time2a(time_t); 36 37 38 enum mandoc_esc 39 mandoc_escape(const char **end, const char **start, int *sz) 40 { 41 const char *local_start; 42 int local_sz, c, i; 43 char term; 44 enum mandoc_esc gly; 45 46 /* 47 * When the caller doesn't provide return storage, 48 * use local storage. 49 */ 50 51 if (NULL == start) 52 start = &local_start; 53 if (NULL == sz) 54 sz = &local_sz; 55 56 /* 57 * Beyond the backslash, at least one input character 58 * is part of the escape sequence. With one exception 59 * (see below), that character won't be returned. 60 */ 61 62 gly = ESCAPE_ERROR; 63 *start = ++*end; 64 *sz = 0; 65 term = '\0'; 66 67 switch ((*start)[-1]) { 68 /* 69 * First the glyphs. There are several different forms of 70 * these, but each eventually returns a substring of the glyph 71 * name. 72 */ 73 case '(': 74 gly = ESCAPE_SPECIAL; 75 *sz = 2; 76 break; 77 case '[': 78 gly = ESCAPE_SPECIAL; 79 term = ']'; 80 break; 81 case 'C': 82 if ('\'' != **start) 83 return ESCAPE_ERROR; 84 *start = ++*end; 85 gly = ESCAPE_SPECIAL; 86 term = '\''; 87 break; 88 89 /* 90 * Escapes taking no arguments at all. 91 */ 92 case 'd': 93 case 'u': 94 case ',': 95 case '/': 96 return ESCAPE_IGNORE; 97 case 'p': 98 return ESCAPE_BREAK; 99 100 /* 101 * The \z escape is supposed to output the following 102 * character without advancing the cursor position. 103 * Since we are mostly dealing with terminal mode, 104 * let us just skip the next character. 105 */ 106 case 'z': 107 return ESCAPE_SKIPCHAR; 108 109 /* 110 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where 111 * 'X' is the trigger. These have opaque sub-strings. 112 */ 113 case 'F': 114 case 'g': 115 case 'k': 116 case 'M': 117 case 'm': 118 case 'n': 119 case 'V': 120 case 'Y': 121 gly = ESCAPE_IGNORE; 122 /* FALLTHROUGH */ 123 case 'f': 124 if (ESCAPE_ERROR == gly) 125 gly = ESCAPE_FONT; 126 switch (**start) { 127 case '(': 128 *start = ++*end; 129 *sz = 2; 130 break; 131 case '[': 132 *start = ++*end; 133 term = ']'; 134 break; 135 default: 136 *sz = 1; 137 break; 138 } 139 break; 140 case '*': 141 if (strncmp(*start, "(.T", 3) != 0) 142 abort(); 143 gly = ESCAPE_DEVICE; 144 *start = ++*end; 145 *sz = 2; 146 break; 147 148 /* 149 * These escapes are of the form \X'Y', where 'X' is the trigger 150 * and 'Y' is any string. These have opaque sub-strings. 151 * The \B and \w escapes are handled in roff.c, roff_res(). 152 */ 153 case 'A': 154 case 'b': 155 case 'D': 156 case 'R': 157 case 'X': 158 case 'Z': 159 gly = ESCAPE_IGNORE; 160 /* FALLTHROUGH */ 161 case 'o': 162 if (**start == '\0') 163 return ESCAPE_ERROR; 164 if (gly == ESCAPE_ERROR) 165 gly = ESCAPE_OVERSTRIKE; 166 term = **start; 167 *start = ++*end; 168 break; 169 170 /* 171 * These escapes are of the form \X'N', where 'X' is the trigger 172 * and 'N' resolves to a numerical expression. 173 */ 174 case 'h': 175 case 'H': 176 case 'L': 177 case 'l': 178 case 'S': 179 case 'v': 180 case 'x': 181 if (strchr(" %&()*+-./0123456789:<=>", **start)) { 182 if ('\0' != **start) 183 ++*end; 184 return ESCAPE_ERROR; 185 } 186 switch ((*start)[-1]) { 187 case 'h': 188 gly = ESCAPE_HORIZ; 189 break; 190 case 'l': 191 gly = ESCAPE_HLINE; 192 break; 193 default: 194 gly = ESCAPE_IGNORE; 195 break; 196 } 197 term = **start; 198 *start = ++*end; 199 break; 200 201 /* 202 * Special handling for the numbered character escape. 203 * XXX Do any other escapes need similar handling? 204 */ 205 case 'N': 206 if ('\0' == **start) 207 return ESCAPE_ERROR; 208 (*end)++; 209 if (isdigit((unsigned char)**start)) { 210 *sz = 1; 211 return ESCAPE_IGNORE; 212 } 213 (*start)++; 214 while (isdigit((unsigned char)**end)) 215 (*end)++; 216 *sz = *end - *start; 217 if ('\0' != **end) 218 (*end)++; 219 return ESCAPE_NUMBERED; 220 221 /* 222 * Sizes get a special category of their own. 223 */ 224 case 's': 225 gly = ESCAPE_IGNORE; 226 227 /* See +/- counts as a sign. */ 228 if ('+' == **end || '-' == **end || ASCII_HYPH == **end) 229 *start = ++*end; 230 231 switch (**end) { 232 case '(': 233 *start = ++*end; 234 *sz = 2; 235 break; 236 case '[': 237 *start = ++*end; 238 term = ']'; 239 break; 240 case '\'': 241 *start = ++*end; 242 term = '\''; 243 break; 244 case '3': 245 case '2': 246 case '1': 247 *sz = (*end)[-1] == 's' && 248 isdigit((unsigned char)(*end)[1]) ? 2 : 1; 249 break; 250 default: 251 *sz = 1; 252 break; 253 } 254 255 break; 256 257 /* 258 * Anything else is assumed to be a glyph. 259 * In this case, pass back the character after the backslash. 260 */ 261 default: 262 gly = ESCAPE_SPECIAL; 263 *start = --*end; 264 *sz = 1; 265 break; 266 } 267 268 assert(ESCAPE_ERROR != gly); 269 270 /* 271 * Read up to the terminating character, 272 * paying attention to nested escapes. 273 */ 274 275 if ('\0' != term) { 276 while (**end != term) { 277 switch (**end) { 278 case '\0': 279 return ESCAPE_ERROR; 280 case '\\': 281 (*end)++; 282 if (ESCAPE_ERROR == 283 mandoc_escape(end, NULL, NULL)) 284 return ESCAPE_ERROR; 285 break; 286 default: 287 (*end)++; 288 break; 289 } 290 } 291 *sz = (*end)++ - *start; 292 } else { 293 assert(*sz > 0); 294 if ((size_t)*sz > strlen(*start)) 295 return ESCAPE_ERROR; 296 *end += *sz; 297 } 298 299 /* Run post-processors. */ 300 301 switch (gly) { 302 case ESCAPE_FONT: 303 if (*sz == 2) { 304 if (**start == 'C') { 305 if ((*start)[1] == 'W' || 306 (*start)[1] == 'R') { 307 gly = ESCAPE_FONTCW; 308 break; 309 } 310 /* 311 * Treat other constant-width font modes 312 * just like regular font modes. 313 */ 314 (*start)++; 315 (*sz)--; 316 } else { 317 if ((*start)[0] == 'B' && (*start)[1] == 'I') 318 gly = ESCAPE_FONTBI; 319 break; 320 } 321 } else if (*sz != 1) { 322 if (*sz == 0) 323 gly = ESCAPE_FONTPREV; 324 break; 325 } 326 327 switch (**start) { 328 case '3': 329 case 'B': 330 gly = ESCAPE_FONTBOLD; 331 break; 332 case '2': 333 case 'I': 334 gly = ESCAPE_FONTITALIC; 335 break; 336 case 'P': 337 gly = ESCAPE_FONTPREV; 338 break; 339 case '1': 340 case 'R': 341 gly = ESCAPE_FONTROMAN; 342 break; 343 } 344 break; 345 case ESCAPE_SPECIAL: 346 if (**start == 'c') { 347 if (*sz == 1) { 348 gly = ESCAPE_NOSPACE; 349 break; 350 } 351 if (*sz < 6 || *sz > 7 || 352 strncmp(*start, "char", 4) != 0 || 353 (int)strspn(*start + 4, "0123456789") + 4 < *sz) 354 break; 355 c = 0; 356 for (i = 4; i < *sz; i++) 357 c = 10 * c + ((*start)[i] - '0'); 358 if (c < 0x21 || (c > 0x7e && c < 0xa0) || c > 0xff) 359 break; 360 *start += 4; 361 *sz -= 4; 362 gly = ESCAPE_NUMBERED; 363 break; 364 } 365 366 /* 367 * Unicode escapes are defined in groff as \[u0000] 368 * to \[u10FFFF], where the contained value must be 369 * a valid Unicode codepoint. Here, however, only 370 * check the length and range. 371 */ 372 if (**start != 'u' || *sz < 5 || *sz > 7) 373 break; 374 if (*sz == 7 && ((*start)[1] != '1' || (*start)[2] != '0')) 375 break; 376 if (*sz == 6 && (*start)[1] == '0') 377 break; 378 if (*sz == 5 && (*start)[1] == 'D' && 379 strchr("89ABCDEF", (*start)[2]) != NULL) 380 break; 381 if ((int)strspn(*start + 1, "0123456789ABCDEFabcdef") 382 + 1 == *sz) 383 gly = ESCAPE_UNICODE; 384 break; 385 default: 386 break; 387 } 388 389 return gly; 390 } 391 392 /* 393 * Parse a quoted or unquoted roff-style request or macro argument. 394 * Return a pointer to the parsed argument, which is either the original 395 * pointer or advanced by one byte in case the argument is quoted. 396 * NUL-terminate the argument in place. 397 * Collapse pairs of quotes inside quoted arguments. 398 * Advance the argument pointer to the next argument, 399 * or to the NUL byte terminating the argument line. 400 */ 401 char * 402 mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos) 403 { 404 char *start, *cp; 405 int quoted, pairs, white; 406 407 /* Quoting can only start with a new word. */ 408 start = *cpp; 409 quoted = 0; 410 if ('"' == *start) { 411 quoted = 1; 412 start++; 413 } 414 415 pairs = 0; 416 white = 0; 417 for (cp = start; '\0' != *cp; cp++) { 418 419 /* 420 * Move the following text left 421 * after quoted quotes and after "\\" and "\t". 422 */ 423 if (pairs) 424 cp[-pairs] = cp[0]; 425 426 if ('\\' == cp[0]) { 427 /* 428 * In copy mode, translate double to single 429 * backslashes and backslash-t to literal tabs. 430 */ 431 switch (cp[1]) { 432 case 't': 433 cp[0] = '\t'; 434 /* FALLTHROUGH */ 435 case '\\': 436 pairs++; 437 cp++; 438 break; 439 case ' ': 440 /* Skip escaped blanks. */ 441 if (0 == quoted) 442 cp++; 443 break; 444 default: 445 break; 446 } 447 } else if (0 == quoted) { 448 if (' ' == cp[0]) { 449 /* Unescaped blanks end unquoted args. */ 450 white = 1; 451 break; 452 } 453 } else if ('"' == cp[0]) { 454 if ('"' == cp[1]) { 455 /* Quoted quotes collapse. */ 456 pairs++; 457 cp++; 458 } else { 459 /* Unquoted quotes end quoted args. */ 460 quoted = 2; 461 break; 462 } 463 } 464 } 465 466 /* Quoted argument without a closing quote. */ 467 if (1 == quoted) 468 mandoc_msg(MANDOCERR_ARG_QUOTE, parse, ln, *pos, NULL); 469 470 /* NUL-terminate this argument and move to the next one. */ 471 if (pairs) 472 cp[-pairs] = '\0'; 473 if ('\0' != *cp) { 474 *cp++ = '\0'; 475 while (' ' == *cp) 476 cp++; 477 } 478 *pos += (int)(cp - start) + (quoted ? 1 : 0); 479 *cpp = cp; 480 481 if ('\0' == *cp && (white || ' ' == cp[-1])) 482 mandoc_msg(MANDOCERR_SPACE_EOL, parse, ln, *pos, NULL); 483 484 return start; 485 } 486 487 static int 488 a2time(time_t *t, const char *fmt, const char *p) 489 { 490 struct tm tm; 491 char *pp; 492 493 memset(&tm, 0, sizeof(struct tm)); 494 495 pp = strptime(p, fmt, &tm); 496 if (NULL != pp && '\0' == *pp) { 497 *t = mktime(&tm); 498 return 1; 499 } 500 501 return 0; 502 } 503 504 static char * 505 time2a(time_t t) 506 { 507 struct tm *tm; 508 char *buf, *p; 509 size_t ssz; 510 int isz; 511 512 tm = localtime(&t); 513 if (tm == NULL) 514 return NULL; 515 516 /* 517 * Reserve space: 518 * up to 9 characters for the month (September) + blank 519 * up to 2 characters for the day + comma + blank 520 * 4 characters for the year and a terminating '\0' 521 */ 522 523 p = buf = mandoc_malloc(10 + 4 + 4 + 1); 524 525 if ((ssz = strftime(p, 10 + 1, "%B ", tm)) == 0) 526 goto fail; 527 p += (int)ssz; 528 529 /* 530 * The output format is just "%d" here, not "%2d" or "%02d". 531 * That's also the reason why we can't just format the 532 * date as a whole with "%B %e, %Y" or "%B %d, %Y". 533 * Besides, the present approach is less prone to buffer 534 * overflows, in case anybody should ever introduce the bug 535 * of looking at LC_TIME. 536 */ 537 538 if ((isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday)) == -1) 539 goto fail; 540 p += isz; 541 542 if (strftime(p, 4 + 1, "%Y", tm) == 0) 543 goto fail; 544 return buf; 545 546 fail: 547 free(buf); 548 return NULL; 549 } 550 551 char * 552 mandoc_normdate(struct roff_man *man, char *in, int ln, int pos) 553 { 554 char *cp; 555 time_t t; 556 557 /* No date specified: use today's date. */ 558 559 if (in == NULL || *in == '\0' || strcmp(in, "$" "Mdocdate$") == 0) { 560 mandoc_msg(MANDOCERR_DATE_MISSING, man->parse, ln, pos, NULL); 561 return time2a(time(NULL)); 562 } 563 564 /* Valid mdoc(7) date format. */ 565 566 if (a2time(&t, "$" "Mdocdate: %b %d %Y $", in) || 567 a2time(&t, "%b %d, %Y", in)) { 568 cp = time2a(t); 569 if (t > time(NULL) + 86400) 570 mandoc_msg(MANDOCERR_DATE_FUTURE, man->parse, 571 ln, pos, cp); 572 else if (*in != '$' && strcmp(in, cp) != 0) 573 mandoc_msg(MANDOCERR_DATE_NORM, man->parse, 574 ln, pos, cp); 575 return cp; 576 } 577 578 /* In man(7), do not warn about the legacy format. */ 579 580 if (a2time(&t, "%Y-%m-%d", in) == 0) 581 mandoc_msg(MANDOCERR_DATE_BAD, man->parse, ln, pos, in); 582 else if (t > time(NULL) + 86400) 583 mandoc_msg(MANDOCERR_DATE_FUTURE, man->parse, ln, pos, in); 584 else if (man->macroset == MACROSET_MDOC) 585 mandoc_vmsg(MANDOCERR_DATE_LEGACY, man->parse, 586 ln, pos, "Dd %s", in); 587 588 /* Use any non-mdoc(7) date verbatim. */ 589 590 return mandoc_strdup(in); 591 } 592 593 int 594 mandoc_eos(const char *p, size_t sz) 595 { 596 const char *q; 597 int enclosed, found; 598 599 if (0 == sz) 600 return 0; 601 602 /* 603 * End-of-sentence recognition must include situations where 604 * some symbols, such as `)', allow prior EOS punctuation to 605 * propagate outward. 606 */ 607 608 enclosed = found = 0; 609 for (q = p + (int)sz - 1; q >= p; q--) { 610 switch (*q) { 611 case '\"': 612 case '\'': 613 case ']': 614 case ')': 615 if (0 == found) 616 enclosed = 1; 617 break; 618 case '.': 619 case '!': 620 case '?': 621 found = 1; 622 break; 623 default: 624 return found && 625 (!enclosed || isalnum((unsigned char)*q)); 626 } 627 } 628 629 return found && !enclosed; 630 } 631 632 /* 633 * Convert a string to a long that may not be <0. 634 * If the string is invalid, or is less than 0, return -1. 635 */ 636 int 637 mandoc_strntoi(const char *p, size_t sz, int base) 638 { 639 char buf[32]; 640 char *ep; 641 long v; 642 643 if (sz > 31) 644 return -1; 645 646 memcpy(buf, p, sz); 647 buf[(int)sz] = '\0'; 648 649 errno = 0; 650 v = strtol(buf, &ep, base); 651 652 if (buf[0] == '\0' || *ep != '\0') 653 return -1; 654 655 if (v > INT_MAX) 656 v = INT_MAX; 657 if (v < INT_MIN) 658 v = INT_MIN; 659 660 return (int)v; 661 } 662