1 /* $OpenBSD: mandoc.c,v 1.86 2020/10/24 22:52:34 schwarze Exp $ */ 2 /* 3 * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv> 4 * Copyright (c) 2011-2015, 2017-2020 Ingo Schwarze <schwarze@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 #include <sys/types.h> 19 20 #include <assert.h> 21 #include <ctype.h> 22 #include <errno.h> 23 #include <limits.h> 24 #include <stdlib.h> 25 #include <stdio.h> 26 #include <string.h> 27 #include <time.h> 28 29 #include "mandoc_aux.h" 30 #include "mandoc.h" 31 #include "roff.h" 32 #include "libmandoc.h" 33 #include "roff_int.h" 34 35 static int a2time(time_t *, const char *, const char *); 36 static char *time2a(time_t); 37 38 39 enum mandoc_esc 40 mandoc_font(const char *cp, int sz) 41 { 42 switch (sz) { 43 case 0: 44 return ESCAPE_FONTPREV; 45 case 1: 46 switch (cp[0]) { 47 case 'B': 48 case '3': 49 return ESCAPE_FONTBOLD; 50 case 'I': 51 case '2': 52 return ESCAPE_FONTITALIC; 53 case 'P': 54 return ESCAPE_FONTPREV; 55 case 'R': 56 case '1': 57 return ESCAPE_FONTROMAN; 58 case '4': 59 return ESCAPE_FONTBI; 60 default: 61 return ESCAPE_ERROR; 62 } 63 case 2: 64 switch (cp[0]) { 65 case 'B': 66 switch (cp[1]) { 67 case 'I': 68 return ESCAPE_FONTBI; 69 default: 70 return ESCAPE_ERROR; 71 } 72 case 'C': 73 switch (cp[1]) { 74 case 'B': 75 return ESCAPE_FONTBOLD; 76 case 'I': 77 return ESCAPE_FONTITALIC; 78 case 'R': 79 case 'W': 80 return ESCAPE_FONTCW; 81 default: 82 return ESCAPE_ERROR; 83 } 84 default: 85 return ESCAPE_ERROR; 86 } 87 default: 88 return ESCAPE_ERROR; 89 } 90 } 91 92 enum mandoc_esc 93 mandoc_escape(const char **end, const char **start, int *sz) 94 { 95 const char *local_start; 96 int local_sz, c, i; 97 char term; 98 enum mandoc_esc gly; 99 100 /* 101 * When the caller doesn't provide return storage, 102 * use local storage. 103 */ 104 105 if (NULL == start) 106 start = &local_start; 107 if (NULL == sz) 108 sz = &local_sz; 109 110 /* 111 * Treat "\E" just like "\"; 112 * it only makes a difference in copy mode. 113 */ 114 115 if (**end == 'E') 116 ++*end; 117 118 /* 119 * Beyond the backslash, at least one input character 120 * is part of the escape sequence. With one exception 121 * (see below), that character won't be returned. 122 */ 123 124 gly = ESCAPE_ERROR; 125 *start = ++*end; 126 *sz = 0; 127 term = '\0'; 128 129 switch ((*start)[-1]) { 130 /* 131 * First the glyphs. There are several different forms of 132 * these, but each eventually returns a substring of the glyph 133 * name. 134 */ 135 case '(': 136 gly = ESCAPE_SPECIAL; 137 *sz = 2; 138 break; 139 case '[': 140 if (**start == ' ') { 141 ++*end; 142 return ESCAPE_ERROR; 143 } 144 gly = ESCAPE_SPECIAL; 145 term = ']'; 146 break; 147 case 'C': 148 if ('\'' != **start) 149 return ESCAPE_ERROR; 150 *start = ++*end; 151 gly = ESCAPE_SPECIAL; 152 term = '\''; 153 break; 154 155 /* 156 * Escapes taking no arguments at all. 157 */ 158 case '!': 159 case '?': 160 return ESCAPE_UNSUPP; 161 case '%': 162 case '&': 163 case ')': 164 case ',': 165 case '/': 166 case '^': 167 case 'a': 168 case 'd': 169 case 'r': 170 case 't': 171 case 'u': 172 case '{': 173 case '|': 174 case '}': 175 return ESCAPE_IGNORE; 176 case 'c': 177 return ESCAPE_NOSPACE; 178 case 'p': 179 return ESCAPE_BREAK; 180 181 /* 182 * The \z escape is supposed to output the following 183 * character without advancing the cursor position. 184 * Since we are mostly dealing with terminal mode, 185 * let us just skip the next character. 186 */ 187 case 'z': 188 return ESCAPE_SKIPCHAR; 189 190 /* 191 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where 192 * 'X' is the trigger. These have opaque sub-strings. 193 */ 194 case 'F': 195 case 'f': 196 case 'g': 197 case 'k': 198 case 'M': 199 case 'm': 200 case 'n': 201 case 'O': 202 case 'V': 203 case 'Y': 204 case '*': 205 switch ((*start)[-1]) { 206 case 'f': 207 gly = ESCAPE_FONT; 208 break; 209 case '*': 210 gly = ESCAPE_DEVICE; 211 break; 212 default: 213 gly = ESCAPE_IGNORE; 214 break; 215 } 216 switch (**start) { 217 case '(': 218 if ((*start)[-1] == 'O') 219 gly = ESCAPE_ERROR; 220 *start = ++*end; 221 *sz = 2; 222 break; 223 case '[': 224 if ((*start)[-1] == 'O') 225 gly = (*start)[1] == '5' ? 226 ESCAPE_UNSUPP : ESCAPE_ERROR; 227 *start = ++*end; 228 term = ']'; 229 break; 230 default: 231 if ((*start)[-1] == 'O') { 232 switch (**start) { 233 case '0': 234 gly = ESCAPE_UNSUPP; 235 break; 236 case '1': 237 case '2': 238 case '3': 239 case '4': 240 break; 241 default: 242 gly = ESCAPE_ERROR; 243 break; 244 } 245 } 246 *sz = 1; 247 break; 248 } 249 break; 250 251 /* 252 * These escapes are of the form \X'Y', where 'X' is the trigger 253 * and 'Y' is any string. These have opaque sub-strings. 254 * The \B and \w escapes are handled in roff.c, roff_res(). 255 */ 256 case 'A': 257 case 'b': 258 case 'D': 259 case 'R': 260 case 'X': 261 case 'Z': 262 gly = ESCAPE_IGNORE; 263 /* FALLTHROUGH */ 264 case 'o': 265 if (**start == '\0') 266 return ESCAPE_ERROR; 267 if (gly == ESCAPE_ERROR) 268 gly = ESCAPE_OVERSTRIKE; 269 term = **start; 270 *start = ++*end; 271 break; 272 273 /* 274 * These escapes are of the form \X'N', where 'X' is the trigger 275 * and 'N' resolves to a numerical expression. 276 */ 277 case 'h': 278 case 'H': 279 case 'L': 280 case 'l': 281 case 'S': 282 case 'v': 283 case 'x': 284 if (strchr(" %&()*+-./0123456789:<=>", **start)) { 285 if ('\0' != **start) 286 ++*end; 287 return ESCAPE_ERROR; 288 } 289 switch ((*start)[-1]) { 290 case 'h': 291 gly = ESCAPE_HORIZ; 292 break; 293 case 'l': 294 gly = ESCAPE_HLINE; 295 break; 296 default: 297 gly = ESCAPE_IGNORE; 298 break; 299 } 300 term = **start; 301 *start = ++*end; 302 break; 303 304 /* 305 * Special handling for the numbered character escape. 306 * XXX Do any other escapes need similar handling? 307 */ 308 case 'N': 309 if ('\0' == **start) 310 return ESCAPE_ERROR; 311 (*end)++; 312 if (isdigit((unsigned char)**start)) { 313 *sz = 1; 314 return ESCAPE_IGNORE; 315 } 316 (*start)++; 317 while (isdigit((unsigned char)**end)) 318 (*end)++; 319 *sz = *end - *start; 320 if ('\0' != **end) 321 (*end)++; 322 return ESCAPE_NUMBERED; 323 324 /* 325 * Sizes get a special category of their own. 326 */ 327 case 's': 328 gly = ESCAPE_IGNORE; 329 330 /* See +/- counts as a sign. */ 331 if ('+' == **end || '-' == **end || ASCII_HYPH == **end) 332 *start = ++*end; 333 334 switch (**end) { 335 case '(': 336 *start = ++*end; 337 *sz = 2; 338 break; 339 case '[': 340 *start = ++*end; 341 term = ']'; 342 break; 343 case '\'': 344 *start = ++*end; 345 term = '\''; 346 break; 347 case '3': 348 case '2': 349 case '1': 350 *sz = (*end)[-1] == 's' && 351 isdigit((unsigned char)(*end)[1]) ? 2 : 1; 352 break; 353 default: 354 *sz = 1; 355 break; 356 } 357 358 break; 359 360 /* 361 * Several special characters can be encoded as 362 * one-byte escape sequences without using \[]. 363 */ 364 case ' ': 365 case '\'': 366 case '-': 367 case '.': 368 case '0': 369 case ':': 370 case '_': 371 case '`': 372 case 'e': 373 case '~': 374 gly = ESCAPE_SPECIAL; 375 /* FALLTHROUGH */ 376 default: 377 if (gly == ESCAPE_ERROR) 378 gly = ESCAPE_UNDEF; 379 *start = --*end; 380 *sz = 1; 381 break; 382 } 383 384 /* 385 * Read up to the terminating character, 386 * paying attention to nested escapes. 387 */ 388 389 if ('\0' != term) { 390 while (**end != term) { 391 switch (**end) { 392 case '\0': 393 return ESCAPE_ERROR; 394 case '\\': 395 (*end)++; 396 if (ESCAPE_ERROR == 397 mandoc_escape(end, NULL, NULL)) 398 return ESCAPE_ERROR; 399 break; 400 default: 401 (*end)++; 402 break; 403 } 404 } 405 *sz = (*end)++ - *start; 406 407 /* 408 * The file chars.c only provides one common list 409 * of character names, but \[-] == \- is the only 410 * one of the characters with one-byte names that 411 * allows enclosing the name in brackets. 412 */ 413 if (gly == ESCAPE_SPECIAL && *sz == 1 && **start != '-') 414 return ESCAPE_ERROR; 415 } else { 416 assert(*sz > 0); 417 if ((size_t)*sz > strlen(*start)) 418 return ESCAPE_ERROR; 419 *end += *sz; 420 } 421 422 /* Run post-processors. */ 423 424 switch (gly) { 425 case ESCAPE_FONT: 426 gly = mandoc_font(*start, *sz); 427 break; 428 case ESCAPE_SPECIAL: 429 if (**start == 'c') { 430 if (*sz < 6 || *sz > 7 || 431 strncmp(*start, "char", 4) != 0 || 432 (int)strspn(*start + 4, "0123456789") + 4 < *sz) 433 break; 434 c = 0; 435 for (i = 4; i < *sz; i++) 436 c = 10 * c + ((*start)[i] - '0'); 437 if (c < 0x21 || (c > 0x7e && c < 0xa0) || c > 0xff) 438 break; 439 *start += 4; 440 *sz -= 4; 441 gly = ESCAPE_NUMBERED; 442 break; 443 } 444 445 /* 446 * Unicode escapes are defined in groff as \[u0000] 447 * to \[u10FFFF], where the contained value must be 448 * a valid Unicode codepoint. Here, however, only 449 * check the length and range. 450 */ 451 if (**start != 'u' || *sz < 5 || *sz > 7) 452 break; 453 if (*sz == 7 && ((*start)[1] != '1' || (*start)[2] != '0')) 454 break; 455 if (*sz == 6 && (*start)[1] == '0') 456 break; 457 if (*sz == 5 && (*start)[1] == 'D' && 458 strchr("89ABCDEF", (*start)[2]) != NULL) 459 break; 460 if ((int)strspn(*start + 1, "0123456789ABCDEFabcdef") 461 + 1 == *sz) 462 gly = ESCAPE_UNICODE; 463 break; 464 case ESCAPE_DEVICE: 465 assert(*sz == 2 && (*start)[0] == '.' && (*start)[1] == 'T'); 466 break; 467 default: 468 break; 469 } 470 471 return gly; 472 } 473 474 static int 475 a2time(time_t *t, const char *fmt, const char *p) 476 { 477 struct tm tm; 478 char *pp; 479 480 memset(&tm, 0, sizeof(struct tm)); 481 482 pp = strptime(p, fmt, &tm); 483 if (NULL != pp && '\0' == *pp) { 484 *t = mktime(&tm); 485 return 1; 486 } 487 488 return 0; 489 } 490 491 static char * 492 time2a(time_t t) 493 { 494 struct tm *tm; 495 char *buf, *p; 496 size_t ssz; 497 int isz; 498 499 buf = NULL; 500 tm = localtime(&t); 501 if (tm == NULL) 502 goto fail; 503 504 /* 505 * Reserve space: 506 * up to 9 characters for the month (September) + blank 507 * up to 2 characters for the day + comma + blank 508 * 4 characters for the year and a terminating '\0' 509 */ 510 511 p = buf = mandoc_malloc(10 + 4 + 4 + 1); 512 513 if ((ssz = strftime(p, 10 + 1, "%B ", tm)) == 0) 514 goto fail; 515 p += (int)ssz; 516 517 /* 518 * The output format is just "%d" here, not "%2d" or "%02d". 519 * That's also the reason why we can't just format the 520 * date as a whole with "%B %e, %Y" or "%B %d, %Y". 521 * Besides, the present approach is less prone to buffer 522 * overflows, in case anybody should ever introduce the bug 523 * of looking at LC_TIME. 524 */ 525 526 isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday); 527 if (isz < 0 || isz > 4) 528 goto fail; 529 p += isz; 530 531 if (strftime(p, 4 + 1, "%Y", tm) == 0) 532 goto fail; 533 return buf; 534 535 fail: 536 free(buf); 537 return mandoc_strdup(""); 538 } 539 540 char * 541 mandoc_normdate(struct roff_node *nch, struct roff_node *nbl) 542 { 543 char *cp; 544 time_t t; 545 546 /* No date specified. */ 547 548 if (nch == NULL) { 549 if (nbl == NULL) 550 mandoc_msg(MANDOCERR_DATE_MISSING, 0, 0, NULL); 551 else 552 mandoc_msg(MANDOCERR_DATE_MISSING, nbl->line, 553 nbl->pos, "%s", roff_name[nbl->tok]); 554 return mandoc_strdup(""); 555 } 556 if (*nch->string == '\0') { 557 mandoc_msg(MANDOCERR_DATE_MISSING, nch->line, 558 nch->pos, "%s", roff_name[nbl->tok]); 559 return mandoc_strdup(""); 560 } 561 if (strcmp(nch->string, "$" "Mdocdate$") == 0) 562 return time2a(time(NULL)); 563 564 /* Valid mdoc(7) date format. */ 565 566 if (a2time(&t, "$" "Mdocdate: %b %d %Y $", nch->string) || 567 a2time(&t, "%b %d, %Y", nch->string)) { 568 cp = time2a(t); 569 if (t > time(NULL) + 86400) 570 mandoc_msg(MANDOCERR_DATE_FUTURE, nch->line, 571 nch->pos, "%s %s", roff_name[nbl->tok], cp); 572 else if (*nch->string != '$' && 573 strcmp(nch->string, cp) != 0) 574 mandoc_msg(MANDOCERR_DATE_NORM, nch->line, 575 nch->pos, "%s %s", roff_name[nbl->tok], cp); 576 return cp; 577 } 578 579 /* In man(7), do not warn about the legacy format. */ 580 581 if (a2time(&t, "%Y-%m-%d", nch->string) == 0) 582 mandoc_msg(MANDOCERR_DATE_BAD, nch->line, nch->pos, 583 "%s %s", roff_name[nbl->tok], nch->string); 584 else if (t > time(NULL) + 86400) 585 mandoc_msg(MANDOCERR_DATE_FUTURE, nch->line, nch->pos, 586 "%s %s", roff_name[nbl->tok], nch->string); 587 else if (nbl->tok == MDOC_Dd) 588 mandoc_msg(MANDOCERR_DATE_LEGACY, nch->line, nch->pos, 589 "Dd %s", nch->string); 590 591 /* Use any non-mdoc(7) date verbatim. */ 592 593 return mandoc_strdup(nch->string); 594 } 595 596 int 597 mandoc_eos(const char *p, size_t sz) 598 { 599 const char *q; 600 int enclosed, found; 601 602 if (0 == sz) 603 return 0; 604 605 /* 606 * End-of-sentence recognition must include situations where 607 * some symbols, such as `)', allow prior EOS punctuation to 608 * propagate outward. 609 */ 610 611 enclosed = found = 0; 612 for (q = p + (int)sz - 1; q >= p; q--) { 613 switch (*q) { 614 case '\"': 615 case '\'': 616 case ']': 617 case ')': 618 if (0 == found) 619 enclosed = 1; 620 break; 621 case '.': 622 case '!': 623 case '?': 624 found = 1; 625 break; 626 default: 627 return found && 628 (!enclosed || isalnum((unsigned char)*q)); 629 } 630 } 631 632 return found && !enclosed; 633 } 634 635 /* 636 * Convert a string to a long that may not be <0. 637 * If the string is invalid, or is less than 0, return -1. 638 */ 639 int 640 mandoc_strntoi(const char *p, size_t sz, int base) 641 { 642 char buf[32]; 643 char *ep; 644 long v; 645 646 if (sz > 31) 647 return -1; 648 649 memcpy(buf, p, sz); 650 buf[(int)sz] = '\0'; 651 652 errno = 0; 653 v = strtol(buf, &ep, base); 654 655 if (buf[0] == '\0' || *ep != '\0') 656 return -1; 657 658 if (v > INT_MAX) 659 v = INT_MAX; 660 if (v < INT_MIN) 661 v = INT_MIN; 662 663 return (int)v; 664 } 665