1 /* $OpenBSD: mandoc.c,v 1.83 2019/05/21 08:03:43 schwarze Exp $ */ 2 /* 3 * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv> 4 * Copyright (c) 2011-2015, 2017, 2018 Ingo Schwarze <schwarze@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 #include <sys/types.h> 19 20 #include <assert.h> 21 #include <ctype.h> 22 #include <errno.h> 23 #include <limits.h> 24 #include <stdlib.h> 25 #include <stdio.h> 26 #include <string.h> 27 #include <time.h> 28 29 #include "mandoc_aux.h" 30 #include "mandoc.h" 31 #include "roff.h" 32 #include "libmandoc.h" 33 #include "roff_int.h" 34 35 static int a2time(time_t *, const char *, const char *); 36 static char *time2a(time_t); 37 38 39 enum mandoc_esc 40 mandoc_font(const char *cp, int sz) 41 { 42 switch (sz) { 43 case 0: 44 return ESCAPE_FONTPREV; 45 case 1: 46 switch (cp[0]) { 47 case 'B': 48 case '3': 49 return ESCAPE_FONTBOLD; 50 case 'I': 51 case '2': 52 return ESCAPE_FONTITALIC; 53 case 'P': 54 return ESCAPE_FONTPREV; 55 case 'R': 56 case '1': 57 return ESCAPE_FONTROMAN; 58 case '4': 59 return ESCAPE_FONTBI; 60 default: 61 return ESCAPE_ERROR; 62 } 63 case 2: 64 switch (cp[0]) { 65 case 'B': 66 switch (cp[1]) { 67 case 'I': 68 return ESCAPE_FONTBI; 69 default: 70 return ESCAPE_ERROR; 71 } 72 case 'C': 73 switch (cp[1]) { 74 case 'B': 75 return ESCAPE_FONTBOLD; 76 case 'I': 77 return ESCAPE_FONTITALIC; 78 case 'R': 79 case 'W': 80 return ESCAPE_FONTCW; 81 default: 82 return ESCAPE_ERROR; 83 } 84 default: 85 return ESCAPE_ERROR; 86 } 87 default: 88 return ESCAPE_ERROR; 89 } 90 } 91 92 enum mandoc_esc 93 mandoc_escape(const char **end, const char **start, int *sz) 94 { 95 const char *local_start; 96 int local_sz, c, i; 97 char term; 98 enum mandoc_esc gly; 99 100 /* 101 * When the caller doesn't provide return storage, 102 * use local storage. 103 */ 104 105 if (NULL == start) 106 start = &local_start; 107 if (NULL == sz) 108 sz = &local_sz; 109 110 /* 111 * Treat "\E" just like "\"; 112 * it only makes a difference in copy mode. 113 */ 114 115 if (**end == 'E') 116 ++*end; 117 118 /* 119 * Beyond the backslash, at least one input character 120 * is part of the escape sequence. With one exception 121 * (see below), that character won't be returned. 122 */ 123 124 gly = ESCAPE_ERROR; 125 *start = ++*end; 126 *sz = 0; 127 term = '\0'; 128 129 switch ((*start)[-1]) { 130 /* 131 * First the glyphs. There are several different forms of 132 * these, but each eventually returns a substring of the glyph 133 * name. 134 */ 135 case '(': 136 gly = ESCAPE_SPECIAL; 137 *sz = 2; 138 break; 139 case '[': 140 if (**start == ' ') { 141 ++*end; 142 return ESCAPE_ERROR; 143 } 144 gly = ESCAPE_SPECIAL; 145 term = ']'; 146 break; 147 case 'C': 148 if ('\'' != **start) 149 return ESCAPE_ERROR; 150 *start = ++*end; 151 gly = ESCAPE_SPECIAL; 152 term = '\''; 153 break; 154 155 /* 156 * Escapes taking no arguments at all. 157 */ 158 case '!': 159 case '?': 160 return ESCAPE_UNSUPP; 161 case '%': 162 case '&': 163 case ')': 164 case ',': 165 case '/': 166 case '^': 167 case 'a': 168 case 'd': 169 case 'r': 170 case 't': 171 case 'u': 172 case '{': 173 case '|': 174 case '}': 175 return ESCAPE_IGNORE; 176 case 'c': 177 return ESCAPE_NOSPACE; 178 case 'p': 179 return ESCAPE_BREAK; 180 181 /* 182 * The \z escape is supposed to output the following 183 * character without advancing the cursor position. 184 * Since we are mostly dealing with terminal mode, 185 * let us just skip the next character. 186 */ 187 case 'z': 188 return ESCAPE_SKIPCHAR; 189 190 /* 191 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where 192 * 'X' is the trigger. These have opaque sub-strings. 193 */ 194 case 'F': 195 case 'f': 196 case 'g': 197 case 'k': 198 case 'M': 199 case 'm': 200 case 'n': 201 case 'O': 202 case 'V': 203 case 'Y': 204 gly = (*start)[-1] == 'f' ? ESCAPE_FONT : ESCAPE_IGNORE; 205 switch (**start) { 206 case '(': 207 if ((*start)[-1] == 'O') 208 gly = ESCAPE_ERROR; 209 *start = ++*end; 210 *sz = 2; 211 break; 212 case '[': 213 if ((*start)[-1] == 'O') 214 gly = (*start)[1] == '5' ? 215 ESCAPE_UNSUPP : ESCAPE_ERROR; 216 *start = ++*end; 217 term = ']'; 218 break; 219 default: 220 if ((*start)[-1] == 'O') { 221 switch (**start) { 222 case '0': 223 gly = ESCAPE_UNSUPP; 224 break; 225 case '1': 226 case '2': 227 case '3': 228 case '4': 229 break; 230 default: 231 gly = ESCAPE_ERROR; 232 break; 233 } 234 } 235 *sz = 1; 236 break; 237 } 238 break; 239 case '*': 240 if (strncmp(*start, "(.T", 3) != 0) 241 abort(); 242 gly = ESCAPE_DEVICE; 243 *start = ++*end; 244 *sz = 2; 245 break; 246 247 /* 248 * These escapes are of the form \X'Y', where 'X' is the trigger 249 * and 'Y' is any string. These have opaque sub-strings. 250 * The \B and \w escapes are handled in roff.c, roff_res(). 251 */ 252 case 'A': 253 case 'b': 254 case 'D': 255 case 'R': 256 case 'X': 257 case 'Z': 258 gly = ESCAPE_IGNORE; 259 /* FALLTHROUGH */ 260 case 'o': 261 if (**start == '\0') 262 return ESCAPE_ERROR; 263 if (gly == ESCAPE_ERROR) 264 gly = ESCAPE_OVERSTRIKE; 265 term = **start; 266 *start = ++*end; 267 break; 268 269 /* 270 * These escapes are of the form \X'N', where 'X' is the trigger 271 * and 'N' resolves to a numerical expression. 272 */ 273 case 'h': 274 case 'H': 275 case 'L': 276 case 'l': 277 case 'S': 278 case 'v': 279 case 'x': 280 if (strchr(" %&()*+-./0123456789:<=>", **start)) { 281 if ('\0' != **start) 282 ++*end; 283 return ESCAPE_ERROR; 284 } 285 switch ((*start)[-1]) { 286 case 'h': 287 gly = ESCAPE_HORIZ; 288 break; 289 case 'l': 290 gly = ESCAPE_HLINE; 291 break; 292 default: 293 gly = ESCAPE_IGNORE; 294 break; 295 } 296 term = **start; 297 *start = ++*end; 298 break; 299 300 /* 301 * Special handling for the numbered character escape. 302 * XXX Do any other escapes need similar handling? 303 */ 304 case 'N': 305 if ('\0' == **start) 306 return ESCAPE_ERROR; 307 (*end)++; 308 if (isdigit((unsigned char)**start)) { 309 *sz = 1; 310 return ESCAPE_IGNORE; 311 } 312 (*start)++; 313 while (isdigit((unsigned char)**end)) 314 (*end)++; 315 *sz = *end - *start; 316 if ('\0' != **end) 317 (*end)++; 318 return ESCAPE_NUMBERED; 319 320 /* 321 * Sizes get a special category of their own. 322 */ 323 case 's': 324 gly = ESCAPE_IGNORE; 325 326 /* See +/- counts as a sign. */ 327 if ('+' == **end || '-' == **end || ASCII_HYPH == **end) 328 *start = ++*end; 329 330 switch (**end) { 331 case '(': 332 *start = ++*end; 333 *sz = 2; 334 break; 335 case '[': 336 *start = ++*end; 337 term = ']'; 338 break; 339 case '\'': 340 *start = ++*end; 341 term = '\''; 342 break; 343 case '3': 344 case '2': 345 case '1': 346 *sz = (*end)[-1] == 's' && 347 isdigit((unsigned char)(*end)[1]) ? 2 : 1; 348 break; 349 default: 350 *sz = 1; 351 break; 352 } 353 354 break; 355 356 /* 357 * Several special characters can be encoded as 358 * one-byte escape sequences without using \[]. 359 */ 360 case ' ': 361 case '\'': 362 case '-': 363 case '.': 364 case '0': 365 case ':': 366 case '_': 367 case '`': 368 case 'e': 369 case '~': 370 gly = ESCAPE_SPECIAL; 371 /* FALLTHROUGH */ 372 default: 373 if (gly == ESCAPE_ERROR) 374 gly = ESCAPE_UNDEF; 375 *start = --*end; 376 *sz = 1; 377 break; 378 } 379 380 /* 381 * Read up to the terminating character, 382 * paying attention to nested escapes. 383 */ 384 385 if ('\0' != term) { 386 while (**end != term) { 387 switch (**end) { 388 case '\0': 389 return ESCAPE_ERROR; 390 case '\\': 391 (*end)++; 392 if (ESCAPE_ERROR == 393 mandoc_escape(end, NULL, NULL)) 394 return ESCAPE_ERROR; 395 break; 396 default: 397 (*end)++; 398 break; 399 } 400 } 401 *sz = (*end)++ - *start; 402 403 /* 404 * The file chars.c only provides one common list 405 * of character names, but \[-] == \- is the only 406 * one of the characters with one-byte names that 407 * allows enclosing the name in brackets. 408 */ 409 if (gly == ESCAPE_SPECIAL && *sz == 1 && **start != '-') 410 return ESCAPE_ERROR; 411 } else { 412 assert(*sz > 0); 413 if ((size_t)*sz > strlen(*start)) 414 return ESCAPE_ERROR; 415 *end += *sz; 416 } 417 418 /* Run post-processors. */ 419 420 switch (gly) { 421 case ESCAPE_FONT: 422 gly = mandoc_font(*start, *sz); 423 break; 424 case ESCAPE_SPECIAL: 425 if (**start == 'c') { 426 if (*sz < 6 || *sz > 7 || 427 strncmp(*start, "char", 4) != 0 || 428 (int)strspn(*start + 4, "0123456789") + 4 < *sz) 429 break; 430 c = 0; 431 for (i = 4; i < *sz; i++) 432 c = 10 * c + ((*start)[i] - '0'); 433 if (c < 0x21 || (c > 0x7e && c < 0xa0) || c > 0xff) 434 break; 435 *start += 4; 436 *sz -= 4; 437 gly = ESCAPE_NUMBERED; 438 break; 439 } 440 441 /* 442 * Unicode escapes are defined in groff as \[u0000] 443 * to \[u10FFFF], where the contained value must be 444 * a valid Unicode codepoint. Here, however, only 445 * check the length and range. 446 */ 447 if (**start != 'u' || *sz < 5 || *sz > 7) 448 break; 449 if (*sz == 7 && ((*start)[1] != '1' || (*start)[2] != '0')) 450 break; 451 if (*sz == 6 && (*start)[1] == '0') 452 break; 453 if (*sz == 5 && (*start)[1] == 'D' && 454 strchr("89ABCDEF", (*start)[2]) != NULL) 455 break; 456 if ((int)strspn(*start + 1, "0123456789ABCDEFabcdef") 457 + 1 == *sz) 458 gly = ESCAPE_UNICODE; 459 break; 460 default: 461 break; 462 } 463 464 return gly; 465 } 466 467 static int 468 a2time(time_t *t, const char *fmt, const char *p) 469 { 470 struct tm tm; 471 char *pp; 472 473 memset(&tm, 0, sizeof(struct tm)); 474 475 pp = strptime(p, fmt, &tm); 476 if (NULL != pp && '\0' == *pp) { 477 *t = mktime(&tm); 478 return 1; 479 } 480 481 return 0; 482 } 483 484 static char * 485 time2a(time_t t) 486 { 487 struct tm *tm; 488 char *buf, *p; 489 size_t ssz; 490 int isz; 491 492 tm = localtime(&t); 493 if (tm == NULL) 494 return NULL; 495 496 /* 497 * Reserve space: 498 * up to 9 characters for the month (September) + blank 499 * up to 2 characters for the day + comma + blank 500 * 4 characters for the year and a terminating '\0' 501 */ 502 503 p = buf = mandoc_malloc(10 + 4 + 4 + 1); 504 505 if ((ssz = strftime(p, 10 + 1, "%B ", tm)) == 0) 506 goto fail; 507 p += (int)ssz; 508 509 /* 510 * The output format is just "%d" here, not "%2d" or "%02d". 511 * That's also the reason why we can't just format the 512 * date as a whole with "%B %e, %Y" or "%B %d, %Y". 513 * Besides, the present approach is less prone to buffer 514 * overflows, in case anybody should ever introduce the bug 515 * of looking at LC_TIME. 516 */ 517 518 if ((isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday)) == -1) 519 goto fail; 520 p += isz; 521 522 if (strftime(p, 4 + 1, "%Y", tm) == 0) 523 goto fail; 524 return buf; 525 526 fail: 527 free(buf); 528 return NULL; 529 } 530 531 char * 532 mandoc_normdate(struct roff_man *man, char *in, int ln, int pos) 533 { 534 char *cp; 535 time_t t; 536 537 /* No date specified: use today's date. */ 538 539 if (in == NULL || *in == '\0') 540 mandoc_msg(MANDOCERR_DATE_MISSING, ln, pos, NULL); 541 if (in == NULL || *in == '\0' || strcmp(in, "$" "Mdocdate$") == 0) 542 return time2a(time(NULL)); 543 544 /* Valid mdoc(7) date format. */ 545 546 if (a2time(&t, "$" "Mdocdate: %b %d %Y $", in) || 547 a2time(&t, "%b %d, %Y", in)) { 548 cp = time2a(t); 549 if (t > time(NULL) + 86400) 550 mandoc_msg(MANDOCERR_DATE_FUTURE, ln, pos, "%s", cp); 551 else if (*in != '$' && strcmp(in, cp) != 0) 552 mandoc_msg(MANDOCERR_DATE_NORM, ln, pos, "%s", cp); 553 return cp; 554 } 555 556 /* In man(7), do not warn about the legacy format. */ 557 558 if (a2time(&t, "%Y-%m-%d", in) == 0) 559 mandoc_msg(MANDOCERR_DATE_BAD, ln, pos, "%s", in); 560 else if (t > time(NULL) + 86400) 561 mandoc_msg(MANDOCERR_DATE_FUTURE, ln, pos, "%s", in); 562 else if (man->meta.macroset == MACROSET_MDOC) 563 mandoc_msg(MANDOCERR_DATE_LEGACY, ln, pos, "Dd %s", in); 564 565 /* Use any non-mdoc(7) date verbatim. */ 566 567 return mandoc_strdup(in); 568 } 569 570 int 571 mandoc_eos(const char *p, size_t sz) 572 { 573 const char *q; 574 int enclosed, found; 575 576 if (0 == sz) 577 return 0; 578 579 /* 580 * End-of-sentence recognition must include situations where 581 * some symbols, such as `)', allow prior EOS punctuation to 582 * propagate outward. 583 */ 584 585 enclosed = found = 0; 586 for (q = p + (int)sz - 1; q >= p; q--) { 587 switch (*q) { 588 case '\"': 589 case '\'': 590 case ']': 591 case ')': 592 if (0 == found) 593 enclosed = 1; 594 break; 595 case '.': 596 case '!': 597 case '?': 598 found = 1; 599 break; 600 default: 601 return found && 602 (!enclosed || isalnum((unsigned char)*q)); 603 } 604 } 605 606 return found && !enclosed; 607 } 608 609 /* 610 * Convert a string to a long that may not be <0. 611 * If the string is invalid, or is less than 0, return -1. 612 */ 613 int 614 mandoc_strntoi(const char *p, size_t sz, int base) 615 { 616 char buf[32]; 617 char *ep; 618 long v; 619 620 if (sz > 31) 621 return -1; 622 623 memcpy(buf, p, sz); 624 buf[(int)sz] = '\0'; 625 626 errno = 0; 627 v = strtol(buf, &ep, base); 628 629 if (buf[0] == '\0' || *ep != '\0') 630 return -1; 631 632 if (v > INT_MAX) 633 v = INT_MAX; 634 if (v < INT_MIN) 635 v = INT_MIN; 636 637 return (int)v; 638 } 639