1 /* $OpenBSD: mandoc.c,v 1.71 2017/07/03 13:40:00 schwarze Exp $ */ 2 /* 3 * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv> 4 * Copyright (c) 2011-2015, 2017 Ingo Schwarze <schwarze@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 #include <sys/types.h> 19 20 #include <assert.h> 21 #include <ctype.h> 22 #include <errno.h> 23 #include <limits.h> 24 #include <stdlib.h> 25 #include <stdio.h> 26 #include <string.h> 27 #include <time.h> 28 29 #include "mandoc_aux.h" 30 #include "mandoc.h" 31 #include "roff.h" 32 #include "libmandoc.h" 33 34 static int a2time(time_t *, const char *, const char *); 35 static char *time2a(time_t); 36 37 38 enum mandoc_esc 39 mandoc_escape(const char **end, const char **start, int *sz) 40 { 41 const char *local_start; 42 int local_sz; 43 char term; 44 enum mandoc_esc gly; 45 46 /* 47 * When the caller doesn't provide return storage, 48 * use local storage. 49 */ 50 51 if (NULL == start) 52 start = &local_start; 53 if (NULL == sz) 54 sz = &local_sz; 55 56 /* 57 * Beyond the backslash, at least one input character 58 * is part of the escape sequence. With one exception 59 * (see below), that character won't be returned. 60 */ 61 62 gly = ESCAPE_ERROR; 63 *start = ++*end; 64 *sz = 0; 65 term = '\0'; 66 67 switch ((*start)[-1]) { 68 /* 69 * First the glyphs. There are several different forms of 70 * these, but each eventually returns a substring of the glyph 71 * name. 72 */ 73 case '(': 74 gly = ESCAPE_SPECIAL; 75 *sz = 2; 76 break; 77 case '[': 78 gly = ESCAPE_SPECIAL; 79 term = ']'; 80 break; 81 case 'C': 82 if ('\'' != **start) 83 return ESCAPE_ERROR; 84 *start = ++*end; 85 gly = ESCAPE_SPECIAL; 86 term = '\''; 87 break; 88 89 /* 90 * Escapes taking no arguments at all. 91 */ 92 case 'd': 93 case 'u': 94 case ',': 95 case '/': 96 return ESCAPE_IGNORE; 97 case 'p': 98 return ESCAPE_BREAK; 99 100 /* 101 * The \z escape is supposed to output the following 102 * character without advancing the cursor position. 103 * Since we are mostly dealing with terminal mode, 104 * let us just skip the next character. 105 */ 106 case 'z': 107 return ESCAPE_SKIPCHAR; 108 109 /* 110 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where 111 * 'X' is the trigger. These have opaque sub-strings. 112 */ 113 case 'F': 114 case 'g': 115 case 'k': 116 case 'M': 117 case 'm': 118 case 'n': 119 case 'V': 120 case 'Y': 121 gly = ESCAPE_IGNORE; 122 /* FALLTHROUGH */ 123 case 'f': 124 if (ESCAPE_ERROR == gly) 125 gly = ESCAPE_FONT; 126 switch (**start) { 127 case '(': 128 *start = ++*end; 129 *sz = 2; 130 break; 131 case '[': 132 *start = ++*end; 133 term = ']'; 134 break; 135 default: 136 *sz = 1; 137 break; 138 } 139 break; 140 141 /* 142 * These escapes are of the form \X'Y', where 'X' is the trigger 143 * and 'Y' is any string. These have opaque sub-strings. 144 * The \B and \w escapes are handled in roff.c, roff_res(). 145 */ 146 case 'A': 147 case 'b': 148 case 'D': 149 case 'R': 150 case 'X': 151 case 'Z': 152 gly = ESCAPE_IGNORE; 153 /* FALLTHROUGH */ 154 case 'o': 155 if (**start == '\0') 156 return ESCAPE_ERROR; 157 if (gly == ESCAPE_ERROR) 158 gly = ESCAPE_OVERSTRIKE; 159 term = **start; 160 *start = ++*end; 161 break; 162 163 /* 164 * These escapes are of the form \X'N', where 'X' is the trigger 165 * and 'N' resolves to a numerical expression. 166 */ 167 case 'h': 168 case 'H': 169 case 'L': 170 case 'l': 171 case 'S': 172 case 'v': 173 case 'x': 174 if (strchr(" %&()*+-./0123456789:<=>", **start)) { 175 if ('\0' != **start) 176 ++*end; 177 return ESCAPE_ERROR; 178 } 179 switch ((*start)[-1]) { 180 case 'h': 181 gly = ESCAPE_HORIZ; 182 break; 183 case 'l': 184 gly = ESCAPE_HLINE; 185 break; 186 default: 187 gly = ESCAPE_IGNORE; 188 break; 189 } 190 term = **start; 191 *start = ++*end; 192 break; 193 194 /* 195 * Special handling for the numbered character escape. 196 * XXX Do any other escapes need similar handling? 197 */ 198 case 'N': 199 if ('\0' == **start) 200 return ESCAPE_ERROR; 201 (*end)++; 202 if (isdigit((unsigned char)**start)) { 203 *sz = 1; 204 return ESCAPE_IGNORE; 205 } 206 (*start)++; 207 while (isdigit((unsigned char)**end)) 208 (*end)++; 209 *sz = *end - *start; 210 if ('\0' != **end) 211 (*end)++; 212 return ESCAPE_NUMBERED; 213 214 /* 215 * Sizes get a special category of their own. 216 */ 217 case 's': 218 gly = ESCAPE_IGNORE; 219 220 /* See +/- counts as a sign. */ 221 if ('+' == **end || '-' == **end || ASCII_HYPH == **end) 222 *start = ++*end; 223 224 switch (**end) { 225 case '(': 226 *start = ++*end; 227 *sz = 2; 228 break; 229 case '[': 230 *start = ++*end; 231 term = ']'; 232 break; 233 case '\'': 234 *start = ++*end; 235 term = '\''; 236 break; 237 case '3': 238 case '2': 239 case '1': 240 *sz = (*end)[-1] == 's' && 241 isdigit((unsigned char)(*end)[1]) ? 2 : 1; 242 break; 243 default: 244 *sz = 1; 245 break; 246 } 247 248 break; 249 250 /* 251 * Anything else is assumed to be a glyph. 252 * In this case, pass back the character after the backslash. 253 */ 254 default: 255 gly = ESCAPE_SPECIAL; 256 *start = --*end; 257 *sz = 1; 258 break; 259 } 260 261 assert(ESCAPE_ERROR != gly); 262 263 /* 264 * Read up to the terminating character, 265 * paying attention to nested escapes. 266 */ 267 268 if ('\0' != term) { 269 while (**end != term) { 270 switch (**end) { 271 case '\0': 272 return ESCAPE_ERROR; 273 case '\\': 274 (*end)++; 275 if (ESCAPE_ERROR == 276 mandoc_escape(end, NULL, NULL)) 277 return ESCAPE_ERROR; 278 break; 279 default: 280 (*end)++; 281 break; 282 } 283 } 284 *sz = (*end)++ - *start; 285 } else { 286 assert(*sz > 0); 287 if ((size_t)*sz > strlen(*start)) 288 return ESCAPE_ERROR; 289 *end += *sz; 290 } 291 292 /* Run post-processors. */ 293 294 switch (gly) { 295 case ESCAPE_FONT: 296 if (2 == *sz) { 297 if ('C' == **start) { 298 /* 299 * Treat constant-width font modes 300 * just like regular font modes. 301 */ 302 (*start)++; 303 (*sz)--; 304 } else { 305 if ('B' == (*start)[0] && 'I' == (*start)[1]) 306 gly = ESCAPE_FONTBI; 307 break; 308 } 309 } else if (1 != *sz) 310 break; 311 312 switch (**start) { 313 case '3': 314 case 'B': 315 gly = ESCAPE_FONTBOLD; 316 break; 317 case '2': 318 case 'I': 319 gly = ESCAPE_FONTITALIC; 320 break; 321 case 'P': 322 gly = ESCAPE_FONTPREV; 323 break; 324 case '1': 325 case 'R': 326 gly = ESCAPE_FONTROMAN; 327 break; 328 } 329 break; 330 case ESCAPE_SPECIAL: 331 if (1 == *sz && 'c' == **start) 332 gly = ESCAPE_NOSPACE; 333 /* 334 * Unicode escapes are defined in groff as \[u0000] 335 * to \[u10FFFF], where the contained value must be 336 * a valid Unicode codepoint. Here, however, only 337 * check the length and range. 338 */ 339 if (**start != 'u' || *sz < 5 || *sz > 7) 340 break; 341 if (*sz == 7 && ((*start)[1] != '1' || (*start)[2] != '0')) 342 break; 343 if (*sz == 6 && (*start)[1] == '0') 344 break; 345 if (*sz == 5 && (*start)[1] == 'D' && 346 strchr("89ABCDEF", (*start)[2]) != NULL) 347 break; 348 if ((int)strspn(*start + 1, "0123456789ABCDEFabcdef") 349 + 1 == *sz) 350 gly = ESCAPE_UNICODE; 351 break; 352 default: 353 break; 354 } 355 356 return gly; 357 } 358 359 /* 360 * Parse a quoted or unquoted roff-style request or macro argument. 361 * Return a pointer to the parsed argument, which is either the original 362 * pointer or advanced by one byte in case the argument is quoted. 363 * NUL-terminate the argument in place. 364 * Collapse pairs of quotes inside quoted arguments. 365 * Advance the argument pointer to the next argument, 366 * or to the NUL byte terminating the argument line. 367 */ 368 char * 369 mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos) 370 { 371 char *start, *cp; 372 int quoted, pairs, white; 373 374 /* Quoting can only start with a new word. */ 375 start = *cpp; 376 quoted = 0; 377 if ('"' == *start) { 378 quoted = 1; 379 start++; 380 } 381 382 pairs = 0; 383 white = 0; 384 for (cp = start; '\0' != *cp; cp++) { 385 386 /* 387 * Move the following text left 388 * after quoted quotes and after "\\" and "\t". 389 */ 390 if (pairs) 391 cp[-pairs] = cp[0]; 392 393 if ('\\' == cp[0]) { 394 /* 395 * In copy mode, translate double to single 396 * backslashes and backslash-t to literal tabs. 397 */ 398 switch (cp[1]) { 399 case 't': 400 cp[0] = '\t'; 401 /* FALLTHROUGH */ 402 case '\\': 403 pairs++; 404 cp++; 405 break; 406 case ' ': 407 /* Skip escaped blanks. */ 408 if (0 == quoted) 409 cp++; 410 break; 411 default: 412 break; 413 } 414 } else if (0 == quoted) { 415 if (' ' == cp[0]) { 416 /* Unescaped blanks end unquoted args. */ 417 white = 1; 418 break; 419 } 420 } else if ('"' == cp[0]) { 421 if ('"' == cp[1]) { 422 /* Quoted quotes collapse. */ 423 pairs++; 424 cp++; 425 } else { 426 /* Unquoted quotes end quoted args. */ 427 quoted = 2; 428 break; 429 } 430 } 431 } 432 433 /* Quoted argument without a closing quote. */ 434 if (1 == quoted) 435 mandoc_msg(MANDOCERR_ARG_QUOTE, parse, ln, *pos, NULL); 436 437 /* NUL-terminate this argument and move to the next one. */ 438 if (pairs) 439 cp[-pairs] = '\0'; 440 if ('\0' != *cp) { 441 *cp++ = '\0'; 442 while (' ' == *cp) 443 cp++; 444 } 445 *pos += (int)(cp - start) + (quoted ? 1 : 0); 446 *cpp = cp; 447 448 if ('\0' == *cp && (white || ' ' == cp[-1])) 449 mandoc_msg(MANDOCERR_SPACE_EOL, parse, ln, *pos, NULL); 450 451 return start; 452 } 453 454 static int 455 a2time(time_t *t, const char *fmt, const char *p) 456 { 457 struct tm tm; 458 char *pp; 459 460 memset(&tm, 0, sizeof(struct tm)); 461 462 pp = strptime(p, fmt, &tm); 463 if (NULL != pp && '\0' == *pp) { 464 *t = mktime(&tm); 465 return 1; 466 } 467 468 return 0; 469 } 470 471 static char * 472 time2a(time_t t) 473 { 474 struct tm *tm; 475 char *buf, *p; 476 size_t ssz; 477 int isz; 478 479 tm = localtime(&t); 480 if (tm == NULL) 481 return NULL; 482 483 /* 484 * Reserve space: 485 * up to 9 characters for the month (September) + blank 486 * up to 2 characters for the day + comma + blank 487 * 4 characters for the year and a terminating '\0' 488 */ 489 490 p = buf = mandoc_malloc(10 + 4 + 4 + 1); 491 492 if ((ssz = strftime(p, 10 + 1, "%B ", tm)) == 0) 493 goto fail; 494 p += (int)ssz; 495 496 /* 497 * The output format is just "%d" here, not "%2d" or "%02d". 498 * That's also the reason why we can't just format the 499 * date as a whole with "%B %e, %Y" or "%B %d, %Y". 500 * Besides, the present approach is less prone to buffer 501 * overflows, in case anybody should ever introduce the bug 502 * of looking at LC_TIME. 503 */ 504 505 if ((isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday)) == -1) 506 goto fail; 507 p += isz; 508 509 if (strftime(p, 4 + 1, "%Y", tm) == 0) 510 goto fail; 511 return buf; 512 513 fail: 514 free(buf); 515 return NULL; 516 } 517 518 char * 519 mandoc_normdate(struct roff_man *man, char *in, int ln, int pos) 520 { 521 char *cp; 522 time_t t; 523 524 /* No date specified: use today's date. */ 525 526 if (in == NULL || *in == '\0' || strcmp(in, "$" "Mdocdate$") == 0) { 527 mandoc_msg(MANDOCERR_DATE_MISSING, man->parse, ln, pos, NULL); 528 return time2a(time(NULL)); 529 } 530 531 /* Valid mdoc(7) date format. */ 532 533 if (a2time(&t, "$" "Mdocdate: %b %d %Y $", in) || 534 a2time(&t, "%b %d, %Y", in)) { 535 cp = time2a(t); 536 if (t > time(NULL) + 86400) 537 mandoc_msg(MANDOCERR_DATE_FUTURE, man->parse, 538 ln, pos, cp); 539 return cp; 540 } 541 542 /* In man(7), do not warn about the legacy format. */ 543 544 if (a2time(&t, "%Y-%m-%d", in) == 0) 545 mandoc_msg(MANDOCERR_DATE_BAD, man->parse, ln, pos, in); 546 else if (t > time(NULL) + 86400) 547 mandoc_msg(MANDOCERR_DATE_FUTURE, man->parse, ln, pos, in); 548 else if (man->macroset == MACROSET_MDOC) 549 mandoc_vmsg(MANDOCERR_DATE_LEGACY, man->parse, 550 ln, pos, "Dd %s", in); 551 552 /* Use any non-mdoc(7) date verbatim. */ 553 554 return mandoc_strdup(in); 555 } 556 557 int 558 mandoc_eos(const char *p, size_t sz) 559 { 560 const char *q; 561 int enclosed, found; 562 563 if (0 == sz) 564 return 0; 565 566 /* 567 * End-of-sentence recognition must include situations where 568 * some symbols, such as `)', allow prior EOS punctuation to 569 * propagate outward. 570 */ 571 572 enclosed = found = 0; 573 for (q = p + (int)sz - 1; q >= p; q--) { 574 switch (*q) { 575 case '\"': 576 case '\'': 577 case ']': 578 case ')': 579 if (0 == found) 580 enclosed = 1; 581 break; 582 case '.': 583 case '!': 584 case '?': 585 found = 1; 586 break; 587 default: 588 return found && 589 (!enclosed || isalnum((unsigned char)*q)); 590 } 591 } 592 593 return found && !enclosed; 594 } 595 596 /* 597 * Convert a string to a long that may not be <0. 598 * If the string is invalid, or is less than 0, return -1. 599 */ 600 int 601 mandoc_strntoi(const char *p, size_t sz, int base) 602 { 603 char buf[32]; 604 char *ep; 605 long v; 606 607 if (sz > 31) 608 return -1; 609 610 memcpy(buf, p, sz); 611 buf[(int)sz] = '\0'; 612 613 errno = 0; 614 v = strtol(buf, &ep, base); 615 616 if (buf[0] == '\0' || *ep != '\0') 617 return -1; 618 619 if (v > INT_MAX) 620 v = INT_MAX; 621 if (v < INT_MIN) 622 v = INT_MIN; 623 624 return (int)v; 625 } 626