1 /* $OpenBSD: mandoc.c,v 1.66 2015/11/12 22:43:30 schwarze Exp $ */ 2 /* 3 * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv> 4 * Copyright (c) 2011-2015 Ingo Schwarze <schwarze@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 #include <sys/types.h> 19 20 #include <assert.h> 21 #include <ctype.h> 22 #include <errno.h> 23 #include <limits.h> 24 #include <stdlib.h> 25 #include <stdio.h> 26 #include <string.h> 27 #include <time.h> 28 29 #include "mandoc.h" 30 #include "mandoc_aux.h" 31 #include "libmandoc.h" 32 33 static int a2time(time_t *, const char *, const char *); 34 static char *time2a(time_t); 35 36 37 enum mandoc_esc 38 mandoc_escape(const char **end, const char **start, int *sz) 39 { 40 const char *local_start; 41 int local_sz; 42 char term; 43 enum mandoc_esc gly; 44 45 /* 46 * When the caller doesn't provide return storage, 47 * use local storage. 48 */ 49 50 if (NULL == start) 51 start = &local_start; 52 if (NULL == sz) 53 sz = &local_sz; 54 55 /* 56 * Beyond the backslash, at least one input character 57 * is part of the escape sequence. With one exception 58 * (see below), that character won't be returned. 59 */ 60 61 gly = ESCAPE_ERROR; 62 *start = ++*end; 63 *sz = 0; 64 term = '\0'; 65 66 switch ((*start)[-1]) { 67 /* 68 * First the glyphs. There are several different forms of 69 * these, but each eventually returns a substring of the glyph 70 * name. 71 */ 72 case '(': 73 gly = ESCAPE_SPECIAL; 74 *sz = 2; 75 break; 76 case '[': 77 gly = ESCAPE_SPECIAL; 78 term = ']'; 79 break; 80 case 'C': 81 if ('\'' != **start) 82 return ESCAPE_ERROR; 83 *start = ++*end; 84 gly = ESCAPE_SPECIAL; 85 term = '\''; 86 break; 87 88 /* 89 * Escapes taking no arguments at all. 90 */ 91 case 'd': 92 case 'u': 93 case ',': 94 case '/': 95 return ESCAPE_IGNORE; 96 97 /* 98 * The \z escape is supposed to output the following 99 * character without advancing the cursor position. 100 * Since we are mostly dealing with terminal mode, 101 * let us just skip the next character. 102 */ 103 case 'z': 104 return ESCAPE_SKIPCHAR; 105 106 /* 107 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where 108 * 'X' is the trigger. These have opaque sub-strings. 109 */ 110 case 'F': 111 case 'g': 112 case 'k': 113 case 'M': 114 case 'm': 115 case 'n': 116 case 'V': 117 case 'Y': 118 gly = ESCAPE_IGNORE; 119 /* FALLTHROUGH */ 120 case 'f': 121 if (ESCAPE_ERROR == gly) 122 gly = ESCAPE_FONT; 123 switch (**start) { 124 case '(': 125 *start = ++*end; 126 *sz = 2; 127 break; 128 case '[': 129 *start = ++*end; 130 term = ']'; 131 break; 132 default: 133 *sz = 1; 134 break; 135 } 136 break; 137 138 /* 139 * These escapes are of the form \X'Y', where 'X' is the trigger 140 * and 'Y' is any string. These have opaque sub-strings. 141 * The \B and \w escapes are handled in roff.c, roff_res(). 142 */ 143 case 'A': 144 case 'b': 145 case 'D': 146 case 'R': 147 case 'X': 148 case 'Z': 149 gly = ESCAPE_IGNORE; 150 /* FALLTHROUGH */ 151 case 'o': 152 if (**start == '\0') 153 return ESCAPE_ERROR; 154 if (gly == ESCAPE_ERROR) 155 gly = ESCAPE_OVERSTRIKE; 156 term = **start; 157 *start = ++*end; 158 break; 159 160 /* 161 * These escapes are of the form \X'N', where 'X' is the trigger 162 * and 'N' resolves to a numerical expression. 163 */ 164 case 'h': 165 case 'H': 166 case 'L': 167 case 'l': 168 case 'S': 169 case 'v': 170 case 'x': 171 if (strchr(" %&()*+-./0123456789:<=>", **start)) { 172 if ('\0' != **start) 173 ++*end; 174 return ESCAPE_ERROR; 175 } 176 gly = ESCAPE_IGNORE; 177 term = **start; 178 *start = ++*end; 179 break; 180 181 /* 182 * Special handling for the numbered character escape. 183 * XXX Do any other escapes need similar handling? 184 */ 185 case 'N': 186 if ('\0' == **start) 187 return ESCAPE_ERROR; 188 (*end)++; 189 if (isdigit((unsigned char)**start)) { 190 *sz = 1; 191 return ESCAPE_IGNORE; 192 } 193 (*start)++; 194 while (isdigit((unsigned char)**end)) 195 (*end)++; 196 *sz = *end - *start; 197 if ('\0' != **end) 198 (*end)++; 199 return ESCAPE_NUMBERED; 200 201 /* 202 * Sizes get a special category of their own. 203 */ 204 case 's': 205 gly = ESCAPE_IGNORE; 206 207 /* See +/- counts as a sign. */ 208 if ('+' == **end || '-' == **end || ASCII_HYPH == **end) 209 *start = ++*end; 210 211 switch (**end) { 212 case '(': 213 *start = ++*end; 214 *sz = 2; 215 break; 216 case '[': 217 *start = ++*end; 218 term = ']'; 219 break; 220 case '\'': 221 *start = ++*end; 222 term = '\''; 223 break; 224 case '3': 225 case '2': 226 case '1': 227 *sz = (*end)[-1] == 's' && 228 isdigit((unsigned char)(*end)[1]) ? 2 : 1; 229 break; 230 default: 231 *sz = 1; 232 break; 233 } 234 235 break; 236 237 /* 238 * Anything else is assumed to be a glyph. 239 * In this case, pass back the character after the backslash. 240 */ 241 default: 242 gly = ESCAPE_SPECIAL; 243 *start = --*end; 244 *sz = 1; 245 break; 246 } 247 248 assert(ESCAPE_ERROR != gly); 249 250 /* 251 * Read up to the terminating character, 252 * paying attention to nested escapes. 253 */ 254 255 if ('\0' != term) { 256 while (**end != term) { 257 switch (**end) { 258 case '\0': 259 return ESCAPE_ERROR; 260 case '\\': 261 (*end)++; 262 if (ESCAPE_ERROR == 263 mandoc_escape(end, NULL, NULL)) 264 return ESCAPE_ERROR; 265 break; 266 default: 267 (*end)++; 268 break; 269 } 270 } 271 *sz = (*end)++ - *start; 272 } else { 273 assert(*sz > 0); 274 if ((size_t)*sz > strlen(*start)) 275 return ESCAPE_ERROR; 276 *end += *sz; 277 } 278 279 /* Run post-processors. */ 280 281 switch (gly) { 282 case ESCAPE_FONT: 283 if (2 == *sz) { 284 if ('C' == **start) { 285 /* 286 * Treat constant-width font modes 287 * just like regular font modes. 288 */ 289 (*start)++; 290 (*sz)--; 291 } else { 292 if ('B' == (*start)[0] && 'I' == (*start)[1]) 293 gly = ESCAPE_FONTBI; 294 break; 295 } 296 } else if (1 != *sz) 297 break; 298 299 switch (**start) { 300 case '3': 301 case 'B': 302 gly = ESCAPE_FONTBOLD; 303 break; 304 case '2': 305 case 'I': 306 gly = ESCAPE_FONTITALIC; 307 break; 308 case 'P': 309 gly = ESCAPE_FONTPREV; 310 break; 311 case '1': 312 case 'R': 313 gly = ESCAPE_FONTROMAN; 314 break; 315 } 316 break; 317 case ESCAPE_SPECIAL: 318 if (1 == *sz && 'c' == **start) 319 gly = ESCAPE_NOSPACE; 320 /* 321 * Unicode escapes are defined in groff as \[u0000] 322 * to \[u10FFFF], where the contained value must be 323 * a valid Unicode codepoint. Here, however, only 324 * check the length and range. 325 */ 326 if (**start != 'u' || *sz < 5 || *sz > 7) 327 break; 328 if (*sz == 7 && ((*start)[1] != '1' || (*start)[2] != '0')) 329 break; 330 if (*sz == 6 && (*start)[1] == '0') 331 break; 332 if (*sz == 5 && (*start)[1] == 'D' && 333 strchr("89ABCDEF", (*start)[2]) != NULL) 334 break; 335 if ((int)strspn(*start + 1, "0123456789ABCDEFabcdef") 336 + 1 == *sz) 337 gly = ESCAPE_UNICODE; 338 break; 339 default: 340 break; 341 } 342 343 return gly; 344 } 345 346 /* 347 * Parse a quoted or unquoted roff-style request or macro argument. 348 * Return a pointer to the parsed argument, which is either the original 349 * pointer or advanced by one byte in case the argument is quoted. 350 * NUL-terminate the argument in place. 351 * Collapse pairs of quotes inside quoted arguments. 352 * Advance the argument pointer to the next argument, 353 * or to the NUL byte terminating the argument line. 354 */ 355 char * 356 mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos) 357 { 358 char *start, *cp; 359 int quoted, pairs, white; 360 361 /* Quoting can only start with a new word. */ 362 start = *cpp; 363 quoted = 0; 364 if ('"' == *start) { 365 quoted = 1; 366 start++; 367 } 368 369 pairs = 0; 370 white = 0; 371 for (cp = start; '\0' != *cp; cp++) { 372 373 /* 374 * Move the following text left 375 * after quoted quotes and after "\\" and "\t". 376 */ 377 if (pairs) 378 cp[-pairs] = cp[0]; 379 380 if ('\\' == cp[0]) { 381 /* 382 * In copy mode, translate double to single 383 * backslashes and backslash-t to literal tabs. 384 */ 385 switch (cp[1]) { 386 case 't': 387 cp[0] = '\t'; 388 /* FALLTHROUGH */ 389 case '\\': 390 pairs++; 391 cp++; 392 break; 393 case ' ': 394 /* Skip escaped blanks. */ 395 if (0 == quoted) 396 cp++; 397 break; 398 default: 399 break; 400 } 401 } else if (0 == quoted) { 402 if (' ' == cp[0]) { 403 /* Unescaped blanks end unquoted args. */ 404 white = 1; 405 break; 406 } 407 } else if ('"' == cp[0]) { 408 if ('"' == cp[1]) { 409 /* Quoted quotes collapse. */ 410 pairs++; 411 cp++; 412 } else { 413 /* Unquoted quotes end quoted args. */ 414 quoted = 2; 415 break; 416 } 417 } 418 } 419 420 /* Quoted argument without a closing quote. */ 421 if (1 == quoted) 422 mandoc_msg(MANDOCERR_ARG_QUOTE, parse, ln, *pos, NULL); 423 424 /* NUL-terminate this argument and move to the next one. */ 425 if (pairs) 426 cp[-pairs] = '\0'; 427 if ('\0' != *cp) { 428 *cp++ = '\0'; 429 while (' ' == *cp) 430 cp++; 431 } 432 *pos += (int)(cp - start) + (quoted ? 1 : 0); 433 *cpp = cp; 434 435 if ('\0' == *cp && (white || ' ' == cp[-1])) 436 mandoc_msg(MANDOCERR_SPACE_EOL, parse, ln, *pos, NULL); 437 438 return start; 439 } 440 441 static int 442 a2time(time_t *t, const char *fmt, const char *p) 443 { 444 struct tm tm; 445 char *pp; 446 447 memset(&tm, 0, sizeof(struct tm)); 448 449 pp = strptime(p, fmt, &tm); 450 if (NULL != pp && '\0' == *pp) { 451 *t = mktime(&tm); 452 return 1; 453 } 454 455 return 0; 456 } 457 458 static char * 459 time2a(time_t t) 460 { 461 struct tm *tm; 462 char *buf, *p; 463 size_t ssz; 464 int isz; 465 466 tm = localtime(&t); 467 if (tm == NULL) 468 return NULL; 469 470 /* 471 * Reserve space: 472 * up to 9 characters for the month (September) + blank 473 * up to 2 characters for the day + comma + blank 474 * 4 characters for the year and a terminating '\0' 475 */ 476 477 p = buf = mandoc_malloc(10 + 4 + 4 + 1); 478 479 if ((ssz = strftime(p, 10 + 1, "%B ", tm)) == 0) 480 goto fail; 481 p += (int)ssz; 482 483 /* 484 * The output format is just "%d" here, not "%2d" or "%02d". 485 * That's also the reason why we can't just format the 486 * date as a whole with "%B %e, %Y" or "%B %d, %Y". 487 * Besides, the present approach is less prone to buffer 488 * overflows, in case anybody should ever introduce the bug 489 * of looking at LC_TIME. 490 */ 491 492 if ((isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday)) == -1) 493 goto fail; 494 p += isz; 495 496 if (strftime(p, 4 + 1, "%Y", tm) == 0) 497 goto fail; 498 return buf; 499 500 fail: 501 free(buf); 502 return NULL; 503 } 504 505 char * 506 mandoc_normdate(struct mparse *parse, char *in, int ln, int pos) 507 { 508 time_t t; 509 510 /* No date specified: use today's date. */ 511 512 if (in == NULL || *in == '\0' || strcmp(in, "$" "Mdocdate$") == 0) { 513 mandoc_msg(MANDOCERR_DATE_MISSING, parse, ln, pos, NULL); 514 return time2a(time(NULL)); 515 } 516 517 /* Valid mdoc(7) date format. */ 518 519 if (a2time(&t, "$" "Mdocdate: %b %d %Y $", in) || 520 a2time(&t, "%b %d, %Y", in)) 521 return time2a(t); 522 523 /* Do not warn about the legacy man(7) format. */ 524 525 if ( ! a2time(&t, "%Y-%m-%d", in)) 526 mandoc_msg(MANDOCERR_DATE_BAD, parse, ln, pos, in); 527 528 /* Use any non-mdoc(7) date verbatim. */ 529 530 return mandoc_strdup(in); 531 } 532 533 int 534 mandoc_eos(const char *p, size_t sz) 535 { 536 const char *q; 537 int enclosed, found; 538 539 if (0 == sz) 540 return 0; 541 542 /* 543 * End-of-sentence recognition must include situations where 544 * some symbols, such as `)', allow prior EOS punctuation to 545 * propagate outward. 546 */ 547 548 enclosed = found = 0; 549 for (q = p + (int)sz - 1; q >= p; q--) { 550 switch (*q) { 551 case '\"': 552 case '\'': 553 case ']': 554 case ')': 555 if (0 == found) 556 enclosed = 1; 557 break; 558 case '.': 559 case '!': 560 case '?': 561 found = 1; 562 break; 563 default: 564 return found && 565 (!enclosed || isalnum((unsigned char)*q)); 566 } 567 } 568 569 return found && !enclosed; 570 } 571 572 /* 573 * Convert a string to a long that may not be <0. 574 * If the string is invalid, or is less than 0, return -1. 575 */ 576 int 577 mandoc_strntoi(const char *p, size_t sz, int base) 578 { 579 char buf[32]; 580 char *ep; 581 long v; 582 583 if (sz > 31) 584 return -1; 585 586 memcpy(buf, p, sz); 587 buf[(int)sz] = '\0'; 588 589 errno = 0; 590 v = strtol(buf, &ep, base); 591 592 if (buf[0] == '\0' || *ep != '\0') 593 return -1; 594 595 if (v > INT_MAX) 596 v = INT_MAX; 597 if (v < INT_MIN) 598 v = INT_MIN; 599 600 return (int)v; 601 } 602