1 /* $OpenBSD: mandoc.c,v 1.56 2014/11/28 19:25:03 schwarze Exp $ */ 2 /* 3 * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv> 4 * Copyright (c) 2011, 2012, 2013, 2014 Ingo Schwarze <schwarze@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 #include <sys/types.h> 19 20 #include <assert.h> 21 #include <ctype.h> 22 #include <errno.h> 23 #include <limits.h> 24 #include <stdlib.h> 25 #include <stdio.h> 26 #include <string.h> 27 #include <time.h> 28 29 #include "mandoc.h" 30 #include "mandoc_aux.h" 31 #include "libmandoc.h" 32 33 #define DATESIZE 32 34 35 static int a2time(time_t *, const char *, const char *); 36 static char *time2a(time_t); 37 38 39 enum mandoc_esc 40 mandoc_escape(const char **end, const char **start, int *sz) 41 { 42 const char *local_start; 43 int local_sz; 44 char term; 45 enum mandoc_esc gly; 46 47 /* 48 * When the caller doesn't provide return storage, 49 * use local storage. 50 */ 51 52 if (NULL == start) 53 start = &local_start; 54 if (NULL == sz) 55 sz = &local_sz; 56 57 /* 58 * Beyond the backslash, at least one input character 59 * is part of the escape sequence. With one exception 60 * (see below), that character won't be returned. 61 */ 62 63 gly = ESCAPE_ERROR; 64 *start = ++*end; 65 *sz = 0; 66 term = '\0'; 67 68 switch ((*start)[-1]) { 69 /* 70 * First the glyphs. There are several different forms of 71 * these, but each eventually returns a substring of the glyph 72 * name. 73 */ 74 case '(': 75 gly = ESCAPE_SPECIAL; 76 *sz = 2; 77 break; 78 case '[': 79 gly = ESCAPE_SPECIAL; 80 term = ']'; 81 break; 82 case 'C': 83 if ('\'' != **start) 84 return(ESCAPE_ERROR); 85 *start = ++*end; 86 gly = ESCAPE_SPECIAL; 87 term = '\''; 88 break; 89 90 /* 91 * Escapes taking no arguments at all. 92 */ 93 case 'd': 94 /* FALLTHROUGH */ 95 case 'u': 96 return(ESCAPE_IGNORE); 97 98 /* 99 * The \z escape is supposed to output the following 100 * character without advancing the cursor position. 101 * Since we are mostly dealing with terminal mode, 102 * let us just skip the next character. 103 */ 104 case 'z': 105 return(ESCAPE_SKIPCHAR); 106 107 /* 108 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where 109 * 'X' is the trigger. These have opaque sub-strings. 110 */ 111 case 'F': 112 /* FALLTHROUGH */ 113 case 'g': 114 /* FALLTHROUGH */ 115 case 'k': 116 /* FALLTHROUGH */ 117 case 'M': 118 /* FALLTHROUGH */ 119 case 'm': 120 /* FALLTHROUGH */ 121 case 'n': 122 /* FALLTHROUGH */ 123 case 'V': 124 /* FALLTHROUGH */ 125 case 'Y': 126 gly = ESCAPE_IGNORE; 127 /* FALLTHROUGH */ 128 case 'f': 129 if (ESCAPE_ERROR == gly) 130 gly = ESCAPE_FONT; 131 switch (**start) { 132 case '(': 133 *start = ++*end; 134 *sz = 2; 135 break; 136 case '[': 137 *start = ++*end; 138 term = ']'; 139 break; 140 default: 141 *sz = 1; 142 break; 143 } 144 break; 145 146 /* 147 * These escapes are of the form \X'Y', where 'X' is the trigger 148 * and 'Y' is any string. These have opaque sub-strings. 149 * The \B and \w escapes are handled in roff.c, roff_res(). 150 */ 151 case 'A': 152 /* FALLTHROUGH */ 153 case 'b': 154 /* FALLTHROUGH */ 155 case 'D': 156 /* FALLTHROUGH */ 157 case 'o': 158 /* FALLTHROUGH */ 159 case 'R': 160 /* FALLTHROUGH */ 161 case 'X': 162 /* FALLTHROUGH */ 163 case 'Z': 164 if ('\0' == **start) 165 return(ESCAPE_ERROR); 166 gly = ESCAPE_IGNORE; 167 term = **start; 168 *start = ++*end; 169 break; 170 171 /* 172 * These escapes are of the form \X'N', where 'X' is the trigger 173 * and 'N' resolves to a numerical expression. 174 */ 175 case 'h': 176 /* FALLTHROUGH */ 177 case 'H': 178 /* FALLTHROUGH */ 179 case 'L': 180 /* FALLTHROUGH */ 181 case 'l': 182 /* FALLTHROUGH */ 183 case 'S': 184 /* FALLTHROUGH */ 185 case 'v': 186 /* FALLTHROUGH */ 187 case 'x': 188 if (strchr(" %&()*+-./0123456789:<=>", **start)) { 189 if ('\0' != **start) 190 ++*end; 191 return(ESCAPE_ERROR); 192 } 193 gly = ESCAPE_IGNORE; 194 term = **start; 195 *start = ++*end; 196 break; 197 198 /* 199 * Special handling for the numbered character escape. 200 * XXX Do any other escapes need similar handling? 201 */ 202 case 'N': 203 if ('\0' == **start) 204 return(ESCAPE_ERROR); 205 (*end)++; 206 if (isdigit((unsigned char)**start)) { 207 *sz = 1; 208 return(ESCAPE_IGNORE); 209 } 210 (*start)++; 211 while (isdigit((unsigned char)**end)) 212 (*end)++; 213 *sz = *end - *start; 214 if ('\0' != **end) 215 (*end)++; 216 return(ESCAPE_NUMBERED); 217 218 /* 219 * Sizes get a special category of their own. 220 */ 221 case 's': 222 gly = ESCAPE_IGNORE; 223 224 /* See +/- counts as a sign. */ 225 if ('+' == **end || '-' == **end || ASCII_HYPH == **end) 226 (*end)++; 227 228 switch (**end) { 229 case '(': 230 *start = ++*end; 231 *sz = 2; 232 break; 233 case '[': 234 *start = ++*end; 235 term = ']'; 236 break; 237 case '\'': 238 *start = ++*end; 239 term = '\''; 240 break; 241 default: 242 *sz = 1; 243 break; 244 } 245 246 break; 247 248 /* 249 * Anything else is assumed to be a glyph. 250 * In this case, pass back the character after the backslash. 251 */ 252 default: 253 gly = ESCAPE_SPECIAL; 254 *start = --*end; 255 *sz = 1; 256 break; 257 } 258 259 assert(ESCAPE_ERROR != gly); 260 261 /* 262 * Read up to the terminating character, 263 * paying attention to nested escapes. 264 */ 265 266 if ('\0' != term) { 267 while (**end != term) { 268 switch (**end) { 269 case '\0': 270 return(ESCAPE_ERROR); 271 case '\\': 272 (*end)++; 273 if (ESCAPE_ERROR == 274 mandoc_escape(end, NULL, NULL)) 275 return(ESCAPE_ERROR); 276 break; 277 default: 278 (*end)++; 279 break; 280 } 281 } 282 *sz = (*end)++ - *start; 283 } else { 284 assert(*sz > 0); 285 if ((size_t)*sz > strlen(*start)) 286 return(ESCAPE_ERROR); 287 *end += *sz; 288 } 289 290 /* Run post-processors. */ 291 292 switch (gly) { 293 case ESCAPE_FONT: 294 if (2 == *sz) { 295 if ('C' == **start) { 296 /* 297 * Treat constant-width font modes 298 * just like regular font modes. 299 */ 300 (*start)++; 301 (*sz)--; 302 } else { 303 if ('B' == (*start)[0] && 'I' == (*start)[1]) 304 gly = ESCAPE_FONTBI; 305 break; 306 } 307 } else if (1 != *sz) 308 break; 309 310 switch (**start) { 311 case '3': 312 /* FALLTHROUGH */ 313 case 'B': 314 gly = ESCAPE_FONTBOLD; 315 break; 316 case '2': 317 /* FALLTHROUGH */ 318 case 'I': 319 gly = ESCAPE_FONTITALIC; 320 break; 321 case 'P': 322 gly = ESCAPE_FONTPREV; 323 break; 324 case '1': 325 /* FALLTHROUGH */ 326 case 'R': 327 gly = ESCAPE_FONTROMAN; 328 break; 329 } 330 break; 331 case ESCAPE_SPECIAL: 332 if (1 == *sz && 'c' == **start) 333 gly = ESCAPE_NOSPACE; 334 /* 335 * Unicode escapes are defined in groff as \[u0000] 336 * to \[u10FFFF], where the contained value must be 337 * a valid Unicode codepoint. Here, however, only 338 * check the length and range. 339 */ 340 if (**start != 'u' || *sz < 5 || *sz > 7) 341 break; 342 if (*sz == 7 && ((*start)[1] != '1' || (*start)[2] != '0')) 343 break; 344 if (*sz == 6 && (*start)[1] == '0') 345 break; 346 if ((int)strspn(*start + 1, "0123456789ABCDEFabcdef") 347 + 1 == *sz) 348 gly = ESCAPE_UNICODE; 349 break; 350 default: 351 break; 352 } 353 354 return(gly); 355 } 356 357 /* 358 * Parse a quoted or unquoted roff-style request or macro argument. 359 * Return a pointer to the parsed argument, which is either the original 360 * pointer or advanced by one byte in case the argument is quoted. 361 * NUL-terminate the argument in place. 362 * Collapse pairs of quotes inside quoted arguments. 363 * Advance the argument pointer to the next argument, 364 * or to the NUL byte terminating the argument line. 365 */ 366 char * 367 mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos) 368 { 369 char *start, *cp; 370 int quoted, pairs, white; 371 372 /* Quoting can only start with a new word. */ 373 start = *cpp; 374 quoted = 0; 375 if ('"' == *start) { 376 quoted = 1; 377 start++; 378 } 379 380 pairs = 0; 381 white = 0; 382 for (cp = start; '\0' != *cp; cp++) { 383 384 /* 385 * Move the following text left 386 * after quoted quotes and after "\\" and "\t". 387 */ 388 if (pairs) 389 cp[-pairs] = cp[0]; 390 391 if ('\\' == cp[0]) { 392 /* 393 * In copy mode, translate double to single 394 * backslashes and backslash-t to literal tabs. 395 */ 396 switch (cp[1]) { 397 case 't': 398 cp[0] = '\t'; 399 /* FALLTHROUGH */ 400 case '\\': 401 pairs++; 402 cp++; 403 break; 404 case ' ': 405 /* Skip escaped blanks. */ 406 if (0 == quoted) 407 cp++; 408 break; 409 default: 410 break; 411 } 412 } else if (0 == quoted) { 413 if (' ' == cp[0]) { 414 /* Unescaped blanks end unquoted args. */ 415 white = 1; 416 break; 417 } 418 } else if ('"' == cp[0]) { 419 if ('"' == cp[1]) { 420 /* Quoted quotes collapse. */ 421 pairs++; 422 cp++; 423 } else { 424 /* Unquoted quotes end quoted args. */ 425 quoted = 2; 426 break; 427 } 428 } 429 } 430 431 /* Quoted argument without a closing quote. */ 432 if (1 == quoted) 433 mandoc_msg(MANDOCERR_ARG_QUOTE, parse, ln, *pos, NULL); 434 435 /* NUL-terminate this argument and move to the next one. */ 436 if (pairs) 437 cp[-pairs] = '\0'; 438 if ('\0' != *cp) { 439 *cp++ = '\0'; 440 while (' ' == *cp) 441 cp++; 442 } 443 *pos += (int)(cp - start) + (quoted ? 1 : 0); 444 *cpp = cp; 445 446 if ('\0' == *cp && (white || ' ' == cp[-1])) 447 mandoc_msg(MANDOCERR_SPACE_EOL, parse, ln, *pos, NULL); 448 449 return(start); 450 } 451 452 static int 453 a2time(time_t *t, const char *fmt, const char *p) 454 { 455 struct tm tm; 456 char *pp; 457 458 memset(&tm, 0, sizeof(struct tm)); 459 460 pp = strptime(p, fmt, &tm); 461 if (NULL != pp && '\0' == *pp) { 462 *t = mktime(&tm); 463 return(1); 464 } 465 466 return(0); 467 } 468 469 static char * 470 time2a(time_t t) 471 { 472 struct tm *tm; 473 char *buf, *p; 474 size_t ssz; 475 int isz; 476 477 tm = localtime(&t); 478 479 /* 480 * Reserve space: 481 * up to 9 characters for the month (September) + blank 482 * up to 2 characters for the day + comma + blank 483 * 4 characters for the year and a terminating '\0' 484 */ 485 p = buf = mandoc_malloc(10 + 4 + 4 + 1); 486 487 if (0 == (ssz = strftime(p, 10 + 1, "%B ", tm))) 488 goto fail; 489 p += (int)ssz; 490 491 if (-1 == (isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday))) 492 goto fail; 493 p += isz; 494 495 if (0 == strftime(p, 4 + 1, "%Y", tm)) 496 goto fail; 497 return(buf); 498 499 fail: 500 free(buf); 501 return(NULL); 502 } 503 504 char * 505 mandoc_normdate(struct mparse *parse, char *in, int ln, int pos) 506 { 507 char *out; 508 time_t t; 509 510 if (NULL == in || '\0' == *in || 511 0 == strcmp(in, "$" "Mdocdate$")) { 512 mandoc_msg(MANDOCERR_DATE_MISSING, parse, ln, pos, NULL); 513 time(&t); 514 } 515 else if (a2time(&t, "%Y-%m-%d", in)) 516 t = 0; 517 else if (!a2time(&t, "$" "Mdocdate: %b %d %Y $", in) && 518 !a2time(&t, "%b %d, %Y", in)) { 519 mandoc_msg(MANDOCERR_DATE_BAD, parse, ln, pos, in); 520 t = 0; 521 } 522 out = t ? time2a(t) : NULL; 523 return(out ? out : mandoc_strdup(in)); 524 } 525 526 int 527 mandoc_eos(const char *p, size_t sz) 528 { 529 const char *q; 530 int enclosed, found; 531 532 if (0 == sz) 533 return(0); 534 535 /* 536 * End-of-sentence recognition must include situations where 537 * some symbols, such as `)', allow prior EOS punctuation to 538 * propagate outward. 539 */ 540 541 enclosed = found = 0; 542 for (q = p + (int)sz - 1; q >= p; q--) { 543 switch (*q) { 544 case '\"': 545 /* FALLTHROUGH */ 546 case '\'': 547 /* FALLTHROUGH */ 548 case ']': 549 /* FALLTHROUGH */ 550 case ')': 551 if (0 == found) 552 enclosed = 1; 553 break; 554 case '.': 555 /* FALLTHROUGH */ 556 case '!': 557 /* FALLTHROUGH */ 558 case '?': 559 found = 1; 560 break; 561 default: 562 return(found && (!enclosed || isalnum((unsigned char)*q))); 563 } 564 } 565 566 return(found && !enclosed); 567 } 568 569 /* 570 * Convert a string to a long that may not be <0. 571 * If the string is invalid, or is less than 0, return -1. 572 */ 573 int 574 mandoc_strntoi(const char *p, size_t sz, int base) 575 { 576 char buf[32]; 577 char *ep; 578 long v; 579 580 if (sz > 31) 581 return(-1); 582 583 memcpy(buf, p, sz); 584 buf[(int)sz] = '\0'; 585 586 errno = 0; 587 v = strtol(buf, &ep, base); 588 589 if (buf[0] == '\0' || *ep != '\0') 590 return(-1); 591 592 if (v > INT_MAX) 593 v = INT_MAX; 594 if (v < INT_MIN) 595 v = INT_MIN; 596 597 return((int)v); 598 } 599