1 /* $Id: mandoc.c,v 1.45 2014/03/21 22:17:01 schwarze Exp $ */ 2 /* 3 * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv> 4 * Copyright (c) 2011, 2012, 2013 Ingo Schwarze <schwarze@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 #include <sys/types.h> 19 20 #include <assert.h> 21 #include <ctype.h> 22 #include <errno.h> 23 #include <limits.h> 24 #include <stdlib.h> 25 #include <stdio.h> 26 #include <string.h> 27 #include <time.h> 28 29 #include "mandoc.h" 30 #include "mandoc_aux.h" 31 #include "libmandoc.h" 32 33 #define DATESIZE 32 34 35 static int a2time(time_t *, const char *, const char *); 36 static char *time2a(time_t); 37 38 39 enum mandoc_esc 40 mandoc_escape(const char **end, const char **start, int *sz) 41 { 42 const char *local_start; 43 int local_sz; 44 char term; 45 enum mandoc_esc gly; 46 47 /* 48 * When the caller doesn't provide return storage, 49 * use local storage. 50 */ 51 52 if (NULL == start) 53 start = &local_start; 54 if (NULL == sz) 55 sz = &local_sz; 56 57 /* 58 * Beyond the backslash, at least one input character 59 * is part of the escape sequence. With one exception 60 * (see below), that character won't be returned. 61 */ 62 63 gly = ESCAPE_ERROR; 64 *start = ++*end; 65 *sz = 0; 66 term = '\0'; 67 68 switch ((*start)[-1]) { 69 /* 70 * First the glyphs. There are several different forms of 71 * these, but each eventually returns a substring of the glyph 72 * name. 73 */ 74 case ('('): 75 gly = ESCAPE_SPECIAL; 76 *sz = 2; 77 break; 78 case ('['): 79 gly = ESCAPE_SPECIAL; 80 /* 81 * Unicode escapes are defined in groff as \[uXXXX] to 82 * \[u10FFFF], where the contained value must be a valid 83 * Unicode codepoint. Here, however, only check whether 84 * it's not a zero-width escape. 85 */ 86 if ('u' == (*start)[0] && ']' != (*start)[1]) 87 gly = ESCAPE_UNICODE; 88 term = ']'; 89 break; 90 case ('C'): 91 if ('\'' != **start) 92 return(ESCAPE_ERROR); 93 *start = ++*end; 94 if ('u' == (*start)[0] && '\'' != (*start)[1]) 95 gly = ESCAPE_UNICODE; 96 else 97 gly = ESCAPE_SPECIAL; 98 term = '\''; 99 break; 100 101 /* 102 * Escapes taking no arguments at all. 103 */ 104 case ('d'): 105 /* FALLTHROUGH */ 106 case ('u'): 107 return(ESCAPE_IGNORE); 108 109 /* 110 * The \z escape is supposed to output the following 111 * character without advancing the cursor position. 112 * Since we are mostly dealing with terminal mode, 113 * let us just skip the next character. 114 */ 115 case ('z'): 116 return(ESCAPE_SKIPCHAR); 117 118 /* 119 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where 120 * 'X' is the trigger. These have opaque sub-strings. 121 */ 122 case ('F'): 123 /* FALLTHROUGH */ 124 case ('g'): 125 /* FALLTHROUGH */ 126 case ('k'): 127 /* FALLTHROUGH */ 128 case ('M'): 129 /* FALLTHROUGH */ 130 case ('m'): 131 /* FALLTHROUGH */ 132 case ('n'): 133 /* FALLTHROUGH */ 134 case ('V'): 135 /* FALLTHROUGH */ 136 case ('Y'): 137 gly = ESCAPE_IGNORE; 138 /* FALLTHROUGH */ 139 case ('f'): 140 if (ESCAPE_ERROR == gly) 141 gly = ESCAPE_FONT; 142 switch (**start) { 143 case ('('): 144 *start = ++*end; 145 *sz = 2; 146 break; 147 case ('['): 148 *start = ++*end; 149 term = ']'; 150 break; 151 default: 152 *sz = 1; 153 break; 154 } 155 break; 156 157 /* 158 * These escapes are of the form \X'Y', where 'X' is the trigger 159 * and 'Y' is any string. These have opaque sub-strings. 160 */ 161 case ('A'): 162 /* FALLTHROUGH */ 163 case ('b'): 164 /* FALLTHROUGH */ 165 case ('B'): 166 /* FALLTHROUGH */ 167 case ('D'): 168 /* FALLTHROUGH */ 169 case ('o'): 170 /* FALLTHROUGH */ 171 case ('R'): 172 /* FALLTHROUGH */ 173 case ('w'): 174 /* FALLTHROUGH */ 175 case ('X'): 176 /* FALLTHROUGH */ 177 case ('Z'): 178 if ('\'' != **start) 179 return(ESCAPE_ERROR); 180 gly = ESCAPE_IGNORE; 181 *start = ++*end; 182 term = '\''; 183 break; 184 185 /* 186 * These escapes are of the form \X'N', where 'X' is the trigger 187 * and 'N' resolves to a numerical expression. 188 */ 189 case ('h'): 190 /* FALLTHROUGH */ 191 case ('H'): 192 /* FALLTHROUGH */ 193 case ('L'): 194 /* FALLTHROUGH */ 195 case ('l'): 196 /* FALLTHROUGH */ 197 case ('S'): 198 /* FALLTHROUGH */ 199 case ('v'): 200 /* FALLTHROUGH */ 201 case ('x'): 202 if ('\'' != **start) 203 return(ESCAPE_ERROR); 204 gly = ESCAPE_IGNORE; 205 *start = ++*end; 206 term = '\''; 207 break; 208 209 /* 210 * Special handling for the numbered character escape. 211 * XXX Do any other escapes need similar handling? 212 */ 213 case ('N'): 214 if ('\0' == **start) 215 return(ESCAPE_ERROR); 216 (*end)++; 217 if (isdigit((unsigned char)**start)) { 218 *sz = 1; 219 return(ESCAPE_IGNORE); 220 } 221 (*start)++; 222 while (isdigit((unsigned char)**end)) 223 (*end)++; 224 *sz = *end - *start; 225 if ('\0' != **end) 226 (*end)++; 227 return(ESCAPE_NUMBERED); 228 229 /* 230 * Sizes get a special category of their own. 231 */ 232 case ('s'): 233 gly = ESCAPE_IGNORE; 234 235 /* See +/- counts as a sign. */ 236 if ('+' == **end || '-' == **end || ASCII_HYPH == **end) 237 (*end)++; 238 239 switch (**end) { 240 case ('('): 241 *start = ++*end; 242 *sz = 2; 243 break; 244 case ('['): 245 *start = ++*end; 246 term = ']'; 247 break; 248 case ('\''): 249 *start = ++*end; 250 term = '\''; 251 break; 252 default: 253 *sz = 1; 254 break; 255 } 256 257 break; 258 259 /* 260 * Anything else is assumed to be a glyph. 261 * In this case, pass back the character after the backslash. 262 */ 263 default: 264 gly = ESCAPE_SPECIAL; 265 *start = --*end; 266 *sz = 1; 267 break; 268 } 269 270 assert(ESCAPE_ERROR != gly); 271 272 /* 273 * Read up to the terminating character, 274 * paying attention to nested escapes. 275 */ 276 277 if ('\0' != term) { 278 while (**end != term) { 279 switch (**end) { 280 case ('\0'): 281 return(ESCAPE_ERROR); 282 case ('\\'): 283 (*end)++; 284 if (ESCAPE_ERROR == 285 mandoc_escape(end, NULL, NULL)) 286 return(ESCAPE_ERROR); 287 break; 288 default: 289 (*end)++; 290 break; 291 } 292 } 293 *sz = (*end)++ - *start; 294 } else { 295 assert(*sz > 0); 296 if ((size_t)*sz > strlen(*start)) 297 return(ESCAPE_ERROR); 298 *end += *sz; 299 } 300 301 /* Run post-processors. */ 302 303 switch (gly) { 304 case (ESCAPE_FONT): 305 if (2 == *sz) { 306 if ('C' == **start) { 307 /* 308 * Treat constant-width font modes 309 * just like regular font modes. 310 */ 311 (*start)++; 312 (*sz)--; 313 } else { 314 if ('B' == (*start)[0] && 'I' == (*start)[1]) 315 gly = ESCAPE_FONTBI; 316 break; 317 } 318 } else if (1 != *sz) 319 break; 320 321 switch (**start) { 322 case ('3'): 323 /* FALLTHROUGH */ 324 case ('B'): 325 gly = ESCAPE_FONTBOLD; 326 break; 327 case ('2'): 328 /* FALLTHROUGH */ 329 case ('I'): 330 gly = ESCAPE_FONTITALIC; 331 break; 332 case ('P'): 333 gly = ESCAPE_FONTPREV; 334 break; 335 case ('1'): 336 /* FALLTHROUGH */ 337 case ('R'): 338 gly = ESCAPE_FONTROMAN; 339 break; 340 } 341 break; 342 case (ESCAPE_SPECIAL): 343 if (1 == *sz && 'c' == **start) 344 gly = ESCAPE_NOSPACE; 345 break; 346 default: 347 break; 348 } 349 350 return(gly); 351 } 352 353 /* 354 * Parse a quoted or unquoted roff-style request or macro argument. 355 * Return a pointer to the parsed argument, which is either the original 356 * pointer or advanced by one byte in case the argument is quoted. 357 * NUL-terminate the argument in place. 358 * Collapse pairs of quotes inside quoted arguments. 359 * Advance the argument pointer to the next argument, 360 * or to the NUL byte terminating the argument line. 361 */ 362 char * 363 mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos) 364 { 365 char *start, *cp; 366 int quoted, pairs, white; 367 368 /* Quoting can only start with a new word. */ 369 start = *cpp; 370 quoted = 0; 371 if ('"' == *start) { 372 quoted = 1; 373 start++; 374 } 375 376 pairs = 0; 377 white = 0; 378 for (cp = start; '\0' != *cp; cp++) { 379 380 /* 381 * Move the following text left 382 * after quoted quotes and after "\\" and "\t". 383 */ 384 if (pairs) 385 cp[-pairs] = cp[0]; 386 387 if ('\\' == cp[0]) { 388 /* 389 * In copy mode, translate double to single 390 * backslashes and backslash-t to literal tabs. 391 */ 392 switch (cp[1]) { 393 case ('t'): 394 cp[0] = '\t'; 395 /* FALLTHROUGH */ 396 case ('\\'): 397 pairs++; 398 cp++; 399 break; 400 case (' '): 401 /* Skip escaped blanks. */ 402 if (0 == quoted) 403 cp++; 404 break; 405 default: 406 break; 407 } 408 } else if (0 == quoted) { 409 if (' ' == cp[0]) { 410 /* Unescaped blanks end unquoted args. */ 411 white = 1; 412 break; 413 } 414 } else if ('"' == cp[0]) { 415 if ('"' == cp[1]) { 416 /* Quoted quotes collapse. */ 417 pairs++; 418 cp++; 419 } else { 420 /* Unquoted quotes end quoted args. */ 421 quoted = 2; 422 break; 423 } 424 } 425 } 426 427 /* Quoted argument without a closing quote. */ 428 if (1 == quoted) 429 mandoc_msg(MANDOCERR_BADQUOTE, parse, ln, *pos, NULL); 430 431 /* NUL-terminate this argument and move to the next one. */ 432 if (pairs) 433 cp[-pairs] = '\0'; 434 if ('\0' != *cp) { 435 *cp++ = '\0'; 436 while (' ' == *cp) 437 cp++; 438 } 439 *pos += (int)(cp - start) + (quoted ? 1 : 0); 440 *cpp = cp; 441 442 if ('\0' == *cp && (white || ' ' == cp[-1])) 443 mandoc_msg(MANDOCERR_EOLNSPACE, parse, ln, *pos, NULL); 444 445 return(start); 446 } 447 448 static int 449 a2time(time_t *t, const char *fmt, const char *p) 450 { 451 struct tm tm; 452 char *pp; 453 454 memset(&tm, 0, sizeof(struct tm)); 455 456 pp = strptime(p, fmt, &tm); 457 if (NULL != pp && '\0' == *pp) { 458 *t = mktime(&tm); 459 return(1); 460 } 461 462 return(0); 463 } 464 465 static char * 466 time2a(time_t t) 467 { 468 struct tm *tm; 469 char *buf, *p; 470 size_t ssz; 471 int isz; 472 473 tm = localtime(&t); 474 475 /* 476 * Reserve space: 477 * up to 9 characters for the month (September) + blank 478 * up to 2 characters for the day + comma + blank 479 * 4 characters for the year and a terminating '\0' 480 */ 481 p = buf = mandoc_malloc(10 + 4 + 4 + 1); 482 483 if (0 == (ssz = strftime(p, 10 + 1, "%B ", tm))) 484 goto fail; 485 p += (int)ssz; 486 487 if (-1 == (isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday))) 488 goto fail; 489 p += isz; 490 491 if (0 == strftime(p, 4 + 1, "%Y", tm)) 492 goto fail; 493 return(buf); 494 495 fail: 496 free(buf); 497 return(NULL); 498 } 499 500 char * 501 mandoc_normdate(struct mparse *parse, char *in, int ln, int pos) 502 { 503 char *out; 504 time_t t; 505 506 if (NULL == in || '\0' == *in || 507 0 == strcmp(in, "$" "Mdocdate$")) { 508 mandoc_msg(MANDOCERR_NODATE, parse, ln, pos, NULL); 509 time(&t); 510 } 511 else if (a2time(&t, "%Y-%m-%d", in)) 512 t = 0; 513 else if (!a2time(&t, "$" "Mdocdate: %b %d %Y $", in) && 514 !a2time(&t, "%b %d, %Y", in)) { 515 mandoc_msg(MANDOCERR_BADDATE, parse, ln, pos, NULL); 516 t = 0; 517 } 518 out = t ? time2a(t) : NULL; 519 return(out ? out : mandoc_strdup(in)); 520 } 521 522 int 523 mandoc_eos(const char *p, size_t sz) 524 { 525 const char *q; 526 int enclosed, found; 527 528 if (0 == sz) 529 return(0); 530 531 /* 532 * End-of-sentence recognition must include situations where 533 * some symbols, such as `)', allow prior EOS punctuation to 534 * propagate outward. 535 */ 536 537 enclosed = found = 0; 538 for (q = p + (int)sz - 1; q >= p; q--) { 539 switch (*q) { 540 case ('\"'): 541 /* FALLTHROUGH */ 542 case ('\''): 543 /* FALLTHROUGH */ 544 case (']'): 545 /* FALLTHROUGH */ 546 case (')'): 547 if (0 == found) 548 enclosed = 1; 549 break; 550 case ('.'): 551 /* FALLTHROUGH */ 552 case ('!'): 553 /* FALLTHROUGH */ 554 case ('?'): 555 found = 1; 556 break; 557 default: 558 return(found && (!enclosed || isalnum((unsigned char)*q))); 559 } 560 } 561 562 return(found && !enclosed); 563 } 564 565 /* 566 * Convert a string to a long that may not be <0. 567 * If the string is invalid, or is less than 0, return -1. 568 */ 569 int 570 mandoc_strntoi(const char *p, size_t sz, int base) 571 { 572 char buf[32]; 573 char *ep; 574 long v; 575 576 if (sz > 31) 577 return(-1); 578 579 memcpy(buf, p, sz); 580 buf[(int)sz] = '\0'; 581 582 errno = 0; 583 v = strtol(buf, &ep, base); 584 585 if (buf[0] == '\0' || *ep != '\0') 586 return(-1); 587 588 if (v > INT_MAX) 589 v = INT_MAX; 590 if (v < INT_MIN) 591 v = INT_MIN; 592 593 return((int)v); 594 } 595