1 /* $Id: mandoc.c,v 1.48 2014/04/20 16:44:44 schwarze Exp $ */ 2 /* 3 * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv> 4 * Copyright (c) 2011, 2012, 2013, 2014 Ingo Schwarze <schwarze@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 #include <sys/types.h> 19 20 #include <assert.h> 21 #include <ctype.h> 22 #include <errno.h> 23 #include <limits.h> 24 #include <stdlib.h> 25 #include <stdio.h> 26 #include <string.h> 27 #include <time.h> 28 29 #include "mandoc.h" 30 #include "mandoc_aux.h" 31 #include "libmandoc.h" 32 33 #define DATESIZE 32 34 35 static int a2time(time_t *, const char *, const char *); 36 static char *time2a(time_t); 37 38 39 enum mandoc_esc 40 mandoc_escape(const char **end, const char **start, int *sz) 41 { 42 const char *local_start; 43 int local_sz; 44 char term; 45 enum mandoc_esc gly; 46 47 /* 48 * When the caller doesn't provide return storage, 49 * use local storage. 50 */ 51 52 if (NULL == start) 53 start = &local_start; 54 if (NULL == sz) 55 sz = &local_sz; 56 57 /* 58 * Beyond the backslash, at least one input character 59 * is part of the escape sequence. With one exception 60 * (see below), that character won't be returned. 61 */ 62 63 gly = ESCAPE_ERROR; 64 *start = ++*end; 65 *sz = 0; 66 term = '\0'; 67 68 switch ((*start)[-1]) { 69 /* 70 * First the glyphs. There are several different forms of 71 * these, but each eventually returns a substring of the glyph 72 * name. 73 */ 74 case '(': 75 gly = ESCAPE_SPECIAL; 76 *sz = 2; 77 break; 78 case '[': 79 gly = ESCAPE_SPECIAL; 80 /* 81 * Unicode escapes are defined in groff as \[uXXXX] to 82 * \[u10FFFF], where the contained value must be a valid 83 * Unicode codepoint. Here, however, only check whether 84 * it's not a zero-width escape. 85 */ 86 if ('u' == (*start)[0] && ']' != (*start)[1]) 87 gly = ESCAPE_UNICODE; 88 term = ']'; 89 break; 90 case 'C': 91 if ('\'' != **start) 92 return(ESCAPE_ERROR); 93 *start = ++*end; 94 if ('u' == (*start)[0] && '\'' != (*start)[1]) 95 gly = ESCAPE_UNICODE; 96 else 97 gly = ESCAPE_SPECIAL; 98 term = '\''; 99 break; 100 101 /* 102 * Escapes taking no arguments at all. 103 */ 104 case 'd': 105 /* FALLTHROUGH */ 106 case 'u': 107 return(ESCAPE_IGNORE); 108 109 /* 110 * The \z escape is supposed to output the following 111 * character without advancing the cursor position. 112 * Since we are mostly dealing with terminal mode, 113 * let us just skip the next character. 114 */ 115 case 'z': 116 return(ESCAPE_SKIPCHAR); 117 118 /* 119 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where 120 * 'X' is the trigger. These have opaque sub-strings. 121 */ 122 case 'F': 123 /* FALLTHROUGH */ 124 case 'g': 125 /* FALLTHROUGH */ 126 case 'k': 127 /* FALLTHROUGH */ 128 case 'M': 129 /* FALLTHROUGH */ 130 case 'm': 131 /* FALLTHROUGH */ 132 case 'n': 133 /* FALLTHROUGH */ 134 case 'V': 135 /* FALLTHROUGH */ 136 case 'Y': 137 gly = ESCAPE_IGNORE; 138 /* FALLTHROUGH */ 139 case 'f': 140 if (ESCAPE_ERROR == gly) 141 gly = ESCAPE_FONT; 142 switch (**start) { 143 case '(': 144 *start = ++*end; 145 *sz = 2; 146 break; 147 case '[': 148 *start = ++*end; 149 term = ']'; 150 break; 151 default: 152 *sz = 1; 153 break; 154 } 155 break; 156 157 /* 158 * These escapes are of the form \X'Y', where 'X' is the trigger 159 * and 'Y' is any string. These have opaque sub-strings. 160 * The \B and \w escapes are handled in roff.c, roff_res(). 161 */ 162 case 'A': 163 /* FALLTHROUGH */ 164 case 'b': 165 /* FALLTHROUGH */ 166 case 'D': 167 /* FALLTHROUGH */ 168 case 'o': 169 /* FALLTHROUGH */ 170 case 'R': 171 /* FALLTHROUGH */ 172 case 'X': 173 /* FALLTHROUGH */ 174 case 'Z': 175 if ('\0' == **start) 176 return(ESCAPE_ERROR); 177 gly = ESCAPE_IGNORE; 178 term = **start; 179 *start = ++*end; 180 break; 181 182 /* 183 * These escapes are of the form \X'N', where 'X' is the trigger 184 * and 'N' resolves to a numerical expression. 185 */ 186 case 'h': 187 /* FALLTHROUGH */ 188 case 'H': 189 /* FALLTHROUGH */ 190 case 'L': 191 /* FALLTHROUGH */ 192 case 'l': 193 /* FALLTHROUGH */ 194 case 'S': 195 /* FALLTHROUGH */ 196 case 'v': 197 /* FALLTHROUGH */ 198 case 'x': 199 if (strchr("\0 %&()*+-./0123456789:<=>", **start)) 200 return(ESCAPE_ERROR); 201 gly = ESCAPE_IGNORE; 202 term = **start; 203 *start = ++*end; 204 break; 205 206 /* 207 * Special handling for the numbered character escape. 208 * XXX Do any other escapes need similar handling? 209 */ 210 case 'N': 211 if ('\0' == **start) 212 return(ESCAPE_ERROR); 213 (*end)++; 214 if (isdigit((unsigned char)**start)) { 215 *sz = 1; 216 return(ESCAPE_IGNORE); 217 } 218 (*start)++; 219 while (isdigit((unsigned char)**end)) 220 (*end)++; 221 *sz = *end - *start; 222 if ('\0' != **end) 223 (*end)++; 224 return(ESCAPE_NUMBERED); 225 226 /* 227 * Sizes get a special category of their own. 228 */ 229 case 's': 230 gly = ESCAPE_IGNORE; 231 232 /* See +/- counts as a sign. */ 233 if ('+' == **end || '-' == **end || ASCII_HYPH == **end) 234 (*end)++; 235 236 switch (**end) { 237 case '(': 238 *start = ++*end; 239 *sz = 2; 240 break; 241 case '[': 242 *start = ++*end; 243 term = ']'; 244 break; 245 case '\'': 246 *start = ++*end; 247 term = '\''; 248 break; 249 default: 250 *sz = 1; 251 break; 252 } 253 254 break; 255 256 /* 257 * Anything else is assumed to be a glyph. 258 * In this case, pass back the character after the backslash. 259 */ 260 default: 261 gly = ESCAPE_SPECIAL; 262 *start = --*end; 263 *sz = 1; 264 break; 265 } 266 267 assert(ESCAPE_ERROR != gly); 268 269 /* 270 * Read up to the terminating character, 271 * paying attention to nested escapes. 272 */ 273 274 if ('\0' != term) { 275 while (**end != term) { 276 switch (**end) { 277 case '\0': 278 return(ESCAPE_ERROR); 279 case '\\': 280 (*end)++; 281 if (ESCAPE_ERROR == 282 mandoc_escape(end, NULL, NULL)) 283 return(ESCAPE_ERROR); 284 break; 285 default: 286 (*end)++; 287 break; 288 } 289 } 290 *sz = (*end)++ - *start; 291 } else { 292 assert(*sz > 0); 293 if ((size_t)*sz > strlen(*start)) 294 return(ESCAPE_ERROR); 295 *end += *sz; 296 } 297 298 /* Run post-processors. */ 299 300 switch (gly) { 301 case ESCAPE_FONT: 302 if (2 == *sz) { 303 if ('C' == **start) { 304 /* 305 * Treat constant-width font modes 306 * just like regular font modes. 307 */ 308 (*start)++; 309 (*sz)--; 310 } else { 311 if ('B' == (*start)[0] && 'I' == (*start)[1]) 312 gly = ESCAPE_FONTBI; 313 break; 314 } 315 } else if (1 != *sz) 316 break; 317 318 switch (**start) { 319 case '3': 320 /* FALLTHROUGH */ 321 case 'B': 322 gly = ESCAPE_FONTBOLD; 323 break; 324 case '2': 325 /* FALLTHROUGH */ 326 case 'I': 327 gly = ESCAPE_FONTITALIC; 328 break; 329 case 'P': 330 gly = ESCAPE_FONTPREV; 331 break; 332 case '1': 333 /* FALLTHROUGH */ 334 case 'R': 335 gly = ESCAPE_FONTROMAN; 336 break; 337 } 338 break; 339 case ESCAPE_SPECIAL: 340 if (1 == *sz && 'c' == **start) 341 gly = ESCAPE_NOSPACE; 342 break; 343 default: 344 break; 345 } 346 347 return(gly); 348 } 349 350 /* 351 * Parse a quoted or unquoted roff-style request or macro argument. 352 * Return a pointer to the parsed argument, which is either the original 353 * pointer or advanced by one byte in case the argument is quoted. 354 * NUL-terminate the argument in place. 355 * Collapse pairs of quotes inside quoted arguments. 356 * Advance the argument pointer to the next argument, 357 * or to the NUL byte terminating the argument line. 358 */ 359 char * 360 mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos) 361 { 362 char *start, *cp; 363 int quoted, pairs, white; 364 365 /* Quoting can only start with a new word. */ 366 start = *cpp; 367 quoted = 0; 368 if ('"' == *start) { 369 quoted = 1; 370 start++; 371 } 372 373 pairs = 0; 374 white = 0; 375 for (cp = start; '\0' != *cp; cp++) { 376 377 /* 378 * Move the following text left 379 * after quoted quotes and after "\\" and "\t". 380 */ 381 if (pairs) 382 cp[-pairs] = cp[0]; 383 384 if ('\\' == cp[0]) { 385 /* 386 * In copy mode, translate double to single 387 * backslashes and backslash-t to literal tabs. 388 */ 389 switch (cp[1]) { 390 case 't': 391 cp[0] = '\t'; 392 /* FALLTHROUGH */ 393 case '\\': 394 pairs++; 395 cp++; 396 break; 397 case ' ': 398 /* Skip escaped blanks. */ 399 if (0 == quoted) 400 cp++; 401 break; 402 default: 403 break; 404 } 405 } else if (0 == quoted) { 406 if (' ' == cp[0]) { 407 /* Unescaped blanks end unquoted args. */ 408 white = 1; 409 break; 410 } 411 } else if ('"' == cp[0]) { 412 if ('"' == cp[1]) { 413 /* Quoted quotes collapse. */ 414 pairs++; 415 cp++; 416 } else { 417 /* Unquoted quotes end quoted args. */ 418 quoted = 2; 419 break; 420 } 421 } 422 } 423 424 /* Quoted argument without a closing quote. */ 425 if (1 == quoted) 426 mandoc_msg(MANDOCERR_BADQUOTE, parse, ln, *pos, NULL); 427 428 /* NUL-terminate this argument and move to the next one. */ 429 if (pairs) 430 cp[-pairs] = '\0'; 431 if ('\0' != *cp) { 432 *cp++ = '\0'; 433 while (' ' == *cp) 434 cp++; 435 } 436 *pos += (int)(cp - start) + (quoted ? 1 : 0); 437 *cpp = cp; 438 439 if ('\0' == *cp && (white || ' ' == cp[-1])) 440 mandoc_msg(MANDOCERR_EOLNSPACE, parse, ln, *pos, NULL); 441 442 return(start); 443 } 444 445 static int 446 a2time(time_t *t, const char *fmt, const char *p) 447 { 448 struct tm tm; 449 char *pp; 450 451 memset(&tm, 0, sizeof(struct tm)); 452 453 pp = strptime(p, fmt, &tm); 454 if (NULL != pp && '\0' == *pp) { 455 *t = mktime(&tm); 456 return(1); 457 } 458 459 return(0); 460 } 461 462 static char * 463 time2a(time_t t) 464 { 465 struct tm *tm; 466 char *buf, *p; 467 size_t ssz; 468 int isz; 469 470 tm = localtime(&t); 471 472 /* 473 * Reserve space: 474 * up to 9 characters for the month (September) + blank 475 * up to 2 characters for the day + comma + blank 476 * 4 characters for the year and a terminating '\0' 477 */ 478 p = buf = mandoc_malloc(10 + 4 + 4 + 1); 479 480 if (0 == (ssz = strftime(p, 10 + 1, "%B ", tm))) 481 goto fail; 482 p += (int)ssz; 483 484 if (-1 == (isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday))) 485 goto fail; 486 p += isz; 487 488 if (0 == strftime(p, 4 + 1, "%Y", tm)) 489 goto fail; 490 return(buf); 491 492 fail: 493 free(buf); 494 return(NULL); 495 } 496 497 char * 498 mandoc_normdate(struct mparse *parse, char *in, int ln, int pos) 499 { 500 char *out; 501 time_t t; 502 503 if (NULL == in || '\0' == *in || 504 0 == strcmp(in, "$" "Mdocdate$")) { 505 mandoc_msg(MANDOCERR_NODATE, parse, ln, pos, NULL); 506 time(&t); 507 } 508 else if (a2time(&t, "%Y-%m-%d", in)) 509 t = 0; 510 else if (!a2time(&t, "$" "Mdocdate: %b %d %Y $", in) && 511 !a2time(&t, "%b %d, %Y", in)) { 512 mandoc_msg(MANDOCERR_BADDATE, parse, ln, pos, NULL); 513 t = 0; 514 } 515 out = t ? time2a(t) : NULL; 516 return(out ? out : mandoc_strdup(in)); 517 } 518 519 int 520 mandoc_eos(const char *p, size_t sz) 521 { 522 const char *q; 523 int enclosed, found; 524 525 if (0 == sz) 526 return(0); 527 528 /* 529 * End-of-sentence recognition must include situations where 530 * some symbols, such as `)', allow prior EOS punctuation to 531 * propagate outward. 532 */ 533 534 enclosed = found = 0; 535 for (q = p + (int)sz - 1; q >= p; q--) { 536 switch (*q) { 537 case '\"': 538 /* FALLTHROUGH */ 539 case '\'': 540 /* FALLTHROUGH */ 541 case ']': 542 /* FALLTHROUGH */ 543 case ')': 544 if (0 == found) 545 enclosed = 1; 546 break; 547 case '.': 548 /* FALLTHROUGH */ 549 case '!': 550 /* FALLTHROUGH */ 551 case '?': 552 found = 1; 553 break; 554 default: 555 return(found && (!enclosed || isalnum((unsigned char)*q))); 556 } 557 } 558 559 return(found && !enclosed); 560 } 561 562 /* 563 * Convert a string to a long that may not be <0. 564 * If the string is invalid, or is less than 0, return -1. 565 */ 566 int 567 mandoc_strntoi(const char *p, size_t sz, int base) 568 { 569 char buf[32]; 570 char *ep; 571 long v; 572 573 if (sz > 31) 574 return(-1); 575 576 memcpy(buf, p, sz); 577 buf[(int)sz] = '\0'; 578 579 errno = 0; 580 v = strtol(buf, &ep, base); 581 582 if (buf[0] == '\0' || *ep != '\0') 583 return(-1); 584 585 if (v > INT_MAX) 586 v = INT_MAX; 587 if (v < INT_MIN) 588 v = INT_MIN; 589 590 return((int)v); 591 } 592