1 /* __gmp_doscan -- formatted input internals. 2 3 THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST 4 CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN 5 FUTURE GNU MP RELEASES. 6 7 Copyright 2001-2003 Free Software Foundation, Inc. 8 9 This file is part of the GNU MP Library. 10 11 The GNU MP Library is free software; you can redistribute it and/or modify 12 it under the terms of either: 13 14 * the GNU Lesser General Public License as published by the Free 15 Software Foundation; either version 3 of the License, or (at your 16 option) any later version. 17 18 or 19 20 * the GNU General Public License as published by the Free Software 21 Foundation; either version 2 of the License, or (at your option) any 22 later version. 23 24 or both in parallel, as here. 25 26 The GNU MP Library is distributed in the hope that it will be useful, but 27 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 28 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 29 for more details. 30 31 You should have received copies of the GNU General Public License and the 32 GNU Lesser General Public License along with the GNU MP Library. If not, 33 see https://www.gnu.org/licenses/. */ 34 35 #define _GNU_SOURCE /* for DECIMAL_POINT in langinfo.h */ 36 37 #include "config.h" /* needed for the HAVE_, could also move gmp incls */ 38 39 #include <stdarg.h> 40 #include <ctype.h> 41 #include <stddef.h> /* for ptrdiff_t */ 42 #include <stdio.h> 43 #include <stdlib.h> /* for strtol */ 44 #include <string.h> 45 46 #if HAVE_LANGINFO_H 47 #include <langinfo.h> /* for nl_langinfo */ 48 #endif 49 50 #if HAVE_LOCALE_H 51 #include <locale.h> /* for localeconv */ 52 #endif 53 54 #if HAVE_INTTYPES_H 55 # include <inttypes.h> /* for intmax_t */ 56 #else 57 # if HAVE_STDINT_H 58 # include <stdint.h> 59 # endif 60 #endif 61 62 #if HAVE_SYS_TYPES_H 63 #include <sys/types.h> /* for quad_t */ 64 #endif 65 66 #include "gmp-impl.h" 67 68 69 /* Change this to "#define TRACE(x) x" for some traces. */ 70 #define TRACE(x) 71 72 73 /* General: 74 75 It's necessary to parse up the format string to recognise the GMP 76 extra types F, Q and Z. Other types and conversions are passed 77 across to the standard sscanf or fscanf via funs->scan, for ease of 78 implementation. This is essential in the case of something like glibc 79 %p where the pointer format isn't actually documented. 80 81 Because funs->scan doesn't get the whole input it can't put the right 82 values in for %n, so that's handled in __gmp_doscan. Neither sscanf 83 nor fscanf directly indicate how many characters were read, so an 84 extra %n is appended to each run for that. For fscanf this merely 85 supports our %n output, but for sscanf it lets funs->step move us 86 along the input string. 87 88 Whitespace and literal matches in the format string, including %%, 89 are handled directly within __gmp_doscan. This is reasonably 90 efficient, and avoids some suspicious behaviour observed in various 91 system libc's. GLIBC 2.2.4 for instance returns 0 on 92 93 sscanf(" ", " x") 94 or 95 sscanf(" ", " x%d",&n) 96 97 whereas we think they should return EOF, since end-of-string is 98 reached when a match of "x" is required. 99 100 For standard % conversions, funs->scan is called once for each 101 conversion. If we had vfscanf and vsscanf and could rely on their 102 fixed text matching behaviour then we could call them with multiple 103 consecutive standard conversions. But plain fscanf and sscanf work 104 fine, and parsing one field at a time shouldn't be too much of a 105 slowdown. 106 107 gmpscan: 108 109 gmpscan reads a gmp type. It's only used from one place, but is a 110 separate subroutine to avoid a big chunk of complicated code in the 111 middle of __gmp_doscan. Within gmpscan a couple of loopbacks make it 112 possible to share code for parsing integers, rationals and floats. 113 114 In gmpscan normally one char of lookahead is maintained, but when width 115 is reached that stops, on the principle that an fgetc/ungetc of a char 116 past where we're told to stop would be undesirable. "chars" is how many 117 characters have been read so far, including the current c. When 118 chars==width and another character is desired then a jump is done to the 119 "convert" stage. c is invalid and mustn't be unget'ed in this case; 120 chars is set to width+1 to indicate that. 121 122 gmpscan normally returns the number of characters read. -1 means an 123 invalid field, -2 means EOF reached before any matching characters 124 were read. 125 126 For hex floats, the mantissa part is passed to mpf_set_str, then the 127 exponent is applied with mpf_mul_exp or mpf_div_2exp. This is easier 128 than teaching mpf_set_str about an exponent factor (ie. 2) differing 129 from the mantissa radix point factor (ie. 16). mpf_mul_exp and 130 mpf_div_2exp will preserve the application requested precision, so 131 nothing in that respect is lost by making this a two-step process. 132 133 Matching and errors: 134 135 C99 7.19.6.2 paras 9 and 10 say an input item is read as the longest 136 string which is a match for the appropriate type, or a prefix of a 137 match. With that done, if it's only a prefix then the result is a 138 matching failure, ie. invalid input. 139 140 This rule seems fairly clear, but doesn't seem to be universally 141 applied in system C libraries. Even GLIBC doesn't seem to get it 142 right, insofar as it seems to accept some apparently invalid forms. 143 Eg. glibc 2.3.1 accepts "0x" for a "%i", where a reading of the 144 standard would suggest a non-empty sequence of digits should be 145 required after an "0x". 146 147 A footnote to 7.19.6.2 para 17 notes how this input item reading can 148 mean inputs acceptable to strtol are not acceptable to fscanf. We 149 think this confirms our reading of "0x" as invalid. 150 151 Clearly gmp_sscanf could backtrack to a longest input which was a 152 valid match for a given item, but this is not done, since C99 says 153 sscanf is identical to fscanf, so we make gmp_sscanf identical to 154 gmp_fscanf. 155 156 Types: 157 158 C99 says "ll" is for long long, and "L" is for long double floats. 159 Unfortunately in GMP 4.1.1 we documented the two as equivalent. This 160 doesn't affect us directly, since both are passed through to plain 161 scanf. It seems wisest not to try to enforce the C99 rule. This is 162 consistent with what we said before, though whether it actually 163 worked was always up to the C library. 164 165 Alternatives: 166 167 Consideration was given to using separate code for gmp_fscanf and 168 gmp_sscanf. The sscanf case could zip across a string doing literal 169 matches or recognising digits in gmpscan, rather than making a 170 function call fun->get per character. The fscanf could use getc 171 rather than fgetc too, which might help those systems where getc is a 172 macro or otherwise inlined. But none of this scanning and converting 173 will be particularly fast, so the two are done together to keep it a 174 little simpler for now. 175 176 Various multibyte string issues are not addressed, for a start C99 177 scanf says the format string is multibyte. Since we pass %c, %s and 178 %[ to the system scanf, they might do multibyte reads already, but 179 it's another matter whether or not that can be used, since our digit 180 and whitespace parsing is only unibyte. The plan is to quietly 181 ignore multibyte locales for now. This is not as bad as it sounds, 182 since GMP is presumably used mostly on numbers, which can be 183 perfectly adequately treated in plain ASCII. 184 185 */ 186 187 188 struct gmp_doscan_params_t { 189 int base; 190 int ignore; 191 char type; 192 int width; 193 }; 194 195 196 #define GET(c) \ 197 do { \ 198 ASSERT (chars <= width); \ 199 chars++; \ 200 if (chars > width) \ 201 goto convert; \ 202 (c) = (*funs->get) (data); \ 203 } while (0) 204 205 /* store into "s", extending if necessary */ 206 #define STORE(c) \ 207 do { \ 208 ASSERT (s_upto <= s_alloc); \ 209 if (s_upto >= s_alloc) \ 210 { \ 211 size_t s_alloc_new = s_alloc + S_ALLOC_STEP; \ 212 s = __GMP_REALLOCATE_FUNC_TYPE (s, s_alloc, s_alloc_new, char); \ 213 s_alloc = s_alloc_new; \ 214 } \ 215 s[s_upto++] = c; \ 216 } while (0) 217 218 #define S_ALLOC_STEP 512 219 220 static int 221 gmpscan (const struct gmp_doscan_funs_t *funs, void *data, 222 const struct gmp_doscan_params_t *p, void *dst) 223 { 224 int chars, c, base, first, width, seen_point, seen_digit, hexfloat; 225 size_t s_upto, s_alloc, hexexp; 226 char *s; 227 int invalid = 0; 228 229 TRACE (printf ("gmpscan\n")); 230 231 ASSERT (p->type == 'F' || p->type == 'Q' || p->type == 'Z'); 232 233 c = (*funs->get) (data); 234 if (c == EOF) 235 return -2; 236 237 chars = 1; 238 first = 1; 239 seen_point = 0; 240 width = (p->width == 0 ? INT_MAX-1 : p->width); 241 base = p->base; 242 s_alloc = S_ALLOC_STEP; 243 s = __GMP_ALLOCATE_FUNC_TYPE (s_alloc, char); 244 s_upto = 0; 245 hexfloat = 0; 246 hexexp = 0; 247 248 another: 249 seen_digit = 0; 250 if (c == '-') 251 { 252 STORE (c); 253 goto get_for_sign; 254 } 255 else if (c == '+') 256 { 257 /* don't store '+', it's not accepted by mpz_set_str etc */ 258 get_for_sign: 259 GET (c); 260 } 261 262 if (base == 0) 263 { 264 base = 10; /* decimal if no base indicator */ 265 if (c == '0') 266 { 267 seen_digit = 1; /* 0 alone is a valid number */ 268 if (p->type != 'F') 269 base = 8; /* leading 0 is octal, for non-floats */ 270 STORE (c); 271 GET (c); 272 if (c == 'x' || c == 'X') 273 { 274 base = 16; 275 seen_digit = 0; /* must have digits after an 0x */ 276 if (p->type == 'F') /* don't pass 'x' to mpf_set_str_point */ 277 hexfloat = 1; 278 else 279 STORE (c); 280 GET (c); 281 } 282 } 283 } 284 285 digits: 286 for (;;) 287 { 288 if (base == 16) 289 { 290 if (! isxdigit (c)) 291 break; 292 } 293 else 294 { 295 if (! isdigit (c)) 296 break; 297 if (base == 8 && (c == '8' || c == '9')) 298 break; 299 } 300 301 seen_digit = 1; 302 STORE (c); 303 GET (c); 304 } 305 306 if (first) 307 { 308 /* decimal point */ 309 if (p->type == 'F' && ! seen_point) 310 { 311 /* For a multi-character decimal point, if the first character is 312 present then all of it must be, otherwise the input is 313 considered invalid. */ 314 const char *point = GMP_DECIMAL_POINT; 315 int pc = (unsigned char) *point++; 316 if (c == pc) 317 { 318 for (;;) 319 { 320 STORE (c); 321 GET (c); 322 pc = (unsigned char) *point++; 323 if (pc == '\0') 324 break; 325 if (c != pc) 326 goto set_invalid; 327 } 328 seen_point = 1; 329 goto digits; 330 } 331 } 332 333 /* exponent */ 334 if (p->type == 'F') 335 { 336 if (hexfloat && (c == 'p' || c == 'P')) 337 { 338 hexexp = s_upto; /* exponent location */ 339 base = 10; /* exponent in decimal */ 340 goto exponent; 341 } 342 else if (! hexfloat && (c == 'e' || c == 'E')) 343 { 344 exponent: 345 /* must have at least one digit in the mantissa, just an exponent 346 is not good enough */ 347 if (! seen_digit) 348 goto set_invalid; 349 350 do_second: 351 first = 0; 352 STORE (c); 353 GET (c); 354 goto another; 355 } 356 } 357 358 /* denominator */ 359 if (p->type == 'Q' && c == '/') 360 { 361 /* must have at least one digit in the numerator */ 362 if (! seen_digit) 363 goto set_invalid; 364 365 /* now look for at least one digit in the denominator */ 366 seen_digit = 0; 367 368 /* allow the base to be redetermined for "%i" */ 369 base = p->base; 370 goto do_second; 371 } 372 } 373 374 convert: 375 if (! seen_digit) 376 { 377 set_invalid: 378 invalid = 1; 379 goto done; 380 } 381 382 if (! p->ignore) 383 { 384 STORE ('\0'); 385 TRACE (printf (" convert \"%s\"\n", s)); 386 387 /* We ought to have parsed out a valid string above, so just test 388 mpz_set_str etc with an ASSERT. */ 389 switch (p->type) { 390 case 'F': 391 { 392 mpf_ptr f = (mpf_ptr) dst; 393 if (hexexp != 0) 394 s[hexexp] = '\0'; 395 ASSERT_NOCARRY (mpf_set_str (f, s, hexfloat ? 16 : 10)); 396 if (hexexp != 0) 397 { 398 char *dummy; 399 long exp; 400 exp = strtol (s + hexexp + 1, &dummy, 10); 401 if (exp >= 0) 402 mpf_mul_2exp (f, f, (unsigned long) exp); 403 else 404 mpf_div_2exp (f, f, NEG_CAST (unsigned long, exp)); 405 } 406 } 407 break; 408 case 'Q': 409 ASSERT_NOCARRY (mpq_set_str ((mpq_ptr) dst, s, p->base)); 410 break; 411 case 'Z': 412 ASSERT_NOCARRY (mpz_set_str ((mpz_ptr) dst, s, p->base)); 413 break; 414 default: 415 ASSERT (0); 416 /*FALLTHRU*/ 417 break; 418 } 419 } 420 421 done: 422 ASSERT (chars <= width+1); 423 if (chars != width+1) 424 { 425 (*funs->unget) (c, data); 426 TRACE (printf (" ungetc %d, to give %d chars\n", c, chars-1)); 427 } 428 chars--; 429 430 (*__gmp_free_func) (s, s_alloc); 431 432 if (invalid) 433 { 434 TRACE (printf (" invalid\n")); 435 return -1; 436 } 437 438 TRACE (printf (" return %d chars (cf width %d)\n", chars, width)); 439 return chars; 440 } 441 442 443 /* Read and discard whitespace, if any. Return number of chars skipped. 444 Whitespace skipping never provokes the EOF return from __gmp_doscan, so 445 it's not necessary to watch for EOF from funs->get, */ 446 static int 447 skip_white (const struct gmp_doscan_funs_t *funs, void *data) 448 { 449 int c; 450 int ret = 0; 451 452 do 453 { 454 c = (funs->get) (data); 455 ret++; 456 } 457 while (isspace (c)); 458 459 (funs->unget) (c, data); 460 ret--; 461 462 TRACE (printf (" skip white %d\n", ret)); 463 return ret; 464 } 465 466 467 int 468 __gmp_doscan (const struct gmp_doscan_funs_t *funs, void *data, 469 const char *orig_fmt, va_list orig_ap) 470 { 471 struct gmp_doscan_params_t param; 472 va_list ap; 473 char *alloc_fmt; 474 const char *fmt, *this_fmt, *end_fmt; 475 size_t orig_fmt_len, alloc_fmt_size, len; 476 int new_fields, new_chars; 477 char fchar; 478 int fields = 0; 479 int chars = 0; 480 481 TRACE (printf ("__gmp_doscan \"%s\"\n", orig_fmt); 482 if (funs->scan == (gmp_doscan_scan_t) sscanf) 483 printf (" s=\"%s\"\n", * (const char **) data)); 484 485 /* Don't modify orig_ap, if va_list is actually an array and hence call by 486 reference. It could be argued that it'd be more efficient to leave 487 callers to make a copy if they care, but doing so here is going to be a 488 very small part of the total work, and we may as well keep applications 489 out of trouble. */ 490 va_copy (ap, orig_ap); 491 492 /* Parts of the format string are going to be copied so that a " %n" can 493 be appended. alloc_fmt is some space for that. orig_fmt_len+4 will be 494 needed if fmt consists of a single "%" specifier, but otherwise is an 495 overestimate. We're not going to be very fast here, so use 496 __gmp_allocate_func rather than TMP_ALLOC. */ 497 orig_fmt_len = strlen (orig_fmt); 498 alloc_fmt_size = orig_fmt_len + 4; 499 alloc_fmt = __GMP_ALLOCATE_FUNC_TYPE (alloc_fmt_size, char); 500 501 fmt = orig_fmt; 502 end_fmt = orig_fmt + orig_fmt_len; 503 504 for (;;) 505 { 506 next: 507 fchar = *fmt++; 508 509 if (fchar == '\0') 510 break; 511 512 if (isspace (fchar)) 513 { 514 chars += skip_white (funs, data); 515 continue; 516 } 517 518 if (fchar != '%') 519 { 520 int c; 521 literal: 522 c = (funs->get) (data); 523 if (c != fchar) 524 { 525 (funs->unget) (c, data); 526 if (c == EOF) 527 { 528 eof_no_match: 529 if (fields == 0) 530 fields = EOF; 531 } 532 goto done; 533 } 534 chars++; 535 continue; 536 } 537 538 param.type = '\0'; 539 param.base = 0; /* for e,f,g,i */ 540 param.ignore = 0; 541 param.width = 0; 542 543 this_fmt = fmt-1; 544 TRACE (printf (" this_fmt \"%s\"\n", this_fmt)); 545 546 for (;;) 547 { 548 ASSERT (fmt <= end_fmt); 549 550 fchar = *fmt++; 551 switch (fchar) { 552 553 case '\0': /* unterminated % sequence */ 554 ASSERT (0); 555 goto done; 556 557 case '%': /* literal % */ 558 goto literal; 559 560 case '[': /* character range */ 561 fchar = *fmt++; 562 if (fchar == '^') 563 fchar = *fmt++; 564 /* ']' allowed as the first char (possibly after '^') */ 565 if (fchar == ']') 566 fchar = *fmt++; 567 for (;;) 568 { 569 ASSERT (fmt <= end_fmt); 570 if (fchar == '\0') 571 { 572 /* unterminated % sequence */ 573 ASSERT (0); 574 goto done; 575 } 576 if (fchar == ']') 577 break; 578 fchar = *fmt++; 579 } 580 /*FALLTHRU*/ 581 case 'c': /* characters */ 582 case 's': /* string of non-whitespace */ 583 case 'p': /* pointer */ 584 libc_type: 585 len = fmt - this_fmt; 586 memcpy (alloc_fmt, this_fmt, len); 587 alloc_fmt[len++] = '%'; 588 alloc_fmt[len++] = 'n'; 589 alloc_fmt[len] = '\0'; 590 591 TRACE (printf (" scan \"%s\"\n", alloc_fmt); 592 if (funs->scan == (gmp_doscan_scan_t) sscanf) 593 printf (" s=\"%s\"\n", * (const char **) data)); 594 595 new_chars = -1; 596 if (param.ignore) 597 { 598 new_fields = (*funs->scan) (data, alloc_fmt, &new_chars, NULL); 599 ASSERT (new_fields == 0 || new_fields == EOF); 600 } 601 else 602 { 603 void *arg = va_arg (ap, void *); 604 new_fields = (*funs->scan) (data, alloc_fmt, arg, &new_chars); 605 ASSERT (new_fields==0 || new_fields==1 || new_fields==EOF); 606 607 if (new_fields == 0) 608 goto done; /* invalid input */ 609 610 if (new_fields == 1) 611 ASSERT (new_chars != -1); 612 } 613 TRACE (printf (" new_fields %d new_chars %d\n", 614 new_fields, new_chars)); 615 616 if (new_fields == -1) 617 goto eof_no_match; /* EOF before anything matched */ 618 619 /* Under param.ignore, when new_fields==0 we don't know if 620 it's a successful match or an invalid field. new_chars 621 won't have been assigned if it was an invalid field. */ 622 if (new_chars == -1) 623 goto done; /* invalid input */ 624 625 chars += new_chars; 626 (*funs->step) (data, new_chars); 627 628 increment_fields: 629 if (! param.ignore) 630 fields++; 631 goto next; 632 633 case 'd': /* decimal */ 634 case 'u': /* decimal */ 635 param.base = 10; 636 goto numeric; 637 638 case 'e': /* float */ 639 case 'E': /* float */ 640 case 'f': /* float */ 641 case 'g': /* float */ 642 case 'G': /* float */ 643 case 'i': /* integer with base marker */ 644 numeric: 645 if (param.type != 'F' && param.type != 'Q' && param.type != 'Z') 646 goto libc_type; 647 648 chars += skip_white (funs, data); 649 650 new_chars = gmpscan (funs, data, ¶m, 651 param.ignore ? NULL : va_arg (ap, void*)); 652 if (new_chars == -2) 653 goto eof_no_match; 654 if (new_chars == -1) 655 goto done; 656 657 ASSERT (new_chars >= 0); 658 chars += new_chars; 659 goto increment_fields; 660 661 case 'a': /* glibc allocate string */ 662 case '\'': /* glibc digit groupings */ 663 break; 664 665 case 'F': /* mpf_t */ 666 case 'j': /* intmax_t */ 667 case 'L': /* long long */ 668 case 'q': /* quad_t */ 669 case 'Q': /* mpq_t */ 670 case 't': /* ptrdiff_t */ 671 case 'z': /* size_t */ 672 case 'Z': /* mpz_t */ 673 set_type: 674 param.type = fchar; 675 break; 676 677 case 'h': /* short or char */ 678 if (param.type != 'h') 679 goto set_type; 680 param.type = 'H'; /* internal code for "hh" */ 681 break; 682 683 goto numeric; 684 685 case 'l': /* long, long long, double or long double */ 686 if (param.type != 'l') 687 goto set_type; 688 param.type = 'L'; /* "ll" means "L" */ 689 break; 690 691 case 'n': 692 if (! param.ignore) 693 { 694 void *p; 695 p = va_arg (ap, void *); 696 TRACE (printf (" store %%n to %p\n", p)); 697 switch (param.type) { 698 case '\0': * (int *) p = chars; break; 699 case 'F': mpf_set_si ((mpf_ptr) p, (long) chars); break; 700 case 'H': * (char *) p = chars; break; 701 case 'h': * (short *) p = chars; break; 702 #if HAVE_INTMAX_T 703 case 'j': * (intmax_t *) p = chars; break; 704 #else 705 case 'j': ASSERT_FAIL (intmax_t not available); break; 706 #endif 707 case 'l': * (long *) p = chars; break; 708 #if HAVE_QUAD_T && HAVE_LONG_LONG 709 case 'q': 710 ASSERT_ALWAYS (sizeof (quad_t) == sizeof (long long)); 711 /*FALLTHRU*/ 712 #else 713 case 'q': ASSERT_FAIL (quad_t not available); break; 714 #endif 715 #if HAVE_LONG_LONG 716 case 'L': * (long long *) p = chars; break; 717 #else 718 case 'L': ASSERT_FAIL (long long not available); break; 719 #endif 720 case 'Q': mpq_set_si ((mpq_ptr) p, (long) chars, 1L); break; 721 #if HAVE_PTRDIFF_T 722 case 't': * (ptrdiff_t *) p = chars; break; 723 #else 724 case 't': ASSERT_FAIL (ptrdiff_t not available); break; 725 #endif 726 case 'z': * (size_t *) p = chars; break; 727 case 'Z': mpz_set_si ((mpz_ptr) p, (long) chars); break; 728 default: ASSERT (0); break; 729 } 730 } 731 goto next; 732 733 case 'o': 734 param.base = 8; 735 goto numeric; 736 737 case 'x': 738 case 'X': 739 param.base = 16; 740 goto numeric; 741 742 case '0': case '1': case '2': case '3': case '4': 743 case '5': case '6': case '7': case '8': case '9': 744 param.width = 0; 745 do { 746 param.width = param.width * 10 + (fchar-'0'); 747 fchar = *fmt++; 748 } while (isdigit (fchar)); 749 fmt--; /* unget the non-digit */ 750 break; 751 752 case '*': 753 param.ignore = 1; 754 break; 755 756 default: 757 /* something invalid in a % sequence */ 758 ASSERT (0); 759 goto next; 760 } 761 } 762 } 763 764 done: 765 (*__gmp_free_func) (alloc_fmt, alloc_fmt_size); 766 return fields; 767 } 768