1 // Copyright 2018 Ulf Adams 2 // 3 // The contents of this file may be used under the terms of the Apache License, 4 // Version 2.0. 5 // 6 // (See accompanying file LICENSE-Apache or copy at 7 // http://www.apache.org/licenses/LICENSE-2.0) 8 // 9 // Alternatively, the contents of this file may be used under the terms of 10 // the Boost Software License, Version 1.0. 11 // (See accompanying file LICENSE-Boost or copy at 12 // https://www.boost.org/LICENSE_1_0.txt) 13 // 14 // Unless required by applicable law or agreed to in writing, this software 15 // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 // KIND, either express or implied. 17 18 // Runtime compiler options: 19 // -DRYU_DEBUG Generate verbose debugging output to stdout. 20 // 21 // -DRYU_ONLY_64_BIT_OPS Avoid using uint128_t or 64-bit intrinsics. Slower, 22 // depending on your compiler. 23 // 24 // -DRYU_AVOID_UINT128 Avoid using uint128_t. Slower, depending on your compiler. 25 26 27 28 #ifdef RYU_DEBUG 29 #endif 30 31 32 #define DOUBLE_MANTISSA_BITS 52 33 #define DOUBLE_EXPONENT_BITS 11 34 #define DOUBLE_BIAS 1023 35 36 #define POW10_ADDITIONAL_BITS 120 37 38 #if defined(HAS_UINT128) 39 static inline uint128_t umul256(const uint128_t a, const uint64_t bHi, const uint64_t bLo, uint128_t* const productHi) { 40 const uint64_t aLo = (uint64_t)a; 41 const uint64_t aHi = (uint64_t)(a >> 64); 42 43 const uint128_t b00 = (uint128_t)aLo * bLo; 44 const uint128_t b01 = (uint128_t)aLo * bHi; 45 const uint128_t b10 = (uint128_t)aHi * bLo; 46 const uint128_t b11 = (uint128_t)aHi * bHi; 47 48 const uint64_t b00Lo = (uint64_t)b00; 49 const uint64_t b00Hi = (uint64_t)(b00 >> 64); 50 51 const uint128_t mid1 = b10 + b00Hi; 52 const uint64_t mid1Lo = (uint64_t)(mid1); 53 const uint64_t mid1Hi = (uint64_t)(mid1 >> 64); 54 55 const uint128_t mid2 = b01 + mid1Lo; 56 const uint64_t mid2Lo = (uint64_t)(mid2); 57 const uint64_t mid2Hi = (uint64_t)(mid2 >> 64); 58 59 const uint128_t pHi = b11 + mid1Hi + mid2Hi; 60 const uint128_t pLo = ((uint128_t)mid2Lo << 64) | b00Lo; 61 62 *productHi = pHi; 63 return pLo; 64 } 65 66 // Returns the high 128 bits of the 256-bit product of a and b. 67 static inline uint128_t umul256_hi(const uint128_t a, const uint64_t bHi, const uint64_t bLo) { 68 // Reuse the umul256 implementation. 69 // Optimizers will likely eliminate the instructions used to compute the 70 // low part of the product. 71 uint128_t hi; 72 umul256(a, bHi, bLo, &hi); 73 return hi; 74 } 75 76 // Unfortunately, gcc/clang do not automatically turn a 128-bit integer division 77 // into a multiplication, so we have to do it manually. 78 static inline uint32_t uint128_mod1e9(const uint128_t v) { 79 // After multiplying, we're going to shift right by 29, then truncate to uint32_t. 80 // This means that we need only 29 + 32 = 61 bits, so we can truncate to uint64_t before shifting. 81 const uint64_t multiplied = (uint64_t) umul256_hi(v, 0x89705F4136B4A597u, 0x31680A88F8953031u); 82 83 // For uint32_t truncation, see the mod1e9() comment in d2s_intrinsics.h. 84 const uint32_t shifted = (uint32_t) (multiplied >> 29); 85 86 return ((uint32_t) v) - 1000000000 * shifted; 87 } 88 89 // Best case: use 128-bit type. 90 static inline uint32_t mulShift_mod1e9(const uint64_t m, const uint64_t* const mul, const int32_t j) { 91 const uint128_t b0 = ((uint128_t) m) * mul[0]; // 0 92 const uint128_t b1 = ((uint128_t) m) * mul[1]; // 64 93 const uint128_t b2 = ((uint128_t) m) * mul[2]; // 128 94 #ifdef RYU_DEBUG 95 if (j < 128 || j > 180) { 96 printf("%d\n", j); 97 } 98 #endif 99 assert(j >= 128); 100 assert(j <= 180); 101 // j: [128, 256) 102 const uint128_t mid = b1 + (uint64_t) (b0 >> 64); // 64 103 const uint128_t s1 = b2 + (uint64_t) (mid >> 64); // 128 104 return uint128_mod1e9(s1 >> (j - 128)); 105 } 106 107 #else // HAS_UINT128 108 109 #if defined(HAS_64_BIT_INTRINSICS) 110 // Returns the low 64 bits of the high 128 bits of the 256-bit product of a and b. 111 static inline uint64_t umul256_hi128_lo64( 112 const uint64_t aHi, const uint64_t aLo, const uint64_t bHi, const uint64_t bLo) { 113 uint64_t b00Hi; 114 const uint64_t b00Lo = umul128(aLo, bLo, &b00Hi); 115 uint64_t b01Hi; 116 const uint64_t b01Lo = umul128(aLo, bHi, &b01Hi); 117 uint64_t b10Hi; 118 const uint64_t b10Lo = umul128(aHi, bLo, &b10Hi); 119 uint64_t b11Hi; 120 const uint64_t b11Lo = umul128(aHi, bHi, &b11Hi); 121 (void) b00Lo; // unused 122 (void) b11Hi; // unused 123 const uint64_t temp1Lo = b10Lo + b00Hi; 124 const uint64_t temp1Hi = b10Hi + (temp1Lo < b10Lo); 125 const uint64_t temp2Lo = b01Lo + temp1Lo; 126 const uint64_t temp2Hi = b01Hi + (temp2Lo < b01Lo); 127 return b11Lo + temp1Hi + temp2Hi; 128 } 129 130 static inline uint32_t uint128_mod1e9(const uint64_t vHi, const uint64_t vLo) { 131 // After multiplying, we're going to shift right by 29, then truncate to uint32_t. 132 // This means that we need only 29 + 32 = 61 bits, so we can truncate to uint64_t before shifting. 133 const uint64_t multiplied = umul256_hi128_lo64(vHi, vLo, 0x89705F4136B4A597u, 0x31680A88F8953031u); 134 135 // For uint32_t truncation, see the mod1e9() comment in d2s_intrinsics.h. 136 const uint32_t shifted = (uint32_t) (multiplied >> 29); 137 138 return ((uint32_t) vLo) - 1000000000 * shifted; 139 } 140 #endif // HAS_64_BIT_INTRINSICS 141 142 static inline uint32_t mulShift_mod1e9(const uint64_t m, const uint64_t* const mul, const int32_t j) { 143 uint64_t high0; // 64 144 const uint64_t low0 = umul128(m, mul[0], &high0); // 0 145 uint64_t high1; // 128 146 const uint64_t low1 = umul128(m, mul[1], &high1); // 64 147 uint64_t high2; // 192 148 const uint64_t low2 = umul128(m, mul[2], &high2); // 128 149 const uint64_t s0low = low0; // 0 150 (void) s0low; // unused 151 const uint64_t s0high = low1 + high0; // 64 152 const uint32_t c1 = s0high < low1; 153 const uint64_t s1low = low2 + high1 + c1; // 128 154 const uint32_t c2 = s1low < low2; // high1 + c1 can't overflow, so compare against low2 155 const uint64_t s1high = high2 + c2; // 192 156 #ifdef RYU_DEBUG 157 if (j < 128 || j > 180) { 158 printf("%d\n", j); 159 } 160 #endif 161 assert(j >= 128); 162 assert(j <= 180); 163 #if defined(HAS_64_BIT_INTRINSICS) 164 const uint32_t dist = (uint32_t) (j - 128); // dist: [0, 52] 165 const uint64_t shiftedhigh = s1high >> dist; 166 const uint64_t shiftedlow = shiftright128(s1low, s1high, dist); 167 return uint128_mod1e9(shiftedhigh, shiftedlow); 168 #else // HAS_64_BIT_INTRINSICS 169 if (j < 160) { // j: [128, 160) 170 const uint64_t r0 = mod1e9(s1high); 171 const uint64_t r1 = mod1e9((r0 << 32) | (s1low >> 32)); 172 const uint64_t r2 = ((r1 << 32) | (s1low & 0xffffffff)); 173 return mod1e9(r2 >> (j - 128)); 174 } else { // j: [160, 192) 175 const uint64_t r0 = mod1e9(s1high); 176 const uint64_t r1 = ((r0 << 32) | (s1low >> 32)); 177 return mod1e9(r1 >> (j - 160)); 178 } 179 #endif // HAS_64_BIT_INTRINSICS 180 } 181 #endif // HAS_UINT128 182 183 // Convert `digits` to a sequence of decimal digits. Append the digits to the result. 184 // The caller has to guarantee that: 185 // 10^(olength-1) <= digits < 10^olength 186 // e.g., by passing `olength` as `decimalLength9(digits)`. 187 static inline void append_n_digits(const uint32_t olength, uint32_t digits, char* const result) { 188 #ifdef RYU_DEBUG 189 printf("DIGITS=%u\n", digits); 190 #endif 191 192 uint32_t i = 0; 193 while (digits >= 10000) { 194 #ifdef __clang__ // https://bugs.llvm.org/show_bug.cgi?id=38217 195 const uint32_t c = digits - 10000 * (digits / 10000); 196 #else 197 const uint32_t c = digits % 10000; 198 #endif 199 digits /= 10000; 200 const uint32_t c0 = (c % 100) << 1; 201 const uint32_t c1 = (c / 100) << 1; 202 memcpy(result + olength - i - 2, DIGIT_TABLE + c0, 2); 203 memcpy(result + olength - i - 4, DIGIT_TABLE + c1, 2); 204 i += 4; 205 } 206 if (digits >= 100) { 207 const uint32_t c = (digits % 100) << 1; 208 digits /= 100; 209 memcpy(result + olength - i - 2, DIGIT_TABLE + c, 2); 210 i += 2; 211 } 212 if (digits >= 10) { 213 const uint32_t c = digits << 1; 214 memcpy(result + olength - i - 2, DIGIT_TABLE + c, 2); 215 } else { 216 result[0] = (char) ('0' + digits); 217 } 218 } 219 220 // Convert `digits` to a sequence of decimal digits. Print the first digit, followed by a decimal 221 // dot '.' followed by the remaining digits. The caller has to guarantee that: 222 // 10^(olength-1) <= digits < 10^olength 223 // e.g., by passing `olength` as `decimalLength9(digits)`. 224 static inline void append_d_digits(const uint32_t olength, uint32_t digits, char* const result) { 225 #ifdef RYU_DEBUG 226 printf("DIGITS=%u\n", digits); 227 #endif 228 229 uint32_t i = 0; 230 while (digits >= 10000) { 231 #ifdef __clang__ // https://bugs.llvm.org/show_bug.cgi?id=38217 232 const uint32_t c = digits - 10000 * (digits / 10000); 233 #else 234 const uint32_t c = digits % 10000; 235 #endif 236 digits /= 10000; 237 const uint32_t c0 = (c % 100) << 1; 238 const uint32_t c1 = (c / 100) << 1; 239 memcpy(result + olength + 1 - i - 2, DIGIT_TABLE + c0, 2); 240 memcpy(result + olength + 1 - i - 4, DIGIT_TABLE + c1, 2); 241 i += 4; 242 } 243 if (digits >= 100) { 244 const uint32_t c = (digits % 100) << 1; 245 digits /= 100; 246 memcpy(result + olength + 1 - i - 2, DIGIT_TABLE + c, 2); 247 i += 2; 248 } 249 if (digits >= 10) { 250 const uint32_t c = digits << 1; 251 result[2] = DIGIT_TABLE[c + 1]; 252 result[1] = '.'; 253 result[0] = DIGIT_TABLE[c]; 254 } else { 255 result[1] = '.'; 256 result[0] = (char) ('0' + digits); 257 } 258 } 259 260 // Convert `digits` to decimal and write the last `count` decimal digits to result. 261 // If `digits` contains additional digits, then those are silently ignored. 262 static inline void append_c_digits(const uint32_t count, uint32_t digits, char* const result) { 263 #ifdef RYU_DEBUG 264 printf("DIGITS=%u\n", digits); 265 #endif 266 // Copy pairs of digits from DIGIT_TABLE. 267 uint32_t i = 0; 268 for (; i < count - 1; i += 2) { 269 const uint32_t c = (digits % 100) << 1; 270 digits /= 100; 271 memcpy(result + count - i - 2, DIGIT_TABLE + c, 2); 272 } 273 // Generate the last digit if count is odd. 274 if (i < count) { 275 const char c = (char) ('0' + (digits % 10)); 276 result[count - i - 1] = c; 277 } 278 } 279 280 // Convert `digits` to decimal and write the last 9 decimal digits to result. 281 // If `digits` contains additional digits, then those are silently ignored. 282 static inline void append_nine_digits(uint32_t digits, char* const result) { 283 #ifdef RYU_DEBUG 284 printf("DIGITS=%u\n", digits); 285 #endif 286 if (digits == 0) { 287 memset(result, '0', 9); 288 return; 289 } 290 291 for (uint32_t i = 0; i < 5; i += 4) { 292 #ifdef __clang__ // https://bugs.llvm.org/show_bug.cgi?id=38217 293 const uint32_t c = digits - 10000 * (digits / 10000); 294 #else 295 const uint32_t c = digits % 10000; 296 #endif 297 digits /= 10000; 298 const uint32_t c0 = (c % 100) << 1; 299 const uint32_t c1 = (c / 100) << 1; 300 memcpy(result + 7 - i, DIGIT_TABLE + c0, 2); 301 memcpy(result + 5 - i, DIGIT_TABLE + c1, 2); 302 } 303 result[0] = (char) ('0' + digits); 304 } 305 306 static inline uint32_t indexForExponent(const uint32_t e) { 307 return (e + 15) / 16; 308 } 309 310 static inline uint32_t pow10BitsForIndex(const uint32_t idx) { 311 return 16 * idx + POW10_ADDITIONAL_BITS; 312 } 313 314 static inline uint32_t lengthForIndex(const uint32_t idx) { 315 // +1 for ceil, +16 for mantissa, +8 to round up when dividing by 9 316 return (log10Pow2(16 * (int32_t) idx) + 1 + 16 + 8) / 9; 317 } 318 319 int d2fixed_buffered_n(double d, uint32_t precision, char* result) { 320 const uint64_t bits = double_to_bits(d); 321 #ifdef RYU_DEBUG 322 printf("IN="); 323 for (int32_t bit = 63; bit >= 0; --bit) { 324 printf("%d", (int) ((bits >> bit) & 1)); 325 } 326 printf("\n"); 327 #endif 328 329 // Decode bits into sign, mantissa, and exponent. 330 const bool ieeeSign = ((bits >> (DOUBLE_MANTISSA_BITS + DOUBLE_EXPONENT_BITS)) & 1) != 0; 331 const uint64_t ieeeMantissa = bits & ((1ull << DOUBLE_MANTISSA_BITS) - 1); 332 const uint32_t ieeeExponent = (uint32_t) ((bits >> DOUBLE_MANTISSA_BITS) & ((1u << DOUBLE_EXPONENT_BITS) - 1)); 333 334 // Case distinction; exit early for the easy cases. 335 if (ieeeExponent == ((1u << DOUBLE_EXPONENT_BITS) - 1u)) { 336 __builtin_abort(); 337 } 338 if (ieeeExponent == 0 && ieeeMantissa == 0) { 339 __builtin_abort(); 340 } 341 342 int32_t e2; 343 uint64_t m2; 344 if (ieeeExponent == 0) { 345 e2 = 1 - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS; 346 m2 = ieeeMantissa; 347 } else { 348 e2 = (int32_t) ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS; 349 m2 = (1ull << DOUBLE_MANTISSA_BITS) | ieeeMantissa; 350 } 351 352 #ifdef RYU_DEBUG 353 printf("-> %" PRIu64 " * 2^%d\n", m2, e2); 354 #endif 355 356 int index = 0; 357 bool nonzero = false; 358 if (ieeeSign) { 359 result[index++] = '-'; 360 } 361 if (e2 >= -52) { 362 const uint32_t idx = e2 < 0 ? 0 : indexForExponent((uint32_t) e2); 363 const uint32_t p10bits = pow10BitsForIndex(idx); 364 const int32_t len = (int32_t) lengthForIndex(idx); 365 #ifdef RYU_DEBUG 366 printf("idx=%u\n", idx); 367 printf("len=%d\n", len); 368 #endif 369 for (int32_t i = len - 1; i >= 0; --i) { 370 const uint32_t j = p10bits - e2; 371 // Temporary: j is usually around 128, and by shifting a bit, we push it to 128 or above, which is 372 // a slightly faster code path in mulShift_mod1e9. Instead, we can just increase the multipliers. 373 const uint32_t digits = mulShift_mod1e9(m2 << 8, POW10_SPLIT[POW10_OFFSET[idx] + i], (int32_t) (j + 8)); 374 if (nonzero) { 375 append_nine_digits(digits, result + index); 376 index += 9; 377 } else if (digits != 0) { 378 const uint32_t olength = decimalLength9(digits); 379 append_n_digits(olength, digits, result + index); 380 index += olength; 381 nonzero = true; 382 } 383 } 384 } 385 if (!nonzero) { 386 result[index++] = '0'; 387 } 388 if (precision > 0) { 389 result[index++] = '.'; 390 } 391 #ifdef RYU_DEBUG 392 printf("e2=%d\n", e2); 393 #endif 394 if (e2 < 0) { 395 const int32_t idx = -e2 / 16; 396 #ifdef RYU_DEBUG 397 printf("idx=%d\n", idx); 398 #endif 399 const uint32_t blocks = precision / 9 + 1; 400 // 0 = don't round up; 1 = round up unconditionally; 2 = round up if odd. 401 int roundUp = 0; 402 uint32_t i = 0; 403 if (blocks <= MIN_BLOCK_2[idx]) { 404 i = blocks; 405 memset(result + index, '0', precision); 406 index += precision; 407 } else if (i < MIN_BLOCK_2[idx]) { 408 i = MIN_BLOCK_2[idx]; 409 memset(result + index, '0', 9 * i); 410 index += 9 * i; 411 } 412 for (; i < blocks; ++i) { 413 const int32_t j = ADDITIONAL_BITS_2 + (-e2 - 16 * idx); 414 const uint32_t p = POW10_OFFSET_2[idx] + i - MIN_BLOCK_2[idx]; 415 if (p >= POW10_OFFSET_2[idx + 1]) { 416 // If the remaining digits are all 0, then we might as well use memset. 417 // No rounding required in this case. 418 const uint32_t fill = precision - 9 * i; 419 memset(result + index, '0', fill); 420 index += fill; 421 break; 422 } 423 // Temporary: j is usually around 128, and by shifting a bit, we push it to 128 or above, which is 424 // a slightly faster code path in mulShift_mod1e9. Instead, we can just increase the multipliers. 425 uint32_t digits = mulShift_mod1e9(m2 << 8, POW10_SPLIT_2[p], j + 8); 426 #ifdef RYU_DEBUG 427 printf("digits=%u\n", digits); 428 #endif 429 if (i < blocks - 1) { 430 append_nine_digits(digits, result + index); 431 index += 9; 432 } else { 433 const uint32_t maximum = precision - 9 * i; 434 uint32_t lastDigit = 0; 435 for (uint32_t k = 0; k < 9 - maximum; ++k) { 436 lastDigit = digits % 10; 437 digits /= 10; 438 } 439 #ifdef RYU_DEBUG 440 printf("lastDigit=%u\n", lastDigit); 441 #endif 442 if (lastDigit != 5) { 443 roundUp = lastDigit > 5; 444 } else { 445 // Is m * 10^(additionalDigits + 1) / 2^(-e2) integer? 446 const int32_t requiredTwos = -e2 - (int32_t) precision - 1; 447 const bool trailingZeros = requiredTwos <= 0 448 || (requiredTwos < 60 && multipleOfPowerOf2(m2, (uint32_t) requiredTwos)); 449 roundUp = trailingZeros ? 2 : 1; 450 #ifdef RYU_DEBUG 451 printf("requiredTwos=%d\n", requiredTwos); 452 printf("trailingZeros=%s\n", trailingZeros ? "true" : "false"); 453 #endif 454 } 455 if (maximum > 0) { 456 append_c_digits(maximum, digits, result + index); 457 index += maximum; 458 } 459 break; 460 } 461 } 462 #ifdef RYU_DEBUG 463 printf("roundUp=%d\n", roundUp); 464 #endif 465 if (roundUp != 0) { 466 int roundIndex = index; 467 int dotIndex = 0; // '.' can't be located at index 0 468 while (true) { 469 --roundIndex; 470 char c; 471 if (roundIndex == -1 || (c = result[roundIndex], c == '-')) { 472 result[roundIndex + 1] = '1'; 473 if (dotIndex > 0) { 474 result[dotIndex] = '0'; 475 result[dotIndex + 1] = '.'; 476 } 477 result[index++] = '0'; 478 break; 479 } 480 if (c == '.') { 481 dotIndex = roundIndex; 482 continue; 483 } else if (c == '9') { 484 result[roundIndex] = '0'; 485 roundUp = 1; 486 continue; 487 } else { 488 if (roundUp == 2 && c % 2 == 0) { 489 break; 490 } 491 result[roundIndex] = c + 1; 492 break; 493 } 494 } 495 } 496 } else { 497 memset(result + index, '0', precision); 498 index += precision; 499 } 500 return index; 501 } 502 503 504 505 int d2exp_buffered_n(double d, uint32_t precision, char* result, int* exp_out) { 506 const uint64_t bits = double_to_bits(d); 507 #ifdef RYU_DEBUG 508 printf("IN="); 509 for (int32_t bit = 63; bit >= 0; --bit) { 510 printf("%d", (int) ((bits >> bit) & 1)); 511 } 512 printf("\n"); 513 #endif 514 515 // Decode bits into sign, mantissa, and exponent. 516 const bool ieeeSign = ((bits >> (DOUBLE_MANTISSA_BITS + DOUBLE_EXPONENT_BITS)) & 1) != 0; 517 const uint64_t ieeeMantissa = bits & ((1ull << DOUBLE_MANTISSA_BITS) - 1); 518 const uint32_t ieeeExponent = (uint32_t) ((bits >> DOUBLE_MANTISSA_BITS) & ((1u << DOUBLE_EXPONENT_BITS) - 1)); 519 520 // Case distinction; exit early for the easy cases. 521 if (ieeeExponent == ((1u << DOUBLE_EXPONENT_BITS) - 1u)) { 522 __builtin_abort(); 523 } 524 if (ieeeExponent == 0 && ieeeMantissa == 0) { 525 __builtin_abort(); 526 } 527 528 int32_t e2; 529 uint64_t m2; 530 if (ieeeExponent == 0) { 531 e2 = 1 - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS; 532 m2 = ieeeMantissa; 533 } else { 534 e2 = (int32_t) ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS; 535 m2 = (1ull << DOUBLE_MANTISSA_BITS) | ieeeMantissa; 536 } 537 538 #ifdef RYU_DEBUG 539 printf("-> %" PRIu64 " * 2^%d\n", m2, e2); 540 #endif 541 542 const bool printDecimalPoint = precision > 0; 543 ++precision; 544 int index = 0; 545 if (ieeeSign) { 546 result[index++] = '-'; 547 } 548 uint32_t digits = 0; 549 uint32_t printedDigits = 0; 550 uint32_t availableDigits = 0; 551 int32_t exp = 0; 552 if (e2 >= -52) { 553 const uint32_t idx = e2 < 0 ? 0 : indexForExponent((uint32_t) e2); 554 const uint32_t p10bits = pow10BitsForIndex(idx); 555 const int32_t len = (int32_t) lengthForIndex(idx); 556 #ifdef RYU_DEBUG 557 printf("idx=%u\n", idx); 558 printf("len=%d\n", len); 559 #endif 560 for (int32_t i = len - 1; i >= 0; --i) { 561 const uint32_t j = p10bits - e2; 562 // Temporary: j is usually around 128, and by shifting a bit, we push it to 128 or above, which is 563 // a slightly faster code path in mulShift_mod1e9. Instead, we can just increase the multipliers. 564 digits = mulShift_mod1e9(m2 << 8, POW10_SPLIT[POW10_OFFSET[idx] + i], (int32_t) (j + 8)); 565 if (printedDigits != 0) { 566 if (printedDigits + 9 > precision) { 567 availableDigits = 9; 568 break; 569 } 570 append_nine_digits(digits, result + index); 571 index += 9; 572 printedDigits += 9; 573 } else if (digits != 0) { 574 availableDigits = decimalLength9(digits); 575 exp = i * 9 + (int32_t) availableDigits - 1; 576 if (availableDigits > precision) { 577 break; 578 } 579 if (printDecimalPoint) { 580 append_d_digits(availableDigits, digits, result + index); 581 index += availableDigits + 1; // +1 for decimal point 582 } else { 583 result[index++] = (char) ('0' + digits); 584 } 585 printedDigits = availableDigits; 586 availableDigits = 0; 587 } 588 } 589 } 590 591 if (e2 < 0 && availableDigits == 0) { 592 const int32_t idx = -e2 / 16; 593 #ifdef RYU_DEBUG 594 printf("idx=%d, e2=%d, min=%d\n", idx, e2, MIN_BLOCK_2[idx]); 595 #endif 596 for (int32_t i = MIN_BLOCK_2[idx]; i < 200; ++i) { 597 const int32_t j = ADDITIONAL_BITS_2 + (-e2 - 16 * idx); 598 const uint32_t p = POW10_OFFSET_2[idx] + (uint32_t) i - MIN_BLOCK_2[idx]; 599 // Temporary: j is usually around 128, and by shifting a bit, we push it to 128 or above, which is 600 // a slightly faster code path in mulShift_mod1e9. Instead, we can just increase the multipliers. 601 digits = (p >= POW10_OFFSET_2[idx + 1]) ? 0 : mulShift_mod1e9(m2 << 8, POW10_SPLIT_2[p], j + 8); 602 #ifdef RYU_DEBUG 603 printf("exact=%" PRIu64 " * (%" PRIu64 " + %" PRIu64 " << 64) >> %d\n", m2, POW10_SPLIT_2[p][0], POW10_SPLIT_2[p][1], j); 604 printf("digits=%u\n", digits); 605 #endif 606 if (printedDigits != 0) { 607 if (printedDigits + 9 > precision) { 608 availableDigits = 9; 609 break; 610 } 611 append_nine_digits(digits, result + index); 612 index += 9; 613 printedDigits += 9; 614 } else if (digits != 0) { 615 availableDigits = decimalLength9(digits); 616 exp = -(i + 1) * 9 + (int32_t) availableDigits - 1; 617 if (availableDigits > precision) { 618 break; 619 } 620 if (printDecimalPoint) { 621 append_d_digits(availableDigits, digits, result + index); 622 index += availableDigits + 1; // +1 for decimal point 623 } else { 624 result[index++] = (char) ('0' + digits); 625 } 626 printedDigits = availableDigits; 627 availableDigits = 0; 628 } 629 } 630 } 631 632 const uint32_t maximum = precision - printedDigits; 633 #ifdef RYU_DEBUG 634 printf("availableDigits=%u\n", availableDigits); 635 printf("digits=%u\n", digits); 636 printf("maximum=%u\n", maximum); 637 #endif 638 if (availableDigits == 0) { 639 digits = 0; 640 } 641 uint32_t lastDigit = 0; 642 if (availableDigits > maximum) { 643 for (uint32_t k = 0; k < availableDigits - maximum; ++k) { 644 lastDigit = digits % 10; 645 digits /= 10; 646 } 647 } 648 #ifdef RYU_DEBUG 649 printf("lastDigit=%u\n", lastDigit); 650 #endif 651 // 0 = don't round up; 1 = round up unconditionally; 2 = round up if odd. 652 int roundUp = 0; 653 if (lastDigit != 5) { 654 roundUp = lastDigit > 5; 655 } else { 656 // Is m * 2^e2 * 10^(precision + 1 - exp) integer? 657 // precision was already increased by 1, so we don't need to write + 1 here. 658 const int32_t rexp = (int32_t) precision - exp; 659 const int32_t requiredTwos = -e2 - rexp; 660 bool trailingZeros = requiredTwos <= 0 661 || (requiredTwos < 60 && multipleOfPowerOf2(m2, (uint32_t) requiredTwos)); 662 if (rexp < 0) { 663 const int32_t requiredFives = -rexp; 664 trailingZeros = trailingZeros && multipleOfPowerOf5(m2, (uint32_t) requiredFives); 665 } 666 roundUp = trailingZeros ? 2 : 1; 667 #ifdef RYU_DEBUG 668 printf("requiredTwos=%d\n", requiredTwos); 669 printf("trailingZeros=%s\n", trailingZeros ? "true" : "false"); 670 #endif 671 } 672 if (printedDigits != 0) { 673 if (digits == 0) { 674 memset(result + index, '0', maximum); 675 } else { 676 append_c_digits(maximum, digits, result + index); 677 } 678 index += maximum; 679 } else { 680 if (printDecimalPoint) { 681 append_d_digits(maximum, digits, result + index); 682 index += maximum + 1; // +1 for decimal point 683 } else { 684 result[index++] = (char) ('0' + digits); 685 } 686 } 687 #ifdef RYU_DEBUG 688 printf("roundUp=%d\n", roundUp); 689 #endif 690 if (roundUp != 0) { 691 int roundIndex = index; 692 while (true) { 693 --roundIndex; 694 char c; 695 if (roundIndex == -1 || (c = result[roundIndex], c == '-')) { 696 result[roundIndex + 1] = '1'; 697 ++exp; 698 break; 699 } 700 if (c == '.') { 701 continue; 702 } else if (c == '9') { 703 result[roundIndex] = '0'; 704 roundUp = 1; 705 continue; 706 } else { 707 if (roundUp == 2 && c % 2 == 0) { 708 break; 709 } 710 result[roundIndex] = c + 1; 711 break; 712 } 713 } 714 } 715 if (exp_out) { 716 *exp_out = exp; 717 } 718 result[index++] = 'e'; 719 if (exp < 0) { 720 result[index++] = '-'; 721 exp = -exp; 722 } else { 723 result[index++] = '+'; 724 } 725 726 if (exp >= 100) { 727 const int32_t c = exp % 10; 728 memcpy(result + index, DIGIT_TABLE + 2 * (exp / 10), 2); 729 result[index + 2] = (char) ('0' + c); 730 index += 3; 731 } else { 732 memcpy(result + index, DIGIT_TABLE + 2 * exp, 2); 733 index += 2; 734 } 735 736 return index; 737 } 738