1 // Copyright 2018 Ulf Adams
2 //
3 // The contents of this file may be used under the terms of the Apache License,
4 // Version 2.0.
5 //
6 // (See accompanying file LICENSE-Apache or copy at
7 // http://www.apache.org/licenses/LICENSE-2.0)
8 //
9 // Alternatively, the contents of this file may be used under the terms of
10 // the Boost Software License, Version 1.0.
11 // (See accompanying file LICENSE-Boost or copy at
12 // https://www.boost.org/LICENSE_1_0.txt)
13 //
14 // Unless required by applicable law or agreed to in writing, this software
15 // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 // KIND, either express or implied.
17
18 // Runtime compiler options:
19 // -DRYU_DEBUG Generate verbose debugging output to stdout.
20 //
21 // -DRYU_ONLY_64_BIT_OPS Avoid using uint128_t or 64-bit intrinsics. Slower,
22 // depending on your compiler.
23 //
24 // -DRYU_AVOID_UINT128 Avoid using uint128_t. Slower, depending on your compiler.
25
26
27
28 #ifdef RYU_DEBUG
29 #endif
30
31
32 #define DOUBLE_MANTISSA_BITS 52
33 #define DOUBLE_EXPONENT_BITS 11
34 #define DOUBLE_BIAS 1023
35
36 #define POW10_ADDITIONAL_BITS 120
37
38 #if defined(HAS_UINT128)
umul256(const uint128_t a,const uint64_t bHi,const uint64_t bLo,uint128_t * const productHi)39 static inline uint128_t umul256(const uint128_t a, const uint64_t bHi, const uint64_t bLo, uint128_t* const productHi) {
40 const uint64_t aLo = (uint64_t)a;
41 const uint64_t aHi = (uint64_t)(a >> 64);
42
43 const uint128_t b00 = (uint128_t)aLo * bLo;
44 const uint128_t b01 = (uint128_t)aLo * bHi;
45 const uint128_t b10 = (uint128_t)aHi * bLo;
46 const uint128_t b11 = (uint128_t)aHi * bHi;
47
48 const uint64_t b00Lo = (uint64_t)b00;
49 const uint64_t b00Hi = (uint64_t)(b00 >> 64);
50
51 const uint128_t mid1 = b10 + b00Hi;
52 const uint64_t mid1Lo = (uint64_t)(mid1);
53 const uint64_t mid1Hi = (uint64_t)(mid1 >> 64);
54
55 const uint128_t mid2 = b01 + mid1Lo;
56 const uint64_t mid2Lo = (uint64_t)(mid2);
57 const uint64_t mid2Hi = (uint64_t)(mid2 >> 64);
58
59 const uint128_t pHi = b11 + mid1Hi + mid2Hi;
60 const uint128_t pLo = ((uint128_t)mid2Lo << 64) | b00Lo;
61
62 *productHi = pHi;
63 return pLo;
64 }
65
66 // Returns the high 128 bits of the 256-bit product of a and b.
umul256_hi(const uint128_t a,const uint64_t bHi,const uint64_t bLo)67 static inline uint128_t umul256_hi(const uint128_t a, const uint64_t bHi, const uint64_t bLo) {
68 // Reuse the umul256 implementation.
69 // Optimizers will likely eliminate the instructions used to compute the
70 // low part of the product.
71 uint128_t hi;
72 umul256(a, bHi, bLo, &hi);
73 return hi;
74 }
75
76 // Unfortunately, gcc/clang do not automatically turn a 128-bit integer division
77 // into a multiplication, so we have to do it manually.
uint128_mod1e9(const uint128_t v)78 static inline uint32_t uint128_mod1e9(const uint128_t v) {
79 // After multiplying, we're going to shift right by 29, then truncate to uint32_t.
80 // This means that we need only 29 + 32 = 61 bits, so we can truncate to uint64_t before shifting.
81 const uint64_t multiplied = (uint64_t) umul256_hi(v, 0x89705F4136B4A597u, 0x31680A88F8953031u);
82
83 // For uint32_t truncation, see the mod1e9() comment in d2s_intrinsics.h.
84 const uint32_t shifted = (uint32_t) (multiplied >> 29);
85
86 return ((uint32_t) v) - 1000000000 * shifted;
87 }
88
89 // Best case: use 128-bit type.
mulShift_mod1e9(const uint64_t m,const uint64_t * const mul,const int32_t j)90 static inline uint32_t mulShift_mod1e9(const uint64_t m, const uint64_t* const mul, const int32_t j) {
91 const uint128_t b0 = ((uint128_t) m) * mul[0]; // 0
92 const uint128_t b1 = ((uint128_t) m) * mul[1]; // 64
93 const uint128_t b2 = ((uint128_t) m) * mul[2]; // 128
94 #ifdef RYU_DEBUG
95 if (j < 128 || j > 180) {
96 printf("%d\n", j);
97 }
98 #endif
99 assert(j >= 128);
100 assert(j <= 180);
101 // j: [128, 256)
102 const uint128_t mid = b1 + (uint64_t) (b0 >> 64); // 64
103 const uint128_t s1 = b2 + (uint64_t) (mid >> 64); // 128
104 return uint128_mod1e9(s1 >> (j - 128));
105 }
106
107 #else // HAS_UINT128
108
109 #if defined(HAS_64_BIT_INTRINSICS)
110 // Returns the low 64 bits of the high 128 bits of the 256-bit product of a and b.
umul256_hi128_lo64(const uint64_t aHi,const uint64_t aLo,const uint64_t bHi,const uint64_t bLo)111 static inline uint64_t umul256_hi128_lo64(
112 const uint64_t aHi, const uint64_t aLo, const uint64_t bHi, const uint64_t bLo) {
113 uint64_t b00Hi;
114 const uint64_t b00Lo = umul128(aLo, bLo, &b00Hi);
115 uint64_t b01Hi;
116 const uint64_t b01Lo = umul128(aLo, bHi, &b01Hi);
117 uint64_t b10Hi;
118 const uint64_t b10Lo = umul128(aHi, bLo, &b10Hi);
119 uint64_t b11Hi;
120 const uint64_t b11Lo = umul128(aHi, bHi, &b11Hi);
121 (void) b00Lo; // unused
122 (void) b11Hi; // unused
123 const uint64_t temp1Lo = b10Lo + b00Hi;
124 const uint64_t temp1Hi = b10Hi + (temp1Lo < b10Lo);
125 const uint64_t temp2Lo = b01Lo + temp1Lo;
126 const uint64_t temp2Hi = b01Hi + (temp2Lo < b01Lo);
127 return b11Lo + temp1Hi + temp2Hi;
128 }
129
uint128_mod1e9(const uint64_t vHi,const uint64_t vLo)130 static inline uint32_t uint128_mod1e9(const uint64_t vHi, const uint64_t vLo) {
131 // After multiplying, we're going to shift right by 29, then truncate to uint32_t.
132 // This means that we need only 29 + 32 = 61 bits, so we can truncate to uint64_t before shifting.
133 const uint64_t multiplied = umul256_hi128_lo64(vHi, vLo, 0x89705F4136B4A597u, 0x31680A88F8953031u);
134
135 // For uint32_t truncation, see the mod1e9() comment in d2s_intrinsics.h.
136 const uint32_t shifted = (uint32_t) (multiplied >> 29);
137
138 return ((uint32_t) vLo) - 1000000000 * shifted;
139 }
140 #endif // HAS_64_BIT_INTRINSICS
141
mulShift_mod1e9(const uint64_t m,const uint64_t * const mul,const int32_t j)142 static inline uint32_t mulShift_mod1e9(const uint64_t m, const uint64_t* const mul, const int32_t j) {
143 uint64_t high0; // 64
144 const uint64_t low0 = umul128(m, mul[0], &high0); // 0
145 uint64_t high1; // 128
146 const uint64_t low1 = umul128(m, mul[1], &high1); // 64
147 uint64_t high2; // 192
148 const uint64_t low2 = umul128(m, mul[2], &high2); // 128
149 const uint64_t s0low = low0; // 0
150 (void) s0low; // unused
151 const uint64_t s0high = low1 + high0; // 64
152 const uint32_t c1 = s0high < low1;
153 const uint64_t s1low = low2 + high1 + c1; // 128
154 const uint32_t c2 = s1low < low2; // high1 + c1 can't overflow, so compare against low2
155 const uint64_t s1high = high2 + c2; // 192
156 #ifdef RYU_DEBUG
157 if (j < 128 || j > 180) {
158 printf("%d\n", j);
159 }
160 #endif
161 assert(j >= 128);
162 assert(j <= 180);
163 #if defined(HAS_64_BIT_INTRINSICS)
164 const uint32_t dist = (uint32_t) (j - 128); // dist: [0, 52]
165 const uint64_t shiftedhigh = s1high >> dist;
166 const uint64_t shiftedlow = shiftright128(s1low, s1high, dist);
167 return uint128_mod1e9(shiftedhigh, shiftedlow);
168 #else // HAS_64_BIT_INTRINSICS
169 if (j < 160) { // j: [128, 160)
170 const uint64_t r0 = mod1e9(s1high);
171 const uint64_t r1 = mod1e9((r0 << 32) | (s1low >> 32));
172 const uint64_t r2 = ((r1 << 32) | (s1low & 0xffffffff));
173 return mod1e9(r2 >> (j - 128));
174 } else { // j: [160, 192)
175 const uint64_t r0 = mod1e9(s1high);
176 const uint64_t r1 = ((r0 << 32) | (s1low >> 32));
177 return mod1e9(r1 >> (j - 160));
178 }
179 #endif // HAS_64_BIT_INTRINSICS
180 }
181 #endif // HAS_UINT128
182
183 // Convert `digits` to a sequence of decimal digits. Append the digits to the result.
184 // The caller has to guarantee that:
185 // 10^(olength-1) <= digits < 10^olength
186 // e.g., by passing `olength` as `decimalLength9(digits)`.
append_n_digits(const uint32_t olength,uint32_t digits,char * const result)187 static inline void append_n_digits(const uint32_t olength, uint32_t digits, char* const result) {
188 #ifdef RYU_DEBUG
189 printf("DIGITS=%u\n", digits);
190 #endif
191
192 uint32_t i = 0;
193 while (digits >= 10000) {
194 #ifdef __clang__ // https://bugs.llvm.org/show_bug.cgi?id=38217
195 const uint32_t c = digits - 10000 * (digits / 10000);
196 #else
197 const uint32_t c = digits % 10000;
198 #endif
199 digits /= 10000;
200 const uint32_t c0 = (c % 100) << 1;
201 const uint32_t c1 = (c / 100) << 1;
202 memcpy(result + olength - i - 2, DIGIT_TABLE + c0, 2);
203 memcpy(result + olength - i - 4, DIGIT_TABLE + c1, 2);
204 i += 4;
205 }
206 if (digits >= 100) {
207 const uint32_t c = (digits % 100) << 1;
208 digits /= 100;
209 memcpy(result + olength - i - 2, DIGIT_TABLE + c, 2);
210 i += 2;
211 }
212 if (digits >= 10) {
213 const uint32_t c = digits << 1;
214 memcpy(result + olength - i - 2, DIGIT_TABLE + c, 2);
215 } else {
216 result[0] = (char) ('0' + digits);
217 }
218 }
219
220 // Convert `digits` to a sequence of decimal digits. Print the first digit, followed by a decimal
221 // dot '.' followed by the remaining digits. The caller has to guarantee that:
222 // 10^(olength-1) <= digits < 10^olength
223 // e.g., by passing `olength` as `decimalLength9(digits)`.
append_d_digits(const uint32_t olength,uint32_t digits,char * const result)224 static inline void append_d_digits(const uint32_t olength, uint32_t digits, char* const result) {
225 #ifdef RYU_DEBUG
226 printf("DIGITS=%u\n", digits);
227 #endif
228
229 uint32_t i = 0;
230 while (digits >= 10000) {
231 #ifdef __clang__ // https://bugs.llvm.org/show_bug.cgi?id=38217
232 const uint32_t c = digits - 10000 * (digits / 10000);
233 #else
234 const uint32_t c = digits % 10000;
235 #endif
236 digits /= 10000;
237 const uint32_t c0 = (c % 100) << 1;
238 const uint32_t c1 = (c / 100) << 1;
239 memcpy(result + olength + 1 - i - 2, DIGIT_TABLE + c0, 2);
240 memcpy(result + olength + 1 - i - 4, DIGIT_TABLE + c1, 2);
241 i += 4;
242 }
243 if (digits >= 100) {
244 const uint32_t c = (digits % 100) << 1;
245 digits /= 100;
246 memcpy(result + olength + 1 - i - 2, DIGIT_TABLE + c, 2);
247 i += 2;
248 }
249 if (digits >= 10) {
250 const uint32_t c = digits << 1;
251 result[2] = DIGIT_TABLE[c + 1];
252 result[1] = '.';
253 result[0] = DIGIT_TABLE[c];
254 } else {
255 result[1] = '.';
256 result[0] = (char) ('0' + digits);
257 }
258 }
259
260 // Convert `digits` to decimal and write the last `count` decimal digits to result.
261 // If `digits` contains additional digits, then those are silently ignored.
append_c_digits(const uint32_t count,uint32_t digits,char * const result)262 static inline void append_c_digits(const uint32_t count, uint32_t digits, char* const result) {
263 #ifdef RYU_DEBUG
264 printf("DIGITS=%u\n", digits);
265 #endif
266 // Copy pairs of digits from DIGIT_TABLE.
267 uint32_t i = 0;
268 for (; i < count - 1; i += 2) {
269 const uint32_t c = (digits % 100) << 1;
270 digits /= 100;
271 memcpy(result + count - i - 2, DIGIT_TABLE + c, 2);
272 }
273 // Generate the last digit if count is odd.
274 if (i < count) {
275 const char c = (char) ('0' + (digits % 10));
276 result[count - i - 1] = c;
277 }
278 }
279
280 // Convert `digits` to decimal and write the last 9 decimal digits to result.
281 // If `digits` contains additional digits, then those are silently ignored.
append_nine_digits(uint32_t digits,char * const result)282 static inline void append_nine_digits(uint32_t digits, char* const result) {
283 #ifdef RYU_DEBUG
284 printf("DIGITS=%u\n", digits);
285 #endif
286 if (digits == 0) {
287 memset(result, '0', 9);
288 return;
289 }
290
291 for (uint32_t i = 0; i < 5; i += 4) {
292 #ifdef __clang__ // https://bugs.llvm.org/show_bug.cgi?id=38217
293 const uint32_t c = digits - 10000 * (digits / 10000);
294 #else
295 const uint32_t c = digits % 10000;
296 #endif
297 digits /= 10000;
298 const uint32_t c0 = (c % 100) << 1;
299 const uint32_t c1 = (c / 100) << 1;
300 memcpy(result + 7 - i, DIGIT_TABLE + c0, 2);
301 memcpy(result + 5 - i, DIGIT_TABLE + c1, 2);
302 }
303 result[0] = (char) ('0' + digits);
304 }
305
indexForExponent(const uint32_t e)306 static inline uint32_t indexForExponent(const uint32_t e) {
307 return (e + 15) / 16;
308 }
309
pow10BitsForIndex(const uint32_t idx)310 static inline uint32_t pow10BitsForIndex(const uint32_t idx) {
311 return 16 * idx + POW10_ADDITIONAL_BITS;
312 }
313
lengthForIndex(const uint32_t idx)314 static inline uint32_t lengthForIndex(const uint32_t idx) {
315 // +1 for ceil, +16 for mantissa, +8 to round up when dividing by 9
316 return (log10Pow2(16 * (int32_t) idx) + 1 + 16 + 8) / 9;
317 }
318
d2fixed_buffered_n(double d,uint32_t precision,char * result)319 int d2fixed_buffered_n(double d, uint32_t precision, char* result) {
320 const uint64_t bits = double_to_bits(d);
321 #ifdef RYU_DEBUG
322 printf("IN=");
323 for (int32_t bit = 63; bit >= 0; --bit) {
324 printf("%d", (int) ((bits >> bit) & 1));
325 }
326 printf("\n");
327 #endif
328
329 // Decode bits into sign, mantissa, and exponent.
330 const bool ieeeSign = ((bits >> (DOUBLE_MANTISSA_BITS + DOUBLE_EXPONENT_BITS)) & 1) != 0;
331 const uint64_t ieeeMantissa = bits & ((1ull << DOUBLE_MANTISSA_BITS) - 1);
332 const uint32_t ieeeExponent = (uint32_t) ((bits >> DOUBLE_MANTISSA_BITS) & ((1u << DOUBLE_EXPONENT_BITS) - 1));
333
334 // Case distinction; exit early for the easy cases.
335 if (ieeeExponent == ((1u << DOUBLE_EXPONENT_BITS) - 1u)) {
336 __builtin_abort();
337 }
338 if (ieeeExponent == 0 && ieeeMantissa == 0) {
339 __builtin_abort();
340 }
341
342 int32_t e2;
343 uint64_t m2;
344 if (ieeeExponent == 0) {
345 e2 = 1 - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS;
346 m2 = ieeeMantissa;
347 } else {
348 e2 = (int32_t) ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS;
349 m2 = (1ull << DOUBLE_MANTISSA_BITS) | ieeeMantissa;
350 }
351
352 #ifdef RYU_DEBUG
353 printf("-> %" PRIu64 " * 2^%d\n", m2, e2);
354 #endif
355
356 int index = 0;
357 bool nonzero = false;
358 if (ieeeSign) {
359 result[index++] = '-';
360 }
361 if (e2 >= -52) {
362 const uint32_t idx = e2 < 0 ? 0 : indexForExponent((uint32_t) e2);
363 const uint32_t p10bits = pow10BitsForIndex(idx);
364 const int32_t len = (int32_t) lengthForIndex(idx);
365 #ifdef RYU_DEBUG
366 printf("idx=%u\n", idx);
367 printf("len=%d\n", len);
368 #endif
369 for (int32_t i = len - 1; i >= 0; --i) {
370 const uint32_t j = p10bits - e2;
371 // Temporary: j is usually around 128, and by shifting a bit, we push it to 128 or above, which is
372 // a slightly faster code path in mulShift_mod1e9. Instead, we can just increase the multipliers.
373 const uint32_t digits = mulShift_mod1e9(m2 << 8, POW10_SPLIT[POW10_OFFSET[idx] + i], (int32_t) (j + 8));
374 if (nonzero) {
375 append_nine_digits(digits, result + index);
376 index += 9;
377 } else if (digits != 0) {
378 const uint32_t olength = decimalLength9(digits);
379 append_n_digits(olength, digits, result + index);
380 index += olength;
381 nonzero = true;
382 }
383 }
384 }
385 if (!nonzero) {
386 result[index++] = '0';
387 }
388 if (precision > 0) {
389 result[index++] = '.';
390 }
391 #ifdef RYU_DEBUG
392 printf("e2=%d\n", e2);
393 #endif
394 if (e2 < 0) {
395 const int32_t idx = -e2 / 16;
396 #ifdef RYU_DEBUG
397 printf("idx=%d\n", idx);
398 #endif
399 const uint32_t blocks = precision / 9 + 1;
400 // 0 = don't round up; 1 = round up unconditionally; 2 = round up if odd.
401 int roundUp = 0;
402 uint32_t i = 0;
403 if (blocks <= MIN_BLOCK_2[idx]) {
404 i = blocks;
405 memset(result + index, '0', precision);
406 index += precision;
407 } else if (i < MIN_BLOCK_2[idx]) {
408 i = MIN_BLOCK_2[idx];
409 memset(result + index, '0', 9 * i);
410 index += 9 * i;
411 }
412 for (; i < blocks; ++i) {
413 const int32_t j = ADDITIONAL_BITS_2 + (-e2 - 16 * idx);
414 const uint32_t p = POW10_OFFSET_2[idx] + i - MIN_BLOCK_2[idx];
415 if (p >= POW10_OFFSET_2[idx + 1]) {
416 // If the remaining digits are all 0, then we might as well use memset.
417 // No rounding required in this case.
418 const uint32_t fill = precision - 9 * i;
419 memset(result + index, '0', fill);
420 index += fill;
421 break;
422 }
423 // Temporary: j is usually around 128, and by shifting a bit, we push it to 128 or above, which is
424 // a slightly faster code path in mulShift_mod1e9. Instead, we can just increase the multipliers.
425 uint32_t digits = mulShift_mod1e9(m2 << 8, POW10_SPLIT_2[p], j + 8);
426 #ifdef RYU_DEBUG
427 printf("digits=%u\n", digits);
428 #endif
429 if (i < blocks - 1) {
430 append_nine_digits(digits, result + index);
431 index += 9;
432 } else {
433 const uint32_t maximum = precision - 9 * i;
434 uint32_t lastDigit = 0;
435 for (uint32_t k = 0; k < 9 - maximum; ++k) {
436 lastDigit = digits % 10;
437 digits /= 10;
438 }
439 #ifdef RYU_DEBUG
440 printf("lastDigit=%u\n", lastDigit);
441 #endif
442 if (lastDigit != 5) {
443 roundUp = lastDigit > 5;
444 } else {
445 // Is m * 10^(additionalDigits + 1) / 2^(-e2) integer?
446 const int32_t requiredTwos = -e2 - (int32_t) precision - 1;
447 const bool trailingZeros = requiredTwos <= 0
448 || (requiredTwos < 60 && multipleOfPowerOf2(m2, (uint32_t) requiredTwos));
449 roundUp = trailingZeros ? 2 : 1;
450 #ifdef RYU_DEBUG
451 printf("requiredTwos=%d\n", requiredTwos);
452 printf("trailingZeros=%s\n", trailingZeros ? "true" : "false");
453 #endif
454 }
455 if (maximum > 0) {
456 append_c_digits(maximum, digits, result + index);
457 index += maximum;
458 }
459 break;
460 }
461 }
462 #ifdef RYU_DEBUG
463 printf("roundUp=%d\n", roundUp);
464 #endif
465 if (roundUp != 0) {
466 int roundIndex = index;
467 int dotIndex = 0; // '.' can't be located at index 0
468 while (true) {
469 --roundIndex;
470 char c;
471 if (roundIndex == -1 || (c = result[roundIndex], c == '-')) {
472 result[roundIndex + 1] = '1';
473 if (dotIndex > 0) {
474 result[dotIndex] = '0';
475 result[dotIndex + 1] = '.';
476 }
477 result[index++] = '0';
478 break;
479 }
480 if (c == '.') {
481 dotIndex = roundIndex;
482 continue;
483 } else if (c == '9') {
484 result[roundIndex] = '0';
485 roundUp = 1;
486 continue;
487 } else {
488 if (roundUp == 2 && c % 2 == 0) {
489 break;
490 }
491 result[roundIndex] = c + 1;
492 break;
493 }
494 }
495 }
496 } else {
497 memset(result + index, '0', precision);
498 index += precision;
499 }
500 return index;
501 }
502
503
504
d2exp_buffered_n(double d,uint32_t precision,char * result,int * exp_out)505 int d2exp_buffered_n(double d, uint32_t precision, char* result, int* exp_out) {
506 const uint64_t bits = double_to_bits(d);
507 #ifdef RYU_DEBUG
508 printf("IN=");
509 for (int32_t bit = 63; bit >= 0; --bit) {
510 printf("%d", (int) ((bits >> bit) & 1));
511 }
512 printf("\n");
513 #endif
514
515 // Decode bits into sign, mantissa, and exponent.
516 const bool ieeeSign = ((bits >> (DOUBLE_MANTISSA_BITS + DOUBLE_EXPONENT_BITS)) & 1) != 0;
517 const uint64_t ieeeMantissa = bits & ((1ull << DOUBLE_MANTISSA_BITS) - 1);
518 const uint32_t ieeeExponent = (uint32_t) ((bits >> DOUBLE_MANTISSA_BITS) & ((1u << DOUBLE_EXPONENT_BITS) - 1));
519
520 // Case distinction; exit early for the easy cases.
521 if (ieeeExponent == ((1u << DOUBLE_EXPONENT_BITS) - 1u)) {
522 __builtin_abort();
523 }
524 if (ieeeExponent == 0 && ieeeMantissa == 0) {
525 __builtin_abort();
526 }
527
528 int32_t e2;
529 uint64_t m2;
530 if (ieeeExponent == 0) {
531 e2 = 1 - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS;
532 m2 = ieeeMantissa;
533 } else {
534 e2 = (int32_t) ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS;
535 m2 = (1ull << DOUBLE_MANTISSA_BITS) | ieeeMantissa;
536 }
537
538 #ifdef RYU_DEBUG
539 printf("-> %" PRIu64 " * 2^%d\n", m2, e2);
540 #endif
541
542 const bool printDecimalPoint = precision > 0;
543 ++precision;
544 int index = 0;
545 if (ieeeSign) {
546 result[index++] = '-';
547 }
548 uint32_t digits = 0;
549 uint32_t printedDigits = 0;
550 uint32_t availableDigits = 0;
551 int32_t exp = 0;
552 if (e2 >= -52) {
553 const uint32_t idx = e2 < 0 ? 0 : indexForExponent((uint32_t) e2);
554 const uint32_t p10bits = pow10BitsForIndex(idx);
555 const int32_t len = (int32_t) lengthForIndex(idx);
556 #ifdef RYU_DEBUG
557 printf("idx=%u\n", idx);
558 printf("len=%d\n", len);
559 #endif
560 for (int32_t i = len - 1; i >= 0; --i) {
561 const uint32_t j = p10bits - e2;
562 // Temporary: j is usually around 128, and by shifting a bit, we push it to 128 or above, which is
563 // a slightly faster code path in mulShift_mod1e9. Instead, we can just increase the multipliers.
564 digits = mulShift_mod1e9(m2 << 8, POW10_SPLIT[POW10_OFFSET[idx] + i], (int32_t) (j + 8));
565 if (printedDigits != 0) {
566 if (printedDigits + 9 > precision) {
567 availableDigits = 9;
568 break;
569 }
570 append_nine_digits(digits, result + index);
571 index += 9;
572 printedDigits += 9;
573 } else if (digits != 0) {
574 availableDigits = decimalLength9(digits);
575 exp = i * 9 + (int32_t) availableDigits - 1;
576 if (availableDigits > precision) {
577 break;
578 }
579 if (printDecimalPoint) {
580 append_d_digits(availableDigits, digits, result + index);
581 index += availableDigits + 1; // +1 for decimal point
582 } else {
583 result[index++] = (char) ('0' + digits);
584 }
585 printedDigits = availableDigits;
586 availableDigits = 0;
587 }
588 }
589 }
590
591 if (e2 < 0 && availableDigits == 0) {
592 const int32_t idx = -e2 / 16;
593 #ifdef RYU_DEBUG
594 printf("idx=%d, e2=%d, min=%d\n", idx, e2, MIN_BLOCK_2[idx]);
595 #endif
596 for (int32_t i = MIN_BLOCK_2[idx]; i < 200; ++i) {
597 const int32_t j = ADDITIONAL_BITS_2 + (-e2 - 16 * idx);
598 const uint32_t p = POW10_OFFSET_2[idx] + (uint32_t) i - MIN_BLOCK_2[idx];
599 // Temporary: j is usually around 128, and by shifting a bit, we push it to 128 or above, which is
600 // a slightly faster code path in mulShift_mod1e9. Instead, we can just increase the multipliers.
601 digits = (p >= POW10_OFFSET_2[idx + 1]) ? 0 : mulShift_mod1e9(m2 << 8, POW10_SPLIT_2[p], j + 8);
602 #ifdef RYU_DEBUG
603 printf("exact=%" PRIu64 " * (%" PRIu64 " + %" PRIu64 " << 64) >> %d\n", m2, POW10_SPLIT_2[p][0], POW10_SPLIT_2[p][1], j);
604 printf("digits=%u\n", digits);
605 #endif
606 if (printedDigits != 0) {
607 if (printedDigits + 9 > precision) {
608 availableDigits = 9;
609 break;
610 }
611 append_nine_digits(digits, result + index);
612 index += 9;
613 printedDigits += 9;
614 } else if (digits != 0) {
615 availableDigits = decimalLength9(digits);
616 exp = -(i + 1) * 9 + (int32_t) availableDigits - 1;
617 if (availableDigits > precision) {
618 break;
619 }
620 if (printDecimalPoint) {
621 append_d_digits(availableDigits, digits, result + index);
622 index += availableDigits + 1; // +1 for decimal point
623 } else {
624 result[index++] = (char) ('0' + digits);
625 }
626 printedDigits = availableDigits;
627 availableDigits = 0;
628 }
629 }
630 }
631
632 const uint32_t maximum = precision - printedDigits;
633 #ifdef RYU_DEBUG
634 printf("availableDigits=%u\n", availableDigits);
635 printf("digits=%u\n", digits);
636 printf("maximum=%u\n", maximum);
637 #endif
638 if (availableDigits == 0) {
639 digits = 0;
640 }
641 uint32_t lastDigit = 0;
642 if (availableDigits > maximum) {
643 for (uint32_t k = 0; k < availableDigits - maximum; ++k) {
644 lastDigit = digits % 10;
645 digits /= 10;
646 }
647 }
648 #ifdef RYU_DEBUG
649 printf("lastDigit=%u\n", lastDigit);
650 #endif
651 // 0 = don't round up; 1 = round up unconditionally; 2 = round up if odd.
652 int roundUp = 0;
653 if (lastDigit != 5) {
654 roundUp = lastDigit > 5;
655 } else {
656 // Is m * 2^e2 * 10^(precision + 1 - exp) integer?
657 // precision was already increased by 1, so we don't need to write + 1 here.
658 const int32_t rexp = (int32_t) precision - exp;
659 const int32_t requiredTwos = -e2 - rexp;
660 bool trailingZeros = requiredTwos <= 0
661 || (requiredTwos < 60 && multipleOfPowerOf2(m2, (uint32_t) requiredTwos));
662 if (rexp < 0) {
663 const int32_t requiredFives = -rexp;
664 trailingZeros = trailingZeros && multipleOfPowerOf5(m2, (uint32_t) requiredFives);
665 }
666 roundUp = trailingZeros ? 2 : 1;
667 #ifdef RYU_DEBUG
668 printf("requiredTwos=%d\n", requiredTwos);
669 printf("trailingZeros=%s\n", trailingZeros ? "true" : "false");
670 #endif
671 }
672 if (printedDigits != 0) {
673 if (digits == 0) {
674 memset(result + index, '0', maximum);
675 } else {
676 append_c_digits(maximum, digits, result + index);
677 }
678 index += maximum;
679 } else {
680 if (printDecimalPoint) {
681 append_d_digits(maximum, digits, result + index);
682 index += maximum + 1; // +1 for decimal point
683 } else {
684 result[index++] = (char) ('0' + digits);
685 }
686 }
687 #ifdef RYU_DEBUG
688 printf("roundUp=%d\n", roundUp);
689 #endif
690 if (roundUp != 0) {
691 int roundIndex = index;
692 while (true) {
693 --roundIndex;
694 char c;
695 if (roundIndex == -1 || (c = result[roundIndex], c == '-')) {
696 result[roundIndex + 1] = '1';
697 ++exp;
698 break;
699 }
700 if (c == '.') {
701 continue;
702 } else if (c == '9') {
703 result[roundIndex] = '0';
704 roundUp = 1;
705 continue;
706 } else {
707 if (roundUp == 2 && c % 2 == 0) {
708 break;
709 }
710 result[roundIndex] = c + 1;
711 break;
712 }
713 }
714 }
715 if (exp_out) {
716 *exp_out = exp;
717 }
718 result[index++] = 'e';
719 if (exp < 0) {
720 result[index++] = '-';
721 exp = -exp;
722 } else {
723 result[index++] = '+';
724 }
725
726 if (exp >= 100) {
727 const int32_t c = exp % 10;
728 memcpy(result + index, DIGIT_TABLE + 2 * (exp / 10), 2);
729 result[index + 2] = (char) ('0' + c);
730 index += 3;
731 } else {
732 memcpy(result + index, DIGIT_TABLE + 2 * exp, 2);
733 index += 2;
734 }
735
736 return index;
737 }
738