1*627f7eb2Smrg /** Arbitrary precision arithmetic ('bignum') for processors with no asm support
2*627f7eb2Smrg *
3*627f7eb2Smrg * All functions operate on arrays of uints, stored LSB first.
4*627f7eb2Smrg * If there is a destination array, it will be the first parameter.
5*627f7eb2Smrg * Currently, all of these functions are subject to change, and are
6*627f7eb2Smrg * intended for internal use only.
7*627f7eb2Smrg * This module is intended only to assist development of high-speed routines
8*627f7eb2Smrg * on currently unsupported processors.
9*627f7eb2Smrg * The X86 asm version is about 30 times faster than the D version (DMD).
10*627f7eb2Smrg */
11*627f7eb2Smrg
12*627f7eb2Smrg /* Copyright Don Clugston 2008 - 2010.
13*627f7eb2Smrg * Distributed under the Boost Software License, Version 1.0.
14*627f7eb2Smrg * (See accompanying file LICENSE_1_0.txt or copy at
15*627f7eb2Smrg * http://www.boost.org/LICENSE_1_0.txt)
16*627f7eb2Smrg */
17*627f7eb2Smrg
18*627f7eb2Smrg module std.internal.math.biguintnoasm;
19*627f7eb2Smrg
20*627f7eb2Smrg nothrow:
21*627f7eb2Smrg @safe:
22*627f7eb2Smrg
23*627f7eb2Smrg public:
24*627f7eb2Smrg alias BigDigit = uint; // A Bignum is an array of BigDigits.
25*627f7eb2Smrg
26*627f7eb2Smrg // Limits for when to switch between multiplication algorithms.
27*627f7eb2Smrg enum int KARATSUBALIMIT = 10; // Minimum value for which Karatsuba is worthwhile.
28*627f7eb2Smrg enum int KARATSUBASQUARELIMIT = 12; // Minimum value for which square Karatsuba is worthwhile
29*627f7eb2Smrg
30*627f7eb2Smrg
31*627f7eb2Smrg /** Multi-byte addition or subtraction
32*627f7eb2Smrg * dest[] = src1[] + src2[] + carry (0 or 1).
33*627f7eb2Smrg * or dest[] = src1[] - src2[] - carry (0 or 1).
34*627f7eb2Smrg * Returns carry or borrow (0 or 1).
35*627f7eb2Smrg * Set op == '+' for addition, '-' for subtraction.
36*627f7eb2Smrg */
multibyteAddSub(char op)37*627f7eb2Smrg uint multibyteAddSub(char op)(uint[] dest, const(uint) [] src1,
38*627f7eb2Smrg const (uint) [] src2, uint carry) pure @nogc @safe
39*627f7eb2Smrg {
40*627f7eb2Smrg ulong c = carry;
41*627f7eb2Smrg for (size_t i = 0; i < src2.length; ++i)
42*627f7eb2Smrg {
43*627f7eb2Smrg static if (op=='+') c = c + src1[i] + src2[i];
44*627f7eb2Smrg else c = cast(ulong) src1[i] - src2[i] - c;
45*627f7eb2Smrg dest[i] = cast(uint) c;
46*627f7eb2Smrg c = (c > 0xFFFF_FFFF);
47*627f7eb2Smrg }
48*627f7eb2Smrg return cast(uint) c;
49*627f7eb2Smrg }
50*627f7eb2Smrg
51*627f7eb2Smrg @safe unittest
52*627f7eb2Smrg {
53*627f7eb2Smrg uint [] a = new uint[40];
54*627f7eb2Smrg uint [] b = new uint[40];
55*627f7eb2Smrg uint [] c = new uint[40];
56*627f7eb2Smrg for (size_t i = 0; i < a.length; ++i)
57*627f7eb2Smrg {
58*627f7eb2Smrg if (i&1) a[i]=cast(uint)(0x8000_0000 + i);
59*627f7eb2Smrg else a[i]=cast(uint) i;
60*627f7eb2Smrg b[i]= 0x8000_0003;
61*627f7eb2Smrg }
62*627f7eb2Smrg c[19]=0x3333_3333;
63*627f7eb2Smrg uint carry = multibyteAddSub!('+')(c[0 .. 18], b[0 .. 18], a[0 .. 18], 0);
64*627f7eb2Smrg assert(c[0]==0x8000_0003);
65*627f7eb2Smrg assert(c[1]==4);
66*627f7eb2Smrg assert(c[19]==0x3333_3333); // check for overrun
67*627f7eb2Smrg assert(carry == 1);
68*627f7eb2Smrg for (size_t i = 0; i < a.length; ++i)
69*627f7eb2Smrg {
70*627f7eb2Smrg a[i] = b[i] = c[i] = 0;
71*627f7eb2Smrg }
72*627f7eb2Smrg a[8]=0x048D159E;
73*627f7eb2Smrg b[8]=0x048D159E;
74*627f7eb2Smrg a[10]=0x1D950C84;
75*627f7eb2Smrg b[10]=0x1D950C84;
76*627f7eb2Smrg a[5] =0x44444444;
77*627f7eb2Smrg carry = multibyteAddSub!('-')(a[0 .. 12], a[0 .. 12], b[0 .. 12], 0);
78*627f7eb2Smrg assert(a[11] == 0);
79*627f7eb2Smrg for (size_t i = 0; i < 10; ++i)
80*627f7eb2Smrg if (i != 5)
81*627f7eb2Smrg assert(a[i] == 0);
82*627f7eb2Smrg
83*627f7eb2Smrg for (size_t q = 3; q < 36; ++q)
84*627f7eb2Smrg {
85*627f7eb2Smrg for (size_t i = 0; i< a.length; ++i)
86*627f7eb2Smrg {
87*627f7eb2Smrg a[i] = b[i] = c[i] = 0;
88*627f7eb2Smrg }
89*627f7eb2Smrg a[q-2]=0x040000;
90*627f7eb2Smrg b[q-2]=0x040000;
91*627f7eb2Smrg carry = multibyteAddSub!('-')(a[0 .. q], a[0 .. q], b[0 .. q], 0);
92*627f7eb2Smrg assert(a[q-2]==0);
93*627f7eb2Smrg }
94*627f7eb2Smrg }
95*627f7eb2Smrg
96*627f7eb2Smrg
97*627f7eb2Smrg
98*627f7eb2Smrg /** dest[] += carry, or dest[] -= carry.
99*627f7eb2Smrg * op must be '+' or '-'
100*627f7eb2Smrg * Returns final carry or borrow (0 or 1)
101*627f7eb2Smrg */
multibyteIncrementAssign(char op)102*627f7eb2Smrg uint multibyteIncrementAssign(char op)(uint[] dest, uint carry)
103*627f7eb2Smrg pure @nogc @safe
104*627f7eb2Smrg {
105*627f7eb2Smrg static if (op=='+')
106*627f7eb2Smrg {
107*627f7eb2Smrg ulong c = carry;
108*627f7eb2Smrg c += dest[0];
109*627f7eb2Smrg dest[0] = cast(uint) c;
110*627f7eb2Smrg if (c <= 0xFFFF_FFFF)
111*627f7eb2Smrg return 0;
112*627f7eb2Smrg
113*627f7eb2Smrg for (size_t i = 1; i < dest.length; ++i)
114*627f7eb2Smrg {
115*627f7eb2Smrg ++dest[i];
116*627f7eb2Smrg if (dest[i] != 0)
117*627f7eb2Smrg return 0;
118*627f7eb2Smrg }
119*627f7eb2Smrg return 1;
120*627f7eb2Smrg }
121*627f7eb2Smrg else
122*627f7eb2Smrg {
123*627f7eb2Smrg ulong c = carry;
124*627f7eb2Smrg c = dest[0] - c;
125*627f7eb2Smrg dest[0] = cast(uint) c;
126*627f7eb2Smrg if (c <= 0xFFFF_FFFF)
127*627f7eb2Smrg return 0;
128*627f7eb2Smrg for (size_t i = 1; i < dest.length; ++i)
129*627f7eb2Smrg {
130*627f7eb2Smrg --dest[i];
131*627f7eb2Smrg if (dest[i] != 0xFFFF_FFFF)
132*627f7eb2Smrg return 0;
133*627f7eb2Smrg }
134*627f7eb2Smrg return 1;
135*627f7eb2Smrg }
136*627f7eb2Smrg }
137*627f7eb2Smrg
138*627f7eb2Smrg /** dest[] = src[] << numbits
139*627f7eb2Smrg * numbits must be in the range 1 .. 31
140*627f7eb2Smrg */
multibyteShl(uint[]dest,const (uint)[]src,uint numbits)141*627f7eb2Smrg uint multibyteShl(uint [] dest, const(uint) [] src, uint numbits)
142*627f7eb2Smrg pure @nogc @safe
143*627f7eb2Smrg {
144*627f7eb2Smrg ulong c = 0;
145*627f7eb2Smrg for (size_t i = 0; i < dest.length; ++i)
146*627f7eb2Smrg {
147*627f7eb2Smrg c += (cast(ulong)(src[i]) << numbits);
148*627f7eb2Smrg dest[i] = cast(uint) c;
149*627f7eb2Smrg c >>>= 32;
150*627f7eb2Smrg }
151*627f7eb2Smrg return cast(uint) c;
152*627f7eb2Smrg }
153*627f7eb2Smrg
154*627f7eb2Smrg
155*627f7eb2Smrg /** dest[] = src[] >> numbits
156*627f7eb2Smrg * numbits must be in the range 1 .. 31
157*627f7eb2Smrg */
multibyteShr(uint[]dest,const (uint)[]src,uint numbits)158*627f7eb2Smrg void multibyteShr(uint [] dest, const(uint) [] src, uint numbits)
159*627f7eb2Smrg pure @nogc @safe
160*627f7eb2Smrg {
161*627f7eb2Smrg ulong c = 0;
162*627f7eb2Smrg for (ptrdiff_t i = dest.length; i != 0; --i)
163*627f7eb2Smrg {
164*627f7eb2Smrg c += (src[i-1] >>numbits) + (cast(ulong)(src[i-1]) << (64 - numbits));
165*627f7eb2Smrg dest[i-1] = cast(uint) c;
166*627f7eb2Smrg c >>>= 32;
167*627f7eb2Smrg }
168*627f7eb2Smrg }
169*627f7eb2Smrg
170*627f7eb2Smrg @safe unittest
171*627f7eb2Smrg {
172*627f7eb2Smrg
173*627f7eb2Smrg uint [] aa = [0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
174*627f7eb2Smrg multibyteShr(aa[0..$-2], aa, 4);
175*627f7eb2Smrg assert(aa[0] == 0x6122_2222 && aa[1] == 0xA455_5555 && aa[2] == 0x0899_9999);
176*627f7eb2Smrg assert(aa[3] == 0xBCCC_CCCD);
177*627f7eb2Smrg
178*627f7eb2Smrg aa = [0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
179*627f7eb2Smrg multibyteShr(aa[0..$-1], aa, 4);
180*627f7eb2Smrg assert(aa[0] == 0x6122_2222 && aa[1] == 0xA455_5555
181*627f7eb2Smrg && aa[2] == 0xD899_9999 && aa[3] == 0x0BCC_CCCC);
182*627f7eb2Smrg
183*627f7eb2Smrg aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD,
184*627f7eb2Smrg 0xEEEE_EEEE];
185*627f7eb2Smrg multibyteShl(aa[1 .. 4], aa[1..$], 4);
186*627f7eb2Smrg assert(aa[0] == 0xF0FF_FFFF && aa[1] == 0x2222_2230
187*627f7eb2Smrg && aa[2]==0x5555_5561 && aa[3]==0x9999_99A4 && aa[4]==0x0BCCC_CCCD);
188*627f7eb2Smrg }
189*627f7eb2Smrg
190*627f7eb2Smrg /** dest[] = src[] * multiplier + carry.
191*627f7eb2Smrg * Returns carry.
192*627f7eb2Smrg */
multibyteMul(uint[]dest,const (uint)[]src,uint multiplier,uint carry)193*627f7eb2Smrg uint multibyteMul(uint[] dest, const(uint)[] src, uint multiplier, uint carry)
194*627f7eb2Smrg pure @nogc @safe
195*627f7eb2Smrg {
196*627f7eb2Smrg assert(dest.length == src.length);
197*627f7eb2Smrg ulong c = carry;
198*627f7eb2Smrg for (size_t i = 0; i < src.length; ++i)
199*627f7eb2Smrg {
200*627f7eb2Smrg c += cast(ulong)(src[i]) * multiplier;
201*627f7eb2Smrg dest[i] = cast(uint) c;
202*627f7eb2Smrg c>>=32;
203*627f7eb2Smrg }
204*627f7eb2Smrg return cast(uint) c;
205*627f7eb2Smrg }
206*627f7eb2Smrg
207*627f7eb2Smrg @safe unittest
208*627f7eb2Smrg {
209*627f7eb2Smrg uint [] aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A,
210*627f7eb2Smrg 0xBCCC_CCCD, 0xEEEE_EEEE];
211*627f7eb2Smrg multibyteMul(aa[1 .. 4], aa[1 .. 4], 16, 0);
212*627f7eb2Smrg assert(aa[0] == 0xF0FF_FFFF && aa[1] == 0x2222_2230 && aa[2]==0x5555_5561
213*627f7eb2Smrg && aa[3]==0x9999_99A4 && aa[4]==0x0BCCC_CCCD);
214*627f7eb2Smrg }
215*627f7eb2Smrg
216*627f7eb2Smrg /**
217*627f7eb2Smrg * dest[] += src[] * multiplier + carry(0 .. FFFF_FFFF).
218*627f7eb2Smrg * Returns carry out of MSB (0 .. FFFF_FFFF).
219*627f7eb2Smrg */
multibyteMulAdd(char op)220*627f7eb2Smrg uint multibyteMulAdd(char op)(uint [] dest, const(uint)[] src,
221*627f7eb2Smrg uint multiplier, uint carry) pure @nogc @safe
222*627f7eb2Smrg {
223*627f7eb2Smrg assert(dest.length == src.length);
224*627f7eb2Smrg ulong c = carry;
225*627f7eb2Smrg for (size_t i = 0; i < src.length; ++i)
226*627f7eb2Smrg {
227*627f7eb2Smrg static if (op=='+')
228*627f7eb2Smrg {
229*627f7eb2Smrg c += cast(ulong)(multiplier) * src[i] + dest[i];
230*627f7eb2Smrg dest[i] = cast(uint) c;
231*627f7eb2Smrg c >>= 32;
232*627f7eb2Smrg }
233*627f7eb2Smrg else
234*627f7eb2Smrg {
235*627f7eb2Smrg c += cast(ulong) multiplier * src[i];
236*627f7eb2Smrg ulong t = cast(ulong) dest[i] - cast(uint) c;
237*627f7eb2Smrg dest[i] = cast(uint) t;
238*627f7eb2Smrg c = cast(uint)((c >> 32) - (t >> 32));
239*627f7eb2Smrg }
240*627f7eb2Smrg }
241*627f7eb2Smrg return cast(uint) c;
242*627f7eb2Smrg }
243*627f7eb2Smrg
244*627f7eb2Smrg @safe unittest
245*627f7eb2Smrg {
246*627f7eb2Smrg
247*627f7eb2Smrg uint [] aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A,
248*627f7eb2Smrg 0xBCCC_CCCD, 0xEEEE_EEEE];
249*627f7eb2Smrg uint [] bb = [0x1234_1234, 0xF0F0_F0F0, 0x00C0_C0C0, 0xF0F0_F0F0,
250*627f7eb2Smrg 0xC0C0_C0C0];
251*627f7eb2Smrg multibyteMulAdd!('+')(bb[1..$-1], aa[1..$-2], 16, 5);
252*627f7eb2Smrg assert(bb[0] == 0x1234_1234 && bb[4] == 0xC0C0_C0C0);
253*627f7eb2Smrg assert(bb[1] == 0x2222_2230 + 0xF0F0_F0F0 + 5
254*627f7eb2Smrg && bb[2] == 0x5555_5561 + 0x00C0_C0C0 + 1
255*627f7eb2Smrg && bb[3] == 0x9999_99A4 + 0xF0F0_F0F0 );
256*627f7eb2Smrg }
257*627f7eb2Smrg
258*627f7eb2Smrg
259*627f7eb2Smrg /**
260*627f7eb2Smrg Sets result = result[0 .. left.length] + left * right
261*627f7eb2Smrg
262*627f7eb2Smrg It is defined in this way to allow cache-efficient multiplication.
263*627f7eb2Smrg This function is equivalent to:
264*627f7eb2Smrg ----
265*627f7eb2Smrg for (size_t i = 0; i< right.length; ++i)
266*627f7eb2Smrg {
267*627f7eb2Smrg dest[left.length + i] = multibyteMulAdd(dest[i .. left.length+i],
268*627f7eb2Smrg left, right[i], 0);
269*627f7eb2Smrg }
270*627f7eb2Smrg ----
271*627f7eb2Smrg */
multibyteMultiplyAccumulate(uint[]dest,const (uint)[]left,const (uint)[]right)272*627f7eb2Smrg void multibyteMultiplyAccumulate(uint [] dest, const(uint)[] left, const(uint)
273*627f7eb2Smrg [] right) pure @nogc @safe
274*627f7eb2Smrg {
275*627f7eb2Smrg for (size_t i = 0; i < right.length; ++i)
276*627f7eb2Smrg {
277*627f7eb2Smrg dest[left.length + i] = multibyteMulAdd!('+')(dest[i .. left.length+i],
278*627f7eb2Smrg left, right[i], 0);
279*627f7eb2Smrg }
280*627f7eb2Smrg }
281*627f7eb2Smrg
282*627f7eb2Smrg /** dest[] /= divisor.
283*627f7eb2Smrg * overflow is the initial remainder, and must be in the range 0 .. divisor-1.
284*627f7eb2Smrg */
multibyteDivAssign(uint[]dest,uint divisor,uint overflow)285*627f7eb2Smrg uint multibyteDivAssign(uint [] dest, uint divisor, uint overflow)
286*627f7eb2Smrg pure @nogc @safe
287*627f7eb2Smrg {
288*627f7eb2Smrg ulong c = cast(ulong) overflow;
289*627f7eb2Smrg for (ptrdiff_t i = dest.length-1; i >= 0; --i)
290*627f7eb2Smrg {
291*627f7eb2Smrg c = (c << 32) + cast(ulong)(dest[i]);
292*627f7eb2Smrg uint q = cast(uint)(c/divisor);
293*627f7eb2Smrg c -= divisor * q;
294*627f7eb2Smrg dest[i] = q;
295*627f7eb2Smrg }
296*627f7eb2Smrg return cast(uint) c;
297*627f7eb2Smrg }
298*627f7eb2Smrg
299*627f7eb2Smrg @safe unittest
300*627f7eb2Smrg {
301*627f7eb2Smrg uint [] aa = new uint[101];
302*627f7eb2Smrg for (uint i = 0; i < aa.length; ++i)
303*627f7eb2Smrg aa[i] = 0x8765_4321 * (i+3);
304*627f7eb2Smrg uint overflow = multibyteMul(aa, aa, 0x8EFD_FCFB, 0x33FF_7461);
305*627f7eb2Smrg uint r = multibyteDivAssign(aa, 0x8EFD_FCFB, overflow);
306*627f7eb2Smrg for (uint i=0; i<aa.length; ++i)
307*627f7eb2Smrg {
308*627f7eb2Smrg assert(aa[i] == 0x8765_4321 * (i+3));
309*627f7eb2Smrg }
310*627f7eb2Smrg assert(r == 0x33FF_7461);
311*627f7eb2Smrg
312*627f7eb2Smrg }
313*627f7eb2Smrg // Set dest[2*i .. 2*i+1]+=src[i]*src[i]
multibyteAddDiagonalSquares(uint[]dest,const (uint)[]src)314*627f7eb2Smrg void multibyteAddDiagonalSquares(uint[] dest, const(uint)[] src)
315*627f7eb2Smrg pure @nogc @safe
316*627f7eb2Smrg {
317*627f7eb2Smrg ulong c = 0;
318*627f7eb2Smrg for (size_t i = 0; i < src.length; ++i)
319*627f7eb2Smrg {
320*627f7eb2Smrg // At this point, c is 0 or 1, since FFFF*FFFF+FFFF_FFFF = 1_0000_0000.
321*627f7eb2Smrg c += cast(ulong)(src[i]) * src[i] + dest[2*i];
322*627f7eb2Smrg dest[2*i] = cast(uint) c;
323*627f7eb2Smrg c = (c>>=32) + dest[2*i+1];
324*627f7eb2Smrg dest[2*i+1] = cast(uint) c;
325*627f7eb2Smrg c >>= 32;
326*627f7eb2Smrg }
327*627f7eb2Smrg }
328*627f7eb2Smrg
329*627f7eb2Smrg // Does half a square multiply. (square = diagonal + 2*triangle)
multibyteTriangleAccumulate(uint[]dest,const (uint)[]x)330*627f7eb2Smrg void multibyteTriangleAccumulate(uint[] dest, const(uint)[] x)
331*627f7eb2Smrg pure @nogc @safe
332*627f7eb2Smrg {
333*627f7eb2Smrg // x[0]*x[1...$] + x[1]*x[2..$] + ... + x[$-2]x[$-1..$]
334*627f7eb2Smrg dest[x.length] = multibyteMul(dest[1 .. x.length], x[1..$], x[0], 0);
335*627f7eb2Smrg if (x.length < 4)
336*627f7eb2Smrg {
337*627f7eb2Smrg if (x.length == 3)
338*627f7eb2Smrg {
339*627f7eb2Smrg ulong c = cast(ulong)(x[$-1]) * x[$-2] + dest[2*x.length-3];
340*627f7eb2Smrg dest[2*x.length - 3] = cast(uint) c;
341*627f7eb2Smrg c >>= 32;
342*627f7eb2Smrg dest[2*x.length - 2] = cast(uint) c;
343*627f7eb2Smrg }
344*627f7eb2Smrg return;
345*627f7eb2Smrg }
346*627f7eb2Smrg for (size_t i = 2; i < x.length - 2; ++i)
347*627f7eb2Smrg {
348*627f7eb2Smrg dest[i-1+ x.length] = multibyteMulAdd!('+')(
349*627f7eb2Smrg dest[i+i-1 .. i+x.length-1], x[i..$], x[i-1], 0);
350*627f7eb2Smrg }
351*627f7eb2Smrg // Unroll the last two entries, to reduce loop overhead:
352*627f7eb2Smrg ulong c = cast(ulong)(x[$-3]) * x[$-2] + dest[2*x.length-5];
353*627f7eb2Smrg dest[2*x.length-5] = cast(uint) c;
354*627f7eb2Smrg c >>= 32;
355*627f7eb2Smrg c += cast(ulong)(x[$-3]) * x[$-1] + dest[2*x.length-4];
356*627f7eb2Smrg dest[2*x.length-4] = cast(uint) c;
357*627f7eb2Smrg c >>= 32;
358*627f7eb2Smrg c += cast(ulong)(x[$-1]) * x[$-2];
359*627f7eb2Smrg dest[2*x.length-3] = cast(uint) c;
360*627f7eb2Smrg c >>= 32;
361*627f7eb2Smrg dest[2*x.length-2] = cast(uint) c;
362*627f7eb2Smrg }
363*627f7eb2Smrg
multibyteSquare(BigDigit[]result,const (BigDigit)[]x)364*627f7eb2Smrg void multibyteSquare(BigDigit[] result, const(BigDigit) [] x) pure @nogc @safe
365*627f7eb2Smrg {
366*627f7eb2Smrg multibyteTriangleAccumulate(result, x);
367*627f7eb2Smrg result[$-1] = multibyteShl(result[1..$-1], result[1..$-1], 1); // mul by 2
368*627f7eb2Smrg result[0] = 0;
369*627f7eb2Smrg multibyteAddDiagonalSquares(result, x);
370*627f7eb2Smrg }
371