186d7f5d3SJohn Marino /* mpn_sqrtrem -- square root and remainder
286d7f5d3SJohn Marino
386d7f5d3SJohn Marino Contributed to the GNU project by Paul Zimmermann (most code) and
486d7f5d3SJohn Marino Torbjorn Granlund (mpn_sqrtrem1).
586d7f5d3SJohn Marino
686d7f5d3SJohn Marino THE FUNCTIONS IN THIS FILE EXCEPT mpn_sqrtrem ARE INTERNAL WITH A
786d7f5d3SJohn Marino MUTABLE INTERFACE. IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED
886d7f5d3SJohn Marino INTERFACES. IN FACT, IT IS ALMOST GUARANTEED THAT THEY WILL CHANGE OR
986d7f5d3SJohn Marino DISAPPEAR IN A FUTURE GMP RELEASE.
1086d7f5d3SJohn Marino
1186d7f5d3SJohn Marino Copyright 1999, 2000, 2001, 2002, 2004, 2005, 2008, 2010 Free Software
1286d7f5d3SJohn Marino Foundation, Inc.
1386d7f5d3SJohn Marino
1486d7f5d3SJohn Marino This file is part of the GNU MP Library.
1586d7f5d3SJohn Marino
1686d7f5d3SJohn Marino The GNU MP Library is free software; you can redistribute it and/or modify
1786d7f5d3SJohn Marino it under the terms of the GNU Lesser General Public License as published by
1886d7f5d3SJohn Marino the Free Software Foundation; either version 3 of the License, or (at your
1986d7f5d3SJohn Marino option) any later version.
2086d7f5d3SJohn Marino
2186d7f5d3SJohn Marino The GNU MP Library is distributed in the hope that it will be useful, but
2286d7f5d3SJohn Marino WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
2386d7f5d3SJohn Marino or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
2486d7f5d3SJohn Marino License for more details.
2586d7f5d3SJohn Marino
2686d7f5d3SJohn Marino You should have received a copy of the GNU Lesser General Public License
2786d7f5d3SJohn Marino along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
2886d7f5d3SJohn Marino
2986d7f5d3SJohn Marino
3086d7f5d3SJohn Marino /* See "Karatsuba Square Root", reference in gmp.texi. */
3186d7f5d3SJohn Marino
3286d7f5d3SJohn Marino
3386d7f5d3SJohn Marino #include <stdio.h>
3486d7f5d3SJohn Marino #include <stdlib.h>
3586d7f5d3SJohn Marino
3686d7f5d3SJohn Marino #include "gmp.h"
3786d7f5d3SJohn Marino #include "gmp-impl.h"
3886d7f5d3SJohn Marino #include "longlong.h"
3986d7f5d3SJohn Marino
4086d7f5d3SJohn Marino static const unsigned short invsqrttab[384] =
4186d7f5d3SJohn Marino {
4286d7f5d3SJohn Marino 0x1ff,0x1fd,0x1fb,0x1f9,0x1f7,0x1f5,0x1f3,0x1f2, /* sqrt(1/80)..sqrt(1/87) */
4386d7f5d3SJohn Marino 0x1f0,0x1ee,0x1ec,0x1ea,0x1e9,0x1e7,0x1e5,0x1e4, /* sqrt(1/88)..sqrt(1/8f) */
4486d7f5d3SJohn Marino 0x1e2,0x1e0,0x1df,0x1dd,0x1db,0x1da,0x1d8,0x1d7, /* sqrt(1/90)..sqrt(1/97) */
4586d7f5d3SJohn Marino 0x1d5,0x1d4,0x1d2,0x1d1,0x1cf,0x1ce,0x1cc,0x1cb, /* sqrt(1/98)..sqrt(1/9f) */
4686d7f5d3SJohn Marino 0x1c9,0x1c8,0x1c6,0x1c5,0x1c4,0x1c2,0x1c1,0x1c0, /* sqrt(1/a0)..sqrt(1/a7) */
4786d7f5d3SJohn Marino 0x1be,0x1bd,0x1bc,0x1ba,0x1b9,0x1b8,0x1b7,0x1b5, /* sqrt(1/a8)..sqrt(1/af) */
4886d7f5d3SJohn Marino 0x1b4,0x1b3,0x1b2,0x1b0,0x1af,0x1ae,0x1ad,0x1ac, /* sqrt(1/b0)..sqrt(1/b7) */
4986d7f5d3SJohn Marino 0x1aa,0x1a9,0x1a8,0x1a7,0x1a6,0x1a5,0x1a4,0x1a3, /* sqrt(1/b8)..sqrt(1/bf) */
5086d7f5d3SJohn Marino 0x1a2,0x1a0,0x19f,0x19e,0x19d,0x19c,0x19b,0x19a, /* sqrt(1/c0)..sqrt(1/c7) */
5186d7f5d3SJohn Marino 0x199,0x198,0x197,0x196,0x195,0x194,0x193,0x192, /* sqrt(1/c8)..sqrt(1/cf) */
5286d7f5d3SJohn Marino 0x191,0x190,0x18f,0x18e,0x18d,0x18c,0x18c,0x18b, /* sqrt(1/d0)..sqrt(1/d7) */
5386d7f5d3SJohn Marino 0x18a,0x189,0x188,0x187,0x186,0x185,0x184,0x183, /* sqrt(1/d8)..sqrt(1/df) */
5486d7f5d3SJohn Marino 0x183,0x182,0x181,0x180,0x17f,0x17e,0x17e,0x17d, /* sqrt(1/e0)..sqrt(1/e7) */
5586d7f5d3SJohn Marino 0x17c,0x17b,0x17a,0x179,0x179,0x178,0x177,0x176, /* sqrt(1/e8)..sqrt(1/ef) */
5686d7f5d3SJohn Marino 0x176,0x175,0x174,0x173,0x172,0x172,0x171,0x170, /* sqrt(1/f0)..sqrt(1/f7) */
5786d7f5d3SJohn Marino 0x16f,0x16f,0x16e,0x16d,0x16d,0x16c,0x16b,0x16a, /* sqrt(1/f8)..sqrt(1/ff) */
5886d7f5d3SJohn Marino 0x16a,0x169,0x168,0x168,0x167,0x166,0x166,0x165, /* sqrt(1/100)..sqrt(1/107) */
5986d7f5d3SJohn Marino 0x164,0x164,0x163,0x162,0x162,0x161,0x160,0x160, /* sqrt(1/108)..sqrt(1/10f) */
6086d7f5d3SJohn Marino 0x15f,0x15e,0x15e,0x15d,0x15c,0x15c,0x15b,0x15a, /* sqrt(1/110)..sqrt(1/117) */
6186d7f5d3SJohn Marino 0x15a,0x159,0x159,0x158,0x157,0x157,0x156,0x156, /* sqrt(1/118)..sqrt(1/11f) */
6286d7f5d3SJohn Marino 0x155,0x154,0x154,0x153,0x153,0x152,0x152,0x151, /* sqrt(1/120)..sqrt(1/127) */
6386d7f5d3SJohn Marino 0x150,0x150,0x14f,0x14f,0x14e,0x14e,0x14d,0x14d, /* sqrt(1/128)..sqrt(1/12f) */
6486d7f5d3SJohn Marino 0x14c,0x14b,0x14b,0x14a,0x14a,0x149,0x149,0x148, /* sqrt(1/130)..sqrt(1/137) */
6586d7f5d3SJohn Marino 0x148,0x147,0x147,0x146,0x146,0x145,0x145,0x144, /* sqrt(1/138)..sqrt(1/13f) */
6686d7f5d3SJohn Marino 0x144,0x143,0x143,0x142,0x142,0x141,0x141,0x140, /* sqrt(1/140)..sqrt(1/147) */
6786d7f5d3SJohn Marino 0x140,0x13f,0x13f,0x13e,0x13e,0x13d,0x13d,0x13c, /* sqrt(1/148)..sqrt(1/14f) */
6886d7f5d3SJohn Marino 0x13c,0x13b,0x13b,0x13a,0x13a,0x139,0x139,0x139, /* sqrt(1/150)..sqrt(1/157) */
6986d7f5d3SJohn Marino 0x138,0x138,0x137,0x137,0x136,0x136,0x135,0x135, /* sqrt(1/158)..sqrt(1/15f) */
7086d7f5d3SJohn Marino 0x135,0x134,0x134,0x133,0x133,0x132,0x132,0x132, /* sqrt(1/160)..sqrt(1/167) */
7186d7f5d3SJohn Marino 0x131,0x131,0x130,0x130,0x12f,0x12f,0x12f,0x12e, /* sqrt(1/168)..sqrt(1/16f) */
7286d7f5d3SJohn Marino 0x12e,0x12d,0x12d,0x12d,0x12c,0x12c,0x12b,0x12b, /* sqrt(1/170)..sqrt(1/177) */
7386d7f5d3SJohn Marino 0x12b,0x12a,0x12a,0x129,0x129,0x129,0x128,0x128, /* sqrt(1/178)..sqrt(1/17f) */
7486d7f5d3SJohn Marino 0x127,0x127,0x127,0x126,0x126,0x126,0x125,0x125, /* sqrt(1/180)..sqrt(1/187) */
7586d7f5d3SJohn Marino 0x124,0x124,0x124,0x123,0x123,0x123,0x122,0x122, /* sqrt(1/188)..sqrt(1/18f) */
7686d7f5d3SJohn Marino 0x121,0x121,0x121,0x120,0x120,0x120,0x11f,0x11f, /* sqrt(1/190)..sqrt(1/197) */
7786d7f5d3SJohn Marino 0x11f,0x11e,0x11e,0x11e,0x11d,0x11d,0x11d,0x11c, /* sqrt(1/198)..sqrt(1/19f) */
7886d7f5d3SJohn Marino 0x11c,0x11b,0x11b,0x11b,0x11a,0x11a,0x11a,0x119, /* sqrt(1/1a0)..sqrt(1/1a7) */
7986d7f5d3SJohn Marino 0x119,0x119,0x118,0x118,0x118,0x118,0x117,0x117, /* sqrt(1/1a8)..sqrt(1/1af) */
8086d7f5d3SJohn Marino 0x117,0x116,0x116,0x116,0x115,0x115,0x115,0x114, /* sqrt(1/1b0)..sqrt(1/1b7) */
8186d7f5d3SJohn Marino 0x114,0x114,0x113,0x113,0x113,0x112,0x112,0x112, /* sqrt(1/1b8)..sqrt(1/1bf) */
8286d7f5d3SJohn Marino 0x112,0x111,0x111,0x111,0x110,0x110,0x110,0x10f, /* sqrt(1/1c0)..sqrt(1/1c7) */
8386d7f5d3SJohn Marino 0x10f,0x10f,0x10f,0x10e,0x10e,0x10e,0x10d,0x10d, /* sqrt(1/1c8)..sqrt(1/1cf) */
8486d7f5d3SJohn Marino 0x10d,0x10c,0x10c,0x10c,0x10c,0x10b,0x10b,0x10b, /* sqrt(1/1d0)..sqrt(1/1d7) */
8586d7f5d3SJohn Marino 0x10a,0x10a,0x10a,0x10a,0x109,0x109,0x109,0x109, /* sqrt(1/1d8)..sqrt(1/1df) */
8686d7f5d3SJohn Marino 0x108,0x108,0x108,0x107,0x107,0x107,0x107,0x106, /* sqrt(1/1e0)..sqrt(1/1e7) */
8786d7f5d3SJohn Marino 0x106,0x106,0x106,0x105,0x105,0x105,0x104,0x104, /* sqrt(1/1e8)..sqrt(1/1ef) */
8886d7f5d3SJohn Marino 0x104,0x104,0x103,0x103,0x103,0x103,0x102,0x102, /* sqrt(1/1f0)..sqrt(1/1f7) */
8986d7f5d3SJohn Marino 0x102,0x102,0x101,0x101,0x101,0x101,0x100,0x100 /* sqrt(1/1f8)..sqrt(1/1ff) */
9086d7f5d3SJohn Marino };
9186d7f5d3SJohn Marino
9286d7f5d3SJohn Marino /* Compute s = floor(sqrt(a0)), and *rp = a0 - s^2. */
9386d7f5d3SJohn Marino
9486d7f5d3SJohn Marino #if GMP_NUMB_BITS > 32
9586d7f5d3SJohn Marino #define MAGIC CNST_LIMB(0x10000000000) /* 0xffe7debbfc < MAGIC < 0x232b1850f410 */
9686d7f5d3SJohn Marino #else
9786d7f5d3SJohn Marino #define MAGIC CNST_LIMB(0x100000) /* 0xfee6f < MAGIC < 0x29cbc8 */
9886d7f5d3SJohn Marino #endif
9986d7f5d3SJohn Marino
10086d7f5d3SJohn Marino static mp_limb_t
mpn_sqrtrem1(mp_ptr rp,mp_limb_t a0)10186d7f5d3SJohn Marino mpn_sqrtrem1 (mp_ptr rp, mp_limb_t a0)
10286d7f5d3SJohn Marino {
10386d7f5d3SJohn Marino #if GMP_NUMB_BITS > 32
10486d7f5d3SJohn Marino mp_limb_t a1;
10586d7f5d3SJohn Marino #endif
10686d7f5d3SJohn Marino mp_limb_t x0, t2, t, x2;
10786d7f5d3SJohn Marino unsigned abits;
10886d7f5d3SJohn Marino
10986d7f5d3SJohn Marino ASSERT_ALWAYS (GMP_NAIL_BITS == 0);
11086d7f5d3SJohn Marino ASSERT_ALWAYS (GMP_LIMB_BITS == 32 || GMP_LIMB_BITS == 64);
11186d7f5d3SJohn Marino ASSERT (a0 >= GMP_NUMB_HIGHBIT / 2);
11286d7f5d3SJohn Marino
11386d7f5d3SJohn Marino /* Use Newton iterations for approximating 1/sqrt(a) instead of sqrt(a),
11486d7f5d3SJohn Marino since we can do the former without division. As part of the last
11586d7f5d3SJohn Marino iteration convert from 1/sqrt(a) to sqrt(a). */
11686d7f5d3SJohn Marino
11786d7f5d3SJohn Marino abits = a0 >> (GMP_LIMB_BITS - 1 - 8); /* extract bits for table lookup */
11886d7f5d3SJohn Marino x0 = invsqrttab[abits - 0x80]; /* initial 1/sqrt(a) */
11986d7f5d3SJohn Marino
12086d7f5d3SJohn Marino /* x0 is now an 8 bits approximation of 1/sqrt(a0) */
12186d7f5d3SJohn Marino
12286d7f5d3SJohn Marino #if GMP_NUMB_BITS > 32
12386d7f5d3SJohn Marino a1 = a0 >> (GMP_LIMB_BITS - 1 - 32);
12486d7f5d3SJohn Marino t = (mp_limb_signed_t) (CNST_LIMB(0x2000000000000) - 0x30000 - a1 * x0 * x0) >> 16;
12586d7f5d3SJohn Marino x0 = (x0 << 16) + ((mp_limb_signed_t) (x0 * t) >> (16+2));
12686d7f5d3SJohn Marino
12786d7f5d3SJohn Marino /* x0 is now an 16 bits approximation of 1/sqrt(a0) */
12886d7f5d3SJohn Marino
12986d7f5d3SJohn Marino t2 = x0 * (a0 >> (32-8));
13086d7f5d3SJohn Marino t = t2 >> 25;
13186d7f5d3SJohn Marino t = ((mp_limb_signed_t) ((a0 << 14) - t * t - MAGIC) >> (32-8));
13286d7f5d3SJohn Marino x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 15);
13386d7f5d3SJohn Marino x0 >>= 32;
13486d7f5d3SJohn Marino #else
13586d7f5d3SJohn Marino t2 = x0 * (a0 >> (16-8));
13686d7f5d3SJohn Marino t = t2 >> 13;
13786d7f5d3SJohn Marino t = ((mp_limb_signed_t) ((a0 << 6) - t * t - MAGIC) >> (16-8));
13886d7f5d3SJohn Marino x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 7);
13986d7f5d3SJohn Marino x0 >>= 16;
14086d7f5d3SJohn Marino #endif
14186d7f5d3SJohn Marino
14286d7f5d3SJohn Marino /* x0 is now a full limb approximation of sqrt(a0) */
14386d7f5d3SJohn Marino
14486d7f5d3SJohn Marino x2 = x0 * x0;
14586d7f5d3SJohn Marino if (x2 + 2*x0 <= a0 - 1)
14686d7f5d3SJohn Marino {
14786d7f5d3SJohn Marino x2 += 2*x0 + 1;
14886d7f5d3SJohn Marino x0++;
14986d7f5d3SJohn Marino }
15086d7f5d3SJohn Marino
15186d7f5d3SJohn Marino *rp = a0 - x2;
15286d7f5d3SJohn Marino return x0;
15386d7f5d3SJohn Marino }
15486d7f5d3SJohn Marino
15586d7f5d3SJohn Marino
15686d7f5d3SJohn Marino #define Prec (GMP_NUMB_BITS >> 1)
15786d7f5d3SJohn Marino
15886d7f5d3SJohn Marino /* same as mpn_sqrtrem, but for size=2 and {np, 2} normalized
15986d7f5d3SJohn Marino return cc such that {np, 2} = sp[0]^2 + cc*2^GMP_NUMB_BITS + rp[0] */
16086d7f5d3SJohn Marino static mp_limb_t
mpn_sqrtrem2(mp_ptr sp,mp_ptr rp,mp_srcptr np)16186d7f5d3SJohn Marino mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp, mp_srcptr np)
16286d7f5d3SJohn Marino {
16386d7f5d3SJohn Marino mp_limb_t qhl, q, u, np0, sp0, rp0, q2;
16486d7f5d3SJohn Marino int cc;
16586d7f5d3SJohn Marino
16686d7f5d3SJohn Marino ASSERT (np[1] >= GMP_NUMB_HIGHBIT / 2);
16786d7f5d3SJohn Marino
16886d7f5d3SJohn Marino np0 = np[0];
16986d7f5d3SJohn Marino sp0 = mpn_sqrtrem1 (rp, np[1]);
17086d7f5d3SJohn Marino qhl = 0;
17186d7f5d3SJohn Marino rp0 = rp[0];
17286d7f5d3SJohn Marino while (rp0 >= sp0)
17386d7f5d3SJohn Marino {
17486d7f5d3SJohn Marino qhl++;
17586d7f5d3SJohn Marino rp0 -= sp0;
17686d7f5d3SJohn Marino }
17786d7f5d3SJohn Marino /* now rp0 < sp0 < 2^Prec */
17886d7f5d3SJohn Marino rp0 = (rp0 << Prec) + (np0 >> Prec);
17986d7f5d3SJohn Marino u = 2 * sp0;
18086d7f5d3SJohn Marino q = rp0 / u;
18186d7f5d3SJohn Marino u = rp0 - q * u;
18286d7f5d3SJohn Marino q += (qhl & 1) << (Prec - 1);
18386d7f5d3SJohn Marino qhl >>= 1; /* if qhl=1, necessary q=0 as qhl*2^Prec + q <= 2^Prec */
18486d7f5d3SJohn Marino /* now we have (initial rp0)<<Prec + np0>>Prec = (qhl<<Prec + q) * (2sp0) + u */
18586d7f5d3SJohn Marino sp0 = ((sp0 + qhl) << Prec) + q;
18686d7f5d3SJohn Marino cc = u >> Prec;
18786d7f5d3SJohn Marino rp0 = ((u << Prec) & GMP_NUMB_MASK) + (np0 & (((mp_limb_t) 1 << Prec) - 1));
18886d7f5d3SJohn Marino /* subtract q * q or qhl*2^(2*Prec) from rp */
18986d7f5d3SJohn Marino q2 = q * q;
19086d7f5d3SJohn Marino cc -= (rp0 < q2) + qhl;
19186d7f5d3SJohn Marino rp0 -= q2;
19286d7f5d3SJohn Marino /* now subtract 2*q*2^Prec + 2^(2*Prec) if qhl is set */
19386d7f5d3SJohn Marino if (cc < 0)
19486d7f5d3SJohn Marino {
19586d7f5d3SJohn Marino if (sp0 != 0)
19686d7f5d3SJohn Marino {
19786d7f5d3SJohn Marino rp0 += sp0;
19886d7f5d3SJohn Marino cc += rp0 < sp0;
19986d7f5d3SJohn Marino }
20086d7f5d3SJohn Marino else
20186d7f5d3SJohn Marino cc++;
20286d7f5d3SJohn Marino --sp0;
20386d7f5d3SJohn Marino rp0 += sp0;
20486d7f5d3SJohn Marino cc += rp0 < sp0;
20586d7f5d3SJohn Marino }
20686d7f5d3SJohn Marino
20786d7f5d3SJohn Marino rp[0] = rp0;
20886d7f5d3SJohn Marino sp[0] = sp0;
20986d7f5d3SJohn Marino return cc;
21086d7f5d3SJohn Marino }
21186d7f5d3SJohn Marino
21286d7f5d3SJohn Marino /* writes in {sp, n} the square root (rounded towards zero) of {np, 2n},
21386d7f5d3SJohn Marino and in {np, n} the low n limbs of the remainder, returns the high
21486d7f5d3SJohn Marino limb of the remainder (which is 0 or 1).
21586d7f5d3SJohn Marino Assumes {np, 2n} is normalized, i.e. np[2n-1] >= B/4
21686d7f5d3SJohn Marino where B=2^GMP_NUMB_BITS. */
21786d7f5d3SJohn Marino static mp_limb_t
mpn_dc_sqrtrem(mp_ptr sp,mp_ptr np,mp_size_t n)21886d7f5d3SJohn Marino mpn_dc_sqrtrem (mp_ptr sp, mp_ptr np, mp_size_t n)
21986d7f5d3SJohn Marino {
22086d7f5d3SJohn Marino mp_limb_t q; /* carry out of {sp, n} */
22186d7f5d3SJohn Marino int c, b; /* carry out of remainder */
22286d7f5d3SJohn Marino mp_size_t l, h;
22386d7f5d3SJohn Marino
22486d7f5d3SJohn Marino ASSERT (np[2 * n - 1] >= GMP_NUMB_HIGHBIT / 2);
22586d7f5d3SJohn Marino
22686d7f5d3SJohn Marino if (n == 1)
22786d7f5d3SJohn Marino c = mpn_sqrtrem2 (sp, np, np);
22886d7f5d3SJohn Marino else
22986d7f5d3SJohn Marino {
23086d7f5d3SJohn Marino l = n / 2;
23186d7f5d3SJohn Marino h = n - l;
23286d7f5d3SJohn Marino q = mpn_dc_sqrtrem (sp + l, np + 2 * l, h);
23386d7f5d3SJohn Marino if (q != 0)
23486d7f5d3SJohn Marino mpn_sub_n (np + 2 * l, np + 2 * l, sp + l, h);
23586d7f5d3SJohn Marino q += mpn_divrem (sp, 0, np + l, n, sp + l, h);
23686d7f5d3SJohn Marino c = sp[0] & 1;
23786d7f5d3SJohn Marino mpn_rshift (sp, sp, l, 1);
23886d7f5d3SJohn Marino sp[l - 1] |= (q << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK;
23986d7f5d3SJohn Marino q >>= 1;
24086d7f5d3SJohn Marino if (c != 0)
24186d7f5d3SJohn Marino c = mpn_add_n (np + l, np + l, sp + l, h);
24286d7f5d3SJohn Marino mpn_sqr (np + n, sp, l);
24386d7f5d3SJohn Marino b = q + mpn_sub_n (np, np, np + n, 2 * l);
24486d7f5d3SJohn Marino c -= (l == h) ? b : mpn_sub_1 (np + 2 * l, np + 2 * l, 1, (mp_limb_t) b);
24586d7f5d3SJohn Marino q = mpn_add_1 (sp + l, sp + l, h, q);
24686d7f5d3SJohn Marino
24786d7f5d3SJohn Marino if (c < 0)
24886d7f5d3SJohn Marino {
24986d7f5d3SJohn Marino c += mpn_addmul_1 (np, sp, n, CNST_LIMB(2)) + 2 * q;
25086d7f5d3SJohn Marino c -= mpn_sub_1 (np, np, n, CNST_LIMB(1));
25186d7f5d3SJohn Marino q -= mpn_sub_1 (sp, sp, n, CNST_LIMB(1));
25286d7f5d3SJohn Marino }
25386d7f5d3SJohn Marino }
25486d7f5d3SJohn Marino
25586d7f5d3SJohn Marino return c;
25686d7f5d3SJohn Marino }
25786d7f5d3SJohn Marino
25886d7f5d3SJohn Marino
25986d7f5d3SJohn Marino mp_size_t
mpn_sqrtrem(mp_ptr sp,mp_ptr rp,mp_srcptr np,mp_size_t nn)26086d7f5d3SJohn Marino mpn_sqrtrem (mp_ptr sp, mp_ptr rp, mp_srcptr np, mp_size_t nn)
26186d7f5d3SJohn Marino {
26286d7f5d3SJohn Marino mp_limb_t *tp, s0[1], cc, high, rl;
26386d7f5d3SJohn Marino int c;
26486d7f5d3SJohn Marino mp_size_t rn, tn;
26586d7f5d3SJohn Marino TMP_DECL;
26686d7f5d3SJohn Marino
26786d7f5d3SJohn Marino ASSERT (nn >= 0);
26886d7f5d3SJohn Marino ASSERT_MPN (np, nn);
26986d7f5d3SJohn Marino
27086d7f5d3SJohn Marino /* If OP is zero, both results are zero. */
27186d7f5d3SJohn Marino if (nn == 0)
27286d7f5d3SJohn Marino return 0;
27386d7f5d3SJohn Marino
27486d7f5d3SJohn Marino ASSERT (np[nn - 1] != 0);
27586d7f5d3SJohn Marino ASSERT (rp == NULL || MPN_SAME_OR_SEPARATE_P (np, rp, nn));
27686d7f5d3SJohn Marino ASSERT (rp == NULL || ! MPN_OVERLAP_P (sp, (nn + 1) / 2, rp, nn));
27786d7f5d3SJohn Marino ASSERT (! MPN_OVERLAP_P (sp, (nn + 1) / 2, np, nn));
27886d7f5d3SJohn Marino
27986d7f5d3SJohn Marino high = np[nn - 1];
28086d7f5d3SJohn Marino if (nn == 1 && (high & GMP_NUMB_HIGHBIT))
28186d7f5d3SJohn Marino {
28286d7f5d3SJohn Marino mp_limb_t r;
28386d7f5d3SJohn Marino sp[0] = mpn_sqrtrem1 (&r, high);
28486d7f5d3SJohn Marino if (rp != NULL)
28586d7f5d3SJohn Marino rp[0] = r;
28686d7f5d3SJohn Marino return r != 0;
28786d7f5d3SJohn Marino }
28886d7f5d3SJohn Marino count_leading_zeros (c, high);
28986d7f5d3SJohn Marino c -= GMP_NAIL_BITS;
29086d7f5d3SJohn Marino
29186d7f5d3SJohn Marino c = c / 2; /* we have to shift left by 2c bits to normalize {np, nn} */
29286d7f5d3SJohn Marino tn = (nn + 1) / 2; /* 2*tn is the smallest even integer >= nn */
29386d7f5d3SJohn Marino
29486d7f5d3SJohn Marino TMP_MARK;
29586d7f5d3SJohn Marino if (nn % 2 != 0 || c > 0)
29686d7f5d3SJohn Marino {
29786d7f5d3SJohn Marino tp = TMP_ALLOC_LIMBS (2 * tn);
29886d7f5d3SJohn Marino tp[0] = 0; /* needed only when 2*tn > nn, but saves a test */
29986d7f5d3SJohn Marino if (c != 0)
30086d7f5d3SJohn Marino mpn_lshift (tp + 2 * tn - nn, np, nn, 2 * c);
30186d7f5d3SJohn Marino else
30286d7f5d3SJohn Marino MPN_COPY (tp + 2 * tn - nn, np, nn);
30386d7f5d3SJohn Marino rl = mpn_dc_sqrtrem (sp, tp, tn);
30486d7f5d3SJohn Marino /* We have 2^(2k)*N = S^2 + R where k = c + (2tn-nn)*GMP_NUMB_BITS/2,
30586d7f5d3SJohn Marino thus 2^(2k)*N = (S-s0)^2 + 2*S*s0 - s0^2 + R where s0=S mod 2^k */
30686d7f5d3SJohn Marino c += (nn % 2) * GMP_NUMB_BITS / 2; /* c now represents k */
30786d7f5d3SJohn Marino s0[0] = sp[0] & (((mp_limb_t) 1 << c) - 1); /* S mod 2^k */
30886d7f5d3SJohn Marino rl += mpn_addmul_1 (tp, sp, tn, 2 * s0[0]); /* R = R + 2*s0*S */
30986d7f5d3SJohn Marino cc = mpn_submul_1 (tp, s0, 1, s0[0]);
31086d7f5d3SJohn Marino rl -= (tn > 1) ? mpn_sub_1 (tp + 1, tp + 1, tn - 1, cc) : cc;
31186d7f5d3SJohn Marino mpn_rshift (sp, sp, tn, c);
31286d7f5d3SJohn Marino tp[tn] = rl;
31386d7f5d3SJohn Marino if (rp == NULL)
31486d7f5d3SJohn Marino rp = tp;
31586d7f5d3SJohn Marino c = c << 1;
31686d7f5d3SJohn Marino if (c < GMP_NUMB_BITS)
31786d7f5d3SJohn Marino tn++;
31886d7f5d3SJohn Marino else
31986d7f5d3SJohn Marino {
32086d7f5d3SJohn Marino tp++;
32186d7f5d3SJohn Marino c -= GMP_NUMB_BITS;
32286d7f5d3SJohn Marino }
32386d7f5d3SJohn Marino if (c != 0)
32486d7f5d3SJohn Marino mpn_rshift (rp, tp, tn, c);
32586d7f5d3SJohn Marino else
32686d7f5d3SJohn Marino MPN_COPY_INCR (rp, tp, tn);
32786d7f5d3SJohn Marino rn = tn;
32886d7f5d3SJohn Marino }
32986d7f5d3SJohn Marino else
33086d7f5d3SJohn Marino {
33186d7f5d3SJohn Marino if (rp == NULL)
33286d7f5d3SJohn Marino rp = TMP_ALLOC_LIMBS (nn);
33386d7f5d3SJohn Marino if (rp != np)
33486d7f5d3SJohn Marino MPN_COPY (rp, np, nn);
33586d7f5d3SJohn Marino rn = tn + (rp[tn] = mpn_dc_sqrtrem (sp, rp, tn));
33686d7f5d3SJohn Marino }
33786d7f5d3SJohn Marino
33886d7f5d3SJohn Marino MPN_NORMALIZE (rp, rn);
33986d7f5d3SJohn Marino
34086d7f5d3SJohn Marino TMP_FREE;
34186d7f5d3SJohn Marino return rn;
34286d7f5d3SJohn Marino }
343