xref: /dflybsd-src/contrib/gmp/mpn/generic/sqrtrem.c (revision 86d7f5d305c6adaa56ff4582ece9859d73106103)
186d7f5d3SJohn Marino /* mpn_sqrtrem -- square root and remainder
286d7f5d3SJohn Marino 
386d7f5d3SJohn Marino    Contributed to the GNU project by Paul Zimmermann (most code) and
486d7f5d3SJohn Marino    Torbjorn Granlund (mpn_sqrtrem1).
586d7f5d3SJohn Marino 
686d7f5d3SJohn Marino    THE FUNCTIONS IN THIS FILE EXCEPT mpn_sqrtrem ARE INTERNAL WITH A
786d7f5d3SJohn Marino    MUTABLE INTERFACE.  IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED
886d7f5d3SJohn Marino    INTERFACES.  IN FACT, IT IS ALMOST GUARANTEED THAT THEY WILL CHANGE OR
986d7f5d3SJohn Marino    DISAPPEAR IN A FUTURE GMP RELEASE.
1086d7f5d3SJohn Marino 
1186d7f5d3SJohn Marino Copyright 1999, 2000, 2001, 2002, 2004, 2005, 2008, 2010 Free Software
1286d7f5d3SJohn Marino Foundation, Inc.
1386d7f5d3SJohn Marino 
1486d7f5d3SJohn Marino This file is part of the GNU MP Library.
1586d7f5d3SJohn Marino 
1686d7f5d3SJohn Marino The GNU MP Library is free software; you can redistribute it and/or modify
1786d7f5d3SJohn Marino it under the terms of the GNU Lesser General Public License as published by
1886d7f5d3SJohn Marino the Free Software Foundation; either version 3 of the License, or (at your
1986d7f5d3SJohn Marino option) any later version.
2086d7f5d3SJohn Marino 
2186d7f5d3SJohn Marino The GNU MP Library is distributed in the hope that it will be useful, but
2286d7f5d3SJohn Marino WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
2386d7f5d3SJohn Marino or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
2486d7f5d3SJohn Marino License for more details.
2586d7f5d3SJohn Marino 
2686d7f5d3SJohn Marino You should have received a copy of the GNU Lesser General Public License
2786d7f5d3SJohn Marino along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
2886d7f5d3SJohn Marino 
2986d7f5d3SJohn Marino 
3086d7f5d3SJohn Marino /* See "Karatsuba Square Root", reference in gmp.texi.  */
3186d7f5d3SJohn Marino 
3286d7f5d3SJohn Marino 
3386d7f5d3SJohn Marino #include <stdio.h>
3486d7f5d3SJohn Marino #include <stdlib.h>
3586d7f5d3SJohn Marino 
3686d7f5d3SJohn Marino #include "gmp.h"
3786d7f5d3SJohn Marino #include "gmp-impl.h"
3886d7f5d3SJohn Marino #include "longlong.h"
3986d7f5d3SJohn Marino 
4086d7f5d3SJohn Marino static const unsigned short invsqrttab[384] =
4186d7f5d3SJohn Marino {
4286d7f5d3SJohn Marino   0x1ff,0x1fd,0x1fb,0x1f9,0x1f7,0x1f5,0x1f3,0x1f2, /* sqrt(1/80)..sqrt(1/87) */
4386d7f5d3SJohn Marino   0x1f0,0x1ee,0x1ec,0x1ea,0x1e9,0x1e7,0x1e5,0x1e4, /* sqrt(1/88)..sqrt(1/8f) */
4486d7f5d3SJohn Marino   0x1e2,0x1e0,0x1df,0x1dd,0x1db,0x1da,0x1d8,0x1d7, /* sqrt(1/90)..sqrt(1/97) */
4586d7f5d3SJohn Marino   0x1d5,0x1d4,0x1d2,0x1d1,0x1cf,0x1ce,0x1cc,0x1cb, /* sqrt(1/98)..sqrt(1/9f) */
4686d7f5d3SJohn Marino   0x1c9,0x1c8,0x1c6,0x1c5,0x1c4,0x1c2,0x1c1,0x1c0, /* sqrt(1/a0)..sqrt(1/a7) */
4786d7f5d3SJohn Marino   0x1be,0x1bd,0x1bc,0x1ba,0x1b9,0x1b8,0x1b7,0x1b5, /* sqrt(1/a8)..sqrt(1/af) */
4886d7f5d3SJohn Marino   0x1b4,0x1b3,0x1b2,0x1b0,0x1af,0x1ae,0x1ad,0x1ac, /* sqrt(1/b0)..sqrt(1/b7) */
4986d7f5d3SJohn Marino   0x1aa,0x1a9,0x1a8,0x1a7,0x1a6,0x1a5,0x1a4,0x1a3, /* sqrt(1/b8)..sqrt(1/bf) */
5086d7f5d3SJohn Marino   0x1a2,0x1a0,0x19f,0x19e,0x19d,0x19c,0x19b,0x19a, /* sqrt(1/c0)..sqrt(1/c7) */
5186d7f5d3SJohn Marino   0x199,0x198,0x197,0x196,0x195,0x194,0x193,0x192, /* sqrt(1/c8)..sqrt(1/cf) */
5286d7f5d3SJohn Marino   0x191,0x190,0x18f,0x18e,0x18d,0x18c,0x18c,0x18b, /* sqrt(1/d0)..sqrt(1/d7) */
5386d7f5d3SJohn Marino   0x18a,0x189,0x188,0x187,0x186,0x185,0x184,0x183, /* sqrt(1/d8)..sqrt(1/df) */
5486d7f5d3SJohn Marino   0x183,0x182,0x181,0x180,0x17f,0x17e,0x17e,0x17d, /* sqrt(1/e0)..sqrt(1/e7) */
5586d7f5d3SJohn Marino   0x17c,0x17b,0x17a,0x179,0x179,0x178,0x177,0x176, /* sqrt(1/e8)..sqrt(1/ef) */
5686d7f5d3SJohn Marino   0x176,0x175,0x174,0x173,0x172,0x172,0x171,0x170, /* sqrt(1/f0)..sqrt(1/f7) */
5786d7f5d3SJohn Marino   0x16f,0x16f,0x16e,0x16d,0x16d,0x16c,0x16b,0x16a, /* sqrt(1/f8)..sqrt(1/ff) */
5886d7f5d3SJohn Marino   0x16a,0x169,0x168,0x168,0x167,0x166,0x166,0x165, /* sqrt(1/100)..sqrt(1/107) */
5986d7f5d3SJohn Marino   0x164,0x164,0x163,0x162,0x162,0x161,0x160,0x160, /* sqrt(1/108)..sqrt(1/10f) */
6086d7f5d3SJohn Marino   0x15f,0x15e,0x15e,0x15d,0x15c,0x15c,0x15b,0x15a, /* sqrt(1/110)..sqrt(1/117) */
6186d7f5d3SJohn Marino   0x15a,0x159,0x159,0x158,0x157,0x157,0x156,0x156, /* sqrt(1/118)..sqrt(1/11f) */
6286d7f5d3SJohn Marino   0x155,0x154,0x154,0x153,0x153,0x152,0x152,0x151, /* sqrt(1/120)..sqrt(1/127) */
6386d7f5d3SJohn Marino   0x150,0x150,0x14f,0x14f,0x14e,0x14e,0x14d,0x14d, /* sqrt(1/128)..sqrt(1/12f) */
6486d7f5d3SJohn Marino   0x14c,0x14b,0x14b,0x14a,0x14a,0x149,0x149,0x148, /* sqrt(1/130)..sqrt(1/137) */
6586d7f5d3SJohn Marino   0x148,0x147,0x147,0x146,0x146,0x145,0x145,0x144, /* sqrt(1/138)..sqrt(1/13f) */
6686d7f5d3SJohn Marino   0x144,0x143,0x143,0x142,0x142,0x141,0x141,0x140, /* sqrt(1/140)..sqrt(1/147) */
6786d7f5d3SJohn Marino   0x140,0x13f,0x13f,0x13e,0x13e,0x13d,0x13d,0x13c, /* sqrt(1/148)..sqrt(1/14f) */
6886d7f5d3SJohn Marino   0x13c,0x13b,0x13b,0x13a,0x13a,0x139,0x139,0x139, /* sqrt(1/150)..sqrt(1/157) */
6986d7f5d3SJohn Marino   0x138,0x138,0x137,0x137,0x136,0x136,0x135,0x135, /* sqrt(1/158)..sqrt(1/15f) */
7086d7f5d3SJohn Marino   0x135,0x134,0x134,0x133,0x133,0x132,0x132,0x132, /* sqrt(1/160)..sqrt(1/167) */
7186d7f5d3SJohn Marino   0x131,0x131,0x130,0x130,0x12f,0x12f,0x12f,0x12e, /* sqrt(1/168)..sqrt(1/16f) */
7286d7f5d3SJohn Marino   0x12e,0x12d,0x12d,0x12d,0x12c,0x12c,0x12b,0x12b, /* sqrt(1/170)..sqrt(1/177) */
7386d7f5d3SJohn Marino   0x12b,0x12a,0x12a,0x129,0x129,0x129,0x128,0x128, /* sqrt(1/178)..sqrt(1/17f) */
7486d7f5d3SJohn Marino   0x127,0x127,0x127,0x126,0x126,0x126,0x125,0x125, /* sqrt(1/180)..sqrt(1/187) */
7586d7f5d3SJohn Marino   0x124,0x124,0x124,0x123,0x123,0x123,0x122,0x122, /* sqrt(1/188)..sqrt(1/18f) */
7686d7f5d3SJohn Marino   0x121,0x121,0x121,0x120,0x120,0x120,0x11f,0x11f, /* sqrt(1/190)..sqrt(1/197) */
7786d7f5d3SJohn Marino   0x11f,0x11e,0x11e,0x11e,0x11d,0x11d,0x11d,0x11c, /* sqrt(1/198)..sqrt(1/19f) */
7886d7f5d3SJohn Marino   0x11c,0x11b,0x11b,0x11b,0x11a,0x11a,0x11a,0x119, /* sqrt(1/1a0)..sqrt(1/1a7) */
7986d7f5d3SJohn Marino   0x119,0x119,0x118,0x118,0x118,0x118,0x117,0x117, /* sqrt(1/1a8)..sqrt(1/1af) */
8086d7f5d3SJohn Marino   0x117,0x116,0x116,0x116,0x115,0x115,0x115,0x114, /* sqrt(1/1b0)..sqrt(1/1b7) */
8186d7f5d3SJohn Marino   0x114,0x114,0x113,0x113,0x113,0x112,0x112,0x112, /* sqrt(1/1b8)..sqrt(1/1bf) */
8286d7f5d3SJohn Marino   0x112,0x111,0x111,0x111,0x110,0x110,0x110,0x10f, /* sqrt(1/1c0)..sqrt(1/1c7) */
8386d7f5d3SJohn Marino   0x10f,0x10f,0x10f,0x10e,0x10e,0x10e,0x10d,0x10d, /* sqrt(1/1c8)..sqrt(1/1cf) */
8486d7f5d3SJohn Marino   0x10d,0x10c,0x10c,0x10c,0x10c,0x10b,0x10b,0x10b, /* sqrt(1/1d0)..sqrt(1/1d7) */
8586d7f5d3SJohn Marino   0x10a,0x10a,0x10a,0x10a,0x109,0x109,0x109,0x109, /* sqrt(1/1d8)..sqrt(1/1df) */
8686d7f5d3SJohn Marino   0x108,0x108,0x108,0x107,0x107,0x107,0x107,0x106, /* sqrt(1/1e0)..sqrt(1/1e7) */
8786d7f5d3SJohn Marino   0x106,0x106,0x106,0x105,0x105,0x105,0x104,0x104, /* sqrt(1/1e8)..sqrt(1/1ef) */
8886d7f5d3SJohn Marino   0x104,0x104,0x103,0x103,0x103,0x103,0x102,0x102, /* sqrt(1/1f0)..sqrt(1/1f7) */
8986d7f5d3SJohn Marino   0x102,0x102,0x101,0x101,0x101,0x101,0x100,0x100  /* sqrt(1/1f8)..sqrt(1/1ff) */
9086d7f5d3SJohn Marino };
9186d7f5d3SJohn Marino 
9286d7f5d3SJohn Marino /* Compute s = floor(sqrt(a0)), and *rp = a0 - s^2.  */
9386d7f5d3SJohn Marino 
9486d7f5d3SJohn Marino #if GMP_NUMB_BITS > 32
9586d7f5d3SJohn Marino #define MAGIC CNST_LIMB(0x10000000000)	/* 0xffe7debbfc < MAGIC < 0x232b1850f410 */
9686d7f5d3SJohn Marino #else
9786d7f5d3SJohn Marino #define MAGIC CNST_LIMB(0x100000)		/* 0xfee6f < MAGIC < 0x29cbc8 */
9886d7f5d3SJohn Marino #endif
9986d7f5d3SJohn Marino 
10086d7f5d3SJohn Marino static mp_limb_t
mpn_sqrtrem1(mp_ptr rp,mp_limb_t a0)10186d7f5d3SJohn Marino mpn_sqrtrem1 (mp_ptr rp, mp_limb_t a0)
10286d7f5d3SJohn Marino {
10386d7f5d3SJohn Marino #if GMP_NUMB_BITS > 32
10486d7f5d3SJohn Marino   mp_limb_t a1;
10586d7f5d3SJohn Marino #endif
10686d7f5d3SJohn Marino   mp_limb_t x0, t2, t, x2;
10786d7f5d3SJohn Marino   unsigned abits;
10886d7f5d3SJohn Marino 
10986d7f5d3SJohn Marino   ASSERT_ALWAYS (GMP_NAIL_BITS == 0);
11086d7f5d3SJohn Marino   ASSERT_ALWAYS (GMP_LIMB_BITS == 32 || GMP_LIMB_BITS == 64);
11186d7f5d3SJohn Marino   ASSERT (a0 >= GMP_NUMB_HIGHBIT / 2);
11286d7f5d3SJohn Marino 
11386d7f5d3SJohn Marino   /* Use Newton iterations for approximating 1/sqrt(a) instead of sqrt(a),
11486d7f5d3SJohn Marino      since we can do the former without division.  As part of the last
11586d7f5d3SJohn Marino      iteration convert from 1/sqrt(a) to sqrt(a).  */
11686d7f5d3SJohn Marino 
11786d7f5d3SJohn Marino   abits = a0 >> (GMP_LIMB_BITS - 1 - 8);	/* extract bits for table lookup */
11886d7f5d3SJohn Marino   x0 = invsqrttab[abits - 0x80];		/* initial 1/sqrt(a) */
11986d7f5d3SJohn Marino 
12086d7f5d3SJohn Marino   /* x0 is now an 8 bits approximation of 1/sqrt(a0) */
12186d7f5d3SJohn Marino 
12286d7f5d3SJohn Marino #if GMP_NUMB_BITS > 32
12386d7f5d3SJohn Marino   a1 = a0 >> (GMP_LIMB_BITS - 1 - 32);
12486d7f5d3SJohn Marino   t = (mp_limb_signed_t) (CNST_LIMB(0x2000000000000) - 0x30000  - a1 * x0 * x0) >> 16;
12586d7f5d3SJohn Marino   x0 = (x0 << 16) + ((mp_limb_signed_t) (x0 * t) >> (16+2));
12686d7f5d3SJohn Marino 
12786d7f5d3SJohn Marino   /* x0 is now an 16 bits approximation of 1/sqrt(a0) */
12886d7f5d3SJohn Marino 
12986d7f5d3SJohn Marino   t2 = x0 * (a0 >> (32-8));
13086d7f5d3SJohn Marino   t = t2 >> 25;
13186d7f5d3SJohn Marino   t = ((mp_limb_signed_t) ((a0 << 14) - t * t - MAGIC) >> (32-8));
13286d7f5d3SJohn Marino   x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 15);
13386d7f5d3SJohn Marino   x0 >>= 32;
13486d7f5d3SJohn Marino #else
13586d7f5d3SJohn Marino   t2 = x0 * (a0 >> (16-8));
13686d7f5d3SJohn Marino   t = t2 >> 13;
13786d7f5d3SJohn Marino   t = ((mp_limb_signed_t) ((a0 << 6) - t * t - MAGIC) >> (16-8));
13886d7f5d3SJohn Marino   x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 7);
13986d7f5d3SJohn Marino   x0 >>= 16;
14086d7f5d3SJohn Marino #endif
14186d7f5d3SJohn Marino 
14286d7f5d3SJohn Marino   /* x0 is now a full limb approximation of sqrt(a0) */
14386d7f5d3SJohn Marino 
14486d7f5d3SJohn Marino   x2 = x0 * x0;
14586d7f5d3SJohn Marino   if (x2 + 2*x0 <= a0 - 1)
14686d7f5d3SJohn Marino     {
14786d7f5d3SJohn Marino       x2 += 2*x0 + 1;
14886d7f5d3SJohn Marino       x0++;
14986d7f5d3SJohn Marino     }
15086d7f5d3SJohn Marino 
15186d7f5d3SJohn Marino   *rp = a0 - x2;
15286d7f5d3SJohn Marino   return x0;
15386d7f5d3SJohn Marino }
15486d7f5d3SJohn Marino 
15586d7f5d3SJohn Marino 
15686d7f5d3SJohn Marino #define Prec (GMP_NUMB_BITS >> 1)
15786d7f5d3SJohn Marino 
15886d7f5d3SJohn Marino /* same as mpn_sqrtrem, but for size=2 and {np, 2} normalized
15986d7f5d3SJohn Marino    return cc such that {np, 2} = sp[0]^2 + cc*2^GMP_NUMB_BITS + rp[0] */
16086d7f5d3SJohn Marino static mp_limb_t
mpn_sqrtrem2(mp_ptr sp,mp_ptr rp,mp_srcptr np)16186d7f5d3SJohn Marino mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp, mp_srcptr np)
16286d7f5d3SJohn Marino {
16386d7f5d3SJohn Marino   mp_limb_t qhl, q, u, np0, sp0, rp0, q2;
16486d7f5d3SJohn Marino   int cc;
16586d7f5d3SJohn Marino 
16686d7f5d3SJohn Marino   ASSERT (np[1] >= GMP_NUMB_HIGHBIT / 2);
16786d7f5d3SJohn Marino 
16886d7f5d3SJohn Marino   np0 = np[0];
16986d7f5d3SJohn Marino   sp0 = mpn_sqrtrem1 (rp, np[1]);
17086d7f5d3SJohn Marino   qhl = 0;
17186d7f5d3SJohn Marino   rp0 = rp[0];
17286d7f5d3SJohn Marino   while (rp0 >= sp0)
17386d7f5d3SJohn Marino     {
17486d7f5d3SJohn Marino       qhl++;
17586d7f5d3SJohn Marino       rp0 -= sp0;
17686d7f5d3SJohn Marino     }
17786d7f5d3SJohn Marino   /* now rp0 < sp0 < 2^Prec */
17886d7f5d3SJohn Marino   rp0 = (rp0 << Prec) + (np0 >> Prec);
17986d7f5d3SJohn Marino   u = 2 * sp0;
18086d7f5d3SJohn Marino   q = rp0 / u;
18186d7f5d3SJohn Marino   u = rp0 - q * u;
18286d7f5d3SJohn Marino   q += (qhl & 1) << (Prec - 1);
18386d7f5d3SJohn Marino   qhl >>= 1; /* if qhl=1, necessary q=0 as qhl*2^Prec + q <= 2^Prec */
18486d7f5d3SJohn Marino   /* now we have (initial rp0)<<Prec + np0>>Prec = (qhl<<Prec + q) * (2sp0) + u */
18586d7f5d3SJohn Marino   sp0 = ((sp0 + qhl) << Prec) + q;
18686d7f5d3SJohn Marino   cc = u >> Prec;
18786d7f5d3SJohn Marino   rp0 = ((u << Prec) & GMP_NUMB_MASK) + (np0 & (((mp_limb_t) 1 << Prec) - 1));
18886d7f5d3SJohn Marino   /* subtract q * q or qhl*2^(2*Prec) from rp */
18986d7f5d3SJohn Marino   q2 = q * q;
19086d7f5d3SJohn Marino   cc -= (rp0 < q2) + qhl;
19186d7f5d3SJohn Marino   rp0 -= q2;
19286d7f5d3SJohn Marino   /* now subtract 2*q*2^Prec + 2^(2*Prec) if qhl is set */
19386d7f5d3SJohn Marino   if (cc < 0)
19486d7f5d3SJohn Marino     {
19586d7f5d3SJohn Marino       if (sp0 != 0)
19686d7f5d3SJohn Marino 	{
19786d7f5d3SJohn Marino 	  rp0 += sp0;
19886d7f5d3SJohn Marino 	  cc += rp0 < sp0;
19986d7f5d3SJohn Marino 	}
20086d7f5d3SJohn Marino       else
20186d7f5d3SJohn Marino 	cc++;
20286d7f5d3SJohn Marino       --sp0;
20386d7f5d3SJohn Marino       rp0 += sp0;
20486d7f5d3SJohn Marino       cc += rp0 < sp0;
20586d7f5d3SJohn Marino     }
20686d7f5d3SJohn Marino 
20786d7f5d3SJohn Marino   rp[0] = rp0;
20886d7f5d3SJohn Marino   sp[0] = sp0;
20986d7f5d3SJohn Marino   return cc;
21086d7f5d3SJohn Marino }
21186d7f5d3SJohn Marino 
21286d7f5d3SJohn Marino /* writes in {sp, n} the square root (rounded towards zero) of {np, 2n},
21386d7f5d3SJohn Marino    and in {np, n} the low n limbs of the remainder, returns the high
21486d7f5d3SJohn Marino    limb of the remainder (which is 0 or 1).
21586d7f5d3SJohn Marino    Assumes {np, 2n} is normalized, i.e. np[2n-1] >= B/4
21686d7f5d3SJohn Marino    where B=2^GMP_NUMB_BITS.  */
21786d7f5d3SJohn Marino static mp_limb_t
mpn_dc_sqrtrem(mp_ptr sp,mp_ptr np,mp_size_t n)21886d7f5d3SJohn Marino mpn_dc_sqrtrem (mp_ptr sp, mp_ptr np, mp_size_t n)
21986d7f5d3SJohn Marino {
22086d7f5d3SJohn Marino   mp_limb_t q;			/* carry out of {sp, n} */
22186d7f5d3SJohn Marino   int c, b;			/* carry out of remainder */
22286d7f5d3SJohn Marino   mp_size_t l, h;
22386d7f5d3SJohn Marino 
22486d7f5d3SJohn Marino   ASSERT (np[2 * n - 1] >= GMP_NUMB_HIGHBIT / 2);
22586d7f5d3SJohn Marino 
22686d7f5d3SJohn Marino   if (n == 1)
22786d7f5d3SJohn Marino     c = mpn_sqrtrem2 (sp, np, np);
22886d7f5d3SJohn Marino   else
22986d7f5d3SJohn Marino     {
23086d7f5d3SJohn Marino       l = n / 2;
23186d7f5d3SJohn Marino       h = n - l;
23286d7f5d3SJohn Marino       q = mpn_dc_sqrtrem (sp + l, np + 2 * l, h);
23386d7f5d3SJohn Marino       if (q != 0)
23486d7f5d3SJohn Marino 	mpn_sub_n (np + 2 * l, np + 2 * l, sp + l, h);
23586d7f5d3SJohn Marino       q += mpn_divrem (sp, 0, np + l, n, sp + l, h);
23686d7f5d3SJohn Marino       c = sp[0] & 1;
23786d7f5d3SJohn Marino       mpn_rshift (sp, sp, l, 1);
23886d7f5d3SJohn Marino       sp[l - 1] |= (q << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK;
23986d7f5d3SJohn Marino       q >>= 1;
24086d7f5d3SJohn Marino       if (c != 0)
24186d7f5d3SJohn Marino 	c = mpn_add_n (np + l, np + l, sp + l, h);
24286d7f5d3SJohn Marino       mpn_sqr (np + n, sp, l);
24386d7f5d3SJohn Marino       b = q + mpn_sub_n (np, np, np + n, 2 * l);
24486d7f5d3SJohn Marino       c -= (l == h) ? b : mpn_sub_1 (np + 2 * l, np + 2 * l, 1, (mp_limb_t) b);
24586d7f5d3SJohn Marino       q = mpn_add_1 (sp + l, sp + l, h, q);
24686d7f5d3SJohn Marino 
24786d7f5d3SJohn Marino       if (c < 0)
24886d7f5d3SJohn Marino 	{
24986d7f5d3SJohn Marino 	  c += mpn_addmul_1 (np, sp, n, CNST_LIMB(2)) + 2 * q;
25086d7f5d3SJohn Marino 	  c -= mpn_sub_1 (np, np, n, CNST_LIMB(1));
25186d7f5d3SJohn Marino 	  q -= mpn_sub_1 (sp, sp, n, CNST_LIMB(1));
25286d7f5d3SJohn Marino 	}
25386d7f5d3SJohn Marino     }
25486d7f5d3SJohn Marino 
25586d7f5d3SJohn Marino   return c;
25686d7f5d3SJohn Marino }
25786d7f5d3SJohn Marino 
25886d7f5d3SJohn Marino 
25986d7f5d3SJohn Marino mp_size_t
mpn_sqrtrem(mp_ptr sp,mp_ptr rp,mp_srcptr np,mp_size_t nn)26086d7f5d3SJohn Marino mpn_sqrtrem (mp_ptr sp, mp_ptr rp, mp_srcptr np, mp_size_t nn)
26186d7f5d3SJohn Marino {
26286d7f5d3SJohn Marino   mp_limb_t *tp, s0[1], cc, high, rl;
26386d7f5d3SJohn Marino   int c;
26486d7f5d3SJohn Marino   mp_size_t rn, tn;
26586d7f5d3SJohn Marino   TMP_DECL;
26686d7f5d3SJohn Marino 
26786d7f5d3SJohn Marino   ASSERT (nn >= 0);
26886d7f5d3SJohn Marino   ASSERT_MPN (np, nn);
26986d7f5d3SJohn Marino 
27086d7f5d3SJohn Marino   /* If OP is zero, both results are zero.  */
27186d7f5d3SJohn Marino   if (nn == 0)
27286d7f5d3SJohn Marino     return 0;
27386d7f5d3SJohn Marino 
27486d7f5d3SJohn Marino   ASSERT (np[nn - 1] != 0);
27586d7f5d3SJohn Marino   ASSERT (rp == NULL || MPN_SAME_OR_SEPARATE_P (np, rp, nn));
27686d7f5d3SJohn Marino   ASSERT (rp == NULL || ! MPN_OVERLAP_P (sp, (nn + 1) / 2, rp, nn));
27786d7f5d3SJohn Marino   ASSERT (! MPN_OVERLAP_P (sp, (nn + 1) / 2, np, nn));
27886d7f5d3SJohn Marino 
27986d7f5d3SJohn Marino   high = np[nn - 1];
28086d7f5d3SJohn Marino   if (nn == 1 && (high & GMP_NUMB_HIGHBIT))
28186d7f5d3SJohn Marino     {
28286d7f5d3SJohn Marino       mp_limb_t r;
28386d7f5d3SJohn Marino       sp[0] = mpn_sqrtrem1 (&r, high);
28486d7f5d3SJohn Marino       if (rp != NULL)
28586d7f5d3SJohn Marino 	rp[0] = r;
28686d7f5d3SJohn Marino       return r != 0;
28786d7f5d3SJohn Marino     }
28886d7f5d3SJohn Marino   count_leading_zeros (c, high);
28986d7f5d3SJohn Marino   c -= GMP_NAIL_BITS;
29086d7f5d3SJohn Marino 
29186d7f5d3SJohn Marino   c = c / 2; /* we have to shift left by 2c bits to normalize {np, nn} */
29286d7f5d3SJohn Marino   tn = (nn + 1) / 2; /* 2*tn is the smallest even integer >= nn */
29386d7f5d3SJohn Marino 
29486d7f5d3SJohn Marino   TMP_MARK;
29586d7f5d3SJohn Marino   if (nn % 2 != 0 || c > 0)
29686d7f5d3SJohn Marino     {
29786d7f5d3SJohn Marino       tp = TMP_ALLOC_LIMBS (2 * tn);
29886d7f5d3SJohn Marino       tp[0] = 0;	     /* needed only when 2*tn > nn, but saves a test */
29986d7f5d3SJohn Marino       if (c != 0)
30086d7f5d3SJohn Marino 	mpn_lshift (tp + 2 * tn - nn, np, nn, 2 * c);
30186d7f5d3SJohn Marino       else
30286d7f5d3SJohn Marino 	MPN_COPY (tp + 2 * tn - nn, np, nn);
30386d7f5d3SJohn Marino       rl = mpn_dc_sqrtrem (sp, tp, tn);
30486d7f5d3SJohn Marino       /* We have 2^(2k)*N = S^2 + R where k = c + (2tn-nn)*GMP_NUMB_BITS/2,
30586d7f5d3SJohn Marino 	 thus 2^(2k)*N = (S-s0)^2 + 2*S*s0 - s0^2 + R where s0=S mod 2^k */
30686d7f5d3SJohn Marino       c += (nn % 2) * GMP_NUMB_BITS / 2;		/* c now represents k */
30786d7f5d3SJohn Marino       s0[0] = sp[0] & (((mp_limb_t) 1 << c) - 1);	/* S mod 2^k */
30886d7f5d3SJohn Marino       rl += mpn_addmul_1 (tp, sp, tn, 2 * s0[0]);	/* R = R + 2*s0*S */
30986d7f5d3SJohn Marino       cc = mpn_submul_1 (tp, s0, 1, s0[0]);
31086d7f5d3SJohn Marino       rl -= (tn > 1) ? mpn_sub_1 (tp + 1, tp + 1, tn - 1, cc) : cc;
31186d7f5d3SJohn Marino       mpn_rshift (sp, sp, tn, c);
31286d7f5d3SJohn Marino       tp[tn] = rl;
31386d7f5d3SJohn Marino       if (rp == NULL)
31486d7f5d3SJohn Marino 	rp = tp;
31586d7f5d3SJohn Marino       c = c << 1;
31686d7f5d3SJohn Marino       if (c < GMP_NUMB_BITS)
31786d7f5d3SJohn Marino 	tn++;
31886d7f5d3SJohn Marino       else
31986d7f5d3SJohn Marino 	{
32086d7f5d3SJohn Marino 	  tp++;
32186d7f5d3SJohn Marino 	  c -= GMP_NUMB_BITS;
32286d7f5d3SJohn Marino 	}
32386d7f5d3SJohn Marino       if (c != 0)
32486d7f5d3SJohn Marino 	mpn_rshift (rp, tp, tn, c);
32586d7f5d3SJohn Marino       else
32686d7f5d3SJohn Marino 	MPN_COPY_INCR (rp, tp, tn);
32786d7f5d3SJohn Marino       rn = tn;
32886d7f5d3SJohn Marino     }
32986d7f5d3SJohn Marino   else
33086d7f5d3SJohn Marino     {
33186d7f5d3SJohn Marino       if (rp == NULL)
33286d7f5d3SJohn Marino 	rp = TMP_ALLOC_LIMBS (nn);
33386d7f5d3SJohn Marino       if (rp != np)
33486d7f5d3SJohn Marino 	MPN_COPY (rp, np, nn);
33586d7f5d3SJohn Marino       rn = tn + (rp[tn] = mpn_dc_sqrtrem (sp, rp, tn));
33686d7f5d3SJohn Marino     }
33786d7f5d3SJohn Marino 
33886d7f5d3SJohn Marino   MPN_NORMALIZE (rp, rn);
33986d7f5d3SJohn Marino 
34086d7f5d3SJohn Marino   TMP_FREE;
34186d7f5d3SJohn Marino   return rn;
34286d7f5d3SJohn Marino }
343