xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/generic/sqrtrem.c (revision 33881f779a77dce6440bdc44610d94de75bebefe)
1 /* mpn_sqrtrem -- square root and remainder
2 
3    Contributed to the GNU project by Paul Zimmermann (most code),
4    Torbjorn Granlund (mpn_sqrtrem1) and Marco Bodrato (mpn_dc_sqrt).
5 
6    THE FUNCTIONS IN THIS FILE EXCEPT mpn_sqrtrem ARE INTERNAL WITH A
7    MUTABLE INTERFACE.  IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED
8    INTERFACES.  IN FACT, IT IS ALMOST GUARANTEED THAT THEY WILL CHANGE OR
9    DISAPPEAR IN A FUTURE GMP RELEASE.
10 
11 Copyright 1999-2002, 2004, 2005, 2008, 2010, 2012, 2015 Free Software
12 Foundation, Inc.
13 
14 This file is part of the GNU MP Library.
15 
16 The GNU MP Library is free software; you can redistribute it and/or modify
17 it under the terms of either:
18 
19   * the GNU Lesser General Public License as published by the Free
20     Software Foundation; either version 3 of the License, or (at your
21     option) any later version.
22 
23 or
24 
25   * the GNU General Public License as published by the Free Software
26     Foundation; either version 2 of the License, or (at your option) any
27     later version.
28 
29 or both in parallel, as here.
30 
31 The GNU MP Library is distributed in the hope that it will be useful, but
32 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
33 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
34 for more details.
35 
36 You should have received copies of the GNU General Public License and the
37 GNU Lesser General Public License along with the GNU MP Library.  If not,
38 see https://www.gnu.org/licenses/.  */
39 
40 
41 /* See "Karatsuba Square Root", reference in gmp.texi.  */
42 
43 
44 #include <stdio.h>
45 #include <stdlib.h>
46 
47 #include "gmp.h"
48 #include "gmp-impl.h"
49 #include "longlong.h"
50 #define USE_DIVAPPR_Q 1
51 #define TRACE(x)
52 
53 static const unsigned char invsqrttab[384] = /* The common 0x100 was removed */
54 {
55   0xff,0xfd,0xfb,0xf9,0xf7,0xf5,0xf3,0xf2, /* sqrt(1/80)..sqrt(1/87) */
56   0xf0,0xee,0xec,0xea,0xe9,0xe7,0xe5,0xe4, /* sqrt(1/88)..sqrt(1/8f) */
57   0xe2,0xe0,0xdf,0xdd,0xdb,0xda,0xd8,0xd7, /* sqrt(1/90)..sqrt(1/97) */
58   0xd5,0xd4,0xd2,0xd1,0xcf,0xce,0xcc,0xcb, /* sqrt(1/98)..sqrt(1/9f) */
59   0xc9,0xc8,0xc6,0xc5,0xc4,0xc2,0xc1,0xc0, /* sqrt(1/a0)..sqrt(1/a7) */
60   0xbe,0xbd,0xbc,0xba,0xb9,0xb8,0xb7,0xb5, /* sqrt(1/a8)..sqrt(1/af) */
61   0xb4,0xb3,0xb2,0xb0,0xaf,0xae,0xad,0xac, /* sqrt(1/b0)..sqrt(1/b7) */
62   0xaa,0xa9,0xa8,0xa7,0xa6,0xa5,0xa4,0xa3, /* sqrt(1/b8)..sqrt(1/bf) */
63   0xa2,0xa0,0x9f,0x9e,0x9d,0x9c,0x9b,0x9a, /* sqrt(1/c0)..sqrt(1/c7) */
64   0x99,0x98,0x97,0x96,0x95,0x94,0x93,0x92, /* sqrt(1/c8)..sqrt(1/cf) */
65   0x91,0x90,0x8f,0x8e,0x8d,0x8c,0x8c,0x8b, /* sqrt(1/d0)..sqrt(1/d7) */
66   0x8a,0x89,0x88,0x87,0x86,0x85,0x84,0x83, /* sqrt(1/d8)..sqrt(1/df) */
67   0x83,0x82,0x81,0x80,0x7f,0x7e,0x7e,0x7d, /* sqrt(1/e0)..sqrt(1/e7) */
68   0x7c,0x7b,0x7a,0x79,0x79,0x78,0x77,0x76, /* sqrt(1/e8)..sqrt(1/ef) */
69   0x76,0x75,0x74,0x73,0x72,0x72,0x71,0x70, /* sqrt(1/f0)..sqrt(1/f7) */
70   0x6f,0x6f,0x6e,0x6d,0x6d,0x6c,0x6b,0x6a, /* sqrt(1/f8)..sqrt(1/ff) */
71   0x6a,0x69,0x68,0x68,0x67,0x66,0x66,0x65, /* sqrt(1/100)..sqrt(1/107) */
72   0x64,0x64,0x63,0x62,0x62,0x61,0x60,0x60, /* sqrt(1/108)..sqrt(1/10f) */
73   0x5f,0x5e,0x5e,0x5d,0x5c,0x5c,0x5b,0x5a, /* sqrt(1/110)..sqrt(1/117) */
74   0x5a,0x59,0x59,0x58,0x57,0x57,0x56,0x56, /* sqrt(1/118)..sqrt(1/11f) */
75   0x55,0x54,0x54,0x53,0x53,0x52,0x52,0x51, /* sqrt(1/120)..sqrt(1/127) */
76   0x50,0x50,0x4f,0x4f,0x4e,0x4e,0x4d,0x4d, /* sqrt(1/128)..sqrt(1/12f) */
77   0x4c,0x4b,0x4b,0x4a,0x4a,0x49,0x49,0x48, /* sqrt(1/130)..sqrt(1/137) */
78   0x48,0x47,0x47,0x46,0x46,0x45,0x45,0x44, /* sqrt(1/138)..sqrt(1/13f) */
79   0x44,0x43,0x43,0x42,0x42,0x41,0x41,0x40, /* sqrt(1/140)..sqrt(1/147) */
80   0x40,0x3f,0x3f,0x3e,0x3e,0x3d,0x3d,0x3c, /* sqrt(1/148)..sqrt(1/14f) */
81   0x3c,0x3b,0x3b,0x3a,0x3a,0x39,0x39,0x39, /* sqrt(1/150)..sqrt(1/157) */
82   0x38,0x38,0x37,0x37,0x36,0x36,0x35,0x35, /* sqrt(1/158)..sqrt(1/15f) */
83   0x35,0x34,0x34,0x33,0x33,0x32,0x32,0x32, /* sqrt(1/160)..sqrt(1/167) */
84   0x31,0x31,0x30,0x30,0x2f,0x2f,0x2f,0x2e, /* sqrt(1/168)..sqrt(1/16f) */
85   0x2e,0x2d,0x2d,0x2d,0x2c,0x2c,0x2b,0x2b, /* sqrt(1/170)..sqrt(1/177) */
86   0x2b,0x2a,0x2a,0x29,0x29,0x29,0x28,0x28, /* sqrt(1/178)..sqrt(1/17f) */
87   0x27,0x27,0x27,0x26,0x26,0x26,0x25,0x25, /* sqrt(1/180)..sqrt(1/187) */
88   0x24,0x24,0x24,0x23,0x23,0x23,0x22,0x22, /* sqrt(1/188)..sqrt(1/18f) */
89   0x21,0x21,0x21,0x20,0x20,0x20,0x1f,0x1f, /* sqrt(1/190)..sqrt(1/197) */
90   0x1f,0x1e,0x1e,0x1e,0x1d,0x1d,0x1d,0x1c, /* sqrt(1/198)..sqrt(1/19f) */
91   0x1c,0x1b,0x1b,0x1b,0x1a,0x1a,0x1a,0x19, /* sqrt(1/1a0)..sqrt(1/1a7) */
92   0x19,0x19,0x18,0x18,0x18,0x18,0x17,0x17, /* sqrt(1/1a8)..sqrt(1/1af) */
93   0x17,0x16,0x16,0x16,0x15,0x15,0x15,0x14, /* sqrt(1/1b0)..sqrt(1/1b7) */
94   0x14,0x14,0x13,0x13,0x13,0x12,0x12,0x12, /* sqrt(1/1b8)..sqrt(1/1bf) */
95   0x12,0x11,0x11,0x11,0x10,0x10,0x10,0x0f, /* sqrt(1/1c0)..sqrt(1/1c7) */
96   0x0f,0x0f,0x0f,0x0e,0x0e,0x0e,0x0d,0x0d, /* sqrt(1/1c8)..sqrt(1/1cf) */
97   0x0d,0x0c,0x0c,0x0c,0x0c,0x0b,0x0b,0x0b, /* sqrt(1/1d0)..sqrt(1/1d7) */
98   0x0a,0x0a,0x0a,0x0a,0x09,0x09,0x09,0x09, /* sqrt(1/1d8)..sqrt(1/1df) */
99   0x08,0x08,0x08,0x07,0x07,0x07,0x07,0x06, /* sqrt(1/1e0)..sqrt(1/1e7) */
100   0x06,0x06,0x06,0x05,0x05,0x05,0x04,0x04, /* sqrt(1/1e8)..sqrt(1/1ef) */
101   0x04,0x04,0x03,0x03,0x03,0x03,0x02,0x02, /* sqrt(1/1f0)..sqrt(1/1f7) */
102   0x02,0x02,0x01,0x01,0x01,0x01,0x00,0x00  /* sqrt(1/1f8)..sqrt(1/1ff) */
103 };
104 
105 /* Compute s = floor(sqrt(a0)), and *rp = a0 - s^2.  */
106 
107 #if GMP_NUMB_BITS > 32
108 #define MAGIC CNST_LIMB(0x10000000000)	/* 0xffe7debbfc < MAGIC < 0x232b1850f410 */
109 #else
110 #define MAGIC CNST_LIMB(0x100000)		/* 0xfee6f < MAGIC < 0x29cbc8 */
111 #endif
112 
113 static mp_limb_t
114 mpn_sqrtrem1 (mp_ptr rp, mp_limb_t a0)
115 {
116 #if GMP_NUMB_BITS > 32
117   mp_limb_t a1;
118 #endif
119   mp_limb_t x0, t2, t, x2;
120   unsigned abits;
121 
122   ASSERT_ALWAYS (GMP_NAIL_BITS == 0);
123   ASSERT_ALWAYS (GMP_LIMB_BITS == 32 || GMP_LIMB_BITS == 64);
124   ASSERT (a0 >= GMP_NUMB_HIGHBIT / 2);
125 
126   /* Use Newton iterations for approximating 1/sqrt(a) instead of sqrt(a),
127      since we can do the former without division.  As part of the last
128      iteration convert from 1/sqrt(a) to sqrt(a).  */
129 
130   abits = a0 >> (GMP_LIMB_BITS - 1 - 8);	/* extract bits for table lookup */
131   x0 = 0x100 | invsqrttab[abits - 0x80];	/* initial 1/sqrt(a) */
132 
133   /* x0 is now an 8 bits approximation of 1/sqrt(a0) */
134 
135 #if GMP_NUMB_BITS > 32
136   a1 = a0 >> (GMP_LIMB_BITS - 1 - 32);
137   t = (mp_limb_signed_t) (CNST_LIMB(0x2000000000000) - 0x30000 - a1 * x0 * x0) >> 16;
138   x0 = (x0 << 16) + ((mp_limb_signed_t) (x0 * t) >> (16+2));
139 
140   /* x0 is now a 16 bits approximation of 1/sqrt(a0) */
141 
142   t2 = x0 * (a0 >> (32-8));
143   t = t2 >> 25;
144   t = ((mp_limb_signed_t) ((a0 << 14) - t * t - MAGIC) >> (32-8));
145   x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 15);
146   x0 >>= 32;
147 #else
148   t2 = x0 * (a0 >> (16-8));
149   t = t2 >> 13;
150   t = ((mp_limb_signed_t) ((a0 << 6) - t * t - MAGIC) >> (16-8));
151   x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 7);
152   x0 >>= 16;
153 #endif
154 
155   /* x0 is now a full limb approximation of sqrt(a0) */
156 
157   x2 = x0 * x0;
158   if (x2 + 2*x0 <= a0 - 1)
159     {
160       x2 += 2*x0 + 1;
161       x0++;
162     }
163 
164   *rp = a0 - x2;
165   return x0;
166 }
167 
168 
169 #define Prec (GMP_NUMB_BITS >> 1)
170 
171 /* same as mpn_sqrtrem, but for size=2 and {np, 2} normalized
172    return cc such that {np, 2} = sp[0]^2 + cc*2^GMP_NUMB_BITS + rp[0] */
173 static mp_limb_t
174 mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp, mp_srcptr np)
175 {
176   mp_limb_t q, u, np0, sp0, rp0, q2;
177   int cc;
178 
179   ASSERT (np[1] >= GMP_NUMB_HIGHBIT / 2);
180 
181   np0 = np[0];
182   sp0 = mpn_sqrtrem1 (rp, np[1]);
183   rp0 = rp[0];
184   /* rp0 <= 2*sp0 < 2^(Prec + 1) */
185   rp0 = (rp0 << (Prec - 1)) + (np0 >> (Prec + 1));
186   q = rp0 / sp0;
187   /* q <= 2^Prec, if q = 2^Prec, reduce the overestimate. */
188   q -= q >> Prec;
189   /* now we have q < 2^Prec */
190   u = rp0 - q * sp0;
191   /* now we have (rp[0]<<Prec + np0>>Prec)/2 = q * sp0 + u */
192   sp0 = (sp0 << Prec) | q;
193   cc = u >> (Prec - 1);
194   rp0 = ((u << (Prec + 1)) & GMP_NUMB_MASK) + (np0 & ((CNST_LIMB (1) << (Prec + 1)) - 1));
195   /* subtract q * q from rp */
196   q2 = q * q;
197   cc -= rp0 < q2;
198   rp0 -= q2;
199   if (cc < 0)
200     {
201       rp0 += sp0;
202       cc += rp0 < sp0;
203       --sp0;
204       rp0 += sp0;
205       cc += rp0 < sp0;
206     }
207 
208   rp[0] = rp0;
209   sp[0] = sp0;
210   return cc;
211 }
212 
213 /* writes in {sp, n} the square root (rounded towards zero) of {np, 2n},
214    and in {np, n} the low n limbs of the remainder, returns the high
215    limb of the remainder (which is 0 or 1).
216    Assumes {np, 2n} is normalized, i.e. np[2n-1] >= B/4
217    where B=2^GMP_NUMB_BITS.
218    Needs a scratch of n/2+1 limbs. */
219 static mp_limb_t
220 mpn_dc_sqrtrem (mp_ptr sp, mp_ptr np, mp_size_t n, mp_limb_t approx, mp_ptr scratch)
221 {
222   mp_limb_t q;			/* carry out of {sp, n} */
223   int c, b;			/* carry out of remainder */
224   mp_size_t l, h;
225 
226   ASSERT (np[2 * n - 1] >= GMP_NUMB_HIGHBIT / 2);
227 
228   if (n == 1)
229     c = mpn_sqrtrem2 (sp, np, np);
230   else
231     {
232       l = n / 2;
233       h = n - l;
234       q = mpn_dc_sqrtrem (sp + l, np + 2 * l, h, 0, scratch);
235       if (q != 0)
236 	ASSERT_CARRY (mpn_sub_n (np + 2 * l, np + 2 * l, sp + l, h));
237       TRACE(printf("tdiv_qr(,,,,%u,,%u) -> %u\n", (unsigned) n, (unsigned) h, (unsigned) (n - h + 1)));
238       mpn_tdiv_qr (scratch, np + l, 0, np + l, n, sp + l, h);
239       q += scratch[l];
240       c = scratch[0] & 1;
241       mpn_rshift (sp, scratch, l, 1);
242       sp[l - 1] |= (q << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK;
243       if (UNLIKELY ((sp[0] & approx) != 0)) /* (sp[0] & mask) > 1 */
244 	return 1; /* Remainder is non-zero */
245       q >>= 1;
246       if (c != 0)
247 	c = mpn_add_n (np + l, np + l, sp + l, h);
248       TRACE(printf("sqr(,,%u)\n", (unsigned) l));
249       mpn_sqr (np + n, sp, l);
250       b = q + mpn_sub_n (np, np, np + n, 2 * l);
251       c -= (l == h) ? b : mpn_sub_1 (np + 2 * l, np + 2 * l, 1, (mp_limb_t) b);
252 
253       if (c < 0)
254 	{
255 	  q = mpn_add_1 (sp + l, sp + l, h, q);
256 #if HAVE_NATIVE_mpn_addlsh1_n_ip1 || HAVE_NATIVE_mpn_addlsh1_n
257 	  c += mpn_addlsh1_n_ip1 (np, sp, n) + 2 * q;
258 #else
259 	  c += mpn_addmul_1 (np, sp, n, CNST_LIMB(2)) + 2 * q;
260 #endif
261 	  c -= mpn_sub_1 (np, np, n, CNST_LIMB(1));
262 	  q -= mpn_sub_1 (sp, sp, n, CNST_LIMB(1));
263 	}
264     }
265 
266   return c;
267 }
268 
269 #if USE_DIVAPPR_Q
270 static void
271 mpn_divappr_q (mp_ptr qp, mp_srcptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn, mp_ptr scratch)
272 {
273   gmp_pi1_t inv;
274   mp_limb_t qh;
275   ASSERT (dn > 2);
276   ASSERT (nn >= dn);
277   ASSERT ((dp[dn-1] & GMP_NUMB_HIGHBIT) != 0);
278 
279   MPN_COPY (scratch, np, nn);
280   invert_pi1 (inv, dp[dn-1], dp[dn-2]);
281   if (BELOW_THRESHOLD (dn, DC_DIVAPPR_Q_THRESHOLD))
282     qh = mpn_sbpi1_divappr_q (qp, scratch, nn, dp, dn, inv.inv32);
283   else if (BELOW_THRESHOLD (dn, MU_DIVAPPR_Q_THRESHOLD))
284     qh = mpn_dcpi1_divappr_q (qp, scratch, nn, dp, dn, &inv);
285   else
286     {
287       mp_size_t itch = mpn_mu_divappr_q_itch (nn, dn, 0);
288       TMP_DECL;
289       TMP_MARK;
290       /* Sadly, scratch is too small. */
291       qh = mpn_mu_divappr_q (qp, np, nn, dp, dn, TMP_ALLOC_LIMBS (itch));
292       TMP_FREE;
293     }
294   qp [nn - dn] = qh;
295 }
296 #endif
297 
298 /* writes in {sp, n} the square root (rounded towards zero) of {np, 2n-odd},
299    returns zero if the operand was a perfect square, one otherwise.
300    Assumes {np, 2n-odd}*4^nsh is normalized, i.e. B > np[2n-1-odd]*4^nsh >= B/4
301    where B=2^GMP_NUMB_BITS.
302    THINK: In the odd case, three more (dummy) limbs are taken into account,
303    when nsh is maximal, two limbs are discarded from the result of the
304    division. Too much? Is a single dummy limb enough? */
305 static int
306 mpn_dc_sqrt (mp_ptr sp, mp_srcptr np, mp_size_t n, unsigned nsh, unsigned odd)
307 {
308   mp_limb_t q;			/* carry out of {sp, n} */
309   int c;			/* carry out of remainder */
310   mp_size_t l, h;
311   mp_ptr qp, tp, scratch;
312   TMP_DECL;
313   TMP_MARK;
314 
315   ASSERT (np[2 * n - 1 - odd] != 0);
316   ASSERT (n > 4);
317   ASSERT (nsh < GMP_NUMB_BITS / 2);
318 
319   l = (n - 1) / 2;
320   h = n - l;
321   ASSERT (n >= l + 2 && l + 2 >= h && h > l && l >= 1 + odd);
322   scratch = TMP_ALLOC_LIMBS (l + 2 * n + 5 - USE_DIVAPPR_Q); /* n + 2-USE_DIVAPPR_Q */
323   tp = scratch + n + 2 - USE_DIVAPPR_Q; /* n + h + 1, but tp [-1] is writable */
324   if (nsh != 0)
325     {
326       /* o is used to exactly set the lowest bits of the dividend, is it needed? */
327       int o = l > (1 + odd);
328       ASSERT_NOCARRY (mpn_lshift (tp - o, np + l - 1 - o - odd, n + h + 1 + o, 2 * nsh));
329     }
330   else
331     MPN_COPY (tp, np + l - 1 - odd, n + h + 1);
332   q = mpn_dc_sqrtrem (sp + l, tp + l + 1, h, 0, scratch);
333   if (q != 0)
334     ASSERT_CARRY (mpn_sub_n (tp + l + 1, tp + l + 1, sp + l, h));
335   qp = tp + n + 1; /* l + 2 */
336   TRACE(printf("div(appr)_q(,,%u,,%u) -> %u \n", (unsigned) n+1, (unsigned) h, (unsigned) (n + 1 - h + 1)));
337 #if USE_DIVAPPR_Q
338   mpn_divappr_q (qp, tp, n + 1, sp + l, h, scratch);
339 #else
340   mpn_div_q (qp, tp, n + 1, sp + l, h, scratch);
341 #endif
342   q += qp [l + 1];
343   c = 1;
344   if (q > 1)
345     {
346       /* FIXME: if s!=0 we will shift later, a noop on this area. */
347       MPN_FILL (sp, l, GMP_NUMB_MAX);
348     }
349   else
350     {
351       /* FIXME: if s!=0 we will shift again later, shift just once. */
352       mpn_rshift (sp, qp + 1, l, 1);
353       sp[l - 1] |= q << (GMP_NUMB_BITS - 1);
354       if (((qp[0] >> (2 + USE_DIVAPPR_Q)) | /* < 3 + 4*USE_DIVAPPR_Q */
355 	   (qp[1] & (GMP_NUMB_MASK >> ((GMP_NUMB_BITS >> odd)- nsh - 1)))) == 0)
356 	{
357 	  mp_limb_t cy;
358 	  /* Approximation is not good enough, the extra limb(+ nsh bits)
359 	     is smaller than needed to absorb the possible error. */
360 	  /* {qp + 1, l + 1} equals 2*{sp, l} */
361 	  /* FIXME: use mullo or wrap-around, or directly evaluate
362 	     remainder with a single sqrmod_bnm1. */
363 	  TRACE(printf("mul(,,%u,,%u)\n", (unsigned) h, (unsigned) (l+1)));
364 	  ASSERT_NOCARRY (mpn_mul (scratch, sp + l, h, qp + 1, l + 1));
365 	  /* Compute the remainder of the previous mpn_div(appr)_q. */
366 	  cy = mpn_sub_n (tp + 1, tp + 1, scratch, h);
367 #if USE_DIVAPPR_Q || WANT_ASSERT
368 	  MPN_DECR_U (tp + 1 + h, l, cy);
369 #if USE_DIVAPPR_Q
370 	  ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) <= 0);
371 	  if (mpn_cmp (tp + 1 + h, scratch + h, l) < 0)
372 	    {
373 	      /* May happen only if div result was not exact. */
374 #if HAVE_NATIVE_mpn_addlsh1_n_ip1 || HAVE_NATIVE_mpn_addlsh1_n
375 	      cy = mpn_addlsh1_n_ip1 (tp + 1, sp + l, h);
376 #else
377 	      cy = mpn_addmul_1 (tp + 1, sp + l, h, CNST_LIMB(2));
378 #endif
379 	      ASSERT_NOCARRY (mpn_add_1 (tp + 1 + h, tp + 1 + h, l, cy));
380 	      MPN_DECR_U (sp, l, 1);
381 	    }
382 	  /* Can the root be exact when a correction was needed? We
383 	     did not find an example, but it depends on divappr
384 	     internals, and we can not assume it true in general...*/
385 	  /* else */
386 #else /* WANT_ASSERT */
387 	  ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) == 0);
388 #endif
389 #endif
390 	  if (mpn_zero_p (tp + l + 1, h - l))
391 	    {
392 	      TRACE(printf("sqr(,,%u)\n", (unsigned) l));
393 	      mpn_sqr (scratch, sp, l);
394 	      c = mpn_cmp (tp + 1, scratch + l, l);
395 	      if (c == 0)
396 		{
397 		  if (nsh != 0)
398 		    {
399 		      mpn_lshift (tp, np, l, 2 * nsh);
400 		      np = tp;
401 		    }
402 		  c = mpn_cmp (np, scratch + odd, l - odd);
403 		}
404 	      if (c < 0)
405 		{
406 		  MPN_DECR_U (sp, l, 1);
407 		  c = 1;
408 		}
409 	    }
410 	}
411     }
412   TMP_FREE;
413 
414   if ((odd | nsh) != 0)
415     mpn_rshift (sp, sp, n, nsh + (odd ? GMP_NUMB_BITS / 2 : 0));
416   return c;
417 }
418 
419 
420 mp_size_t
421 mpn_sqrtrem (mp_ptr sp, mp_ptr rp, mp_srcptr np, mp_size_t nn)
422 {
423   mp_limb_t *tp, s0[1], cc, high, rl;
424   int c;
425   mp_size_t rn, tn;
426   TMP_DECL;
427 
428   ASSERT (nn > 0);
429   ASSERT_MPN (np, nn);
430 
431   ASSERT (np[nn - 1] != 0);
432   ASSERT (rp == NULL || MPN_SAME_OR_SEPARATE_P (np, rp, nn));
433   ASSERT (rp == NULL || ! MPN_OVERLAP_P (sp, (nn + 1) / 2, rp, nn));
434   ASSERT (! MPN_OVERLAP_P (sp, (nn + 1) / 2, np, nn));
435 
436   high = np[nn - 1];
437   if (high & (GMP_NUMB_HIGHBIT | (GMP_NUMB_HIGHBIT / 2)))
438     c = 0;
439   else
440     {
441       count_leading_zeros (c, high);
442       c -= GMP_NAIL_BITS;
443 
444       c = c / 2; /* we have to shift left by 2c bits to normalize {np, nn} */
445     }
446   if (nn == 1) {
447     if (c == 0)
448       {
449 	sp[0] = mpn_sqrtrem1 (&rl, high);
450 	if (rp != NULL)
451 	  rp[0] = rl;
452       }
453     else
454       {
455 	cc = mpn_sqrtrem1 (&rl, high << (2*c)) >> c;
456 	sp[0] = cc;
457 	if (rp != NULL)
458 	  rp[0] = rl = high - cc*cc;
459       }
460     return rl != 0;
461   }
462   tn = (nn + 1) / 2; /* 2*tn is the smallest even integer >= nn */
463 
464   if ((rp == NULL) && (nn > 8))
465     return mpn_dc_sqrt (sp, np, tn, c, nn & 1);
466   TMP_MARK;
467   if (((nn & 1) | c) != 0)
468     {
469       mp_limb_t mask;
470       mp_ptr scratch;
471       TMP_ALLOC_LIMBS_2 (tp, 2 * tn, scratch, tn / 2 + 1);
472       tp[0] = 0;	     /* needed only when 2*tn > nn, but saves a test */
473       if (c != 0)
474 	mpn_lshift (tp + (nn & 1), np, nn, 2 * c);
475       else
476 	MPN_COPY (tp + (nn & 1), np, nn);
477       c += (nn & 1) ? GMP_NUMB_BITS / 2 : 0;		/* c now represents k */
478       mask = (CNST_LIMB (1) << c) - 1;
479       rl = mpn_dc_sqrtrem (sp, tp, tn, (rp == NULL) ? mask - 1 : 0, scratch);
480       /* We have 2^(2k)*N = S^2 + R where k = c + (2tn-nn)*GMP_NUMB_BITS/2,
481 	 thus 2^(2k)*N = (S-s0)^2 + 2*S*s0 - s0^2 + R where s0=S mod 2^k */
482       s0[0] = sp[0] & mask;	/* S mod 2^k */
483       rl += mpn_addmul_1 (tp, sp, tn, 2 * s0[0]);	/* R = R + 2*s0*S */
484       cc = mpn_submul_1 (tp, s0, 1, s0[0]);
485       rl -= (tn > 1) ? mpn_sub_1 (tp + 1, tp + 1, tn - 1, cc) : cc;
486       mpn_rshift (sp, sp, tn, c);
487       tp[tn] = rl;
488       if (rp == NULL)
489 	rp = tp;
490       c = c << 1;
491       if (c < GMP_NUMB_BITS)
492 	tn++;
493       else
494 	{
495 	  tp++;
496 	  c -= GMP_NUMB_BITS;
497 	}
498       if (c != 0)
499 	mpn_rshift (rp, tp, tn, c);
500       else
501 	MPN_COPY_INCR (rp, tp, tn);
502       rn = tn;
503     }
504   else
505     {
506       if (rp != np)
507 	{
508 	  if (rp == NULL) /* nn <= 8 */
509 	    rp = TMP_SALLOC_LIMBS (nn);
510 	  MPN_COPY (rp, np, nn);
511 	}
512       rn = tn + (rp[tn] = mpn_dc_sqrtrem (sp, rp, tn, 0, TMP_ALLOC_LIMBS(tn / 2 + 1)));
513     }
514 
515   MPN_NORMALIZE (rp, rn);
516 
517   TMP_FREE;
518   return rn;
519 }
520