xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/generic/rootrem.c (revision e89934bbf778a6d6d6894877c4da59d0c7835b0f)
1 /* mpn_rootrem(rootp,remp,ap,an,nth) -- Compute the nth root of {ap,an}, and
2    store the truncated integer part at rootp and the remainder at remp.
3 
4    Contributed by Paul Zimmermann (algorithm) and
5    Paul Zimmermann and Torbjorn Granlund (implementation).
6 
7    THE FUNCTIONS IN THIS FILE ARE INTERNAL, AND HAVE MUTABLE INTERFACES.  IT'S
8    ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT'S ALMOST
9    GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
10 
11 Copyright 2002, 2005, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
12 
13 This file is part of the GNU MP Library.
14 
15 The GNU MP Library is free software; you can redistribute it and/or modify
16 it under the terms of the GNU Lesser General Public License as published by
17 the Free Software Foundation; either version 3 of the License, or (at your
18 option) any later version.
19 
20 The GNU MP Library is distributed in the hope that it will be useful, but
21 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
22 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
23 License for more details.
24 
25 You should have received a copy of the GNU Lesser General Public License
26 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
27 
28 /* FIXME:
29      This implementation is not optimal when remp == NULL, since the complexity
30      is M(n), whereas it should be M(n/k) on average.
31 */
32 
33 #include <stdio.h>		/* for NULL */
34 
35 #include "gmp.h"
36 #include "gmp-impl.h"
37 #include "longlong.h"
38 
39 static mp_size_t mpn_rootrem_internal (mp_ptr, mp_ptr, mp_srcptr, mp_size_t,
40 				       mp_limb_t, int);
41 
42 #define MPN_RSHIFT(cy,rp,up,un,cnt) \
43   do {									\
44     if ((cnt) != 0)							\
45       cy = mpn_rshift (rp, up, un, cnt);				\
46     else								\
47       {									\
48 	MPN_COPY_INCR (rp, up, un);					\
49 	cy = 0;								\
50       }									\
51   } while (0)
52 
53 #define MPN_LSHIFT(cy,rp,up,un,cnt) \
54   do {									\
55     if ((cnt) != 0)							\
56       cy = mpn_lshift (rp, up, un, cnt);				\
57     else								\
58       {									\
59 	MPN_COPY_DECR (rp, up, un);					\
60 	cy = 0;								\
61       }									\
62   } while (0)
63 
64 
65 /* Put in {rootp, ceil(un/k)} the kth root of {up, un}, rounded toward zero.
66    If remp <> NULL, put in {remp, un} the remainder.
67    Return the size (in limbs) of the remainder if remp <> NULL,
68 	  or a non-zero value iff the remainder is non-zero when remp = NULL.
69    Assumes:
70    (a) up[un-1] is not zero
71    (b) rootp has at least space for ceil(un/k) limbs
72    (c) remp has at least space for un limbs (in case remp <> NULL)
73    (d) the operands do not overlap.
74 
75    The auxiliary memory usage is 3*un+2 if remp = NULL,
76    and 2*un+2 if remp <> NULL.  FIXME: This is an incorrect comment.
77 */
78 mp_size_t
79 mpn_rootrem (mp_ptr rootp, mp_ptr remp,
80 	     mp_srcptr up, mp_size_t un, mp_limb_t k)
81 {
82   mp_size_t m;
83   ASSERT (un > 0);
84   ASSERT (up[un - 1] != 0);
85   ASSERT (k > 1);
86 
87   m = (un - 1) / k;		/* ceil(un/k) - 1 */
88   if (remp == NULL && m > 2)
89     /* Pad {up,un} with k zero limbs.  This will produce an approximate root
90        with one more limb, allowing us to compute the exact integral result. */
91     {
92       mp_ptr sp, wp;
93       mp_size_t rn, sn, wn;
94       TMP_DECL;
95       TMP_MARK;
96       wn = un + k;
97       wp = TMP_ALLOC_LIMBS (wn); /* will contain the padded input */
98       sn = m + 2; /* ceil(un/k) + 1 */
99       sp = TMP_ALLOC_LIMBS (sn); /* approximate root of padded input */
100       MPN_COPY (wp + k, up, un);
101       MPN_ZERO (wp, k);
102       rn = mpn_rootrem_internal (sp, NULL, wp, wn, k, 1);
103       /* The approximate root S = {sp,sn} is either the correct root of
104 	 {sp,sn}, or 1 too large.  Thus unless the least significant limb of
105 	 S is 0 or 1, we can deduce the root of {up,un} is S truncated by one
106 	 limb.  (In case sp[0]=1, we can deduce the root, but not decide
107 	 whether it is exact or not.) */
108       MPN_COPY (rootp, sp + 1, sn - 1);
109       TMP_FREE;
110       return rn;
111     }
112   else
113     {
114       return mpn_rootrem_internal (rootp, remp, up, un, k, 0);
115     }
116 }
117 
118 /* if approx is non-zero, does not compute the final remainder */
119 static mp_size_t
120 mpn_rootrem_internal (mp_ptr rootp, mp_ptr remp, mp_srcptr up, mp_size_t un,
121 		      mp_limb_t k, int approx)
122 {
123   mp_ptr qp, rp, sp, wp, scratch;
124   mp_size_t qn, rn, sn, wn, nl, bn;
125   mp_limb_t save, save2, cy;
126   unsigned long int unb; /* number of significant bits of {up,un} */
127   unsigned long int xnb; /* number of significant bits of the result */
128   unsigned long b, kk;
129   unsigned long sizes[GMP_NUMB_BITS + 1];
130   int ni, i;
131   int c;
132   int logk;
133   TMP_DECL;
134 
135   TMP_MARK;
136 
137   if (remp == NULL)
138     {
139       rp = TMP_ALLOC_LIMBS (un + 1);     /* will contain the remainder */
140       scratch = rp;			 /* used by mpn_div_q */
141     }
142   else
143     {
144       scratch = TMP_ALLOC_LIMBS (un + 1); /* used by mpn_div_q */
145       rp = remp;
146     }
147   sp = rootp;
148 
149   MPN_SIZEINBASE_2EXP(unb, up, un, 1);
150   /* unb is the number of bits of the input U */
151 
152   xnb = (unb - 1) / k + 1;	/* ceil (unb / k) */
153   /* xnb is the number of bits of the root R */
154 
155   if (xnb == 1) /* root is 1 */
156     {
157       if (remp == NULL)
158 	remp = rp;
159       mpn_sub_1 (remp, up, un, (mp_limb_t) 1);
160       MPN_NORMALIZE (remp, un);	/* There should be at most one zero limb,
161 				   if we demand u to be normalized  */
162       rootp[0] = 1;
163       TMP_FREE;
164       return un;
165     }
166 
167   /* We initialize the algorithm with a 1-bit approximation to zero: since we
168      know the root has exactly xnb bits, we write r0 = 2^(xnb-1), so that
169      r0^k = 2^(k*(xnb-1)), that we subtract to the input. */
170   kk = k * (xnb - 1);		/* number of truncated bits in the input */
171   rn = un - kk / GMP_NUMB_BITS; /* number of limbs of the non-truncated part */
172   MPN_RSHIFT (cy, rp, up + kk / GMP_NUMB_BITS, rn, kk % GMP_NUMB_BITS);
173   mpn_sub_1 (rp, rp, rn, 1);	/* subtract the initial approximation: since
174 				   the non-truncated part is less than 2^k, it
175 				   is <= k bits: rn <= ceil(k/GMP_NUMB_BITS) */
176   sp[0] = 1;			/* initial approximation */
177   sn = 1;			/* it has one limb */
178 
179   for (logk = 1; ((k - 1) >> logk) != 0; logk++)
180     ;
181   /* logk = ceil(log(k)/log(2)) */
182 
183   b = xnb - 1; /* number of remaining bits to determine in the kth root */
184   ni = 0;
185   while (b != 0)
186     {
187       /* invariant: here we want b+1 total bits for the kth root */
188       sizes[ni] = b;
189       /* if c is the new value of b, this means that we'll go from a root
190 	 of c+1 bits (say s') to a root of b+1 bits.
191 	 It is proved in the book "Modern Computer Arithmetic" from Brent
192 	 and Zimmermann, Chapter 1, that
193 	 if s' >= k*beta, then at most one correction is necessary.
194 	 Here beta = 2^(b-c), and s' >= 2^c, thus it suffices that
195 	 c >= ceil((b + log2(k))/2). */
196       b = (b + logk + 1) / 2;
197       if (b >= sizes[ni])
198 	b = sizes[ni] - 1;	/* add just one bit at a time */
199       ni++;
200     }
201   sizes[ni] = 0;
202   ASSERT_ALWAYS (ni < GMP_NUMB_BITS + 1);
203   /* We have sizes[0] = b > sizes[1] > ... > sizes[ni] = 0 with
204      sizes[i] <= 2 * sizes[i+1].
205      Newton iteration will first compute sizes[ni-1] extra bits,
206      then sizes[ni-2], ..., then sizes[0] = b. */
207 
208   /* qp and wp need enough space to store S'^k where S' is an approximate
209      root. Since S' can be as large as S+2, the worst case is when S=2 and
210      S'=4. But then since we know the number of bits of S in advance, S'
211      can only be 3 at most. Similarly for S=4, then S' can be 6 at most.
212      So the worst case is S'/S=3/2, thus S'^k <= (3/2)^k * S^k. Since S^k
213      fits in un limbs, the number of extra limbs needed is bounded by
214      ceil(k*log2(3/2)/GMP_NUMB_BITS). */
215 #define EXTRA 2 + (mp_size_t) (0.585 * (double) k / (double) GMP_NUMB_BITS)
216   qp = TMP_ALLOC_LIMBS (un + EXTRA); /* will contain quotient and remainder
217 					of R/(k*S^(k-1)), and S^k */
218   wp = TMP_ALLOC_LIMBS (un + EXTRA); /* will contain S^(k-1), k*S^(k-1),
219 					and temporary for mpn_pow_1 */
220 
221   wp[0] = 1; /* {sp,sn}^(k-1) = 1 */
222   wn = 1;
223   for (i = ni; i != 0; i--)
224     {
225       /* 1: loop invariant:
226 	 {sp, sn} is the current approximation of the root, which has
227 		  exactly 1 + sizes[ni] bits.
228 	 {rp, rn} is the current remainder
229 	 {wp, wn} = {sp, sn}^(k-1)
230 	 kk = number of truncated bits of the input
231       */
232       b = sizes[i - 1] - sizes[i]; /* number of bits to compute in that
233 				      iteration */
234 
235       /* Reinsert a low zero limb if we normalized away the entire remainder */
236       if (rn == 0)
237 	{
238 	  rp[0] = 0;
239 	  rn = 1;
240 	}
241 
242       /* first multiply the remainder by 2^b */
243       MPN_LSHIFT (cy, rp + b / GMP_NUMB_BITS, rp, rn, b % GMP_NUMB_BITS);
244       rn = rn + b / GMP_NUMB_BITS;
245       if (cy != 0)
246 	{
247 	  rp[rn] = cy;
248 	  rn++;
249 	}
250 
251       kk = kk - b;
252 
253       /* 2: current buffers: {sp,sn}, {rp,rn}, {wp,wn} */
254 
255       /* Now insert bits [kk,kk+b-1] from the input U */
256       bn = b / GMP_NUMB_BITS; /* lowest limb from high part of rp[] */
257       save = rp[bn];
258       /* nl is the number of limbs in U which contain bits [kk,kk+b-1] */
259       nl = 1 + (kk + b - 1) / GMP_NUMB_BITS - (kk / GMP_NUMB_BITS);
260       /* nl  = 1 + floor((kk + b - 1) / GMP_NUMB_BITS)
261 		 - floor(kk / GMP_NUMB_BITS)
262 	     <= 1 + (kk + b - 1) / GMP_NUMB_BITS
263 		  - (kk - GMP_NUMB_BITS + 1) / GMP_NUMB_BITS
264 	     = 2 + (b - 2) / GMP_NUMB_BITS
265 	 thus since nl is an integer:
266 	 nl <= 2 + floor(b/GMP_NUMB_BITS) <= 2 + bn. */
267       /* we have to save rp[bn] up to rp[nl-1], i.e. 1 or 2 limbs */
268       if (nl - 1 > bn)
269 	save2 = rp[bn + 1];
270       MPN_RSHIFT (cy, rp, up + kk / GMP_NUMB_BITS, nl, kk % GMP_NUMB_BITS);
271       /* set to zero high bits of rp[bn] */
272       rp[bn] &= ((mp_limb_t) 1 << (b % GMP_NUMB_BITS)) - 1;
273       /* restore corresponding bits */
274       rp[bn] |= save;
275       if (nl - 1 > bn)
276 	rp[bn + 1] = save2; /* the low b bits go in rp[0..bn] only, since
277 			       they start by bit 0 in rp[0], so they use
278 			       at most ceil(b/GMP_NUMB_BITS) limbs */
279 
280       /* 3: current buffers: {sp,sn}, {rp,rn}, {wp,wn} */
281 
282       /* compute {wp, wn} = k * {sp, sn}^(k-1) */
283       cy = mpn_mul_1 (wp, wp, wn, k);
284       wp[wn] = cy;
285       wn += cy != 0;
286 
287       /* 4: current buffers: {sp,sn}, {rp,rn}, {wp,wn} */
288 
289       /* now divide {rp, rn} by {wp, wn} to get the low part of the root */
290       if (rn < wn)
291 	{
292 	  qn = 0;
293 	}
294       else
295 	{
296 	  qn = rn - wn; /* expected quotient size */
297 	  mpn_div_q (qp, rp, rn, wp, wn, scratch);
298 	  qn += qp[qn] != 0;
299 	}
300 
301       /* 5: current buffers: {sp,sn}, {qp,qn}.
302 	 Note: {rp,rn} is not needed any more since we'll compute it from
303 	 scratch at the end of the loop.
304        */
305 
306       /* Number of limbs used by b bits, when least significant bit is
307 	 aligned to least limb */
308       bn = (b - 1) / GMP_NUMB_BITS + 1;
309 
310       /* the quotient should be smaller than 2^b, since the previous
311 	 approximation was correctly rounded toward zero */
312       if (qn > bn || (qn == bn && (b % GMP_NUMB_BITS != 0) &&
313 		      qp[qn - 1] >= ((mp_limb_t) 1 << (b % GMP_NUMB_BITS))))
314 	{
315 	  qn = b / GMP_NUMB_BITS + 1; /* b+1 bits */
316 	  MPN_ZERO (qp, qn);
317 	  qp[qn - 1] = (mp_limb_t) 1 << (b % GMP_NUMB_BITS);
318 	  MPN_DECR_U (qp, qn, 1);
319 	  qn -= qp[qn - 1] == 0;
320 	}
321 
322       /* 6: current buffers: {sp,sn}, {qp,qn} */
323 
324       /* multiply the root approximation by 2^b */
325       MPN_LSHIFT (cy, sp + b / GMP_NUMB_BITS, sp, sn, b % GMP_NUMB_BITS);
326       sn = sn + b / GMP_NUMB_BITS;
327       if (cy != 0)
328 	{
329 	  sp[sn] = cy;
330 	  sn++;
331 	}
332 
333       /* 7: current buffers: {sp,sn}, {qp,qn} */
334 
335       ASSERT_ALWAYS (bn >= qn); /* this is ok since in the case qn > bn
336 				   above, q is set to 2^b-1, which has
337 				   exactly bn limbs */
338 
339       /* Combine sB and q to form sB + q.  */
340       save = sp[b / GMP_NUMB_BITS];
341       MPN_COPY (sp, qp, qn);
342       MPN_ZERO (sp + qn, bn - qn);
343       sp[b / GMP_NUMB_BITS] |= save;
344 
345       /* 8: current buffer: {sp,sn} */
346 
347       /* Since each iteration treats b bits from the root and thus k*b bits
348 	 from the input, and we already considered b bits from the input,
349 	 we now have to take another (k-1)*b bits from the input. */
350       kk -= (k - 1) * b; /* remaining input bits */
351       /* {rp, rn} = floor({up, un} / 2^kk) */
352       MPN_RSHIFT (cy, rp, up + kk / GMP_NUMB_BITS, un - kk / GMP_NUMB_BITS, kk % GMP_NUMB_BITS);
353       rn = un - kk / GMP_NUMB_BITS;
354       rn -= rp[rn - 1] == 0;
355 
356       /* 9: current buffers: {sp,sn}, {rp,rn} */
357 
358      for (c = 0;; c++)
359 	{
360 	  /* Compute S^k in {qp,qn}. */
361 	  if (i == 1)
362 	    {
363 	      /* Last iteration: we don't need W anymore. */
364 	      /* mpn_pow_1 requires that both qp and wp have enough space to
365 		 store the result {sp,sn}^k + 1 limb */
366 	      approx = approx && (sp[0] > 1);
367 	      qn = (approx == 0) ? mpn_pow_1 (qp, sp, sn, k, wp) : 0;
368 	    }
369 	  else
370 	    {
371 	      /* W <- S^(k-1) for the next iteration,
372 		 and S^k = W * S. */
373 	      wn = mpn_pow_1 (wp, sp, sn, k - 1, qp);
374 	      mpn_mul (qp, wp, wn, sp, sn);
375 	      qn = wn + sn;
376 	      qn -= qp[qn - 1] == 0;
377 	    }
378 
379 	  /* if S^k > floor(U/2^kk), the root approximation was too large */
380 	  if (qn > rn || (qn == rn && mpn_cmp (qp, rp, rn) > 0))
381 	    MPN_DECR_U (sp, sn, 1);
382 	  else
383 	    break;
384 	}
385 
386       /* 10: current buffers: {sp,sn}, {rp,rn}, {qp,qn}, {wp,wn} */
387 
388       ASSERT_ALWAYS (c <= 1);
389       ASSERT_ALWAYS (rn >= qn);
390 
391       /* R = R - Q = floor(U/2^kk) - S^k */
392       if (i > 1 || approx == 0)
393 	{
394 	  mpn_sub (rp, rp, rn, qp, qn);
395 	  MPN_NORMALIZE (rp, rn);
396 	}
397       /* otherwise we have rn > 0, thus the return value is ok */
398 
399       /* 11: current buffers: {sp,sn}, {rp,rn}, {wp,wn} */
400     }
401 
402   TMP_FREE;
403   return rn;
404 }
405