xref: /dflybsd-src/contrib/gmp/mpn/generic/gcdext.c (revision 4b6a78b780ca436f67569886cbfcd90d19f45579)
1*4b6a78b7SSimon Schubert /* mpn_gcdext -- Extended Greatest Common Divisor.
2*4b6a78b7SSimon Schubert 
3*4b6a78b7SSimon Schubert Copyright 1996, 1998, 2000, 2001, 2002, 2003, 2004, 2005, 2008 Free Software
4*4b6a78b7SSimon Schubert Foundation, Inc.
5*4b6a78b7SSimon Schubert 
6*4b6a78b7SSimon Schubert This file is part of the GNU MP Library.
7*4b6a78b7SSimon Schubert 
8*4b6a78b7SSimon Schubert The GNU MP Library is free software; you can redistribute it and/or modify
9*4b6a78b7SSimon Schubert it under the terms of the GNU Lesser General Public License as published by
10*4b6a78b7SSimon Schubert the Free Software Foundation; either version 3 of the License, or (at your
11*4b6a78b7SSimon Schubert option) any later version.
12*4b6a78b7SSimon Schubert 
13*4b6a78b7SSimon Schubert The GNU MP Library is distributed in the hope that it will be useful, but
14*4b6a78b7SSimon Schubert WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15*4b6a78b7SSimon Schubert or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16*4b6a78b7SSimon Schubert License for more details.
17*4b6a78b7SSimon Schubert 
18*4b6a78b7SSimon Schubert You should have received a copy of the GNU Lesser General Public License
19*4b6a78b7SSimon Schubert along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
20*4b6a78b7SSimon Schubert 
21*4b6a78b7SSimon Schubert #include "gmp.h"
22*4b6a78b7SSimon Schubert #include "gmp-impl.h"
23*4b6a78b7SSimon Schubert #include "longlong.h"
24*4b6a78b7SSimon Schubert 
25*4b6a78b7SSimon Schubert /* Computes (r;b) = (a; b) M. Result is of size n + M->n +/- 1, and
26*4b6a78b7SSimon Schubert    the size is returned (if inputs are non-normalized, result may be
27*4b6a78b7SSimon Schubert    non-normalized too). Temporary space needed is M->n + n.
28*4b6a78b7SSimon Schubert  */
29*4b6a78b7SSimon Schubert static size_t
30*4b6a78b7SSimon Schubert hgcd_mul_matrix_vector (struct hgcd_matrix *M,
31*4b6a78b7SSimon Schubert 			mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp)
32*4b6a78b7SSimon Schubert {
33*4b6a78b7SSimon Schubert   mp_limb_t ah, bh;
34*4b6a78b7SSimon Schubert 
35*4b6a78b7SSimon Schubert   /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as
36*4b6a78b7SSimon Schubert 
37*4b6a78b7SSimon Schubert      t  = u00 * a
38*4b6a78b7SSimon Schubert      r  = u10 * b
39*4b6a78b7SSimon Schubert      r += t;
40*4b6a78b7SSimon Schubert 
41*4b6a78b7SSimon Schubert      t  = u11 * b
42*4b6a78b7SSimon Schubert      b  = u01 * a
43*4b6a78b7SSimon Schubert      b += t;
44*4b6a78b7SSimon Schubert   */
45*4b6a78b7SSimon Schubert 
46*4b6a78b7SSimon Schubert   if (M->n >= n)
47*4b6a78b7SSimon Schubert     {
48*4b6a78b7SSimon Schubert       mpn_mul (tp, M->p[0][0], M->n, ap, n);
49*4b6a78b7SSimon Schubert       mpn_mul (rp, M->p[1][0], M->n, bp, n);
50*4b6a78b7SSimon Schubert     }
51*4b6a78b7SSimon Schubert   else
52*4b6a78b7SSimon Schubert     {
53*4b6a78b7SSimon Schubert       mpn_mul (tp, ap, n, M->p[0][0], M->n);
54*4b6a78b7SSimon Schubert       mpn_mul (rp, bp, n, M->p[1][0], M->n);
55*4b6a78b7SSimon Schubert     }
56*4b6a78b7SSimon Schubert 
57*4b6a78b7SSimon Schubert   ah = mpn_add_n (rp, rp, tp, n + M->n);
58*4b6a78b7SSimon Schubert 
59*4b6a78b7SSimon Schubert   if (M->n >= n)
60*4b6a78b7SSimon Schubert     {
61*4b6a78b7SSimon Schubert       mpn_mul (tp, M->p[1][1], M->n, bp, n);
62*4b6a78b7SSimon Schubert       mpn_mul (bp, M->p[0][1], M->n, ap, n);
63*4b6a78b7SSimon Schubert     }
64*4b6a78b7SSimon Schubert   else
65*4b6a78b7SSimon Schubert     {
66*4b6a78b7SSimon Schubert       mpn_mul (tp, bp, n, M->p[1][1], M->n);
67*4b6a78b7SSimon Schubert       mpn_mul (bp, ap, n, M->p[0][1], M->n);
68*4b6a78b7SSimon Schubert     }
69*4b6a78b7SSimon Schubert   bh = mpn_add_n (bp, bp, tp, n + M->n);
70*4b6a78b7SSimon Schubert 
71*4b6a78b7SSimon Schubert   n += M->n;
72*4b6a78b7SSimon Schubert   if ( (ah | bh) > 0)
73*4b6a78b7SSimon Schubert     {
74*4b6a78b7SSimon Schubert       rp[n] = ah;
75*4b6a78b7SSimon Schubert       bp[n] = bh;
76*4b6a78b7SSimon Schubert       n++;
77*4b6a78b7SSimon Schubert     }
78*4b6a78b7SSimon Schubert   else
79*4b6a78b7SSimon Schubert     {
80*4b6a78b7SSimon Schubert       /* Normalize */
81*4b6a78b7SSimon Schubert       while ( (rp[n-1] | bp[n-1]) == 0)
82*4b6a78b7SSimon Schubert 	n--;
83*4b6a78b7SSimon Schubert     }
84*4b6a78b7SSimon Schubert 
85*4b6a78b7SSimon Schubert   return n;
86*4b6a78b7SSimon Schubert }
87*4b6a78b7SSimon Schubert 
88*4b6a78b7SSimon Schubert #define COMPUTE_V_ITCH(n) (2*(n) + 1)
89*4b6a78b7SSimon Schubert 
90*4b6a78b7SSimon Schubert /* Computes |v| = |(g - u a)| / b, where u may be positive or
91*4b6a78b7SSimon Schubert    negative, and v is of the opposite sign. a, b are of size n, u and
92*4b6a78b7SSimon Schubert    v at most size n, and v must have space for n+1 limbs. */
93*4b6a78b7SSimon Schubert static mp_size_t
94*4b6a78b7SSimon Schubert compute_v (mp_ptr vp,
95*4b6a78b7SSimon Schubert 	   mp_srcptr ap, mp_srcptr bp, mp_size_t n,
96*4b6a78b7SSimon Schubert 	   mp_srcptr gp, mp_size_t gn,
97*4b6a78b7SSimon Schubert 	   mp_srcptr up, mp_size_t usize,
98*4b6a78b7SSimon Schubert 	   mp_ptr tp)
99*4b6a78b7SSimon Schubert {
100*4b6a78b7SSimon Schubert   mp_size_t size;
101*4b6a78b7SSimon Schubert   mp_size_t an;
102*4b6a78b7SSimon Schubert   mp_size_t bn;
103*4b6a78b7SSimon Schubert   mp_size_t vn;
104*4b6a78b7SSimon Schubert 
105*4b6a78b7SSimon Schubert   ASSERT (n > 0);
106*4b6a78b7SSimon Schubert   ASSERT (gn > 0);
107*4b6a78b7SSimon Schubert   ASSERT (usize != 0);
108*4b6a78b7SSimon Schubert 
109*4b6a78b7SSimon Schubert   size = ABS (usize);
110*4b6a78b7SSimon Schubert   ASSERT (size <= n);
111*4b6a78b7SSimon Schubert 
112*4b6a78b7SSimon Schubert   an = n;
113*4b6a78b7SSimon Schubert   MPN_NORMALIZE (ap, an);
114*4b6a78b7SSimon Schubert 
115*4b6a78b7SSimon Schubert   if (an >= size)
116*4b6a78b7SSimon Schubert     mpn_mul (tp, ap, an, up, size);
117*4b6a78b7SSimon Schubert   else
118*4b6a78b7SSimon Schubert     mpn_mul (tp, up, size, ap, an);
119*4b6a78b7SSimon Schubert 
120*4b6a78b7SSimon Schubert   size += an;
121*4b6a78b7SSimon Schubert 
122*4b6a78b7SSimon Schubert   ASSERT (gn <= size);
123*4b6a78b7SSimon Schubert 
124*4b6a78b7SSimon Schubert   if (usize > 0)
125*4b6a78b7SSimon Schubert     {
126*4b6a78b7SSimon Schubert       /* |v| = -v = (u a - g) / b */
127*4b6a78b7SSimon Schubert 
128*4b6a78b7SSimon Schubert       ASSERT_NOCARRY (mpn_sub (tp, tp, size, gp, gn));
129*4b6a78b7SSimon Schubert       MPN_NORMALIZE (tp, size);
130*4b6a78b7SSimon Schubert       if (size == 0)
131*4b6a78b7SSimon Schubert 	return 0;
132*4b6a78b7SSimon Schubert     }
133*4b6a78b7SSimon Schubert   else
134*4b6a78b7SSimon Schubert     { /* usize < 0 */
135*4b6a78b7SSimon Schubert       /* |v| = v = (c - u a) / b = (c + |u| a) / b */
136*4b6a78b7SSimon Schubert       mp_limb_t cy = mpn_add (tp, tp, size, gp, gn);
137*4b6a78b7SSimon Schubert       if (cy)
138*4b6a78b7SSimon Schubert 	tp[size++] = cy;
139*4b6a78b7SSimon Schubert     }
140*4b6a78b7SSimon Schubert 
141*4b6a78b7SSimon Schubert   /* Now divide t / b. There must be no remainder */
142*4b6a78b7SSimon Schubert   bn = n;
143*4b6a78b7SSimon Schubert   MPN_NORMALIZE (bp, bn);
144*4b6a78b7SSimon Schubert   ASSERT (size >= bn);
145*4b6a78b7SSimon Schubert 
146*4b6a78b7SSimon Schubert   vn = size + 1 - bn;
147*4b6a78b7SSimon Schubert   ASSERT (vn <= n + 1);
148*4b6a78b7SSimon Schubert 
149*4b6a78b7SSimon Schubert   /* FIXME: Use divexact. Or do the entire calculation mod 2^{n *
150*4b6a78b7SSimon Schubert      GMP_NUMB_BITS}. */
151*4b6a78b7SSimon Schubert   mpn_tdiv_qr (vp, tp, 0, tp, size, bp, bn);
152*4b6a78b7SSimon Schubert   vn -= (vp[vn-1] == 0);
153*4b6a78b7SSimon Schubert 
154*4b6a78b7SSimon Schubert   /* Remainder must be zero */
155*4b6a78b7SSimon Schubert #if WANT_ASSERT
156*4b6a78b7SSimon Schubert   {
157*4b6a78b7SSimon Schubert     mp_size_t i;
158*4b6a78b7SSimon Schubert     for (i = 0; i < bn; i++)
159*4b6a78b7SSimon Schubert       {
160*4b6a78b7SSimon Schubert 	ASSERT (tp[i] == 0);
161*4b6a78b7SSimon Schubert       }
162*4b6a78b7SSimon Schubert   }
163*4b6a78b7SSimon Schubert #endif
164*4b6a78b7SSimon Schubert   return vn;
165*4b6a78b7SSimon Schubert }
166*4b6a78b7SSimon Schubert 
167*4b6a78b7SSimon Schubert /* Temporary storage:
168*4b6a78b7SSimon Schubert 
169*4b6a78b7SSimon Schubert    Initial division: Quotient of at most an - n + 1 <= an limbs.
170*4b6a78b7SSimon Schubert 
171*4b6a78b7SSimon Schubert    Storage for u0 and u1: 2(n+1).
172*4b6a78b7SSimon Schubert 
173*4b6a78b7SSimon Schubert    Storage for hgcd matrix M, with input ceil(n/2): 5 * ceil(n/4)
174*4b6a78b7SSimon Schubert 
175*4b6a78b7SSimon Schubert    Storage for hgcd, input (n + 1)/2: 9 n/4 plus some.
176*4b6a78b7SSimon Schubert 
177*4b6a78b7SSimon Schubert    When hgcd succeeds: 1 + floor(3n/2) for adjusting a and b, and 2(n+1) for the cofactors.
178*4b6a78b7SSimon Schubert 
179*4b6a78b7SSimon Schubert    When hgcd fails: 2n + 1 for mpn_gcdext_subdiv_step, which is less.
180*4b6a78b7SSimon Schubert 
181*4b6a78b7SSimon Schubert    For the lehmer call after the loop, Let T denote
182*4b6a78b7SSimon Schubert    GCDEXT_DC_THRESHOLD. For the gcdext_lehmer call, we need T each for
183*4b6a78b7SSimon Schubert    u, a and b, and 4T+3 scratch space. Next, for compute_v, we need T
184*4b6a78b7SSimon Schubert    + 1 for v and 2T + 1 scratch space. In all, 7T + 3 is sufficient.
185*4b6a78b7SSimon Schubert 
186*4b6a78b7SSimon Schubert */
187*4b6a78b7SSimon Schubert 
188*4b6a78b7SSimon Schubert /* Optimal choice of p seems difficult. In each iteration the division
189*4b6a78b7SSimon Schubert  * of work between hgcd and the updates of u0 and u1 depends on the
190*4b6a78b7SSimon Schubert  * current size of the u. It may be desirable to use a different
191*4b6a78b7SSimon Schubert  * choice of p in each iteration. Also the input size seems to matter;
192*4b6a78b7SSimon Schubert  * choosing p = n / 3 in the first iteration seems to improve
193*4b6a78b7SSimon Schubert  * performance slightly for input size just above the threshold, but
194*4b6a78b7SSimon Schubert  * degrade performance for larger inputs. */
195*4b6a78b7SSimon Schubert #define CHOOSE_P_1(n) ((n) / 2)
196*4b6a78b7SSimon Schubert #define CHOOSE_P_2(n) ((n) / 3)
197*4b6a78b7SSimon Schubert 
198*4b6a78b7SSimon Schubert mp_size_t
199*4b6a78b7SSimon Schubert mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep,
200*4b6a78b7SSimon Schubert 	    mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t n)
201*4b6a78b7SSimon Schubert {
202*4b6a78b7SSimon Schubert   mp_size_t talloc;
203*4b6a78b7SSimon Schubert   mp_size_t scratch;
204*4b6a78b7SSimon Schubert   mp_size_t matrix_scratch;
205*4b6a78b7SSimon Schubert   mp_size_t ualloc = n + 1;
206*4b6a78b7SSimon Schubert 
207*4b6a78b7SSimon Schubert   mp_size_t un;
208*4b6a78b7SSimon Schubert   mp_ptr u0;
209*4b6a78b7SSimon Schubert   mp_ptr u1;
210*4b6a78b7SSimon Schubert 
211*4b6a78b7SSimon Schubert   mp_ptr tp;
212*4b6a78b7SSimon Schubert 
213*4b6a78b7SSimon Schubert   TMP_DECL;
214*4b6a78b7SSimon Schubert 
215*4b6a78b7SSimon Schubert   ASSERT (an >= n);
216*4b6a78b7SSimon Schubert   ASSERT (n > 0);
217*4b6a78b7SSimon Schubert 
218*4b6a78b7SSimon Schubert   TMP_MARK;
219*4b6a78b7SSimon Schubert 
220*4b6a78b7SSimon Schubert   /* FIXME: Check for small sizes first, before setting up temporary
221*4b6a78b7SSimon Schubert      storage etc. */
222*4b6a78b7SSimon Schubert   talloc = MPN_GCDEXT_LEHMER_N_ITCH(n);
223*4b6a78b7SSimon Schubert 
224*4b6a78b7SSimon Schubert   /* For initial division */
225*4b6a78b7SSimon Schubert   scratch = an - n + 1;
226*4b6a78b7SSimon Schubert   if (scratch > talloc)
227*4b6a78b7SSimon Schubert     talloc = scratch;
228*4b6a78b7SSimon Schubert 
229*4b6a78b7SSimon Schubert   if (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
230*4b6a78b7SSimon Schubert     {
231*4b6a78b7SSimon Schubert       /* For hgcd loop. */
232*4b6a78b7SSimon Schubert       mp_size_t hgcd_scratch;
233*4b6a78b7SSimon Schubert       mp_size_t update_scratch;
234*4b6a78b7SSimon Schubert       mp_size_t p1 = CHOOSE_P_1 (n);
235*4b6a78b7SSimon Schubert       mp_size_t p2 = CHOOSE_P_2 (n);
236*4b6a78b7SSimon Schubert       mp_size_t min_p = MIN(p1, p2);
237*4b6a78b7SSimon Schubert       mp_size_t max_p = MAX(p1, p2);
238*4b6a78b7SSimon Schubert       matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - min_p);
239*4b6a78b7SSimon Schubert       hgcd_scratch = mpn_hgcd_itch (n - min_p);
240*4b6a78b7SSimon Schubert       update_scratch = max_p + n - 1;
241*4b6a78b7SSimon Schubert 
242*4b6a78b7SSimon Schubert       scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch);
243*4b6a78b7SSimon Schubert       if (scratch > talloc)
244*4b6a78b7SSimon Schubert 	talloc = scratch;
245*4b6a78b7SSimon Schubert 
246*4b6a78b7SSimon Schubert       /* Final mpn_gcdext_lehmer_n call. Need space for u and for
247*4b6a78b7SSimon Schubert 	 copies of a and b. */
248*4b6a78b7SSimon Schubert       scratch = MPN_GCDEXT_LEHMER_N_ITCH (GCDEXT_DC_THRESHOLD)
249*4b6a78b7SSimon Schubert 	+ 3*GCDEXT_DC_THRESHOLD;
250*4b6a78b7SSimon Schubert 
251*4b6a78b7SSimon Schubert       if (scratch > talloc)
252*4b6a78b7SSimon Schubert 	talloc = scratch;
253*4b6a78b7SSimon Schubert 
254*4b6a78b7SSimon Schubert       /* Cofactors u0 and u1 */
255*4b6a78b7SSimon Schubert       talloc += 2*(n+1);
256*4b6a78b7SSimon Schubert     }
257*4b6a78b7SSimon Schubert 
258*4b6a78b7SSimon Schubert   tp = TMP_ALLOC_LIMBS(talloc);
259*4b6a78b7SSimon Schubert 
260*4b6a78b7SSimon Schubert   if (an > n)
261*4b6a78b7SSimon Schubert     {
262*4b6a78b7SSimon Schubert       mpn_tdiv_qr (tp, ap, 0, ap, an, bp, n);
263*4b6a78b7SSimon Schubert 
264*4b6a78b7SSimon Schubert       if (mpn_zero_p (ap, n))
265*4b6a78b7SSimon Schubert 	{
266*4b6a78b7SSimon Schubert 	  MPN_COPY (gp, bp, n);
267*4b6a78b7SSimon Schubert 	  *usizep = 0;
268*4b6a78b7SSimon Schubert 	  TMP_FREE;
269*4b6a78b7SSimon Schubert 	  return n;
270*4b6a78b7SSimon Schubert 	}
271*4b6a78b7SSimon Schubert     }
272*4b6a78b7SSimon Schubert 
273*4b6a78b7SSimon Schubert   if (BELOW_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
274*4b6a78b7SSimon Schubert     {
275*4b6a78b7SSimon Schubert       mp_size_t gn = mpn_gcdext_lehmer_n(gp, up, usizep, ap, bp, n, tp);
276*4b6a78b7SSimon Schubert 
277*4b6a78b7SSimon Schubert       TMP_FREE;
278*4b6a78b7SSimon Schubert       return gn;
279*4b6a78b7SSimon Schubert     }
280*4b6a78b7SSimon Schubert 
281*4b6a78b7SSimon Schubert   MPN_ZERO (tp, 2*ualloc);
282*4b6a78b7SSimon Schubert   u0 = tp; tp += ualloc;
283*4b6a78b7SSimon Schubert   u1 = tp; tp += ualloc;
284*4b6a78b7SSimon Schubert 
285*4b6a78b7SSimon Schubert   {
286*4b6a78b7SSimon Schubert     /* For the first hgcd call, there are no u updates, and it makes
287*4b6a78b7SSimon Schubert        some sense to use a different choice for p. */
288*4b6a78b7SSimon Schubert 
289*4b6a78b7SSimon Schubert     /* FIXME: We could trim use of temporary storage, since u0 and u1
290*4b6a78b7SSimon Schubert        are not used yet. For the hgcd call, we could swap in the u0
291*4b6a78b7SSimon Schubert        and u1 pointers for the relevant matrix elements. */
292*4b6a78b7SSimon Schubert 
293*4b6a78b7SSimon Schubert     struct hgcd_matrix M;
294*4b6a78b7SSimon Schubert     mp_size_t p = CHOOSE_P_1 (n);
295*4b6a78b7SSimon Schubert     mp_size_t nn;
296*4b6a78b7SSimon Schubert 
297*4b6a78b7SSimon Schubert     mpn_hgcd_matrix_init (&M, n - p, tp);
298*4b6a78b7SSimon Schubert     nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
299*4b6a78b7SSimon Schubert     if (nn > 0)
300*4b6a78b7SSimon Schubert       {
301*4b6a78b7SSimon Schubert 	ASSERT (M.n <= (n - p - 1)/2);
302*4b6a78b7SSimon Schubert 	ASSERT (M.n + p <= (p + n - 1) / 2);
303*4b6a78b7SSimon Schubert 
304*4b6a78b7SSimon Schubert 	/* Temporary storage 2 (p + M->n) <= p + n - 1 */
305*4b6a78b7SSimon Schubert 	n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + matrix_scratch);
306*4b6a78b7SSimon Schubert 
307*4b6a78b7SSimon Schubert 	MPN_COPY (u0, M.p[1][0], M.n);
308*4b6a78b7SSimon Schubert 	MPN_COPY (u1, M.p[1][1], M.n);
309*4b6a78b7SSimon Schubert 	un = M.n;
310*4b6a78b7SSimon Schubert 	while ( (u0[un-1] | u1[un-1] ) == 0)
311*4b6a78b7SSimon Schubert 	  un--;
312*4b6a78b7SSimon Schubert       }
313*4b6a78b7SSimon Schubert     else
314*4b6a78b7SSimon Schubert       {
315*4b6a78b7SSimon Schubert 	/* mpn_hgcd has failed. Then either one of a or b is very
316*4b6a78b7SSimon Schubert 	   small, or the difference is very small. Perform one
317*4b6a78b7SSimon Schubert 	   subtraction followed by one division. */
318*4b6a78b7SSimon Schubert 	mp_size_t gn;
319*4b6a78b7SSimon Schubert 	mp_size_t updated_un = 1;
320*4b6a78b7SSimon Schubert 
321*4b6a78b7SSimon Schubert 	u1[0] = 1;
322*4b6a78b7SSimon Schubert 
323*4b6a78b7SSimon Schubert 	/* Temporary storage 2n + 1 */
324*4b6a78b7SSimon Schubert 	n = mpn_gcdext_subdiv_step (gp, &gn, up, usizep, ap, bp, n,
325*4b6a78b7SSimon Schubert 				    u0, u1, &updated_un, tp, tp + n);
326*4b6a78b7SSimon Schubert 	if (n == 0)
327*4b6a78b7SSimon Schubert 	  {
328*4b6a78b7SSimon Schubert 	    TMP_FREE;
329*4b6a78b7SSimon Schubert 	    return gn;
330*4b6a78b7SSimon Schubert 	  }
331*4b6a78b7SSimon Schubert 
332*4b6a78b7SSimon Schubert 	un = updated_un;
333*4b6a78b7SSimon Schubert 	ASSERT (un < ualloc);
334*4b6a78b7SSimon Schubert       }
335*4b6a78b7SSimon Schubert   }
336*4b6a78b7SSimon Schubert 
337*4b6a78b7SSimon Schubert   while (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
338*4b6a78b7SSimon Schubert     {
339*4b6a78b7SSimon Schubert       struct hgcd_matrix M;
340*4b6a78b7SSimon Schubert       mp_size_t p = CHOOSE_P_2 (n);
341*4b6a78b7SSimon Schubert       mp_size_t nn;
342*4b6a78b7SSimon Schubert 
343*4b6a78b7SSimon Schubert       mpn_hgcd_matrix_init (&M, n - p, tp);
344*4b6a78b7SSimon Schubert       nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
345*4b6a78b7SSimon Schubert       if (nn > 0)
346*4b6a78b7SSimon Schubert 	{
347*4b6a78b7SSimon Schubert 	  mp_ptr t0;
348*4b6a78b7SSimon Schubert 
349*4b6a78b7SSimon Schubert 	  t0 = tp + matrix_scratch;
350*4b6a78b7SSimon Schubert 	  ASSERT (M.n <= (n - p - 1)/2);
351*4b6a78b7SSimon Schubert 	  ASSERT (M.n + p <= (p + n - 1) / 2);
352*4b6a78b7SSimon Schubert 
353*4b6a78b7SSimon Schubert 	  /* Temporary storage 2 (p + M->n) <= p + n - 1 */
354*4b6a78b7SSimon Schubert 	  n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, t0);
355*4b6a78b7SSimon Schubert 
356*4b6a78b7SSimon Schubert 	  /* By the same analysis as for mpn_hgcd_matrix_mul */
357*4b6a78b7SSimon Schubert 	  ASSERT (M.n + un <= ualloc);
358*4b6a78b7SSimon Schubert 
359*4b6a78b7SSimon Schubert 	  /* FIXME: This copying could be avoided by some swapping of
360*4b6a78b7SSimon Schubert 	   * pointers. May need more temporary storage, though. */
361*4b6a78b7SSimon Schubert 	  MPN_COPY (t0, u0, un);
362*4b6a78b7SSimon Schubert 
363*4b6a78b7SSimon Schubert 	  /* Temporary storage ualloc */
364*4b6a78b7SSimon Schubert 	  un = hgcd_mul_matrix_vector (&M, u0, t0, u1, un, t0 + un);
365*4b6a78b7SSimon Schubert 
366*4b6a78b7SSimon Schubert 	  ASSERT (un < ualloc);
367*4b6a78b7SSimon Schubert 	  ASSERT ( (u0[un-1] | u1[un-1]) > 0);
368*4b6a78b7SSimon Schubert 	}
369*4b6a78b7SSimon Schubert       else
370*4b6a78b7SSimon Schubert 	{
371*4b6a78b7SSimon Schubert 	  /* mpn_hgcd has failed. Then either one of a or b is very
372*4b6a78b7SSimon Schubert 	     small, or the difference is very small. Perform one
373*4b6a78b7SSimon Schubert 	     subtraction followed by one division. */
374*4b6a78b7SSimon Schubert 	  mp_size_t gn;
375*4b6a78b7SSimon Schubert 	  mp_size_t updated_un = un;
376*4b6a78b7SSimon Schubert 
377*4b6a78b7SSimon Schubert 	  /* Temporary storage 2n + 1 */
378*4b6a78b7SSimon Schubert 	  n = mpn_gcdext_subdiv_step (gp, &gn, up, usizep, ap, bp, n,
379*4b6a78b7SSimon Schubert 				      u0, u1, &updated_un, tp, tp + n);
380*4b6a78b7SSimon Schubert 	  if (n == 0)
381*4b6a78b7SSimon Schubert 	    {
382*4b6a78b7SSimon Schubert 	      TMP_FREE;
383*4b6a78b7SSimon Schubert 	      return gn;
384*4b6a78b7SSimon Schubert 	    }
385*4b6a78b7SSimon Schubert 
386*4b6a78b7SSimon Schubert 	  un = updated_un;
387*4b6a78b7SSimon Schubert 	  ASSERT (un < ualloc);
388*4b6a78b7SSimon Schubert 	}
389*4b6a78b7SSimon Schubert     }
390*4b6a78b7SSimon Schubert 
391*4b6a78b7SSimon Schubert   if (mpn_zero_p (ap, n))
392*4b6a78b7SSimon Schubert     {
393*4b6a78b7SSimon Schubert       MPN_COPY (gp, bp, n);
394*4b6a78b7SSimon Schubert       MPN_NORMALIZE_NOT_ZERO (u0, un);
395*4b6a78b7SSimon Schubert       MPN_COPY (up, u0, un);
396*4b6a78b7SSimon Schubert       *usizep = -un;
397*4b6a78b7SSimon Schubert 
398*4b6a78b7SSimon Schubert       TMP_FREE;
399*4b6a78b7SSimon Schubert       return n;
400*4b6a78b7SSimon Schubert     }
401*4b6a78b7SSimon Schubert   else if (mpn_zero_p (bp, n))
402*4b6a78b7SSimon Schubert     {
403*4b6a78b7SSimon Schubert       MPN_COPY (gp, ap, n);
404*4b6a78b7SSimon Schubert       MPN_NORMALIZE_NOT_ZERO (u1, un);
405*4b6a78b7SSimon Schubert       MPN_COPY (up, u1, un);
406*4b6a78b7SSimon Schubert       *usizep = un;
407*4b6a78b7SSimon Schubert 
408*4b6a78b7SSimon Schubert       TMP_FREE;
409*4b6a78b7SSimon Schubert       return n;
410*4b6a78b7SSimon Schubert     }
411*4b6a78b7SSimon Schubert   else if (mpn_zero_p (u0, un))
412*4b6a78b7SSimon Schubert     {
413*4b6a78b7SSimon Schubert       mp_size_t gn;
414*4b6a78b7SSimon Schubert       ASSERT (un == 1);
415*4b6a78b7SSimon Schubert       ASSERT (u1[0] == 1);
416*4b6a78b7SSimon Schubert 
417*4b6a78b7SSimon Schubert       /* g = u a + v b = (u u1 - v u0) A + (...) B = u A + (...) B */
418*4b6a78b7SSimon Schubert       gn = mpn_gcdext_lehmer_n (gp, up, usizep, ap, bp, n, tp);
419*4b6a78b7SSimon Schubert 
420*4b6a78b7SSimon Schubert       TMP_FREE;
421*4b6a78b7SSimon Schubert       return gn;
422*4b6a78b7SSimon Schubert     }
423*4b6a78b7SSimon Schubert   else
424*4b6a78b7SSimon Schubert     {
425*4b6a78b7SSimon Schubert       /* We have A = ... a + ... b
426*4b6a78b7SSimon Schubert 		 B =  u0 a +  u1 b
427*4b6a78b7SSimon Schubert 
428*4b6a78b7SSimon Schubert 		 a = u1  A + ... B
429*4b6a78b7SSimon Schubert 		 b = -u0 A + ... B
430*4b6a78b7SSimon Schubert 
431*4b6a78b7SSimon Schubert 	 with bounds
432*4b6a78b7SSimon Schubert 
433*4b6a78b7SSimon Schubert 	   |u0|, |u1| <= B / min(a, b)
434*4b6a78b7SSimon Schubert 
435*4b6a78b7SSimon Schubert 	 Compute g = u a + v b = (u u1 - v u0) A + (...) B
436*4b6a78b7SSimon Schubert 	 Here, u, v are bounded by
437*4b6a78b7SSimon Schubert 
438*4b6a78b7SSimon Schubert 	 |u| <= b,
439*4b6a78b7SSimon Schubert 	 |v| <= a
440*4b6a78b7SSimon Schubert       */
441*4b6a78b7SSimon Schubert 
442*4b6a78b7SSimon Schubert       mp_size_t u0n;
443*4b6a78b7SSimon Schubert       mp_size_t u1n;
444*4b6a78b7SSimon Schubert       mp_size_t lehmer_un;
445*4b6a78b7SSimon Schubert       mp_size_t lehmer_vn;
446*4b6a78b7SSimon Schubert       mp_size_t gn;
447*4b6a78b7SSimon Schubert 
448*4b6a78b7SSimon Schubert       mp_ptr lehmer_up;
449*4b6a78b7SSimon Schubert       mp_ptr lehmer_vp;
450*4b6a78b7SSimon Schubert       int negate;
451*4b6a78b7SSimon Schubert 
452*4b6a78b7SSimon Schubert       lehmer_up = tp; tp += n;
453*4b6a78b7SSimon Schubert 
454*4b6a78b7SSimon Schubert       /* Call mpn_gcdext_lehmer_n with copies of a and b. */
455*4b6a78b7SSimon Schubert       MPN_COPY (tp, ap, n);
456*4b6a78b7SSimon Schubert       MPN_COPY (tp + n, bp, n);
457*4b6a78b7SSimon Schubert       gn = mpn_gcdext_lehmer_n (gp, lehmer_up, &lehmer_un, tp, tp + n, n, tp + 2*n);
458*4b6a78b7SSimon Schubert 
459*4b6a78b7SSimon Schubert       u0n = un;
460*4b6a78b7SSimon Schubert       MPN_NORMALIZE (u0, u0n);
461*4b6a78b7SSimon Schubert       if (lehmer_un == 0)
462*4b6a78b7SSimon Schubert 	{
463*4b6a78b7SSimon Schubert 	  /* u == 0  ==>  v = g / b == 1  ==> g = - u0 A + (...) B */
464*4b6a78b7SSimon Schubert 	  MPN_COPY (up, u0, u0n);
465*4b6a78b7SSimon Schubert 	  *usizep = -u0n;
466*4b6a78b7SSimon Schubert 
467*4b6a78b7SSimon Schubert 	  TMP_FREE;
468*4b6a78b7SSimon Schubert 	  return gn;
469*4b6a78b7SSimon Schubert 	}
470*4b6a78b7SSimon Schubert 
471*4b6a78b7SSimon Schubert       lehmer_vp = tp;
472*4b6a78b7SSimon Schubert       /* Compute v = (g - u a) / b */
473*4b6a78b7SSimon Schubert       lehmer_vn = compute_v (lehmer_vp,
474*4b6a78b7SSimon Schubert 			     ap, bp, n, gp, gn, lehmer_up, lehmer_un, tp + n + 1);
475*4b6a78b7SSimon Schubert 
476*4b6a78b7SSimon Schubert       if (lehmer_un > 0)
477*4b6a78b7SSimon Schubert 	negate = 0;
478*4b6a78b7SSimon Schubert       else
479*4b6a78b7SSimon Schubert 	{
480*4b6a78b7SSimon Schubert 	  lehmer_un = -lehmer_un;
481*4b6a78b7SSimon Schubert 	  negate = 1;
482*4b6a78b7SSimon Schubert 	}
483*4b6a78b7SSimon Schubert 
484*4b6a78b7SSimon Schubert       u1n = un;
485*4b6a78b7SSimon Schubert       MPN_NORMALIZE (u1, u1n);
486*4b6a78b7SSimon Schubert 
487*4b6a78b7SSimon Schubert       /* It's possible that u0 = 1, u1 = 0 */
488*4b6a78b7SSimon Schubert       if (u1n == 0)
489*4b6a78b7SSimon Schubert 	{
490*4b6a78b7SSimon Schubert 	  ASSERT (un == 1);
491*4b6a78b7SSimon Schubert 	  ASSERT (u0[0] == 1);
492*4b6a78b7SSimon Schubert 
493*4b6a78b7SSimon Schubert 	  /* u1 == 0 ==> u u1 + v u0 = v */
494*4b6a78b7SSimon Schubert 	  MPN_COPY (up, lehmer_vp, lehmer_vn);
495*4b6a78b7SSimon Schubert 	  *usizep = negate ? lehmer_vn : - lehmer_vn;
496*4b6a78b7SSimon Schubert 
497*4b6a78b7SSimon Schubert 	  TMP_FREE;
498*4b6a78b7SSimon Schubert 	  return gn;
499*4b6a78b7SSimon Schubert 	}
500*4b6a78b7SSimon Schubert 
501*4b6a78b7SSimon Schubert       ASSERT (lehmer_un + u1n <= ualloc);
502*4b6a78b7SSimon Schubert       ASSERT (lehmer_vn + u0n <= ualloc);
503*4b6a78b7SSimon Schubert 
504*4b6a78b7SSimon Schubert       /* Now u0, u1, u are non-zero. We may still have v == 0 */
505*4b6a78b7SSimon Schubert 
506*4b6a78b7SSimon Schubert       /* Compute u u0 */
507*4b6a78b7SSimon Schubert       if (lehmer_un <= u1n)
508*4b6a78b7SSimon Schubert 	/* Should be the common case */
509*4b6a78b7SSimon Schubert 	mpn_mul (up, u1, u1n, lehmer_up, lehmer_un);
510*4b6a78b7SSimon Schubert       else
511*4b6a78b7SSimon Schubert 	mpn_mul (up, lehmer_up, lehmer_un, u1, u1n);
512*4b6a78b7SSimon Schubert 
513*4b6a78b7SSimon Schubert       un = u1n + lehmer_un;
514*4b6a78b7SSimon Schubert       un -= (up[un - 1] == 0);
515*4b6a78b7SSimon Schubert 
516*4b6a78b7SSimon Schubert       if (lehmer_vn > 0)
517*4b6a78b7SSimon Schubert 	{
518*4b6a78b7SSimon Schubert 	  mp_limb_t cy;
519*4b6a78b7SSimon Schubert 
520*4b6a78b7SSimon Schubert 	  /* Overwrites old u1 value */
521*4b6a78b7SSimon Schubert 	  if (lehmer_vn <= u0n)
522*4b6a78b7SSimon Schubert 	    /* Should be the common case */
523*4b6a78b7SSimon Schubert 	    mpn_mul (u1, u0, u0n, lehmer_vp, lehmer_vn);
524*4b6a78b7SSimon Schubert 	  else
525*4b6a78b7SSimon Schubert 	    mpn_mul (u1, lehmer_vp, lehmer_vn, u0, u0n);
526*4b6a78b7SSimon Schubert 
527*4b6a78b7SSimon Schubert 	  u1n = u0n + lehmer_vn;
528*4b6a78b7SSimon Schubert 	  u1n -= (u1[u1n - 1] == 0);
529*4b6a78b7SSimon Schubert 
530*4b6a78b7SSimon Schubert 	  if (u1n <= un)
531*4b6a78b7SSimon Schubert 	    {
532*4b6a78b7SSimon Schubert 	      cy = mpn_add (up, up, un, u1, u1n);
533*4b6a78b7SSimon Schubert 	    }
534*4b6a78b7SSimon Schubert 	  else
535*4b6a78b7SSimon Schubert 	    {
536*4b6a78b7SSimon Schubert 	      cy = mpn_add (up, u1, u1n, up, un);
537*4b6a78b7SSimon Schubert 	      un = u1n;
538*4b6a78b7SSimon Schubert 	    }
539*4b6a78b7SSimon Schubert 	  up[un] = cy;
540*4b6a78b7SSimon Schubert 	  un += (cy != 0);
541*4b6a78b7SSimon Schubert 
542*4b6a78b7SSimon Schubert 	  ASSERT (un < ualloc);
543*4b6a78b7SSimon Schubert 	}
544*4b6a78b7SSimon Schubert       *usizep = negate ? -un : un;
545*4b6a78b7SSimon Schubert 
546*4b6a78b7SSimon Schubert       TMP_FREE;
547*4b6a78b7SSimon Schubert       return gn;
548*4b6a78b7SSimon Schubert     }
549*4b6a78b7SSimon Schubert }
550