xref: /dflybsd-src/contrib/gmp/mpn/generic/gcdext.c (revision d365564473a20a528d07c59cad8ee2f4bea5546f)
14b6a78b7SSimon Schubert /* mpn_gcdext -- Extended Greatest Common Divisor.
24b6a78b7SSimon Schubert 
38b5d8148SSascha Wildner Copyright 1996, 1998, 2000, 2001, 2002, 2003, 2004, 2005, 2008, 2009 Free Software
44b6a78b7SSimon Schubert Foundation, Inc.
54b6a78b7SSimon Schubert 
64b6a78b7SSimon Schubert This file is part of the GNU MP Library.
74b6a78b7SSimon Schubert 
84b6a78b7SSimon Schubert The GNU MP Library is free software; you can redistribute it and/or modify
94b6a78b7SSimon Schubert it under the terms of the GNU Lesser General Public License as published by
104b6a78b7SSimon Schubert the Free Software Foundation; either version 3 of the License, or (at your
114b6a78b7SSimon Schubert option) any later version.
124b6a78b7SSimon Schubert 
134b6a78b7SSimon Schubert The GNU MP Library is distributed in the hope that it will be useful, but
144b6a78b7SSimon Schubert WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
154b6a78b7SSimon Schubert or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
164b6a78b7SSimon Schubert License for more details.
174b6a78b7SSimon Schubert 
184b6a78b7SSimon Schubert You should have received a copy of the GNU Lesser General Public License
194b6a78b7SSimon Schubert along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
204b6a78b7SSimon Schubert 
214b6a78b7SSimon Schubert #include "gmp.h"
224b6a78b7SSimon Schubert #include "gmp-impl.h"
234b6a78b7SSimon Schubert #include "longlong.h"
244b6a78b7SSimon Schubert 
254b6a78b7SSimon Schubert /* Computes (r;b) = (a; b) M. Result is of size n + M->n +/- 1, and
264b6a78b7SSimon Schubert    the size is returned (if inputs are non-normalized, result may be
274b6a78b7SSimon Schubert    non-normalized too). Temporary space needed is M->n + n.
284b6a78b7SSimon Schubert  */
294b6a78b7SSimon Schubert static size_t
hgcd_mul_matrix_vector(struct hgcd_matrix * M,mp_ptr rp,mp_srcptr ap,mp_ptr bp,mp_size_t n,mp_ptr tp)304b6a78b7SSimon Schubert hgcd_mul_matrix_vector (struct hgcd_matrix *M,
314b6a78b7SSimon Schubert 			mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp)
324b6a78b7SSimon Schubert {
334b6a78b7SSimon Schubert   mp_limb_t ah, bh;
344b6a78b7SSimon Schubert 
354b6a78b7SSimon Schubert   /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as
364b6a78b7SSimon Schubert 
374b6a78b7SSimon Schubert      t  = u00 * a
384b6a78b7SSimon Schubert      r  = u10 * b
394b6a78b7SSimon Schubert      r += t;
404b6a78b7SSimon Schubert 
414b6a78b7SSimon Schubert      t  = u11 * b
424b6a78b7SSimon Schubert      b  = u01 * a
434b6a78b7SSimon Schubert      b += t;
444b6a78b7SSimon Schubert   */
454b6a78b7SSimon Schubert 
464b6a78b7SSimon Schubert   if (M->n >= n)
474b6a78b7SSimon Schubert     {
484b6a78b7SSimon Schubert       mpn_mul (tp, M->p[0][0], M->n, ap, n);
494b6a78b7SSimon Schubert       mpn_mul (rp, M->p[1][0], M->n, bp, n);
504b6a78b7SSimon Schubert     }
514b6a78b7SSimon Schubert   else
524b6a78b7SSimon Schubert     {
534b6a78b7SSimon Schubert       mpn_mul (tp, ap, n, M->p[0][0], M->n);
544b6a78b7SSimon Schubert       mpn_mul (rp, bp, n, M->p[1][0], M->n);
554b6a78b7SSimon Schubert     }
564b6a78b7SSimon Schubert 
574b6a78b7SSimon Schubert   ah = mpn_add_n (rp, rp, tp, n + M->n);
584b6a78b7SSimon Schubert 
594b6a78b7SSimon Schubert   if (M->n >= n)
604b6a78b7SSimon Schubert     {
614b6a78b7SSimon Schubert       mpn_mul (tp, M->p[1][1], M->n, bp, n);
624b6a78b7SSimon Schubert       mpn_mul (bp, M->p[0][1], M->n, ap, n);
634b6a78b7SSimon Schubert     }
644b6a78b7SSimon Schubert   else
654b6a78b7SSimon Schubert     {
664b6a78b7SSimon Schubert       mpn_mul (tp, bp, n, M->p[1][1], M->n);
674b6a78b7SSimon Schubert       mpn_mul (bp, ap, n, M->p[0][1], M->n);
684b6a78b7SSimon Schubert     }
694b6a78b7SSimon Schubert   bh = mpn_add_n (bp, bp, tp, n + M->n);
704b6a78b7SSimon Schubert 
714b6a78b7SSimon Schubert   n += M->n;
724b6a78b7SSimon Schubert   if ( (ah | bh) > 0)
734b6a78b7SSimon Schubert     {
744b6a78b7SSimon Schubert       rp[n] = ah;
754b6a78b7SSimon Schubert       bp[n] = bh;
764b6a78b7SSimon Schubert       n++;
774b6a78b7SSimon Schubert     }
784b6a78b7SSimon Schubert   else
794b6a78b7SSimon Schubert     {
804b6a78b7SSimon Schubert       /* Normalize */
814b6a78b7SSimon Schubert       while ( (rp[n-1] | bp[n-1]) == 0)
824b6a78b7SSimon Schubert 	n--;
834b6a78b7SSimon Schubert     }
844b6a78b7SSimon Schubert 
854b6a78b7SSimon Schubert   return n;
864b6a78b7SSimon Schubert }
874b6a78b7SSimon Schubert 
884b6a78b7SSimon Schubert #define COMPUTE_V_ITCH(n) (2*(n) + 1)
894b6a78b7SSimon Schubert 
904b6a78b7SSimon Schubert /* Computes |v| = |(g - u a)| / b, where u may be positive or
914b6a78b7SSimon Schubert    negative, and v is of the opposite sign. a, b are of size n, u and
924b6a78b7SSimon Schubert    v at most size n, and v must have space for n+1 limbs. */
934b6a78b7SSimon Schubert static mp_size_t
compute_v(mp_ptr vp,mp_srcptr ap,mp_srcptr bp,mp_size_t n,mp_srcptr gp,mp_size_t gn,mp_srcptr up,mp_size_t usize,mp_ptr tp)944b6a78b7SSimon Schubert compute_v (mp_ptr vp,
954b6a78b7SSimon Schubert 	   mp_srcptr ap, mp_srcptr bp, mp_size_t n,
964b6a78b7SSimon Schubert 	   mp_srcptr gp, mp_size_t gn,
974b6a78b7SSimon Schubert 	   mp_srcptr up, mp_size_t usize,
984b6a78b7SSimon Schubert 	   mp_ptr tp)
994b6a78b7SSimon Schubert {
1004b6a78b7SSimon Schubert   mp_size_t size;
1014b6a78b7SSimon Schubert   mp_size_t an;
1024b6a78b7SSimon Schubert   mp_size_t bn;
1034b6a78b7SSimon Schubert   mp_size_t vn;
1044b6a78b7SSimon Schubert 
1054b6a78b7SSimon Schubert   ASSERT (n > 0);
1064b6a78b7SSimon Schubert   ASSERT (gn > 0);
1074b6a78b7SSimon Schubert   ASSERT (usize != 0);
1084b6a78b7SSimon Schubert 
1094b6a78b7SSimon Schubert   size = ABS (usize);
1104b6a78b7SSimon Schubert   ASSERT (size <= n);
1114b6a78b7SSimon Schubert 
1124b6a78b7SSimon Schubert   an = n;
1134b6a78b7SSimon Schubert   MPN_NORMALIZE (ap, an);
1144b6a78b7SSimon Schubert 
1154b6a78b7SSimon Schubert   if (an >= size)
1164b6a78b7SSimon Schubert     mpn_mul (tp, ap, an, up, size);
1174b6a78b7SSimon Schubert   else
1184b6a78b7SSimon Schubert     mpn_mul (tp, up, size, ap, an);
1194b6a78b7SSimon Schubert 
1204b6a78b7SSimon Schubert   size += an;
12154028e53SJohn Marino   size -= tp[size - 1] == 0;
1224b6a78b7SSimon Schubert 
1234b6a78b7SSimon Schubert   ASSERT (gn <= size);
1244b6a78b7SSimon Schubert 
1254b6a78b7SSimon Schubert   if (usize > 0)
1264b6a78b7SSimon Schubert     {
1274b6a78b7SSimon Schubert       /* |v| = -v = (u a - g) / b */
1284b6a78b7SSimon Schubert 
1294b6a78b7SSimon Schubert       ASSERT_NOCARRY (mpn_sub (tp, tp, size, gp, gn));
1304b6a78b7SSimon Schubert       MPN_NORMALIZE (tp, size);
1314b6a78b7SSimon Schubert       if (size == 0)
1324b6a78b7SSimon Schubert 	return 0;
1334b6a78b7SSimon Schubert     }
1344b6a78b7SSimon Schubert   else
1354b6a78b7SSimon Schubert     { /* usize < 0 */
1364b6a78b7SSimon Schubert       /* |v| = v = (c - u a) / b = (c + |u| a) / b */
1374b6a78b7SSimon Schubert       mp_limb_t cy = mpn_add (tp, tp, size, gp, gn);
1384b6a78b7SSimon Schubert       if (cy)
1394b6a78b7SSimon Schubert 	tp[size++] = cy;
1404b6a78b7SSimon Schubert     }
1414b6a78b7SSimon Schubert 
1424b6a78b7SSimon Schubert   /* Now divide t / b. There must be no remainder */
1434b6a78b7SSimon Schubert   bn = n;
1444b6a78b7SSimon Schubert   MPN_NORMALIZE (bp, bn);
1454b6a78b7SSimon Schubert   ASSERT (size >= bn);
1464b6a78b7SSimon Schubert 
1474b6a78b7SSimon Schubert   vn = size + 1 - bn;
1484b6a78b7SSimon Schubert   ASSERT (vn <= n + 1);
1494b6a78b7SSimon Schubert 
15054028e53SJohn Marino   mpn_divexact (vp, tp, size, bp, bn);
1514b6a78b7SSimon Schubert   vn -= (vp[vn-1] == 0);
1524b6a78b7SSimon Schubert 
1534b6a78b7SSimon Schubert   return vn;
1544b6a78b7SSimon Schubert }
1554b6a78b7SSimon Schubert 
1564b6a78b7SSimon Schubert /* Temporary storage:
1574b6a78b7SSimon Schubert 
1584b6a78b7SSimon Schubert    Initial division: Quotient of at most an - n + 1 <= an limbs.
1594b6a78b7SSimon Schubert 
1604b6a78b7SSimon Schubert    Storage for u0 and u1: 2(n+1).
1614b6a78b7SSimon Schubert 
1624b6a78b7SSimon Schubert    Storage for hgcd matrix M, with input ceil(n/2): 5 * ceil(n/4)
1634b6a78b7SSimon Schubert 
1644b6a78b7SSimon Schubert    Storage for hgcd, input (n + 1)/2: 9 n/4 plus some.
1654b6a78b7SSimon Schubert 
1664b6a78b7SSimon Schubert    When hgcd succeeds: 1 + floor(3n/2) for adjusting a and b, and 2(n+1) for the cofactors.
1674b6a78b7SSimon Schubert 
1684b6a78b7SSimon Schubert    When hgcd fails: 2n + 1 for mpn_gcdext_subdiv_step, which is less.
1694b6a78b7SSimon Schubert 
1704b6a78b7SSimon Schubert    For the lehmer call after the loop, Let T denote
1714b6a78b7SSimon Schubert    GCDEXT_DC_THRESHOLD. For the gcdext_lehmer call, we need T each for
1724b6a78b7SSimon Schubert    u, a and b, and 4T+3 scratch space. Next, for compute_v, we need T
17354028e53SJohn Marino    for u, T+1 for v and 2T + 1 scratch space. In all, 7T + 3 is
17454028e53SJohn Marino    sufficient for both operations.
1754b6a78b7SSimon Schubert 
1764b6a78b7SSimon Schubert */
1774b6a78b7SSimon Schubert 
1784b6a78b7SSimon Schubert /* Optimal choice of p seems difficult. In each iteration the division
1794b6a78b7SSimon Schubert  * of work between hgcd and the updates of u0 and u1 depends on the
1804b6a78b7SSimon Schubert  * current size of the u. It may be desirable to use a different
1814b6a78b7SSimon Schubert  * choice of p in each iteration. Also the input size seems to matter;
1824b6a78b7SSimon Schubert  * choosing p = n / 3 in the first iteration seems to improve
1834b6a78b7SSimon Schubert  * performance slightly for input size just above the threshold, but
1844b6a78b7SSimon Schubert  * degrade performance for larger inputs. */
1854b6a78b7SSimon Schubert #define CHOOSE_P_1(n) ((n) / 2)
1864b6a78b7SSimon Schubert #define CHOOSE_P_2(n) ((n) / 3)
1874b6a78b7SSimon Schubert 
1884b6a78b7SSimon Schubert mp_size_t
mpn_gcdext(mp_ptr gp,mp_ptr up,mp_size_t * usizep,mp_ptr ap,mp_size_t an,mp_ptr bp,mp_size_t n)1894b6a78b7SSimon Schubert mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep,
1904b6a78b7SSimon Schubert 	    mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t n)
1914b6a78b7SSimon Schubert {
1924b6a78b7SSimon Schubert   mp_size_t talloc;
1934b6a78b7SSimon Schubert   mp_size_t scratch;
1944b6a78b7SSimon Schubert   mp_size_t matrix_scratch;
1954b6a78b7SSimon Schubert   mp_size_t ualloc = n + 1;
1964b6a78b7SSimon Schubert 
1974b6a78b7SSimon Schubert   mp_size_t un;
1984b6a78b7SSimon Schubert   mp_ptr u0;
1994b6a78b7SSimon Schubert   mp_ptr u1;
2004b6a78b7SSimon Schubert 
2014b6a78b7SSimon Schubert   mp_ptr tp;
2024b6a78b7SSimon Schubert 
2034b6a78b7SSimon Schubert   TMP_DECL;
2044b6a78b7SSimon Schubert 
2054b6a78b7SSimon Schubert   ASSERT (an >= n);
2064b6a78b7SSimon Schubert   ASSERT (n > 0);
2074b6a78b7SSimon Schubert 
2084b6a78b7SSimon Schubert   TMP_MARK;
2094b6a78b7SSimon Schubert 
2104b6a78b7SSimon Schubert   /* FIXME: Check for small sizes first, before setting up temporary
2114b6a78b7SSimon Schubert      storage etc. */
2124b6a78b7SSimon Schubert   talloc = MPN_GCDEXT_LEHMER_N_ITCH(n);
2134b6a78b7SSimon Schubert 
2144b6a78b7SSimon Schubert   /* For initial division */
2154b6a78b7SSimon Schubert   scratch = an - n + 1;
2164b6a78b7SSimon Schubert   if (scratch > talloc)
2174b6a78b7SSimon Schubert     talloc = scratch;
2184b6a78b7SSimon Schubert 
2194b6a78b7SSimon Schubert   if (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
2204b6a78b7SSimon Schubert     {
2214b6a78b7SSimon Schubert       /* For hgcd loop. */
2224b6a78b7SSimon Schubert       mp_size_t hgcd_scratch;
2234b6a78b7SSimon Schubert       mp_size_t update_scratch;
2244b6a78b7SSimon Schubert       mp_size_t p1 = CHOOSE_P_1 (n);
2254b6a78b7SSimon Schubert       mp_size_t p2 = CHOOSE_P_2 (n);
2264b6a78b7SSimon Schubert       mp_size_t min_p = MIN(p1, p2);
2274b6a78b7SSimon Schubert       mp_size_t max_p = MAX(p1, p2);
2284b6a78b7SSimon Schubert       matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - min_p);
2294b6a78b7SSimon Schubert       hgcd_scratch = mpn_hgcd_itch (n - min_p);
2304b6a78b7SSimon Schubert       update_scratch = max_p + n - 1;
2314b6a78b7SSimon Schubert 
2324b6a78b7SSimon Schubert       scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch);
2334b6a78b7SSimon Schubert       if (scratch > talloc)
2344b6a78b7SSimon Schubert 	talloc = scratch;
2354b6a78b7SSimon Schubert 
2364b6a78b7SSimon Schubert       /* Final mpn_gcdext_lehmer_n call. Need space for u and for
2374b6a78b7SSimon Schubert 	 copies of a and b. */
2384b6a78b7SSimon Schubert       scratch = MPN_GCDEXT_LEHMER_N_ITCH (GCDEXT_DC_THRESHOLD)
2394b6a78b7SSimon Schubert 	+ 3*GCDEXT_DC_THRESHOLD;
2404b6a78b7SSimon Schubert 
2414b6a78b7SSimon Schubert       if (scratch > talloc)
2424b6a78b7SSimon Schubert 	talloc = scratch;
2434b6a78b7SSimon Schubert 
2444b6a78b7SSimon Schubert       /* Cofactors u0 and u1 */
2454b6a78b7SSimon Schubert       talloc += 2*(n+1);
2464b6a78b7SSimon Schubert     }
2474b6a78b7SSimon Schubert 
2484b6a78b7SSimon Schubert   tp = TMP_ALLOC_LIMBS(talloc);
2494b6a78b7SSimon Schubert 
2504b6a78b7SSimon Schubert   if (an > n)
2514b6a78b7SSimon Schubert     {
2524b6a78b7SSimon Schubert       mpn_tdiv_qr (tp, ap, 0, ap, an, bp, n);
2534b6a78b7SSimon Schubert 
2544b6a78b7SSimon Schubert       if (mpn_zero_p (ap, n))
2554b6a78b7SSimon Schubert 	{
2564b6a78b7SSimon Schubert 	  MPN_COPY (gp, bp, n);
2574b6a78b7SSimon Schubert 	  *usizep = 0;
2584b6a78b7SSimon Schubert 	  TMP_FREE;
2594b6a78b7SSimon Schubert 	  return n;
2604b6a78b7SSimon Schubert 	}
2614b6a78b7SSimon Schubert     }
2624b6a78b7SSimon Schubert 
2634b6a78b7SSimon Schubert   if (BELOW_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
2644b6a78b7SSimon Schubert     {
2654b6a78b7SSimon Schubert       mp_size_t gn = mpn_gcdext_lehmer_n(gp, up, usizep, ap, bp, n, tp);
2664b6a78b7SSimon Schubert 
2674b6a78b7SSimon Schubert       TMP_FREE;
2684b6a78b7SSimon Schubert       return gn;
2694b6a78b7SSimon Schubert     }
2704b6a78b7SSimon Schubert 
2714b6a78b7SSimon Schubert   MPN_ZERO (tp, 2*ualloc);
2724b6a78b7SSimon Schubert   u0 = tp; tp += ualloc;
2734b6a78b7SSimon Schubert   u1 = tp; tp += ualloc;
2744b6a78b7SSimon Schubert 
2754b6a78b7SSimon Schubert   {
2764b6a78b7SSimon Schubert     /* For the first hgcd call, there are no u updates, and it makes
2774b6a78b7SSimon Schubert        some sense to use a different choice for p. */
2784b6a78b7SSimon Schubert 
2794b6a78b7SSimon Schubert     /* FIXME: We could trim use of temporary storage, since u0 and u1
2804b6a78b7SSimon Schubert        are not used yet. For the hgcd call, we could swap in the u0
2814b6a78b7SSimon Schubert        and u1 pointers for the relevant matrix elements. */
2824b6a78b7SSimon Schubert 
2834b6a78b7SSimon Schubert     struct hgcd_matrix M;
2844b6a78b7SSimon Schubert     mp_size_t p = CHOOSE_P_1 (n);
2854b6a78b7SSimon Schubert     mp_size_t nn;
2864b6a78b7SSimon Schubert 
2874b6a78b7SSimon Schubert     mpn_hgcd_matrix_init (&M, n - p, tp);
2884b6a78b7SSimon Schubert     nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
2894b6a78b7SSimon Schubert     if (nn > 0)
2904b6a78b7SSimon Schubert       {
2914b6a78b7SSimon Schubert 	ASSERT (M.n <= (n - p - 1)/2);
2924b6a78b7SSimon Schubert 	ASSERT (M.n + p <= (p + n - 1) / 2);
2934b6a78b7SSimon Schubert 
2944b6a78b7SSimon Schubert 	/* Temporary storage 2 (p + M->n) <= p + n - 1 */
2954b6a78b7SSimon Schubert 	n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + matrix_scratch);
2964b6a78b7SSimon Schubert 
2974b6a78b7SSimon Schubert 	MPN_COPY (u0, M.p[1][0], M.n);
2984b6a78b7SSimon Schubert 	MPN_COPY (u1, M.p[1][1], M.n);
2994b6a78b7SSimon Schubert 	un = M.n;
3004b6a78b7SSimon Schubert 	while ( (u0[un-1] | u1[un-1] ) == 0)
3014b6a78b7SSimon Schubert 	  un--;
3024b6a78b7SSimon Schubert       }
3034b6a78b7SSimon Schubert     else
3044b6a78b7SSimon Schubert       {
3054b6a78b7SSimon Schubert 	/* mpn_hgcd has failed. Then either one of a or b is very
3064b6a78b7SSimon Schubert 	   small, or the difference is very small. Perform one
3074b6a78b7SSimon Schubert 	   subtraction followed by one division. */
3084b6a78b7SSimon Schubert 	mp_size_t gn;
3094b6a78b7SSimon Schubert 	mp_size_t updated_un = 1;
3104b6a78b7SSimon Schubert 
3114b6a78b7SSimon Schubert 	u1[0] = 1;
3124b6a78b7SSimon Schubert 
3134b6a78b7SSimon Schubert 	/* Temporary storage 2n + 1 */
3144b6a78b7SSimon Schubert 	n = mpn_gcdext_subdiv_step (gp, &gn, up, usizep, ap, bp, n,
3154b6a78b7SSimon Schubert 				    u0, u1, &updated_un, tp, tp + n);
3164b6a78b7SSimon Schubert 	if (n == 0)
3174b6a78b7SSimon Schubert 	  {
3184b6a78b7SSimon Schubert 	    TMP_FREE;
3194b6a78b7SSimon Schubert 	    return gn;
3204b6a78b7SSimon Schubert 	  }
3214b6a78b7SSimon Schubert 
3224b6a78b7SSimon Schubert 	un = updated_un;
3234b6a78b7SSimon Schubert 	ASSERT (un < ualloc);
3244b6a78b7SSimon Schubert       }
3254b6a78b7SSimon Schubert   }
3264b6a78b7SSimon Schubert 
3274b6a78b7SSimon Schubert   while (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
3284b6a78b7SSimon Schubert     {
3294b6a78b7SSimon Schubert       struct hgcd_matrix M;
3304b6a78b7SSimon Schubert       mp_size_t p = CHOOSE_P_2 (n);
3314b6a78b7SSimon Schubert       mp_size_t nn;
3324b6a78b7SSimon Schubert 
3334b6a78b7SSimon Schubert       mpn_hgcd_matrix_init (&M, n - p, tp);
3344b6a78b7SSimon Schubert       nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
3354b6a78b7SSimon Schubert       if (nn > 0)
3364b6a78b7SSimon Schubert 	{
3374b6a78b7SSimon Schubert 	  mp_ptr t0;
3384b6a78b7SSimon Schubert 
3394b6a78b7SSimon Schubert 	  t0 = tp + matrix_scratch;
3404b6a78b7SSimon Schubert 	  ASSERT (M.n <= (n - p - 1)/2);
3414b6a78b7SSimon Schubert 	  ASSERT (M.n + p <= (p + n - 1) / 2);
3424b6a78b7SSimon Schubert 
3434b6a78b7SSimon Schubert 	  /* Temporary storage 2 (p + M->n) <= p + n - 1 */
3444b6a78b7SSimon Schubert 	  n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, t0);
3454b6a78b7SSimon Schubert 
3464b6a78b7SSimon Schubert 	  /* By the same analysis as for mpn_hgcd_matrix_mul */
3474b6a78b7SSimon Schubert 	  ASSERT (M.n + un <= ualloc);
3484b6a78b7SSimon Schubert 
3494b6a78b7SSimon Schubert 	  /* FIXME: This copying could be avoided by some swapping of
3504b6a78b7SSimon Schubert 	   * pointers. May need more temporary storage, though. */
3514b6a78b7SSimon Schubert 	  MPN_COPY (t0, u0, un);
3524b6a78b7SSimon Schubert 
3534b6a78b7SSimon Schubert 	  /* Temporary storage ualloc */
3544b6a78b7SSimon Schubert 	  un = hgcd_mul_matrix_vector (&M, u0, t0, u1, un, t0 + un);
3554b6a78b7SSimon Schubert 
3564b6a78b7SSimon Schubert 	  ASSERT (un < ualloc);
3574b6a78b7SSimon Schubert 	  ASSERT ( (u0[un-1] | u1[un-1]) > 0);
3584b6a78b7SSimon Schubert 	}
3594b6a78b7SSimon Schubert       else
3604b6a78b7SSimon Schubert 	{
3614b6a78b7SSimon Schubert 	  /* mpn_hgcd has failed. Then either one of a or b is very
3624b6a78b7SSimon Schubert 	     small, or the difference is very small. Perform one
3634b6a78b7SSimon Schubert 	     subtraction followed by one division. */
3644b6a78b7SSimon Schubert 	  mp_size_t gn;
3654b6a78b7SSimon Schubert 	  mp_size_t updated_un = un;
3664b6a78b7SSimon Schubert 
3674b6a78b7SSimon Schubert 	  /* Temporary storage 2n + 1 */
3684b6a78b7SSimon Schubert 	  n = mpn_gcdext_subdiv_step (gp, &gn, up, usizep, ap, bp, n,
3694b6a78b7SSimon Schubert 				      u0, u1, &updated_un, tp, tp + n);
3704b6a78b7SSimon Schubert 	  if (n == 0)
3714b6a78b7SSimon Schubert 	    {
3724b6a78b7SSimon Schubert 	      TMP_FREE;
3734b6a78b7SSimon Schubert 	      return gn;
3744b6a78b7SSimon Schubert 	    }
3754b6a78b7SSimon Schubert 
3764b6a78b7SSimon Schubert 	  un = updated_un;
3774b6a78b7SSimon Schubert 	  ASSERT (un < ualloc);
3784b6a78b7SSimon Schubert 	}
3794b6a78b7SSimon Schubert     }
3804b6a78b7SSimon Schubert 
3818b5d8148SSascha Wildner   if (UNLIKELY (mpn_cmp (ap, bp, n) == 0))
3824b6a78b7SSimon Schubert     {
3838b5d8148SSascha Wildner       /* Must return the smallest cofactor, +u1 or -u0 */
3848b5d8148SSascha Wildner       int c;
3858b5d8148SSascha Wildner 
3868b5d8148SSascha Wildner       MPN_COPY (gp, ap, n);
3878b5d8148SSascha Wildner 
3888b5d8148SSascha Wildner       MPN_CMP (c, u0, u1, un);
389*d2d4b659SJohn Marino       /* c == 0 can happen only when A = (2k+1) G, B = 2 G. And in
390*d2d4b659SJohn Marino 	 this case we choose the cofactor + 1, corresponding to G = A
391*d2d4b659SJohn Marino 	 - k B, rather than -1, corresponding to G = - A + (k+1) B. */
392*d2d4b659SJohn Marino       ASSERT (c != 0 || (un == 1 && u0[0] == 1 && u1[0] == 1));
3938b5d8148SSascha Wildner       if (c < 0)
3948b5d8148SSascha Wildner 	{
3958b5d8148SSascha Wildner 	  MPN_NORMALIZE (u0, un);
3964b6a78b7SSimon Schubert 	  MPN_COPY (up, u0, un);
3974b6a78b7SSimon Schubert 	  *usizep = -un;
3984b6a78b7SSimon Schubert 	}
3998b5d8148SSascha Wildner       else
4004b6a78b7SSimon Schubert 	{
4014b6a78b7SSimon Schubert 	  MPN_NORMALIZE_NOT_ZERO (u1, un);
4024b6a78b7SSimon Schubert 	  MPN_COPY (up, u1, un);
4034b6a78b7SSimon Schubert 	  *usizep = un;
4048b5d8148SSascha Wildner 	}
4054b6a78b7SSimon Schubert 
4064b6a78b7SSimon Schubert       TMP_FREE;
4074b6a78b7SSimon Schubert       return n;
4084b6a78b7SSimon Schubert     }
4094b6a78b7SSimon Schubert   else if (mpn_zero_p (u0, un))
4104b6a78b7SSimon Schubert     {
4114b6a78b7SSimon Schubert       mp_size_t gn;
4124b6a78b7SSimon Schubert       ASSERT (un == 1);
4134b6a78b7SSimon Schubert       ASSERT (u1[0] == 1);
4144b6a78b7SSimon Schubert 
4154b6a78b7SSimon Schubert       /* g = u a + v b = (u u1 - v u0) A + (...) B = u A + (...) B */
4164b6a78b7SSimon Schubert       gn = mpn_gcdext_lehmer_n (gp, up, usizep, ap, bp, n, tp);
4174b6a78b7SSimon Schubert 
4184b6a78b7SSimon Schubert       TMP_FREE;
4194b6a78b7SSimon Schubert       return gn;
4204b6a78b7SSimon Schubert     }
4214b6a78b7SSimon Schubert   else
4224b6a78b7SSimon Schubert     {
4234b6a78b7SSimon Schubert       /* We have A = ... a + ... b
4244b6a78b7SSimon Schubert 		 B =  u0 a +  u1 b
4254b6a78b7SSimon Schubert 
4264b6a78b7SSimon Schubert 		 a = u1  A + ... B
4274b6a78b7SSimon Schubert 		 b = -u0 A + ... B
4284b6a78b7SSimon Schubert 
4294b6a78b7SSimon Schubert 	 with bounds
4304b6a78b7SSimon Schubert 
4314b6a78b7SSimon Schubert 	   |u0|, |u1| <= B / min(a, b)
4324b6a78b7SSimon Schubert 
4334b6a78b7SSimon Schubert 	 Compute g = u a + v b = (u u1 - v u0) A + (...) B
4344b6a78b7SSimon Schubert 	 Here, u, v are bounded by
4354b6a78b7SSimon Schubert 
4364b6a78b7SSimon Schubert 	 |u| <= b,
4374b6a78b7SSimon Schubert 	 |v| <= a
4384b6a78b7SSimon Schubert       */
4394b6a78b7SSimon Schubert 
4404b6a78b7SSimon Schubert       mp_size_t u0n;
4414b6a78b7SSimon Schubert       mp_size_t u1n;
4424b6a78b7SSimon Schubert       mp_size_t lehmer_un;
4434b6a78b7SSimon Schubert       mp_size_t lehmer_vn;
4444b6a78b7SSimon Schubert       mp_size_t gn;
4454b6a78b7SSimon Schubert 
4464b6a78b7SSimon Schubert       mp_ptr lehmer_up;
4474b6a78b7SSimon Schubert       mp_ptr lehmer_vp;
4484b6a78b7SSimon Schubert       int negate;
4494b6a78b7SSimon Schubert 
4504b6a78b7SSimon Schubert       lehmer_up = tp; tp += n;
4514b6a78b7SSimon Schubert 
4524b6a78b7SSimon Schubert       /* Call mpn_gcdext_lehmer_n with copies of a and b. */
4534b6a78b7SSimon Schubert       MPN_COPY (tp, ap, n);
4544b6a78b7SSimon Schubert       MPN_COPY (tp + n, bp, n);
4554b6a78b7SSimon Schubert       gn = mpn_gcdext_lehmer_n (gp, lehmer_up, &lehmer_un, tp, tp + n, n, tp + 2*n);
4564b6a78b7SSimon Schubert 
4574b6a78b7SSimon Schubert       u0n = un;
4584b6a78b7SSimon Schubert       MPN_NORMALIZE (u0, u0n);
4594b6a78b7SSimon Schubert       if (lehmer_un == 0)
4604b6a78b7SSimon Schubert 	{
4614b6a78b7SSimon Schubert 	  /* u == 0  ==>  v = g / b == 1  ==> g = - u0 A + (...) B */
4624b6a78b7SSimon Schubert 	  MPN_COPY (up, u0, u0n);
4634b6a78b7SSimon Schubert 	  *usizep = -u0n;
4644b6a78b7SSimon Schubert 
4654b6a78b7SSimon Schubert 	  TMP_FREE;
4664b6a78b7SSimon Schubert 	  return gn;
4674b6a78b7SSimon Schubert 	}
4684b6a78b7SSimon Schubert 
4694b6a78b7SSimon Schubert       lehmer_vp = tp;
4704b6a78b7SSimon Schubert       /* Compute v = (g - u a) / b */
4714b6a78b7SSimon Schubert       lehmer_vn = compute_v (lehmer_vp,
4724b6a78b7SSimon Schubert 			     ap, bp, n, gp, gn, lehmer_up, lehmer_un, tp + n + 1);
4734b6a78b7SSimon Schubert 
4744b6a78b7SSimon Schubert       if (lehmer_un > 0)
4754b6a78b7SSimon Schubert 	negate = 0;
4764b6a78b7SSimon Schubert       else
4774b6a78b7SSimon Schubert 	{
4784b6a78b7SSimon Schubert 	  lehmer_un = -lehmer_un;
4794b6a78b7SSimon Schubert 	  negate = 1;
4804b6a78b7SSimon Schubert 	}
4814b6a78b7SSimon Schubert 
4824b6a78b7SSimon Schubert       u1n = un;
4834b6a78b7SSimon Schubert       MPN_NORMALIZE (u1, u1n);
4844b6a78b7SSimon Schubert 
4854b6a78b7SSimon Schubert       /* It's possible that u0 = 1, u1 = 0 */
4864b6a78b7SSimon Schubert       if (u1n == 0)
4874b6a78b7SSimon Schubert 	{
4884b6a78b7SSimon Schubert 	  ASSERT (un == 1);
4894b6a78b7SSimon Schubert 	  ASSERT (u0[0] == 1);
4904b6a78b7SSimon Schubert 
4914b6a78b7SSimon Schubert 	  /* u1 == 0 ==> u u1 + v u0 = v */
4924b6a78b7SSimon Schubert 	  MPN_COPY (up, lehmer_vp, lehmer_vn);
4934b6a78b7SSimon Schubert 	  *usizep = negate ? lehmer_vn : - lehmer_vn;
4944b6a78b7SSimon Schubert 
4954b6a78b7SSimon Schubert 	  TMP_FREE;
4964b6a78b7SSimon Schubert 	  return gn;
4974b6a78b7SSimon Schubert 	}
4984b6a78b7SSimon Schubert 
4994b6a78b7SSimon Schubert       ASSERT (lehmer_un + u1n <= ualloc);
5004b6a78b7SSimon Schubert       ASSERT (lehmer_vn + u0n <= ualloc);
5014b6a78b7SSimon Schubert 
5024b6a78b7SSimon Schubert       /* Now u0, u1, u are non-zero. We may still have v == 0 */
5034b6a78b7SSimon Schubert 
5044b6a78b7SSimon Schubert       /* Compute u u0 */
5054b6a78b7SSimon Schubert       if (lehmer_un <= u1n)
5064b6a78b7SSimon Schubert 	/* Should be the common case */
5074b6a78b7SSimon Schubert 	mpn_mul (up, u1, u1n, lehmer_up, lehmer_un);
5084b6a78b7SSimon Schubert       else
5094b6a78b7SSimon Schubert 	mpn_mul (up, lehmer_up, lehmer_un, u1, u1n);
5104b6a78b7SSimon Schubert 
5114b6a78b7SSimon Schubert       un = u1n + lehmer_un;
5124b6a78b7SSimon Schubert       un -= (up[un - 1] == 0);
5134b6a78b7SSimon Schubert 
5144b6a78b7SSimon Schubert       if (lehmer_vn > 0)
5154b6a78b7SSimon Schubert 	{
5164b6a78b7SSimon Schubert 	  mp_limb_t cy;
5174b6a78b7SSimon Schubert 
5184b6a78b7SSimon Schubert 	  /* Overwrites old u1 value */
5194b6a78b7SSimon Schubert 	  if (lehmer_vn <= u0n)
5204b6a78b7SSimon Schubert 	    /* Should be the common case */
5214b6a78b7SSimon Schubert 	    mpn_mul (u1, u0, u0n, lehmer_vp, lehmer_vn);
5224b6a78b7SSimon Schubert 	  else
5234b6a78b7SSimon Schubert 	    mpn_mul (u1, lehmer_vp, lehmer_vn, u0, u0n);
5244b6a78b7SSimon Schubert 
5254b6a78b7SSimon Schubert 	  u1n = u0n + lehmer_vn;
5264b6a78b7SSimon Schubert 	  u1n -= (u1[u1n - 1] == 0);
5274b6a78b7SSimon Schubert 
5284b6a78b7SSimon Schubert 	  if (u1n <= un)
5294b6a78b7SSimon Schubert 	    {
5304b6a78b7SSimon Schubert 	      cy = mpn_add (up, up, un, u1, u1n);
5314b6a78b7SSimon Schubert 	    }
5324b6a78b7SSimon Schubert 	  else
5334b6a78b7SSimon Schubert 	    {
5344b6a78b7SSimon Schubert 	      cy = mpn_add (up, u1, u1n, up, un);
5354b6a78b7SSimon Schubert 	      un = u1n;
5364b6a78b7SSimon Schubert 	    }
5374b6a78b7SSimon Schubert 	  up[un] = cy;
5384b6a78b7SSimon Schubert 	  un += (cy != 0);
5394b6a78b7SSimon Schubert 
5404b6a78b7SSimon Schubert 	  ASSERT (un < ualloc);
5414b6a78b7SSimon Schubert 	}
5424b6a78b7SSimon Schubert       *usizep = negate ? -un : un;
5434b6a78b7SSimon Schubert 
5444b6a78b7SSimon Schubert       TMP_FREE;
5454b6a78b7SSimon Schubert       return gn;
5464b6a78b7SSimon Schubert     }
5474b6a78b7SSimon Schubert }
548