xref: /dflybsd-src/contrib/gmp/mpn/generic/gcdext.c (revision 8b5d8148d3631a3bdb67aafad46263c0cc3a7529)
14b6a78b7SSimon Schubert /* mpn_gcdext -- Extended Greatest Common Divisor.
24b6a78b7SSimon Schubert 
3*8b5d8148SSascha Wildner Copyright 1996, 1998, 2000, 2001, 2002, 2003, 2004, 2005, 2008, 2009 Free Software
44b6a78b7SSimon Schubert Foundation, Inc.
54b6a78b7SSimon Schubert 
64b6a78b7SSimon Schubert This file is part of the GNU MP Library.
74b6a78b7SSimon Schubert 
84b6a78b7SSimon Schubert The GNU MP Library is free software; you can redistribute it and/or modify
94b6a78b7SSimon Schubert it under the terms of the GNU Lesser General Public License as published by
104b6a78b7SSimon Schubert the Free Software Foundation; either version 3 of the License, or (at your
114b6a78b7SSimon Schubert option) any later version.
124b6a78b7SSimon Schubert 
134b6a78b7SSimon Schubert The GNU MP Library is distributed in the hope that it will be useful, but
144b6a78b7SSimon Schubert WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
154b6a78b7SSimon Schubert or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
164b6a78b7SSimon Schubert License for more details.
174b6a78b7SSimon Schubert 
184b6a78b7SSimon Schubert You should have received a copy of the GNU Lesser General Public License
194b6a78b7SSimon Schubert along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
204b6a78b7SSimon Schubert 
214b6a78b7SSimon Schubert #include "gmp.h"
224b6a78b7SSimon Schubert #include "gmp-impl.h"
234b6a78b7SSimon Schubert #include "longlong.h"
244b6a78b7SSimon Schubert 
254b6a78b7SSimon Schubert /* Computes (r;b) = (a; b) M. Result is of size n + M->n +/- 1, and
264b6a78b7SSimon Schubert    the size is returned (if inputs are non-normalized, result may be
274b6a78b7SSimon Schubert    non-normalized too). Temporary space needed is M->n + n.
284b6a78b7SSimon Schubert  */
294b6a78b7SSimon Schubert static size_t
304b6a78b7SSimon Schubert hgcd_mul_matrix_vector (struct hgcd_matrix *M,
314b6a78b7SSimon Schubert 			mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp)
324b6a78b7SSimon Schubert {
334b6a78b7SSimon Schubert   mp_limb_t ah, bh;
344b6a78b7SSimon Schubert 
354b6a78b7SSimon Schubert   /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as
364b6a78b7SSimon Schubert 
374b6a78b7SSimon Schubert      t  = u00 * a
384b6a78b7SSimon Schubert      r  = u10 * b
394b6a78b7SSimon Schubert      r += t;
404b6a78b7SSimon Schubert 
414b6a78b7SSimon Schubert      t  = u11 * b
424b6a78b7SSimon Schubert      b  = u01 * a
434b6a78b7SSimon Schubert      b += t;
444b6a78b7SSimon Schubert   */
454b6a78b7SSimon Schubert 
464b6a78b7SSimon Schubert   if (M->n >= n)
474b6a78b7SSimon Schubert     {
484b6a78b7SSimon Schubert       mpn_mul (tp, M->p[0][0], M->n, ap, n);
494b6a78b7SSimon Schubert       mpn_mul (rp, M->p[1][0], M->n, bp, n);
504b6a78b7SSimon Schubert     }
514b6a78b7SSimon Schubert   else
524b6a78b7SSimon Schubert     {
534b6a78b7SSimon Schubert       mpn_mul (tp, ap, n, M->p[0][0], M->n);
544b6a78b7SSimon Schubert       mpn_mul (rp, bp, n, M->p[1][0], M->n);
554b6a78b7SSimon Schubert     }
564b6a78b7SSimon Schubert 
574b6a78b7SSimon Schubert   ah = mpn_add_n (rp, rp, tp, n + M->n);
584b6a78b7SSimon Schubert 
594b6a78b7SSimon Schubert   if (M->n >= n)
604b6a78b7SSimon Schubert     {
614b6a78b7SSimon Schubert       mpn_mul (tp, M->p[1][1], M->n, bp, n);
624b6a78b7SSimon Schubert       mpn_mul (bp, M->p[0][1], M->n, ap, n);
634b6a78b7SSimon Schubert     }
644b6a78b7SSimon Schubert   else
654b6a78b7SSimon Schubert     {
664b6a78b7SSimon Schubert       mpn_mul (tp, bp, n, M->p[1][1], M->n);
674b6a78b7SSimon Schubert       mpn_mul (bp, ap, n, M->p[0][1], M->n);
684b6a78b7SSimon Schubert     }
694b6a78b7SSimon Schubert   bh = mpn_add_n (bp, bp, tp, n + M->n);
704b6a78b7SSimon Schubert 
714b6a78b7SSimon Schubert   n += M->n;
724b6a78b7SSimon Schubert   if ( (ah | bh) > 0)
734b6a78b7SSimon Schubert     {
744b6a78b7SSimon Schubert       rp[n] = ah;
754b6a78b7SSimon Schubert       bp[n] = bh;
764b6a78b7SSimon Schubert       n++;
774b6a78b7SSimon Schubert     }
784b6a78b7SSimon Schubert   else
794b6a78b7SSimon Schubert     {
804b6a78b7SSimon Schubert       /* Normalize */
814b6a78b7SSimon Schubert       while ( (rp[n-1] | bp[n-1]) == 0)
824b6a78b7SSimon Schubert 	n--;
834b6a78b7SSimon Schubert     }
844b6a78b7SSimon Schubert 
854b6a78b7SSimon Schubert   return n;
864b6a78b7SSimon Schubert }
874b6a78b7SSimon Schubert 
884b6a78b7SSimon Schubert #define COMPUTE_V_ITCH(n) (2*(n) + 1)
894b6a78b7SSimon Schubert 
904b6a78b7SSimon Schubert /* Computes |v| = |(g - u a)| / b, where u may be positive or
914b6a78b7SSimon Schubert    negative, and v is of the opposite sign. a, b are of size n, u and
924b6a78b7SSimon Schubert    v at most size n, and v must have space for n+1 limbs. */
934b6a78b7SSimon Schubert static mp_size_t
944b6a78b7SSimon Schubert compute_v (mp_ptr vp,
954b6a78b7SSimon Schubert 	   mp_srcptr ap, mp_srcptr bp, mp_size_t n,
964b6a78b7SSimon Schubert 	   mp_srcptr gp, mp_size_t gn,
974b6a78b7SSimon Schubert 	   mp_srcptr up, mp_size_t usize,
984b6a78b7SSimon Schubert 	   mp_ptr tp)
994b6a78b7SSimon Schubert {
1004b6a78b7SSimon Schubert   mp_size_t size;
1014b6a78b7SSimon Schubert   mp_size_t an;
1024b6a78b7SSimon Schubert   mp_size_t bn;
1034b6a78b7SSimon Schubert   mp_size_t vn;
1044b6a78b7SSimon Schubert 
1054b6a78b7SSimon Schubert   ASSERT (n > 0);
1064b6a78b7SSimon Schubert   ASSERT (gn > 0);
1074b6a78b7SSimon Schubert   ASSERT (usize != 0);
1084b6a78b7SSimon Schubert 
1094b6a78b7SSimon Schubert   size = ABS (usize);
1104b6a78b7SSimon Schubert   ASSERT (size <= n);
1114b6a78b7SSimon Schubert 
1124b6a78b7SSimon Schubert   an = n;
1134b6a78b7SSimon Schubert   MPN_NORMALIZE (ap, an);
1144b6a78b7SSimon Schubert 
1154b6a78b7SSimon Schubert   if (an >= size)
1164b6a78b7SSimon Schubert     mpn_mul (tp, ap, an, up, size);
1174b6a78b7SSimon Schubert   else
1184b6a78b7SSimon Schubert     mpn_mul (tp, up, size, ap, an);
1194b6a78b7SSimon Schubert 
1204b6a78b7SSimon Schubert   size += an;
1214b6a78b7SSimon Schubert 
1224b6a78b7SSimon Schubert   ASSERT (gn <= size);
1234b6a78b7SSimon Schubert 
1244b6a78b7SSimon Schubert   if (usize > 0)
1254b6a78b7SSimon Schubert     {
1264b6a78b7SSimon Schubert       /* |v| = -v = (u a - g) / b */
1274b6a78b7SSimon Schubert 
1284b6a78b7SSimon Schubert       ASSERT_NOCARRY (mpn_sub (tp, tp, size, gp, gn));
1294b6a78b7SSimon Schubert       MPN_NORMALIZE (tp, size);
1304b6a78b7SSimon Schubert       if (size == 0)
1314b6a78b7SSimon Schubert 	return 0;
1324b6a78b7SSimon Schubert     }
1334b6a78b7SSimon Schubert   else
1344b6a78b7SSimon Schubert     { /* usize < 0 */
1354b6a78b7SSimon Schubert       /* |v| = v = (c - u a) / b = (c + |u| a) / b */
1364b6a78b7SSimon Schubert       mp_limb_t cy = mpn_add (tp, tp, size, gp, gn);
1374b6a78b7SSimon Schubert       if (cy)
1384b6a78b7SSimon Schubert 	tp[size++] = cy;
1394b6a78b7SSimon Schubert     }
1404b6a78b7SSimon Schubert 
1414b6a78b7SSimon Schubert   /* Now divide t / b. There must be no remainder */
1424b6a78b7SSimon Schubert   bn = n;
1434b6a78b7SSimon Schubert   MPN_NORMALIZE (bp, bn);
1444b6a78b7SSimon Schubert   ASSERT (size >= bn);
1454b6a78b7SSimon Schubert 
1464b6a78b7SSimon Schubert   vn = size + 1 - bn;
1474b6a78b7SSimon Schubert   ASSERT (vn <= n + 1);
1484b6a78b7SSimon Schubert 
1494b6a78b7SSimon Schubert   /* FIXME: Use divexact. Or do the entire calculation mod 2^{n *
1504b6a78b7SSimon Schubert      GMP_NUMB_BITS}. */
1514b6a78b7SSimon Schubert   mpn_tdiv_qr (vp, tp, 0, tp, size, bp, bn);
1524b6a78b7SSimon Schubert   vn -= (vp[vn-1] == 0);
1534b6a78b7SSimon Schubert 
1544b6a78b7SSimon Schubert   /* Remainder must be zero */
1554b6a78b7SSimon Schubert #if WANT_ASSERT
1564b6a78b7SSimon Schubert   {
1574b6a78b7SSimon Schubert     mp_size_t i;
1584b6a78b7SSimon Schubert     for (i = 0; i < bn; i++)
1594b6a78b7SSimon Schubert       {
1604b6a78b7SSimon Schubert 	ASSERT (tp[i] == 0);
1614b6a78b7SSimon Schubert       }
1624b6a78b7SSimon Schubert   }
1634b6a78b7SSimon Schubert #endif
1644b6a78b7SSimon Schubert   return vn;
1654b6a78b7SSimon Schubert }
1664b6a78b7SSimon Schubert 
1674b6a78b7SSimon Schubert /* Temporary storage:
1684b6a78b7SSimon Schubert 
1694b6a78b7SSimon Schubert    Initial division: Quotient of at most an - n + 1 <= an limbs.
1704b6a78b7SSimon Schubert 
1714b6a78b7SSimon Schubert    Storage for u0 and u1: 2(n+1).
1724b6a78b7SSimon Schubert 
1734b6a78b7SSimon Schubert    Storage for hgcd matrix M, with input ceil(n/2): 5 * ceil(n/4)
1744b6a78b7SSimon Schubert 
1754b6a78b7SSimon Schubert    Storage for hgcd, input (n + 1)/2: 9 n/4 plus some.
1764b6a78b7SSimon Schubert 
1774b6a78b7SSimon Schubert    When hgcd succeeds: 1 + floor(3n/2) for adjusting a and b, and 2(n+1) for the cofactors.
1784b6a78b7SSimon Schubert 
1794b6a78b7SSimon Schubert    When hgcd fails: 2n + 1 for mpn_gcdext_subdiv_step, which is less.
1804b6a78b7SSimon Schubert 
1814b6a78b7SSimon Schubert    For the lehmer call after the loop, Let T denote
1824b6a78b7SSimon Schubert    GCDEXT_DC_THRESHOLD. For the gcdext_lehmer call, we need T each for
1834b6a78b7SSimon Schubert    u, a and b, and 4T+3 scratch space. Next, for compute_v, we need T
1844b6a78b7SSimon Schubert    + 1 for v and 2T + 1 scratch space. In all, 7T + 3 is sufficient.
1854b6a78b7SSimon Schubert 
1864b6a78b7SSimon Schubert */
1874b6a78b7SSimon Schubert 
1884b6a78b7SSimon Schubert /* Optimal choice of p seems difficult. In each iteration the division
1894b6a78b7SSimon Schubert  * of work between hgcd and the updates of u0 and u1 depends on the
1904b6a78b7SSimon Schubert  * current size of the u. It may be desirable to use a different
1914b6a78b7SSimon Schubert  * choice of p in each iteration. Also the input size seems to matter;
1924b6a78b7SSimon Schubert  * choosing p = n / 3 in the first iteration seems to improve
1934b6a78b7SSimon Schubert  * performance slightly for input size just above the threshold, but
1944b6a78b7SSimon Schubert  * degrade performance for larger inputs. */
1954b6a78b7SSimon Schubert #define CHOOSE_P_1(n) ((n) / 2)
1964b6a78b7SSimon Schubert #define CHOOSE_P_2(n) ((n) / 3)
1974b6a78b7SSimon Schubert 
1984b6a78b7SSimon Schubert mp_size_t
1994b6a78b7SSimon Schubert mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep,
2004b6a78b7SSimon Schubert 	    mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t n)
2014b6a78b7SSimon Schubert {
2024b6a78b7SSimon Schubert   mp_size_t talloc;
2034b6a78b7SSimon Schubert   mp_size_t scratch;
2044b6a78b7SSimon Schubert   mp_size_t matrix_scratch;
2054b6a78b7SSimon Schubert   mp_size_t ualloc = n + 1;
2064b6a78b7SSimon Schubert 
2074b6a78b7SSimon Schubert   mp_size_t un;
2084b6a78b7SSimon Schubert   mp_ptr u0;
2094b6a78b7SSimon Schubert   mp_ptr u1;
2104b6a78b7SSimon Schubert 
2114b6a78b7SSimon Schubert   mp_ptr tp;
2124b6a78b7SSimon Schubert 
2134b6a78b7SSimon Schubert   TMP_DECL;
2144b6a78b7SSimon Schubert 
2154b6a78b7SSimon Schubert   ASSERT (an >= n);
2164b6a78b7SSimon Schubert   ASSERT (n > 0);
2174b6a78b7SSimon Schubert 
2184b6a78b7SSimon Schubert   TMP_MARK;
2194b6a78b7SSimon Schubert 
2204b6a78b7SSimon Schubert   /* FIXME: Check for small sizes first, before setting up temporary
2214b6a78b7SSimon Schubert      storage etc. */
2224b6a78b7SSimon Schubert   talloc = MPN_GCDEXT_LEHMER_N_ITCH(n);
2234b6a78b7SSimon Schubert 
2244b6a78b7SSimon Schubert   /* For initial division */
2254b6a78b7SSimon Schubert   scratch = an - n + 1;
2264b6a78b7SSimon Schubert   if (scratch > talloc)
2274b6a78b7SSimon Schubert     talloc = scratch;
2284b6a78b7SSimon Schubert 
2294b6a78b7SSimon Schubert   if (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
2304b6a78b7SSimon Schubert     {
2314b6a78b7SSimon Schubert       /* For hgcd loop. */
2324b6a78b7SSimon Schubert       mp_size_t hgcd_scratch;
2334b6a78b7SSimon Schubert       mp_size_t update_scratch;
2344b6a78b7SSimon Schubert       mp_size_t p1 = CHOOSE_P_1 (n);
2354b6a78b7SSimon Schubert       mp_size_t p2 = CHOOSE_P_2 (n);
2364b6a78b7SSimon Schubert       mp_size_t min_p = MIN(p1, p2);
2374b6a78b7SSimon Schubert       mp_size_t max_p = MAX(p1, p2);
2384b6a78b7SSimon Schubert       matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - min_p);
2394b6a78b7SSimon Schubert       hgcd_scratch = mpn_hgcd_itch (n - min_p);
2404b6a78b7SSimon Schubert       update_scratch = max_p + n - 1;
2414b6a78b7SSimon Schubert 
2424b6a78b7SSimon Schubert       scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch);
2434b6a78b7SSimon Schubert       if (scratch > talloc)
2444b6a78b7SSimon Schubert 	talloc = scratch;
2454b6a78b7SSimon Schubert 
2464b6a78b7SSimon Schubert       /* Final mpn_gcdext_lehmer_n call. Need space for u and for
2474b6a78b7SSimon Schubert 	 copies of a and b. */
2484b6a78b7SSimon Schubert       scratch = MPN_GCDEXT_LEHMER_N_ITCH (GCDEXT_DC_THRESHOLD)
2494b6a78b7SSimon Schubert 	+ 3*GCDEXT_DC_THRESHOLD;
2504b6a78b7SSimon Schubert 
2514b6a78b7SSimon Schubert       if (scratch > talloc)
2524b6a78b7SSimon Schubert 	talloc = scratch;
2534b6a78b7SSimon Schubert 
2544b6a78b7SSimon Schubert       /* Cofactors u0 and u1 */
2554b6a78b7SSimon Schubert       talloc += 2*(n+1);
2564b6a78b7SSimon Schubert     }
2574b6a78b7SSimon Schubert 
2584b6a78b7SSimon Schubert   tp = TMP_ALLOC_LIMBS(talloc);
2594b6a78b7SSimon Schubert 
2604b6a78b7SSimon Schubert   if (an > n)
2614b6a78b7SSimon Schubert     {
2624b6a78b7SSimon Schubert       mpn_tdiv_qr (tp, ap, 0, ap, an, bp, n);
2634b6a78b7SSimon Schubert 
2644b6a78b7SSimon Schubert       if (mpn_zero_p (ap, n))
2654b6a78b7SSimon Schubert 	{
2664b6a78b7SSimon Schubert 	  MPN_COPY (gp, bp, n);
2674b6a78b7SSimon Schubert 	  *usizep = 0;
2684b6a78b7SSimon Schubert 	  TMP_FREE;
2694b6a78b7SSimon Schubert 	  return n;
2704b6a78b7SSimon Schubert 	}
2714b6a78b7SSimon Schubert     }
2724b6a78b7SSimon Schubert 
2734b6a78b7SSimon Schubert   if (BELOW_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
2744b6a78b7SSimon Schubert     {
2754b6a78b7SSimon Schubert       mp_size_t gn = mpn_gcdext_lehmer_n(gp, up, usizep, ap, bp, n, tp);
2764b6a78b7SSimon Schubert 
2774b6a78b7SSimon Schubert       TMP_FREE;
2784b6a78b7SSimon Schubert       return gn;
2794b6a78b7SSimon Schubert     }
2804b6a78b7SSimon Schubert 
2814b6a78b7SSimon Schubert   MPN_ZERO (tp, 2*ualloc);
2824b6a78b7SSimon Schubert   u0 = tp; tp += ualloc;
2834b6a78b7SSimon Schubert   u1 = tp; tp += ualloc;
2844b6a78b7SSimon Schubert 
2854b6a78b7SSimon Schubert   {
2864b6a78b7SSimon Schubert     /* For the first hgcd call, there are no u updates, and it makes
2874b6a78b7SSimon Schubert        some sense to use a different choice for p. */
2884b6a78b7SSimon Schubert 
2894b6a78b7SSimon Schubert     /* FIXME: We could trim use of temporary storage, since u0 and u1
2904b6a78b7SSimon Schubert        are not used yet. For the hgcd call, we could swap in the u0
2914b6a78b7SSimon Schubert        and u1 pointers for the relevant matrix elements. */
2924b6a78b7SSimon Schubert 
2934b6a78b7SSimon Schubert     struct hgcd_matrix M;
2944b6a78b7SSimon Schubert     mp_size_t p = CHOOSE_P_1 (n);
2954b6a78b7SSimon Schubert     mp_size_t nn;
2964b6a78b7SSimon Schubert 
2974b6a78b7SSimon Schubert     mpn_hgcd_matrix_init (&M, n - p, tp);
2984b6a78b7SSimon Schubert     nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
2994b6a78b7SSimon Schubert     if (nn > 0)
3004b6a78b7SSimon Schubert       {
3014b6a78b7SSimon Schubert 	ASSERT (M.n <= (n - p - 1)/2);
3024b6a78b7SSimon Schubert 	ASSERT (M.n + p <= (p + n - 1) / 2);
3034b6a78b7SSimon Schubert 
3044b6a78b7SSimon Schubert 	/* Temporary storage 2 (p + M->n) <= p + n - 1 */
3054b6a78b7SSimon Schubert 	n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + matrix_scratch);
3064b6a78b7SSimon Schubert 
3074b6a78b7SSimon Schubert 	MPN_COPY (u0, M.p[1][0], M.n);
3084b6a78b7SSimon Schubert 	MPN_COPY (u1, M.p[1][1], M.n);
3094b6a78b7SSimon Schubert 	un = M.n;
3104b6a78b7SSimon Schubert 	while ( (u0[un-1] | u1[un-1] ) == 0)
3114b6a78b7SSimon Schubert 	  un--;
3124b6a78b7SSimon Schubert       }
3134b6a78b7SSimon Schubert     else
3144b6a78b7SSimon Schubert       {
3154b6a78b7SSimon Schubert 	/* mpn_hgcd has failed. Then either one of a or b is very
3164b6a78b7SSimon Schubert 	   small, or the difference is very small. Perform one
3174b6a78b7SSimon Schubert 	   subtraction followed by one division. */
3184b6a78b7SSimon Schubert 	mp_size_t gn;
3194b6a78b7SSimon Schubert 	mp_size_t updated_un = 1;
3204b6a78b7SSimon Schubert 
3214b6a78b7SSimon Schubert 	u1[0] = 1;
3224b6a78b7SSimon Schubert 
3234b6a78b7SSimon Schubert 	/* Temporary storage 2n + 1 */
3244b6a78b7SSimon Schubert 	n = mpn_gcdext_subdiv_step (gp, &gn, up, usizep, ap, bp, n,
3254b6a78b7SSimon Schubert 				    u0, u1, &updated_un, tp, tp + n);
3264b6a78b7SSimon Schubert 	if (n == 0)
3274b6a78b7SSimon Schubert 	  {
3284b6a78b7SSimon Schubert 	    TMP_FREE;
3294b6a78b7SSimon Schubert 	    return gn;
3304b6a78b7SSimon Schubert 	  }
3314b6a78b7SSimon Schubert 
3324b6a78b7SSimon Schubert 	un = updated_un;
3334b6a78b7SSimon Schubert 	ASSERT (un < ualloc);
3344b6a78b7SSimon Schubert       }
3354b6a78b7SSimon Schubert   }
3364b6a78b7SSimon Schubert 
3374b6a78b7SSimon Schubert   while (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
3384b6a78b7SSimon Schubert     {
3394b6a78b7SSimon Schubert       struct hgcd_matrix M;
3404b6a78b7SSimon Schubert       mp_size_t p = CHOOSE_P_2 (n);
3414b6a78b7SSimon Schubert       mp_size_t nn;
3424b6a78b7SSimon Schubert 
3434b6a78b7SSimon Schubert       mpn_hgcd_matrix_init (&M, n - p, tp);
3444b6a78b7SSimon Schubert       nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
3454b6a78b7SSimon Schubert       if (nn > 0)
3464b6a78b7SSimon Schubert 	{
3474b6a78b7SSimon Schubert 	  mp_ptr t0;
3484b6a78b7SSimon Schubert 
3494b6a78b7SSimon Schubert 	  t0 = tp + matrix_scratch;
3504b6a78b7SSimon Schubert 	  ASSERT (M.n <= (n - p - 1)/2);
3514b6a78b7SSimon Schubert 	  ASSERT (M.n + p <= (p + n - 1) / 2);
3524b6a78b7SSimon Schubert 
3534b6a78b7SSimon Schubert 	  /* Temporary storage 2 (p + M->n) <= p + n - 1 */
3544b6a78b7SSimon Schubert 	  n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, t0);
3554b6a78b7SSimon Schubert 
3564b6a78b7SSimon Schubert 	  /* By the same analysis as for mpn_hgcd_matrix_mul */
3574b6a78b7SSimon Schubert 	  ASSERT (M.n + un <= ualloc);
3584b6a78b7SSimon Schubert 
3594b6a78b7SSimon Schubert 	  /* FIXME: This copying could be avoided by some swapping of
3604b6a78b7SSimon Schubert 	   * pointers. May need more temporary storage, though. */
3614b6a78b7SSimon Schubert 	  MPN_COPY (t0, u0, un);
3624b6a78b7SSimon Schubert 
3634b6a78b7SSimon Schubert 	  /* Temporary storage ualloc */
3644b6a78b7SSimon Schubert 	  un = hgcd_mul_matrix_vector (&M, u0, t0, u1, un, t0 + un);
3654b6a78b7SSimon Schubert 
3664b6a78b7SSimon Schubert 	  ASSERT (un < ualloc);
3674b6a78b7SSimon Schubert 	  ASSERT ( (u0[un-1] | u1[un-1]) > 0);
3684b6a78b7SSimon Schubert 	}
3694b6a78b7SSimon Schubert       else
3704b6a78b7SSimon Schubert 	{
3714b6a78b7SSimon Schubert 	  /* mpn_hgcd has failed. Then either one of a or b is very
3724b6a78b7SSimon Schubert 	     small, or the difference is very small. Perform one
3734b6a78b7SSimon Schubert 	     subtraction followed by one division. */
3744b6a78b7SSimon Schubert 	  mp_size_t gn;
3754b6a78b7SSimon Schubert 	  mp_size_t updated_un = un;
3764b6a78b7SSimon Schubert 
3774b6a78b7SSimon Schubert 	  /* Temporary storage 2n + 1 */
3784b6a78b7SSimon Schubert 	  n = mpn_gcdext_subdiv_step (gp, &gn, up, usizep, ap, bp, n,
3794b6a78b7SSimon Schubert 				      u0, u1, &updated_un, tp, tp + n);
3804b6a78b7SSimon Schubert 	  if (n == 0)
3814b6a78b7SSimon Schubert 	    {
3824b6a78b7SSimon Schubert 	      TMP_FREE;
3834b6a78b7SSimon Schubert 	      return gn;
3844b6a78b7SSimon Schubert 	    }
3854b6a78b7SSimon Schubert 
3864b6a78b7SSimon Schubert 	  un = updated_un;
3874b6a78b7SSimon Schubert 	  ASSERT (un < ualloc);
3884b6a78b7SSimon Schubert 	}
3894b6a78b7SSimon Schubert     }
3904b6a78b7SSimon Schubert 
391*8b5d8148SSascha Wildner   if (UNLIKELY (mpn_cmp (ap, bp, n) == 0))
3924b6a78b7SSimon Schubert     {
393*8b5d8148SSascha Wildner       /* Must return the smallest cofactor, +u1 or -u0 */
394*8b5d8148SSascha Wildner       int c;
395*8b5d8148SSascha Wildner 
396*8b5d8148SSascha Wildner       MPN_COPY (gp, ap, n);
397*8b5d8148SSascha Wildner 
398*8b5d8148SSascha Wildner       MPN_CMP (c, u0, u1, un);
399*8b5d8148SSascha Wildner       ASSERT (c != 0);
400*8b5d8148SSascha Wildner       if (c < 0)
401*8b5d8148SSascha Wildner 	{
402*8b5d8148SSascha Wildner 	  MPN_NORMALIZE (u0, un);
4034b6a78b7SSimon Schubert 	  MPN_COPY (up, u0, un);
4044b6a78b7SSimon Schubert 	  *usizep = -un;
4054b6a78b7SSimon Schubert 	}
406*8b5d8148SSascha Wildner       else
4074b6a78b7SSimon Schubert 	{
4084b6a78b7SSimon Schubert 	  MPN_NORMALIZE_NOT_ZERO (u1, un);
4094b6a78b7SSimon Schubert 	  MPN_COPY (up, u1, un);
4104b6a78b7SSimon Schubert 	  *usizep = un;
411*8b5d8148SSascha Wildner 	}
4124b6a78b7SSimon Schubert 
4134b6a78b7SSimon Schubert       TMP_FREE;
4144b6a78b7SSimon Schubert       return n;
4154b6a78b7SSimon Schubert     }
4164b6a78b7SSimon Schubert   else if (mpn_zero_p (u0, un))
4174b6a78b7SSimon Schubert     {
4184b6a78b7SSimon Schubert       mp_size_t gn;
4194b6a78b7SSimon Schubert       ASSERT (un == 1);
4204b6a78b7SSimon Schubert       ASSERT (u1[0] == 1);
4214b6a78b7SSimon Schubert 
4224b6a78b7SSimon Schubert       /* g = u a + v b = (u u1 - v u0) A + (...) B = u A + (...) B */
4234b6a78b7SSimon Schubert       gn = mpn_gcdext_lehmer_n (gp, up, usizep, ap, bp, n, tp);
4244b6a78b7SSimon Schubert 
4254b6a78b7SSimon Schubert       TMP_FREE;
4264b6a78b7SSimon Schubert       return gn;
4274b6a78b7SSimon Schubert     }
4284b6a78b7SSimon Schubert   else
4294b6a78b7SSimon Schubert     {
4304b6a78b7SSimon Schubert       /* We have A = ... a + ... b
4314b6a78b7SSimon Schubert 		 B =  u0 a +  u1 b
4324b6a78b7SSimon Schubert 
4334b6a78b7SSimon Schubert 		 a = u1  A + ... B
4344b6a78b7SSimon Schubert 		 b = -u0 A + ... B
4354b6a78b7SSimon Schubert 
4364b6a78b7SSimon Schubert 	 with bounds
4374b6a78b7SSimon Schubert 
4384b6a78b7SSimon Schubert 	   |u0|, |u1| <= B / min(a, b)
4394b6a78b7SSimon Schubert 
4404b6a78b7SSimon Schubert 	 Compute g = u a + v b = (u u1 - v u0) A + (...) B
4414b6a78b7SSimon Schubert 	 Here, u, v are bounded by
4424b6a78b7SSimon Schubert 
4434b6a78b7SSimon Schubert 	 |u| <= b,
4444b6a78b7SSimon Schubert 	 |v| <= a
4454b6a78b7SSimon Schubert       */
4464b6a78b7SSimon Schubert 
4474b6a78b7SSimon Schubert       mp_size_t u0n;
4484b6a78b7SSimon Schubert       mp_size_t u1n;
4494b6a78b7SSimon Schubert       mp_size_t lehmer_un;
4504b6a78b7SSimon Schubert       mp_size_t lehmer_vn;
4514b6a78b7SSimon Schubert       mp_size_t gn;
4524b6a78b7SSimon Schubert 
4534b6a78b7SSimon Schubert       mp_ptr lehmer_up;
4544b6a78b7SSimon Schubert       mp_ptr lehmer_vp;
4554b6a78b7SSimon Schubert       int negate;
4564b6a78b7SSimon Schubert 
4574b6a78b7SSimon Schubert       lehmer_up = tp; tp += n;
4584b6a78b7SSimon Schubert 
4594b6a78b7SSimon Schubert       /* Call mpn_gcdext_lehmer_n with copies of a and b. */
4604b6a78b7SSimon Schubert       MPN_COPY (tp, ap, n);
4614b6a78b7SSimon Schubert       MPN_COPY (tp + n, bp, n);
4624b6a78b7SSimon Schubert       gn = mpn_gcdext_lehmer_n (gp, lehmer_up, &lehmer_un, tp, tp + n, n, tp + 2*n);
4634b6a78b7SSimon Schubert 
4644b6a78b7SSimon Schubert       u0n = un;
4654b6a78b7SSimon Schubert       MPN_NORMALIZE (u0, u0n);
4664b6a78b7SSimon Schubert       if (lehmer_un == 0)
4674b6a78b7SSimon Schubert 	{
4684b6a78b7SSimon Schubert 	  /* u == 0  ==>  v = g / b == 1  ==> g = - u0 A + (...) B */
4694b6a78b7SSimon Schubert 	  MPN_COPY (up, u0, u0n);
4704b6a78b7SSimon Schubert 	  *usizep = -u0n;
4714b6a78b7SSimon Schubert 
4724b6a78b7SSimon Schubert 	  TMP_FREE;
4734b6a78b7SSimon Schubert 	  return gn;
4744b6a78b7SSimon Schubert 	}
4754b6a78b7SSimon Schubert 
4764b6a78b7SSimon Schubert       lehmer_vp = tp;
4774b6a78b7SSimon Schubert       /* Compute v = (g - u a) / b */
4784b6a78b7SSimon Schubert       lehmer_vn = compute_v (lehmer_vp,
4794b6a78b7SSimon Schubert 			     ap, bp, n, gp, gn, lehmer_up, lehmer_un, tp + n + 1);
4804b6a78b7SSimon Schubert 
4814b6a78b7SSimon Schubert       if (lehmer_un > 0)
4824b6a78b7SSimon Schubert 	negate = 0;
4834b6a78b7SSimon Schubert       else
4844b6a78b7SSimon Schubert 	{
4854b6a78b7SSimon Schubert 	  lehmer_un = -lehmer_un;
4864b6a78b7SSimon Schubert 	  negate = 1;
4874b6a78b7SSimon Schubert 	}
4884b6a78b7SSimon Schubert 
4894b6a78b7SSimon Schubert       u1n = un;
4904b6a78b7SSimon Schubert       MPN_NORMALIZE (u1, u1n);
4914b6a78b7SSimon Schubert 
4924b6a78b7SSimon Schubert       /* It's possible that u0 = 1, u1 = 0 */
4934b6a78b7SSimon Schubert       if (u1n == 0)
4944b6a78b7SSimon Schubert 	{
4954b6a78b7SSimon Schubert 	  ASSERT (un == 1);
4964b6a78b7SSimon Schubert 	  ASSERT (u0[0] == 1);
4974b6a78b7SSimon Schubert 
4984b6a78b7SSimon Schubert 	  /* u1 == 0 ==> u u1 + v u0 = v */
4994b6a78b7SSimon Schubert 	  MPN_COPY (up, lehmer_vp, lehmer_vn);
5004b6a78b7SSimon Schubert 	  *usizep = negate ? lehmer_vn : - lehmer_vn;
5014b6a78b7SSimon Schubert 
5024b6a78b7SSimon Schubert 	  TMP_FREE;
5034b6a78b7SSimon Schubert 	  return gn;
5044b6a78b7SSimon Schubert 	}
5054b6a78b7SSimon Schubert 
5064b6a78b7SSimon Schubert       ASSERT (lehmer_un + u1n <= ualloc);
5074b6a78b7SSimon Schubert       ASSERT (lehmer_vn + u0n <= ualloc);
5084b6a78b7SSimon Schubert 
5094b6a78b7SSimon Schubert       /* Now u0, u1, u are non-zero. We may still have v == 0 */
5104b6a78b7SSimon Schubert 
5114b6a78b7SSimon Schubert       /* Compute u u0 */
5124b6a78b7SSimon Schubert       if (lehmer_un <= u1n)
5134b6a78b7SSimon Schubert 	/* Should be the common case */
5144b6a78b7SSimon Schubert 	mpn_mul (up, u1, u1n, lehmer_up, lehmer_un);
5154b6a78b7SSimon Schubert       else
5164b6a78b7SSimon Schubert 	mpn_mul (up, lehmer_up, lehmer_un, u1, u1n);
5174b6a78b7SSimon Schubert 
5184b6a78b7SSimon Schubert       un = u1n + lehmer_un;
5194b6a78b7SSimon Schubert       un -= (up[un - 1] == 0);
5204b6a78b7SSimon Schubert 
5214b6a78b7SSimon Schubert       if (lehmer_vn > 0)
5224b6a78b7SSimon Schubert 	{
5234b6a78b7SSimon Schubert 	  mp_limb_t cy;
5244b6a78b7SSimon Schubert 
5254b6a78b7SSimon Schubert 	  /* Overwrites old u1 value */
5264b6a78b7SSimon Schubert 	  if (lehmer_vn <= u0n)
5274b6a78b7SSimon Schubert 	    /* Should be the common case */
5284b6a78b7SSimon Schubert 	    mpn_mul (u1, u0, u0n, lehmer_vp, lehmer_vn);
5294b6a78b7SSimon Schubert 	  else
5304b6a78b7SSimon Schubert 	    mpn_mul (u1, lehmer_vp, lehmer_vn, u0, u0n);
5314b6a78b7SSimon Schubert 
5324b6a78b7SSimon Schubert 	  u1n = u0n + lehmer_vn;
5334b6a78b7SSimon Schubert 	  u1n -= (u1[u1n - 1] == 0);
5344b6a78b7SSimon Schubert 
5354b6a78b7SSimon Schubert 	  if (u1n <= un)
5364b6a78b7SSimon Schubert 	    {
5374b6a78b7SSimon Schubert 	      cy = mpn_add (up, up, un, u1, u1n);
5384b6a78b7SSimon Schubert 	    }
5394b6a78b7SSimon Schubert 	  else
5404b6a78b7SSimon Schubert 	    {
5414b6a78b7SSimon Schubert 	      cy = mpn_add (up, u1, u1n, up, un);
5424b6a78b7SSimon Schubert 	      un = u1n;
5434b6a78b7SSimon Schubert 	    }
5444b6a78b7SSimon Schubert 	  up[un] = cy;
5454b6a78b7SSimon Schubert 	  un += (cy != 0);
5464b6a78b7SSimon Schubert 
5474b6a78b7SSimon Schubert 	  ASSERT (un < ualloc);
5484b6a78b7SSimon Schubert 	}
5494b6a78b7SSimon Schubert       *usizep = negate ? -un : un;
5504b6a78b7SSimon Schubert 
5514b6a78b7SSimon Schubert       TMP_FREE;
5524b6a78b7SSimon Schubert       return gn;
5534b6a78b7SSimon Schubert     }
5544b6a78b7SSimon Schubert }
555