14b6a78b7SSimon Schubert /* mpn_gcdext -- Extended Greatest Common Divisor.
24b6a78b7SSimon Schubert
38b5d8148SSascha Wildner Copyright 1996, 1998, 2000, 2001, 2002, 2003, 2004, 2005, 2008, 2009 Free Software
44b6a78b7SSimon Schubert Foundation, Inc.
54b6a78b7SSimon Schubert
64b6a78b7SSimon Schubert This file is part of the GNU MP Library.
74b6a78b7SSimon Schubert
84b6a78b7SSimon Schubert The GNU MP Library is free software; you can redistribute it and/or modify
94b6a78b7SSimon Schubert it under the terms of the GNU Lesser General Public License as published by
104b6a78b7SSimon Schubert the Free Software Foundation; either version 3 of the License, or (at your
114b6a78b7SSimon Schubert option) any later version.
124b6a78b7SSimon Schubert
134b6a78b7SSimon Schubert The GNU MP Library is distributed in the hope that it will be useful, but
144b6a78b7SSimon Schubert WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
154b6a78b7SSimon Schubert or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
164b6a78b7SSimon Schubert License for more details.
174b6a78b7SSimon Schubert
184b6a78b7SSimon Schubert You should have received a copy of the GNU Lesser General Public License
194b6a78b7SSimon Schubert along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
204b6a78b7SSimon Schubert
214b6a78b7SSimon Schubert #include "gmp.h"
224b6a78b7SSimon Schubert #include "gmp-impl.h"
234b6a78b7SSimon Schubert #include "longlong.h"
244b6a78b7SSimon Schubert
254b6a78b7SSimon Schubert /* Computes (r;b) = (a; b) M. Result is of size n + M->n +/- 1, and
264b6a78b7SSimon Schubert the size is returned (if inputs are non-normalized, result may be
274b6a78b7SSimon Schubert non-normalized too). Temporary space needed is M->n + n.
284b6a78b7SSimon Schubert */
294b6a78b7SSimon Schubert static size_t
hgcd_mul_matrix_vector(struct hgcd_matrix * M,mp_ptr rp,mp_srcptr ap,mp_ptr bp,mp_size_t n,mp_ptr tp)304b6a78b7SSimon Schubert hgcd_mul_matrix_vector (struct hgcd_matrix *M,
314b6a78b7SSimon Schubert mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp)
324b6a78b7SSimon Schubert {
334b6a78b7SSimon Schubert mp_limb_t ah, bh;
344b6a78b7SSimon Schubert
354b6a78b7SSimon Schubert /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as
364b6a78b7SSimon Schubert
374b6a78b7SSimon Schubert t = u00 * a
384b6a78b7SSimon Schubert r = u10 * b
394b6a78b7SSimon Schubert r += t;
404b6a78b7SSimon Schubert
414b6a78b7SSimon Schubert t = u11 * b
424b6a78b7SSimon Schubert b = u01 * a
434b6a78b7SSimon Schubert b += t;
444b6a78b7SSimon Schubert */
454b6a78b7SSimon Schubert
464b6a78b7SSimon Schubert if (M->n >= n)
474b6a78b7SSimon Schubert {
484b6a78b7SSimon Schubert mpn_mul (tp, M->p[0][0], M->n, ap, n);
494b6a78b7SSimon Schubert mpn_mul (rp, M->p[1][0], M->n, bp, n);
504b6a78b7SSimon Schubert }
514b6a78b7SSimon Schubert else
524b6a78b7SSimon Schubert {
534b6a78b7SSimon Schubert mpn_mul (tp, ap, n, M->p[0][0], M->n);
544b6a78b7SSimon Schubert mpn_mul (rp, bp, n, M->p[1][0], M->n);
554b6a78b7SSimon Schubert }
564b6a78b7SSimon Schubert
574b6a78b7SSimon Schubert ah = mpn_add_n (rp, rp, tp, n + M->n);
584b6a78b7SSimon Schubert
594b6a78b7SSimon Schubert if (M->n >= n)
604b6a78b7SSimon Schubert {
614b6a78b7SSimon Schubert mpn_mul (tp, M->p[1][1], M->n, bp, n);
624b6a78b7SSimon Schubert mpn_mul (bp, M->p[0][1], M->n, ap, n);
634b6a78b7SSimon Schubert }
644b6a78b7SSimon Schubert else
654b6a78b7SSimon Schubert {
664b6a78b7SSimon Schubert mpn_mul (tp, bp, n, M->p[1][1], M->n);
674b6a78b7SSimon Schubert mpn_mul (bp, ap, n, M->p[0][1], M->n);
684b6a78b7SSimon Schubert }
694b6a78b7SSimon Schubert bh = mpn_add_n (bp, bp, tp, n + M->n);
704b6a78b7SSimon Schubert
714b6a78b7SSimon Schubert n += M->n;
724b6a78b7SSimon Schubert if ( (ah | bh) > 0)
734b6a78b7SSimon Schubert {
744b6a78b7SSimon Schubert rp[n] = ah;
754b6a78b7SSimon Schubert bp[n] = bh;
764b6a78b7SSimon Schubert n++;
774b6a78b7SSimon Schubert }
784b6a78b7SSimon Schubert else
794b6a78b7SSimon Schubert {
804b6a78b7SSimon Schubert /* Normalize */
814b6a78b7SSimon Schubert while ( (rp[n-1] | bp[n-1]) == 0)
824b6a78b7SSimon Schubert n--;
834b6a78b7SSimon Schubert }
844b6a78b7SSimon Schubert
854b6a78b7SSimon Schubert return n;
864b6a78b7SSimon Schubert }
874b6a78b7SSimon Schubert
884b6a78b7SSimon Schubert #define COMPUTE_V_ITCH(n) (2*(n) + 1)
894b6a78b7SSimon Schubert
904b6a78b7SSimon Schubert /* Computes |v| = |(g - u a)| / b, where u may be positive or
914b6a78b7SSimon Schubert negative, and v is of the opposite sign. a, b are of size n, u and
924b6a78b7SSimon Schubert v at most size n, and v must have space for n+1 limbs. */
934b6a78b7SSimon Schubert static mp_size_t
compute_v(mp_ptr vp,mp_srcptr ap,mp_srcptr bp,mp_size_t n,mp_srcptr gp,mp_size_t gn,mp_srcptr up,mp_size_t usize,mp_ptr tp)944b6a78b7SSimon Schubert compute_v (mp_ptr vp,
954b6a78b7SSimon Schubert mp_srcptr ap, mp_srcptr bp, mp_size_t n,
964b6a78b7SSimon Schubert mp_srcptr gp, mp_size_t gn,
974b6a78b7SSimon Schubert mp_srcptr up, mp_size_t usize,
984b6a78b7SSimon Schubert mp_ptr tp)
994b6a78b7SSimon Schubert {
1004b6a78b7SSimon Schubert mp_size_t size;
1014b6a78b7SSimon Schubert mp_size_t an;
1024b6a78b7SSimon Schubert mp_size_t bn;
1034b6a78b7SSimon Schubert mp_size_t vn;
1044b6a78b7SSimon Schubert
1054b6a78b7SSimon Schubert ASSERT (n > 0);
1064b6a78b7SSimon Schubert ASSERT (gn > 0);
1074b6a78b7SSimon Schubert ASSERT (usize != 0);
1084b6a78b7SSimon Schubert
1094b6a78b7SSimon Schubert size = ABS (usize);
1104b6a78b7SSimon Schubert ASSERT (size <= n);
1114b6a78b7SSimon Schubert
1124b6a78b7SSimon Schubert an = n;
1134b6a78b7SSimon Schubert MPN_NORMALIZE (ap, an);
1144b6a78b7SSimon Schubert
1154b6a78b7SSimon Schubert if (an >= size)
1164b6a78b7SSimon Schubert mpn_mul (tp, ap, an, up, size);
1174b6a78b7SSimon Schubert else
1184b6a78b7SSimon Schubert mpn_mul (tp, up, size, ap, an);
1194b6a78b7SSimon Schubert
1204b6a78b7SSimon Schubert size += an;
12154028e53SJohn Marino size -= tp[size - 1] == 0;
1224b6a78b7SSimon Schubert
1234b6a78b7SSimon Schubert ASSERT (gn <= size);
1244b6a78b7SSimon Schubert
1254b6a78b7SSimon Schubert if (usize > 0)
1264b6a78b7SSimon Schubert {
1274b6a78b7SSimon Schubert /* |v| = -v = (u a - g) / b */
1284b6a78b7SSimon Schubert
1294b6a78b7SSimon Schubert ASSERT_NOCARRY (mpn_sub (tp, tp, size, gp, gn));
1304b6a78b7SSimon Schubert MPN_NORMALIZE (tp, size);
1314b6a78b7SSimon Schubert if (size == 0)
1324b6a78b7SSimon Schubert return 0;
1334b6a78b7SSimon Schubert }
1344b6a78b7SSimon Schubert else
1354b6a78b7SSimon Schubert { /* usize < 0 */
1364b6a78b7SSimon Schubert /* |v| = v = (c - u a) / b = (c + |u| a) / b */
1374b6a78b7SSimon Schubert mp_limb_t cy = mpn_add (tp, tp, size, gp, gn);
1384b6a78b7SSimon Schubert if (cy)
1394b6a78b7SSimon Schubert tp[size++] = cy;
1404b6a78b7SSimon Schubert }
1414b6a78b7SSimon Schubert
1424b6a78b7SSimon Schubert /* Now divide t / b. There must be no remainder */
1434b6a78b7SSimon Schubert bn = n;
1444b6a78b7SSimon Schubert MPN_NORMALIZE (bp, bn);
1454b6a78b7SSimon Schubert ASSERT (size >= bn);
1464b6a78b7SSimon Schubert
1474b6a78b7SSimon Schubert vn = size + 1 - bn;
1484b6a78b7SSimon Schubert ASSERT (vn <= n + 1);
1494b6a78b7SSimon Schubert
15054028e53SJohn Marino mpn_divexact (vp, tp, size, bp, bn);
1514b6a78b7SSimon Schubert vn -= (vp[vn-1] == 0);
1524b6a78b7SSimon Schubert
1534b6a78b7SSimon Schubert return vn;
1544b6a78b7SSimon Schubert }
1554b6a78b7SSimon Schubert
1564b6a78b7SSimon Schubert /* Temporary storage:
1574b6a78b7SSimon Schubert
1584b6a78b7SSimon Schubert Initial division: Quotient of at most an - n + 1 <= an limbs.
1594b6a78b7SSimon Schubert
1604b6a78b7SSimon Schubert Storage for u0 and u1: 2(n+1).
1614b6a78b7SSimon Schubert
1624b6a78b7SSimon Schubert Storage for hgcd matrix M, with input ceil(n/2): 5 * ceil(n/4)
1634b6a78b7SSimon Schubert
1644b6a78b7SSimon Schubert Storage for hgcd, input (n + 1)/2: 9 n/4 plus some.
1654b6a78b7SSimon Schubert
1664b6a78b7SSimon Schubert When hgcd succeeds: 1 + floor(3n/2) for adjusting a and b, and 2(n+1) for the cofactors.
1674b6a78b7SSimon Schubert
1684b6a78b7SSimon Schubert When hgcd fails: 2n + 1 for mpn_gcdext_subdiv_step, which is less.
1694b6a78b7SSimon Schubert
1704b6a78b7SSimon Schubert For the lehmer call after the loop, Let T denote
1714b6a78b7SSimon Schubert GCDEXT_DC_THRESHOLD. For the gcdext_lehmer call, we need T each for
1724b6a78b7SSimon Schubert u, a and b, and 4T+3 scratch space. Next, for compute_v, we need T
17354028e53SJohn Marino for u, T+1 for v and 2T + 1 scratch space. In all, 7T + 3 is
17454028e53SJohn Marino sufficient for both operations.
1754b6a78b7SSimon Schubert
1764b6a78b7SSimon Schubert */
1774b6a78b7SSimon Schubert
1784b6a78b7SSimon Schubert /* Optimal choice of p seems difficult. In each iteration the division
1794b6a78b7SSimon Schubert * of work between hgcd and the updates of u0 and u1 depends on the
1804b6a78b7SSimon Schubert * current size of the u. It may be desirable to use a different
1814b6a78b7SSimon Schubert * choice of p in each iteration. Also the input size seems to matter;
1824b6a78b7SSimon Schubert * choosing p = n / 3 in the first iteration seems to improve
1834b6a78b7SSimon Schubert * performance slightly for input size just above the threshold, but
1844b6a78b7SSimon Schubert * degrade performance for larger inputs. */
1854b6a78b7SSimon Schubert #define CHOOSE_P_1(n) ((n) / 2)
1864b6a78b7SSimon Schubert #define CHOOSE_P_2(n) ((n) / 3)
1874b6a78b7SSimon Schubert
1884b6a78b7SSimon Schubert mp_size_t
mpn_gcdext(mp_ptr gp,mp_ptr up,mp_size_t * usizep,mp_ptr ap,mp_size_t an,mp_ptr bp,mp_size_t n)1894b6a78b7SSimon Schubert mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep,
1904b6a78b7SSimon Schubert mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t n)
1914b6a78b7SSimon Schubert {
1924b6a78b7SSimon Schubert mp_size_t talloc;
1934b6a78b7SSimon Schubert mp_size_t scratch;
1944b6a78b7SSimon Schubert mp_size_t matrix_scratch;
1954b6a78b7SSimon Schubert mp_size_t ualloc = n + 1;
1964b6a78b7SSimon Schubert
1974b6a78b7SSimon Schubert mp_size_t un;
1984b6a78b7SSimon Schubert mp_ptr u0;
1994b6a78b7SSimon Schubert mp_ptr u1;
2004b6a78b7SSimon Schubert
2014b6a78b7SSimon Schubert mp_ptr tp;
2024b6a78b7SSimon Schubert
2034b6a78b7SSimon Schubert TMP_DECL;
2044b6a78b7SSimon Schubert
2054b6a78b7SSimon Schubert ASSERT (an >= n);
2064b6a78b7SSimon Schubert ASSERT (n > 0);
2074b6a78b7SSimon Schubert
2084b6a78b7SSimon Schubert TMP_MARK;
2094b6a78b7SSimon Schubert
2104b6a78b7SSimon Schubert /* FIXME: Check for small sizes first, before setting up temporary
2114b6a78b7SSimon Schubert storage etc. */
2124b6a78b7SSimon Schubert talloc = MPN_GCDEXT_LEHMER_N_ITCH(n);
2134b6a78b7SSimon Schubert
2144b6a78b7SSimon Schubert /* For initial division */
2154b6a78b7SSimon Schubert scratch = an - n + 1;
2164b6a78b7SSimon Schubert if (scratch > talloc)
2174b6a78b7SSimon Schubert talloc = scratch;
2184b6a78b7SSimon Schubert
2194b6a78b7SSimon Schubert if (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
2204b6a78b7SSimon Schubert {
2214b6a78b7SSimon Schubert /* For hgcd loop. */
2224b6a78b7SSimon Schubert mp_size_t hgcd_scratch;
2234b6a78b7SSimon Schubert mp_size_t update_scratch;
2244b6a78b7SSimon Schubert mp_size_t p1 = CHOOSE_P_1 (n);
2254b6a78b7SSimon Schubert mp_size_t p2 = CHOOSE_P_2 (n);
2264b6a78b7SSimon Schubert mp_size_t min_p = MIN(p1, p2);
2274b6a78b7SSimon Schubert mp_size_t max_p = MAX(p1, p2);
2284b6a78b7SSimon Schubert matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - min_p);
2294b6a78b7SSimon Schubert hgcd_scratch = mpn_hgcd_itch (n - min_p);
2304b6a78b7SSimon Schubert update_scratch = max_p + n - 1;
2314b6a78b7SSimon Schubert
2324b6a78b7SSimon Schubert scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch);
2334b6a78b7SSimon Schubert if (scratch > talloc)
2344b6a78b7SSimon Schubert talloc = scratch;
2354b6a78b7SSimon Schubert
2364b6a78b7SSimon Schubert /* Final mpn_gcdext_lehmer_n call. Need space for u and for
2374b6a78b7SSimon Schubert copies of a and b. */
2384b6a78b7SSimon Schubert scratch = MPN_GCDEXT_LEHMER_N_ITCH (GCDEXT_DC_THRESHOLD)
2394b6a78b7SSimon Schubert + 3*GCDEXT_DC_THRESHOLD;
2404b6a78b7SSimon Schubert
2414b6a78b7SSimon Schubert if (scratch > talloc)
2424b6a78b7SSimon Schubert talloc = scratch;
2434b6a78b7SSimon Schubert
2444b6a78b7SSimon Schubert /* Cofactors u0 and u1 */
2454b6a78b7SSimon Schubert talloc += 2*(n+1);
2464b6a78b7SSimon Schubert }
2474b6a78b7SSimon Schubert
2484b6a78b7SSimon Schubert tp = TMP_ALLOC_LIMBS(talloc);
2494b6a78b7SSimon Schubert
2504b6a78b7SSimon Schubert if (an > n)
2514b6a78b7SSimon Schubert {
2524b6a78b7SSimon Schubert mpn_tdiv_qr (tp, ap, 0, ap, an, bp, n);
2534b6a78b7SSimon Schubert
2544b6a78b7SSimon Schubert if (mpn_zero_p (ap, n))
2554b6a78b7SSimon Schubert {
2564b6a78b7SSimon Schubert MPN_COPY (gp, bp, n);
2574b6a78b7SSimon Schubert *usizep = 0;
2584b6a78b7SSimon Schubert TMP_FREE;
2594b6a78b7SSimon Schubert return n;
2604b6a78b7SSimon Schubert }
2614b6a78b7SSimon Schubert }
2624b6a78b7SSimon Schubert
2634b6a78b7SSimon Schubert if (BELOW_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
2644b6a78b7SSimon Schubert {
2654b6a78b7SSimon Schubert mp_size_t gn = mpn_gcdext_lehmer_n(gp, up, usizep, ap, bp, n, tp);
2664b6a78b7SSimon Schubert
2674b6a78b7SSimon Schubert TMP_FREE;
2684b6a78b7SSimon Schubert return gn;
2694b6a78b7SSimon Schubert }
2704b6a78b7SSimon Schubert
2714b6a78b7SSimon Schubert MPN_ZERO (tp, 2*ualloc);
2724b6a78b7SSimon Schubert u0 = tp; tp += ualloc;
2734b6a78b7SSimon Schubert u1 = tp; tp += ualloc;
2744b6a78b7SSimon Schubert
2754b6a78b7SSimon Schubert {
2764b6a78b7SSimon Schubert /* For the first hgcd call, there are no u updates, and it makes
2774b6a78b7SSimon Schubert some sense to use a different choice for p. */
2784b6a78b7SSimon Schubert
2794b6a78b7SSimon Schubert /* FIXME: We could trim use of temporary storage, since u0 and u1
2804b6a78b7SSimon Schubert are not used yet. For the hgcd call, we could swap in the u0
2814b6a78b7SSimon Schubert and u1 pointers for the relevant matrix elements. */
2824b6a78b7SSimon Schubert
2834b6a78b7SSimon Schubert struct hgcd_matrix M;
2844b6a78b7SSimon Schubert mp_size_t p = CHOOSE_P_1 (n);
2854b6a78b7SSimon Schubert mp_size_t nn;
2864b6a78b7SSimon Schubert
2874b6a78b7SSimon Schubert mpn_hgcd_matrix_init (&M, n - p, tp);
2884b6a78b7SSimon Schubert nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
2894b6a78b7SSimon Schubert if (nn > 0)
2904b6a78b7SSimon Schubert {
2914b6a78b7SSimon Schubert ASSERT (M.n <= (n - p - 1)/2);
2924b6a78b7SSimon Schubert ASSERT (M.n + p <= (p + n - 1) / 2);
2934b6a78b7SSimon Schubert
2944b6a78b7SSimon Schubert /* Temporary storage 2 (p + M->n) <= p + n - 1 */
2954b6a78b7SSimon Schubert n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + matrix_scratch);
2964b6a78b7SSimon Schubert
2974b6a78b7SSimon Schubert MPN_COPY (u0, M.p[1][0], M.n);
2984b6a78b7SSimon Schubert MPN_COPY (u1, M.p[1][1], M.n);
2994b6a78b7SSimon Schubert un = M.n;
3004b6a78b7SSimon Schubert while ( (u0[un-1] | u1[un-1] ) == 0)
3014b6a78b7SSimon Schubert un--;
3024b6a78b7SSimon Schubert }
3034b6a78b7SSimon Schubert else
3044b6a78b7SSimon Schubert {
3054b6a78b7SSimon Schubert /* mpn_hgcd has failed. Then either one of a or b is very
3064b6a78b7SSimon Schubert small, or the difference is very small. Perform one
3074b6a78b7SSimon Schubert subtraction followed by one division. */
3084b6a78b7SSimon Schubert mp_size_t gn;
3094b6a78b7SSimon Schubert mp_size_t updated_un = 1;
3104b6a78b7SSimon Schubert
3114b6a78b7SSimon Schubert u1[0] = 1;
3124b6a78b7SSimon Schubert
3134b6a78b7SSimon Schubert /* Temporary storage 2n + 1 */
3144b6a78b7SSimon Schubert n = mpn_gcdext_subdiv_step (gp, &gn, up, usizep, ap, bp, n,
3154b6a78b7SSimon Schubert u0, u1, &updated_un, tp, tp + n);
3164b6a78b7SSimon Schubert if (n == 0)
3174b6a78b7SSimon Schubert {
3184b6a78b7SSimon Schubert TMP_FREE;
3194b6a78b7SSimon Schubert return gn;
3204b6a78b7SSimon Schubert }
3214b6a78b7SSimon Schubert
3224b6a78b7SSimon Schubert un = updated_un;
3234b6a78b7SSimon Schubert ASSERT (un < ualloc);
3244b6a78b7SSimon Schubert }
3254b6a78b7SSimon Schubert }
3264b6a78b7SSimon Schubert
3274b6a78b7SSimon Schubert while (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
3284b6a78b7SSimon Schubert {
3294b6a78b7SSimon Schubert struct hgcd_matrix M;
3304b6a78b7SSimon Schubert mp_size_t p = CHOOSE_P_2 (n);
3314b6a78b7SSimon Schubert mp_size_t nn;
3324b6a78b7SSimon Schubert
3334b6a78b7SSimon Schubert mpn_hgcd_matrix_init (&M, n - p, tp);
3344b6a78b7SSimon Schubert nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
3354b6a78b7SSimon Schubert if (nn > 0)
3364b6a78b7SSimon Schubert {
3374b6a78b7SSimon Schubert mp_ptr t0;
3384b6a78b7SSimon Schubert
3394b6a78b7SSimon Schubert t0 = tp + matrix_scratch;
3404b6a78b7SSimon Schubert ASSERT (M.n <= (n - p - 1)/2);
3414b6a78b7SSimon Schubert ASSERT (M.n + p <= (p + n - 1) / 2);
3424b6a78b7SSimon Schubert
3434b6a78b7SSimon Schubert /* Temporary storage 2 (p + M->n) <= p + n - 1 */
3444b6a78b7SSimon Schubert n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, t0);
3454b6a78b7SSimon Schubert
3464b6a78b7SSimon Schubert /* By the same analysis as for mpn_hgcd_matrix_mul */
3474b6a78b7SSimon Schubert ASSERT (M.n + un <= ualloc);
3484b6a78b7SSimon Schubert
3494b6a78b7SSimon Schubert /* FIXME: This copying could be avoided by some swapping of
3504b6a78b7SSimon Schubert * pointers. May need more temporary storage, though. */
3514b6a78b7SSimon Schubert MPN_COPY (t0, u0, un);
3524b6a78b7SSimon Schubert
3534b6a78b7SSimon Schubert /* Temporary storage ualloc */
3544b6a78b7SSimon Schubert un = hgcd_mul_matrix_vector (&M, u0, t0, u1, un, t0 + un);
3554b6a78b7SSimon Schubert
3564b6a78b7SSimon Schubert ASSERT (un < ualloc);
3574b6a78b7SSimon Schubert ASSERT ( (u0[un-1] | u1[un-1]) > 0);
3584b6a78b7SSimon Schubert }
3594b6a78b7SSimon Schubert else
3604b6a78b7SSimon Schubert {
3614b6a78b7SSimon Schubert /* mpn_hgcd has failed. Then either one of a or b is very
3624b6a78b7SSimon Schubert small, or the difference is very small. Perform one
3634b6a78b7SSimon Schubert subtraction followed by one division. */
3644b6a78b7SSimon Schubert mp_size_t gn;
3654b6a78b7SSimon Schubert mp_size_t updated_un = un;
3664b6a78b7SSimon Schubert
3674b6a78b7SSimon Schubert /* Temporary storage 2n + 1 */
3684b6a78b7SSimon Schubert n = mpn_gcdext_subdiv_step (gp, &gn, up, usizep, ap, bp, n,
3694b6a78b7SSimon Schubert u0, u1, &updated_un, tp, tp + n);
3704b6a78b7SSimon Schubert if (n == 0)
3714b6a78b7SSimon Schubert {
3724b6a78b7SSimon Schubert TMP_FREE;
3734b6a78b7SSimon Schubert return gn;
3744b6a78b7SSimon Schubert }
3754b6a78b7SSimon Schubert
3764b6a78b7SSimon Schubert un = updated_un;
3774b6a78b7SSimon Schubert ASSERT (un < ualloc);
3784b6a78b7SSimon Schubert }
3794b6a78b7SSimon Schubert }
3804b6a78b7SSimon Schubert
3818b5d8148SSascha Wildner if (UNLIKELY (mpn_cmp (ap, bp, n) == 0))
3824b6a78b7SSimon Schubert {
3838b5d8148SSascha Wildner /* Must return the smallest cofactor, +u1 or -u0 */
3848b5d8148SSascha Wildner int c;
3858b5d8148SSascha Wildner
3868b5d8148SSascha Wildner MPN_COPY (gp, ap, n);
3878b5d8148SSascha Wildner
3888b5d8148SSascha Wildner MPN_CMP (c, u0, u1, un);
389*d2d4b659SJohn Marino /* c == 0 can happen only when A = (2k+1) G, B = 2 G. And in
390*d2d4b659SJohn Marino this case we choose the cofactor + 1, corresponding to G = A
391*d2d4b659SJohn Marino - k B, rather than -1, corresponding to G = - A + (k+1) B. */
392*d2d4b659SJohn Marino ASSERT (c != 0 || (un == 1 && u0[0] == 1 && u1[0] == 1));
3938b5d8148SSascha Wildner if (c < 0)
3948b5d8148SSascha Wildner {
3958b5d8148SSascha Wildner MPN_NORMALIZE (u0, un);
3964b6a78b7SSimon Schubert MPN_COPY (up, u0, un);
3974b6a78b7SSimon Schubert *usizep = -un;
3984b6a78b7SSimon Schubert }
3998b5d8148SSascha Wildner else
4004b6a78b7SSimon Schubert {
4014b6a78b7SSimon Schubert MPN_NORMALIZE_NOT_ZERO (u1, un);
4024b6a78b7SSimon Schubert MPN_COPY (up, u1, un);
4034b6a78b7SSimon Schubert *usizep = un;
4048b5d8148SSascha Wildner }
4054b6a78b7SSimon Schubert
4064b6a78b7SSimon Schubert TMP_FREE;
4074b6a78b7SSimon Schubert return n;
4084b6a78b7SSimon Schubert }
4094b6a78b7SSimon Schubert else if (mpn_zero_p (u0, un))
4104b6a78b7SSimon Schubert {
4114b6a78b7SSimon Schubert mp_size_t gn;
4124b6a78b7SSimon Schubert ASSERT (un == 1);
4134b6a78b7SSimon Schubert ASSERT (u1[0] == 1);
4144b6a78b7SSimon Schubert
4154b6a78b7SSimon Schubert /* g = u a + v b = (u u1 - v u0) A + (...) B = u A + (...) B */
4164b6a78b7SSimon Schubert gn = mpn_gcdext_lehmer_n (gp, up, usizep, ap, bp, n, tp);
4174b6a78b7SSimon Schubert
4184b6a78b7SSimon Schubert TMP_FREE;
4194b6a78b7SSimon Schubert return gn;
4204b6a78b7SSimon Schubert }
4214b6a78b7SSimon Schubert else
4224b6a78b7SSimon Schubert {
4234b6a78b7SSimon Schubert /* We have A = ... a + ... b
4244b6a78b7SSimon Schubert B = u0 a + u1 b
4254b6a78b7SSimon Schubert
4264b6a78b7SSimon Schubert a = u1 A + ... B
4274b6a78b7SSimon Schubert b = -u0 A + ... B
4284b6a78b7SSimon Schubert
4294b6a78b7SSimon Schubert with bounds
4304b6a78b7SSimon Schubert
4314b6a78b7SSimon Schubert |u0|, |u1| <= B / min(a, b)
4324b6a78b7SSimon Schubert
4334b6a78b7SSimon Schubert Compute g = u a + v b = (u u1 - v u0) A + (...) B
4344b6a78b7SSimon Schubert Here, u, v are bounded by
4354b6a78b7SSimon Schubert
4364b6a78b7SSimon Schubert |u| <= b,
4374b6a78b7SSimon Schubert |v| <= a
4384b6a78b7SSimon Schubert */
4394b6a78b7SSimon Schubert
4404b6a78b7SSimon Schubert mp_size_t u0n;
4414b6a78b7SSimon Schubert mp_size_t u1n;
4424b6a78b7SSimon Schubert mp_size_t lehmer_un;
4434b6a78b7SSimon Schubert mp_size_t lehmer_vn;
4444b6a78b7SSimon Schubert mp_size_t gn;
4454b6a78b7SSimon Schubert
4464b6a78b7SSimon Schubert mp_ptr lehmer_up;
4474b6a78b7SSimon Schubert mp_ptr lehmer_vp;
4484b6a78b7SSimon Schubert int negate;
4494b6a78b7SSimon Schubert
4504b6a78b7SSimon Schubert lehmer_up = tp; tp += n;
4514b6a78b7SSimon Schubert
4524b6a78b7SSimon Schubert /* Call mpn_gcdext_lehmer_n with copies of a and b. */
4534b6a78b7SSimon Schubert MPN_COPY (tp, ap, n);
4544b6a78b7SSimon Schubert MPN_COPY (tp + n, bp, n);
4554b6a78b7SSimon Schubert gn = mpn_gcdext_lehmer_n (gp, lehmer_up, &lehmer_un, tp, tp + n, n, tp + 2*n);
4564b6a78b7SSimon Schubert
4574b6a78b7SSimon Schubert u0n = un;
4584b6a78b7SSimon Schubert MPN_NORMALIZE (u0, u0n);
4594b6a78b7SSimon Schubert if (lehmer_un == 0)
4604b6a78b7SSimon Schubert {
4614b6a78b7SSimon Schubert /* u == 0 ==> v = g / b == 1 ==> g = - u0 A + (...) B */
4624b6a78b7SSimon Schubert MPN_COPY (up, u0, u0n);
4634b6a78b7SSimon Schubert *usizep = -u0n;
4644b6a78b7SSimon Schubert
4654b6a78b7SSimon Schubert TMP_FREE;
4664b6a78b7SSimon Schubert return gn;
4674b6a78b7SSimon Schubert }
4684b6a78b7SSimon Schubert
4694b6a78b7SSimon Schubert lehmer_vp = tp;
4704b6a78b7SSimon Schubert /* Compute v = (g - u a) / b */
4714b6a78b7SSimon Schubert lehmer_vn = compute_v (lehmer_vp,
4724b6a78b7SSimon Schubert ap, bp, n, gp, gn, lehmer_up, lehmer_un, tp + n + 1);
4734b6a78b7SSimon Schubert
4744b6a78b7SSimon Schubert if (lehmer_un > 0)
4754b6a78b7SSimon Schubert negate = 0;
4764b6a78b7SSimon Schubert else
4774b6a78b7SSimon Schubert {
4784b6a78b7SSimon Schubert lehmer_un = -lehmer_un;
4794b6a78b7SSimon Schubert negate = 1;
4804b6a78b7SSimon Schubert }
4814b6a78b7SSimon Schubert
4824b6a78b7SSimon Schubert u1n = un;
4834b6a78b7SSimon Schubert MPN_NORMALIZE (u1, u1n);
4844b6a78b7SSimon Schubert
4854b6a78b7SSimon Schubert /* It's possible that u0 = 1, u1 = 0 */
4864b6a78b7SSimon Schubert if (u1n == 0)
4874b6a78b7SSimon Schubert {
4884b6a78b7SSimon Schubert ASSERT (un == 1);
4894b6a78b7SSimon Schubert ASSERT (u0[0] == 1);
4904b6a78b7SSimon Schubert
4914b6a78b7SSimon Schubert /* u1 == 0 ==> u u1 + v u0 = v */
4924b6a78b7SSimon Schubert MPN_COPY (up, lehmer_vp, lehmer_vn);
4934b6a78b7SSimon Schubert *usizep = negate ? lehmer_vn : - lehmer_vn;
4944b6a78b7SSimon Schubert
4954b6a78b7SSimon Schubert TMP_FREE;
4964b6a78b7SSimon Schubert return gn;
4974b6a78b7SSimon Schubert }
4984b6a78b7SSimon Schubert
4994b6a78b7SSimon Schubert ASSERT (lehmer_un + u1n <= ualloc);
5004b6a78b7SSimon Schubert ASSERT (lehmer_vn + u0n <= ualloc);
5014b6a78b7SSimon Schubert
5024b6a78b7SSimon Schubert /* Now u0, u1, u are non-zero. We may still have v == 0 */
5034b6a78b7SSimon Schubert
5044b6a78b7SSimon Schubert /* Compute u u0 */
5054b6a78b7SSimon Schubert if (lehmer_un <= u1n)
5064b6a78b7SSimon Schubert /* Should be the common case */
5074b6a78b7SSimon Schubert mpn_mul (up, u1, u1n, lehmer_up, lehmer_un);
5084b6a78b7SSimon Schubert else
5094b6a78b7SSimon Schubert mpn_mul (up, lehmer_up, lehmer_un, u1, u1n);
5104b6a78b7SSimon Schubert
5114b6a78b7SSimon Schubert un = u1n + lehmer_un;
5124b6a78b7SSimon Schubert un -= (up[un - 1] == 0);
5134b6a78b7SSimon Schubert
5144b6a78b7SSimon Schubert if (lehmer_vn > 0)
5154b6a78b7SSimon Schubert {
5164b6a78b7SSimon Schubert mp_limb_t cy;
5174b6a78b7SSimon Schubert
5184b6a78b7SSimon Schubert /* Overwrites old u1 value */
5194b6a78b7SSimon Schubert if (lehmer_vn <= u0n)
5204b6a78b7SSimon Schubert /* Should be the common case */
5214b6a78b7SSimon Schubert mpn_mul (u1, u0, u0n, lehmer_vp, lehmer_vn);
5224b6a78b7SSimon Schubert else
5234b6a78b7SSimon Schubert mpn_mul (u1, lehmer_vp, lehmer_vn, u0, u0n);
5244b6a78b7SSimon Schubert
5254b6a78b7SSimon Schubert u1n = u0n + lehmer_vn;
5264b6a78b7SSimon Schubert u1n -= (u1[u1n - 1] == 0);
5274b6a78b7SSimon Schubert
5284b6a78b7SSimon Schubert if (u1n <= un)
5294b6a78b7SSimon Schubert {
5304b6a78b7SSimon Schubert cy = mpn_add (up, up, un, u1, u1n);
5314b6a78b7SSimon Schubert }
5324b6a78b7SSimon Schubert else
5334b6a78b7SSimon Schubert {
5344b6a78b7SSimon Schubert cy = mpn_add (up, u1, u1n, up, un);
5354b6a78b7SSimon Schubert un = u1n;
5364b6a78b7SSimon Schubert }
5374b6a78b7SSimon Schubert up[un] = cy;
5384b6a78b7SSimon Schubert un += (cy != 0);
5394b6a78b7SSimon Schubert
5404b6a78b7SSimon Schubert ASSERT (un < ualloc);
5414b6a78b7SSimon Schubert }
5424b6a78b7SSimon Schubert *usizep = negate ? -un : un;
5434b6a78b7SSimon Schubert
5444b6a78b7SSimon Schubert TMP_FREE;
5454b6a78b7SSimon Schubert return gn;
5464b6a78b7SSimon Schubert }
5474b6a78b7SSimon Schubert }
548