14b6a78b7SSimon Schubert /* mpn_gcdext -- Extended Greatest Common Divisor. 24b6a78b7SSimon Schubert 3*8b5d8148SSascha Wildner Copyright 1996, 1998, 2000, 2001, 2002, 2003, 2004, 2005, 2008, 2009 Free Software 44b6a78b7SSimon Schubert Foundation, Inc. 54b6a78b7SSimon Schubert 64b6a78b7SSimon Schubert This file is part of the GNU MP Library. 74b6a78b7SSimon Schubert 84b6a78b7SSimon Schubert The GNU MP Library is free software; you can redistribute it and/or modify 94b6a78b7SSimon Schubert it under the terms of the GNU Lesser General Public License as published by 104b6a78b7SSimon Schubert the Free Software Foundation; either version 3 of the License, or (at your 114b6a78b7SSimon Schubert option) any later version. 124b6a78b7SSimon Schubert 134b6a78b7SSimon Schubert The GNU MP Library is distributed in the hope that it will be useful, but 144b6a78b7SSimon Schubert WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 154b6a78b7SSimon Schubert or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 164b6a78b7SSimon Schubert License for more details. 174b6a78b7SSimon Schubert 184b6a78b7SSimon Schubert You should have received a copy of the GNU Lesser General Public License 194b6a78b7SSimon Schubert along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 204b6a78b7SSimon Schubert 214b6a78b7SSimon Schubert #include "gmp.h" 224b6a78b7SSimon Schubert #include "gmp-impl.h" 234b6a78b7SSimon Schubert #include "longlong.h" 244b6a78b7SSimon Schubert 254b6a78b7SSimon Schubert /* Computes (r;b) = (a; b) M. Result is of size n + M->n +/- 1, and 264b6a78b7SSimon Schubert the size is returned (if inputs are non-normalized, result may be 274b6a78b7SSimon Schubert non-normalized too). Temporary space needed is M->n + n. 284b6a78b7SSimon Schubert */ 294b6a78b7SSimon Schubert static size_t 304b6a78b7SSimon Schubert hgcd_mul_matrix_vector (struct hgcd_matrix *M, 314b6a78b7SSimon Schubert mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp) 324b6a78b7SSimon Schubert { 334b6a78b7SSimon Schubert mp_limb_t ah, bh; 344b6a78b7SSimon Schubert 354b6a78b7SSimon Schubert /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as 364b6a78b7SSimon Schubert 374b6a78b7SSimon Schubert t = u00 * a 384b6a78b7SSimon Schubert r = u10 * b 394b6a78b7SSimon Schubert r += t; 404b6a78b7SSimon Schubert 414b6a78b7SSimon Schubert t = u11 * b 424b6a78b7SSimon Schubert b = u01 * a 434b6a78b7SSimon Schubert b += t; 444b6a78b7SSimon Schubert */ 454b6a78b7SSimon Schubert 464b6a78b7SSimon Schubert if (M->n >= n) 474b6a78b7SSimon Schubert { 484b6a78b7SSimon Schubert mpn_mul (tp, M->p[0][0], M->n, ap, n); 494b6a78b7SSimon Schubert mpn_mul (rp, M->p[1][0], M->n, bp, n); 504b6a78b7SSimon Schubert } 514b6a78b7SSimon Schubert else 524b6a78b7SSimon Schubert { 534b6a78b7SSimon Schubert mpn_mul (tp, ap, n, M->p[0][0], M->n); 544b6a78b7SSimon Schubert mpn_mul (rp, bp, n, M->p[1][0], M->n); 554b6a78b7SSimon Schubert } 564b6a78b7SSimon Schubert 574b6a78b7SSimon Schubert ah = mpn_add_n (rp, rp, tp, n + M->n); 584b6a78b7SSimon Schubert 594b6a78b7SSimon Schubert if (M->n >= n) 604b6a78b7SSimon Schubert { 614b6a78b7SSimon Schubert mpn_mul (tp, M->p[1][1], M->n, bp, n); 624b6a78b7SSimon Schubert mpn_mul (bp, M->p[0][1], M->n, ap, n); 634b6a78b7SSimon Schubert } 644b6a78b7SSimon Schubert else 654b6a78b7SSimon Schubert { 664b6a78b7SSimon Schubert mpn_mul (tp, bp, n, M->p[1][1], M->n); 674b6a78b7SSimon Schubert mpn_mul (bp, ap, n, M->p[0][1], M->n); 684b6a78b7SSimon Schubert } 694b6a78b7SSimon Schubert bh = mpn_add_n (bp, bp, tp, n + M->n); 704b6a78b7SSimon Schubert 714b6a78b7SSimon Schubert n += M->n; 724b6a78b7SSimon Schubert if ( (ah | bh) > 0) 734b6a78b7SSimon Schubert { 744b6a78b7SSimon Schubert rp[n] = ah; 754b6a78b7SSimon Schubert bp[n] = bh; 764b6a78b7SSimon Schubert n++; 774b6a78b7SSimon Schubert } 784b6a78b7SSimon Schubert else 794b6a78b7SSimon Schubert { 804b6a78b7SSimon Schubert /* Normalize */ 814b6a78b7SSimon Schubert while ( (rp[n-1] | bp[n-1]) == 0) 824b6a78b7SSimon Schubert n--; 834b6a78b7SSimon Schubert } 844b6a78b7SSimon Schubert 854b6a78b7SSimon Schubert return n; 864b6a78b7SSimon Schubert } 874b6a78b7SSimon Schubert 884b6a78b7SSimon Schubert #define COMPUTE_V_ITCH(n) (2*(n) + 1) 894b6a78b7SSimon Schubert 904b6a78b7SSimon Schubert /* Computes |v| = |(g - u a)| / b, where u may be positive or 914b6a78b7SSimon Schubert negative, and v is of the opposite sign. a, b are of size n, u and 924b6a78b7SSimon Schubert v at most size n, and v must have space for n+1 limbs. */ 934b6a78b7SSimon Schubert static mp_size_t 944b6a78b7SSimon Schubert compute_v (mp_ptr vp, 954b6a78b7SSimon Schubert mp_srcptr ap, mp_srcptr bp, mp_size_t n, 964b6a78b7SSimon Schubert mp_srcptr gp, mp_size_t gn, 974b6a78b7SSimon Schubert mp_srcptr up, mp_size_t usize, 984b6a78b7SSimon Schubert mp_ptr tp) 994b6a78b7SSimon Schubert { 1004b6a78b7SSimon Schubert mp_size_t size; 1014b6a78b7SSimon Schubert mp_size_t an; 1024b6a78b7SSimon Schubert mp_size_t bn; 1034b6a78b7SSimon Schubert mp_size_t vn; 1044b6a78b7SSimon Schubert 1054b6a78b7SSimon Schubert ASSERT (n > 0); 1064b6a78b7SSimon Schubert ASSERT (gn > 0); 1074b6a78b7SSimon Schubert ASSERT (usize != 0); 1084b6a78b7SSimon Schubert 1094b6a78b7SSimon Schubert size = ABS (usize); 1104b6a78b7SSimon Schubert ASSERT (size <= n); 1114b6a78b7SSimon Schubert 1124b6a78b7SSimon Schubert an = n; 1134b6a78b7SSimon Schubert MPN_NORMALIZE (ap, an); 1144b6a78b7SSimon Schubert 1154b6a78b7SSimon Schubert if (an >= size) 1164b6a78b7SSimon Schubert mpn_mul (tp, ap, an, up, size); 1174b6a78b7SSimon Schubert else 1184b6a78b7SSimon Schubert mpn_mul (tp, up, size, ap, an); 1194b6a78b7SSimon Schubert 1204b6a78b7SSimon Schubert size += an; 1214b6a78b7SSimon Schubert 1224b6a78b7SSimon Schubert ASSERT (gn <= size); 1234b6a78b7SSimon Schubert 1244b6a78b7SSimon Schubert if (usize > 0) 1254b6a78b7SSimon Schubert { 1264b6a78b7SSimon Schubert /* |v| = -v = (u a - g) / b */ 1274b6a78b7SSimon Schubert 1284b6a78b7SSimon Schubert ASSERT_NOCARRY (mpn_sub (tp, tp, size, gp, gn)); 1294b6a78b7SSimon Schubert MPN_NORMALIZE (tp, size); 1304b6a78b7SSimon Schubert if (size == 0) 1314b6a78b7SSimon Schubert return 0; 1324b6a78b7SSimon Schubert } 1334b6a78b7SSimon Schubert else 1344b6a78b7SSimon Schubert { /* usize < 0 */ 1354b6a78b7SSimon Schubert /* |v| = v = (c - u a) / b = (c + |u| a) / b */ 1364b6a78b7SSimon Schubert mp_limb_t cy = mpn_add (tp, tp, size, gp, gn); 1374b6a78b7SSimon Schubert if (cy) 1384b6a78b7SSimon Schubert tp[size++] = cy; 1394b6a78b7SSimon Schubert } 1404b6a78b7SSimon Schubert 1414b6a78b7SSimon Schubert /* Now divide t / b. There must be no remainder */ 1424b6a78b7SSimon Schubert bn = n; 1434b6a78b7SSimon Schubert MPN_NORMALIZE (bp, bn); 1444b6a78b7SSimon Schubert ASSERT (size >= bn); 1454b6a78b7SSimon Schubert 1464b6a78b7SSimon Schubert vn = size + 1 - bn; 1474b6a78b7SSimon Schubert ASSERT (vn <= n + 1); 1484b6a78b7SSimon Schubert 1494b6a78b7SSimon Schubert /* FIXME: Use divexact. Or do the entire calculation mod 2^{n * 1504b6a78b7SSimon Schubert GMP_NUMB_BITS}. */ 1514b6a78b7SSimon Schubert mpn_tdiv_qr (vp, tp, 0, tp, size, bp, bn); 1524b6a78b7SSimon Schubert vn -= (vp[vn-1] == 0); 1534b6a78b7SSimon Schubert 1544b6a78b7SSimon Schubert /* Remainder must be zero */ 1554b6a78b7SSimon Schubert #if WANT_ASSERT 1564b6a78b7SSimon Schubert { 1574b6a78b7SSimon Schubert mp_size_t i; 1584b6a78b7SSimon Schubert for (i = 0; i < bn; i++) 1594b6a78b7SSimon Schubert { 1604b6a78b7SSimon Schubert ASSERT (tp[i] == 0); 1614b6a78b7SSimon Schubert } 1624b6a78b7SSimon Schubert } 1634b6a78b7SSimon Schubert #endif 1644b6a78b7SSimon Schubert return vn; 1654b6a78b7SSimon Schubert } 1664b6a78b7SSimon Schubert 1674b6a78b7SSimon Schubert /* Temporary storage: 1684b6a78b7SSimon Schubert 1694b6a78b7SSimon Schubert Initial division: Quotient of at most an - n + 1 <= an limbs. 1704b6a78b7SSimon Schubert 1714b6a78b7SSimon Schubert Storage for u0 and u1: 2(n+1). 1724b6a78b7SSimon Schubert 1734b6a78b7SSimon Schubert Storage for hgcd matrix M, with input ceil(n/2): 5 * ceil(n/4) 1744b6a78b7SSimon Schubert 1754b6a78b7SSimon Schubert Storage for hgcd, input (n + 1)/2: 9 n/4 plus some. 1764b6a78b7SSimon Schubert 1774b6a78b7SSimon Schubert When hgcd succeeds: 1 + floor(3n/2) for adjusting a and b, and 2(n+1) for the cofactors. 1784b6a78b7SSimon Schubert 1794b6a78b7SSimon Schubert When hgcd fails: 2n + 1 for mpn_gcdext_subdiv_step, which is less. 1804b6a78b7SSimon Schubert 1814b6a78b7SSimon Schubert For the lehmer call after the loop, Let T denote 1824b6a78b7SSimon Schubert GCDEXT_DC_THRESHOLD. For the gcdext_lehmer call, we need T each for 1834b6a78b7SSimon Schubert u, a and b, and 4T+3 scratch space. Next, for compute_v, we need T 1844b6a78b7SSimon Schubert + 1 for v and 2T + 1 scratch space. In all, 7T + 3 is sufficient. 1854b6a78b7SSimon Schubert 1864b6a78b7SSimon Schubert */ 1874b6a78b7SSimon Schubert 1884b6a78b7SSimon Schubert /* Optimal choice of p seems difficult. In each iteration the division 1894b6a78b7SSimon Schubert * of work between hgcd and the updates of u0 and u1 depends on the 1904b6a78b7SSimon Schubert * current size of the u. It may be desirable to use a different 1914b6a78b7SSimon Schubert * choice of p in each iteration. Also the input size seems to matter; 1924b6a78b7SSimon Schubert * choosing p = n / 3 in the first iteration seems to improve 1934b6a78b7SSimon Schubert * performance slightly for input size just above the threshold, but 1944b6a78b7SSimon Schubert * degrade performance for larger inputs. */ 1954b6a78b7SSimon Schubert #define CHOOSE_P_1(n) ((n) / 2) 1964b6a78b7SSimon Schubert #define CHOOSE_P_2(n) ((n) / 3) 1974b6a78b7SSimon Schubert 1984b6a78b7SSimon Schubert mp_size_t 1994b6a78b7SSimon Schubert mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep, 2004b6a78b7SSimon Schubert mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t n) 2014b6a78b7SSimon Schubert { 2024b6a78b7SSimon Schubert mp_size_t talloc; 2034b6a78b7SSimon Schubert mp_size_t scratch; 2044b6a78b7SSimon Schubert mp_size_t matrix_scratch; 2054b6a78b7SSimon Schubert mp_size_t ualloc = n + 1; 2064b6a78b7SSimon Schubert 2074b6a78b7SSimon Schubert mp_size_t un; 2084b6a78b7SSimon Schubert mp_ptr u0; 2094b6a78b7SSimon Schubert mp_ptr u1; 2104b6a78b7SSimon Schubert 2114b6a78b7SSimon Schubert mp_ptr tp; 2124b6a78b7SSimon Schubert 2134b6a78b7SSimon Schubert TMP_DECL; 2144b6a78b7SSimon Schubert 2154b6a78b7SSimon Schubert ASSERT (an >= n); 2164b6a78b7SSimon Schubert ASSERT (n > 0); 2174b6a78b7SSimon Schubert 2184b6a78b7SSimon Schubert TMP_MARK; 2194b6a78b7SSimon Schubert 2204b6a78b7SSimon Schubert /* FIXME: Check for small sizes first, before setting up temporary 2214b6a78b7SSimon Schubert storage etc. */ 2224b6a78b7SSimon Schubert talloc = MPN_GCDEXT_LEHMER_N_ITCH(n); 2234b6a78b7SSimon Schubert 2244b6a78b7SSimon Schubert /* For initial division */ 2254b6a78b7SSimon Schubert scratch = an - n + 1; 2264b6a78b7SSimon Schubert if (scratch > talloc) 2274b6a78b7SSimon Schubert talloc = scratch; 2284b6a78b7SSimon Schubert 2294b6a78b7SSimon Schubert if (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD)) 2304b6a78b7SSimon Schubert { 2314b6a78b7SSimon Schubert /* For hgcd loop. */ 2324b6a78b7SSimon Schubert mp_size_t hgcd_scratch; 2334b6a78b7SSimon Schubert mp_size_t update_scratch; 2344b6a78b7SSimon Schubert mp_size_t p1 = CHOOSE_P_1 (n); 2354b6a78b7SSimon Schubert mp_size_t p2 = CHOOSE_P_2 (n); 2364b6a78b7SSimon Schubert mp_size_t min_p = MIN(p1, p2); 2374b6a78b7SSimon Schubert mp_size_t max_p = MAX(p1, p2); 2384b6a78b7SSimon Schubert matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - min_p); 2394b6a78b7SSimon Schubert hgcd_scratch = mpn_hgcd_itch (n - min_p); 2404b6a78b7SSimon Schubert update_scratch = max_p + n - 1; 2414b6a78b7SSimon Schubert 2424b6a78b7SSimon Schubert scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch); 2434b6a78b7SSimon Schubert if (scratch > talloc) 2444b6a78b7SSimon Schubert talloc = scratch; 2454b6a78b7SSimon Schubert 2464b6a78b7SSimon Schubert /* Final mpn_gcdext_lehmer_n call. Need space for u and for 2474b6a78b7SSimon Schubert copies of a and b. */ 2484b6a78b7SSimon Schubert scratch = MPN_GCDEXT_LEHMER_N_ITCH (GCDEXT_DC_THRESHOLD) 2494b6a78b7SSimon Schubert + 3*GCDEXT_DC_THRESHOLD; 2504b6a78b7SSimon Schubert 2514b6a78b7SSimon Schubert if (scratch > talloc) 2524b6a78b7SSimon Schubert talloc = scratch; 2534b6a78b7SSimon Schubert 2544b6a78b7SSimon Schubert /* Cofactors u0 and u1 */ 2554b6a78b7SSimon Schubert talloc += 2*(n+1); 2564b6a78b7SSimon Schubert } 2574b6a78b7SSimon Schubert 2584b6a78b7SSimon Schubert tp = TMP_ALLOC_LIMBS(talloc); 2594b6a78b7SSimon Schubert 2604b6a78b7SSimon Schubert if (an > n) 2614b6a78b7SSimon Schubert { 2624b6a78b7SSimon Schubert mpn_tdiv_qr (tp, ap, 0, ap, an, bp, n); 2634b6a78b7SSimon Schubert 2644b6a78b7SSimon Schubert if (mpn_zero_p (ap, n)) 2654b6a78b7SSimon Schubert { 2664b6a78b7SSimon Schubert MPN_COPY (gp, bp, n); 2674b6a78b7SSimon Schubert *usizep = 0; 2684b6a78b7SSimon Schubert TMP_FREE; 2694b6a78b7SSimon Schubert return n; 2704b6a78b7SSimon Schubert } 2714b6a78b7SSimon Schubert } 2724b6a78b7SSimon Schubert 2734b6a78b7SSimon Schubert if (BELOW_THRESHOLD (n, GCDEXT_DC_THRESHOLD)) 2744b6a78b7SSimon Schubert { 2754b6a78b7SSimon Schubert mp_size_t gn = mpn_gcdext_lehmer_n(gp, up, usizep, ap, bp, n, tp); 2764b6a78b7SSimon Schubert 2774b6a78b7SSimon Schubert TMP_FREE; 2784b6a78b7SSimon Schubert return gn; 2794b6a78b7SSimon Schubert } 2804b6a78b7SSimon Schubert 2814b6a78b7SSimon Schubert MPN_ZERO (tp, 2*ualloc); 2824b6a78b7SSimon Schubert u0 = tp; tp += ualloc; 2834b6a78b7SSimon Schubert u1 = tp; tp += ualloc; 2844b6a78b7SSimon Schubert 2854b6a78b7SSimon Schubert { 2864b6a78b7SSimon Schubert /* For the first hgcd call, there are no u updates, and it makes 2874b6a78b7SSimon Schubert some sense to use a different choice for p. */ 2884b6a78b7SSimon Schubert 2894b6a78b7SSimon Schubert /* FIXME: We could trim use of temporary storage, since u0 and u1 2904b6a78b7SSimon Schubert are not used yet. For the hgcd call, we could swap in the u0 2914b6a78b7SSimon Schubert and u1 pointers for the relevant matrix elements. */ 2924b6a78b7SSimon Schubert 2934b6a78b7SSimon Schubert struct hgcd_matrix M; 2944b6a78b7SSimon Schubert mp_size_t p = CHOOSE_P_1 (n); 2954b6a78b7SSimon Schubert mp_size_t nn; 2964b6a78b7SSimon Schubert 2974b6a78b7SSimon Schubert mpn_hgcd_matrix_init (&M, n - p, tp); 2984b6a78b7SSimon Schubert nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch); 2994b6a78b7SSimon Schubert if (nn > 0) 3004b6a78b7SSimon Schubert { 3014b6a78b7SSimon Schubert ASSERT (M.n <= (n - p - 1)/2); 3024b6a78b7SSimon Schubert ASSERT (M.n + p <= (p + n - 1) / 2); 3034b6a78b7SSimon Schubert 3044b6a78b7SSimon Schubert /* Temporary storage 2 (p + M->n) <= p + n - 1 */ 3054b6a78b7SSimon Schubert n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + matrix_scratch); 3064b6a78b7SSimon Schubert 3074b6a78b7SSimon Schubert MPN_COPY (u0, M.p[1][0], M.n); 3084b6a78b7SSimon Schubert MPN_COPY (u1, M.p[1][1], M.n); 3094b6a78b7SSimon Schubert un = M.n; 3104b6a78b7SSimon Schubert while ( (u0[un-1] | u1[un-1] ) == 0) 3114b6a78b7SSimon Schubert un--; 3124b6a78b7SSimon Schubert } 3134b6a78b7SSimon Schubert else 3144b6a78b7SSimon Schubert { 3154b6a78b7SSimon Schubert /* mpn_hgcd has failed. Then either one of a or b is very 3164b6a78b7SSimon Schubert small, or the difference is very small. Perform one 3174b6a78b7SSimon Schubert subtraction followed by one division. */ 3184b6a78b7SSimon Schubert mp_size_t gn; 3194b6a78b7SSimon Schubert mp_size_t updated_un = 1; 3204b6a78b7SSimon Schubert 3214b6a78b7SSimon Schubert u1[0] = 1; 3224b6a78b7SSimon Schubert 3234b6a78b7SSimon Schubert /* Temporary storage 2n + 1 */ 3244b6a78b7SSimon Schubert n = mpn_gcdext_subdiv_step (gp, &gn, up, usizep, ap, bp, n, 3254b6a78b7SSimon Schubert u0, u1, &updated_un, tp, tp + n); 3264b6a78b7SSimon Schubert if (n == 0) 3274b6a78b7SSimon Schubert { 3284b6a78b7SSimon Schubert TMP_FREE; 3294b6a78b7SSimon Schubert return gn; 3304b6a78b7SSimon Schubert } 3314b6a78b7SSimon Schubert 3324b6a78b7SSimon Schubert un = updated_un; 3334b6a78b7SSimon Schubert ASSERT (un < ualloc); 3344b6a78b7SSimon Schubert } 3354b6a78b7SSimon Schubert } 3364b6a78b7SSimon Schubert 3374b6a78b7SSimon Schubert while (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD)) 3384b6a78b7SSimon Schubert { 3394b6a78b7SSimon Schubert struct hgcd_matrix M; 3404b6a78b7SSimon Schubert mp_size_t p = CHOOSE_P_2 (n); 3414b6a78b7SSimon Schubert mp_size_t nn; 3424b6a78b7SSimon Schubert 3434b6a78b7SSimon Schubert mpn_hgcd_matrix_init (&M, n - p, tp); 3444b6a78b7SSimon Schubert nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch); 3454b6a78b7SSimon Schubert if (nn > 0) 3464b6a78b7SSimon Schubert { 3474b6a78b7SSimon Schubert mp_ptr t0; 3484b6a78b7SSimon Schubert 3494b6a78b7SSimon Schubert t0 = tp + matrix_scratch; 3504b6a78b7SSimon Schubert ASSERT (M.n <= (n - p - 1)/2); 3514b6a78b7SSimon Schubert ASSERT (M.n + p <= (p + n - 1) / 2); 3524b6a78b7SSimon Schubert 3534b6a78b7SSimon Schubert /* Temporary storage 2 (p + M->n) <= p + n - 1 */ 3544b6a78b7SSimon Schubert n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, t0); 3554b6a78b7SSimon Schubert 3564b6a78b7SSimon Schubert /* By the same analysis as for mpn_hgcd_matrix_mul */ 3574b6a78b7SSimon Schubert ASSERT (M.n + un <= ualloc); 3584b6a78b7SSimon Schubert 3594b6a78b7SSimon Schubert /* FIXME: This copying could be avoided by some swapping of 3604b6a78b7SSimon Schubert * pointers. May need more temporary storage, though. */ 3614b6a78b7SSimon Schubert MPN_COPY (t0, u0, un); 3624b6a78b7SSimon Schubert 3634b6a78b7SSimon Schubert /* Temporary storage ualloc */ 3644b6a78b7SSimon Schubert un = hgcd_mul_matrix_vector (&M, u0, t0, u1, un, t0 + un); 3654b6a78b7SSimon Schubert 3664b6a78b7SSimon Schubert ASSERT (un < ualloc); 3674b6a78b7SSimon Schubert ASSERT ( (u0[un-1] | u1[un-1]) > 0); 3684b6a78b7SSimon Schubert } 3694b6a78b7SSimon Schubert else 3704b6a78b7SSimon Schubert { 3714b6a78b7SSimon Schubert /* mpn_hgcd has failed. Then either one of a or b is very 3724b6a78b7SSimon Schubert small, or the difference is very small. Perform one 3734b6a78b7SSimon Schubert subtraction followed by one division. */ 3744b6a78b7SSimon Schubert mp_size_t gn; 3754b6a78b7SSimon Schubert mp_size_t updated_un = un; 3764b6a78b7SSimon Schubert 3774b6a78b7SSimon Schubert /* Temporary storage 2n + 1 */ 3784b6a78b7SSimon Schubert n = mpn_gcdext_subdiv_step (gp, &gn, up, usizep, ap, bp, n, 3794b6a78b7SSimon Schubert u0, u1, &updated_un, tp, tp + n); 3804b6a78b7SSimon Schubert if (n == 0) 3814b6a78b7SSimon Schubert { 3824b6a78b7SSimon Schubert TMP_FREE; 3834b6a78b7SSimon Schubert return gn; 3844b6a78b7SSimon Schubert } 3854b6a78b7SSimon Schubert 3864b6a78b7SSimon Schubert un = updated_un; 3874b6a78b7SSimon Schubert ASSERT (un < ualloc); 3884b6a78b7SSimon Schubert } 3894b6a78b7SSimon Schubert } 3904b6a78b7SSimon Schubert 391*8b5d8148SSascha Wildner if (UNLIKELY (mpn_cmp (ap, bp, n) == 0)) 3924b6a78b7SSimon Schubert { 393*8b5d8148SSascha Wildner /* Must return the smallest cofactor, +u1 or -u0 */ 394*8b5d8148SSascha Wildner int c; 395*8b5d8148SSascha Wildner 396*8b5d8148SSascha Wildner MPN_COPY (gp, ap, n); 397*8b5d8148SSascha Wildner 398*8b5d8148SSascha Wildner MPN_CMP (c, u0, u1, un); 399*8b5d8148SSascha Wildner ASSERT (c != 0); 400*8b5d8148SSascha Wildner if (c < 0) 401*8b5d8148SSascha Wildner { 402*8b5d8148SSascha Wildner MPN_NORMALIZE (u0, un); 4034b6a78b7SSimon Schubert MPN_COPY (up, u0, un); 4044b6a78b7SSimon Schubert *usizep = -un; 4054b6a78b7SSimon Schubert } 406*8b5d8148SSascha Wildner else 4074b6a78b7SSimon Schubert { 4084b6a78b7SSimon Schubert MPN_NORMALIZE_NOT_ZERO (u1, un); 4094b6a78b7SSimon Schubert MPN_COPY (up, u1, un); 4104b6a78b7SSimon Schubert *usizep = un; 411*8b5d8148SSascha Wildner } 4124b6a78b7SSimon Schubert 4134b6a78b7SSimon Schubert TMP_FREE; 4144b6a78b7SSimon Schubert return n; 4154b6a78b7SSimon Schubert } 4164b6a78b7SSimon Schubert else if (mpn_zero_p (u0, un)) 4174b6a78b7SSimon Schubert { 4184b6a78b7SSimon Schubert mp_size_t gn; 4194b6a78b7SSimon Schubert ASSERT (un == 1); 4204b6a78b7SSimon Schubert ASSERT (u1[0] == 1); 4214b6a78b7SSimon Schubert 4224b6a78b7SSimon Schubert /* g = u a + v b = (u u1 - v u0) A + (...) B = u A + (...) B */ 4234b6a78b7SSimon Schubert gn = mpn_gcdext_lehmer_n (gp, up, usizep, ap, bp, n, tp); 4244b6a78b7SSimon Schubert 4254b6a78b7SSimon Schubert TMP_FREE; 4264b6a78b7SSimon Schubert return gn; 4274b6a78b7SSimon Schubert } 4284b6a78b7SSimon Schubert else 4294b6a78b7SSimon Schubert { 4304b6a78b7SSimon Schubert /* We have A = ... a + ... b 4314b6a78b7SSimon Schubert B = u0 a + u1 b 4324b6a78b7SSimon Schubert 4334b6a78b7SSimon Schubert a = u1 A + ... B 4344b6a78b7SSimon Schubert b = -u0 A + ... B 4354b6a78b7SSimon Schubert 4364b6a78b7SSimon Schubert with bounds 4374b6a78b7SSimon Schubert 4384b6a78b7SSimon Schubert |u0|, |u1| <= B / min(a, b) 4394b6a78b7SSimon Schubert 4404b6a78b7SSimon Schubert Compute g = u a + v b = (u u1 - v u0) A + (...) B 4414b6a78b7SSimon Schubert Here, u, v are bounded by 4424b6a78b7SSimon Schubert 4434b6a78b7SSimon Schubert |u| <= b, 4444b6a78b7SSimon Schubert |v| <= a 4454b6a78b7SSimon Schubert */ 4464b6a78b7SSimon Schubert 4474b6a78b7SSimon Schubert mp_size_t u0n; 4484b6a78b7SSimon Schubert mp_size_t u1n; 4494b6a78b7SSimon Schubert mp_size_t lehmer_un; 4504b6a78b7SSimon Schubert mp_size_t lehmer_vn; 4514b6a78b7SSimon Schubert mp_size_t gn; 4524b6a78b7SSimon Schubert 4534b6a78b7SSimon Schubert mp_ptr lehmer_up; 4544b6a78b7SSimon Schubert mp_ptr lehmer_vp; 4554b6a78b7SSimon Schubert int negate; 4564b6a78b7SSimon Schubert 4574b6a78b7SSimon Schubert lehmer_up = tp; tp += n; 4584b6a78b7SSimon Schubert 4594b6a78b7SSimon Schubert /* Call mpn_gcdext_lehmer_n with copies of a and b. */ 4604b6a78b7SSimon Schubert MPN_COPY (tp, ap, n); 4614b6a78b7SSimon Schubert MPN_COPY (tp + n, bp, n); 4624b6a78b7SSimon Schubert gn = mpn_gcdext_lehmer_n (gp, lehmer_up, &lehmer_un, tp, tp + n, n, tp + 2*n); 4634b6a78b7SSimon Schubert 4644b6a78b7SSimon Schubert u0n = un; 4654b6a78b7SSimon Schubert MPN_NORMALIZE (u0, u0n); 4664b6a78b7SSimon Schubert if (lehmer_un == 0) 4674b6a78b7SSimon Schubert { 4684b6a78b7SSimon Schubert /* u == 0 ==> v = g / b == 1 ==> g = - u0 A + (...) B */ 4694b6a78b7SSimon Schubert MPN_COPY (up, u0, u0n); 4704b6a78b7SSimon Schubert *usizep = -u0n; 4714b6a78b7SSimon Schubert 4724b6a78b7SSimon Schubert TMP_FREE; 4734b6a78b7SSimon Schubert return gn; 4744b6a78b7SSimon Schubert } 4754b6a78b7SSimon Schubert 4764b6a78b7SSimon Schubert lehmer_vp = tp; 4774b6a78b7SSimon Schubert /* Compute v = (g - u a) / b */ 4784b6a78b7SSimon Schubert lehmer_vn = compute_v (lehmer_vp, 4794b6a78b7SSimon Schubert ap, bp, n, gp, gn, lehmer_up, lehmer_un, tp + n + 1); 4804b6a78b7SSimon Schubert 4814b6a78b7SSimon Schubert if (lehmer_un > 0) 4824b6a78b7SSimon Schubert negate = 0; 4834b6a78b7SSimon Schubert else 4844b6a78b7SSimon Schubert { 4854b6a78b7SSimon Schubert lehmer_un = -lehmer_un; 4864b6a78b7SSimon Schubert negate = 1; 4874b6a78b7SSimon Schubert } 4884b6a78b7SSimon Schubert 4894b6a78b7SSimon Schubert u1n = un; 4904b6a78b7SSimon Schubert MPN_NORMALIZE (u1, u1n); 4914b6a78b7SSimon Schubert 4924b6a78b7SSimon Schubert /* It's possible that u0 = 1, u1 = 0 */ 4934b6a78b7SSimon Schubert if (u1n == 0) 4944b6a78b7SSimon Schubert { 4954b6a78b7SSimon Schubert ASSERT (un == 1); 4964b6a78b7SSimon Schubert ASSERT (u0[0] == 1); 4974b6a78b7SSimon Schubert 4984b6a78b7SSimon Schubert /* u1 == 0 ==> u u1 + v u0 = v */ 4994b6a78b7SSimon Schubert MPN_COPY (up, lehmer_vp, lehmer_vn); 5004b6a78b7SSimon Schubert *usizep = negate ? lehmer_vn : - lehmer_vn; 5014b6a78b7SSimon Schubert 5024b6a78b7SSimon Schubert TMP_FREE; 5034b6a78b7SSimon Schubert return gn; 5044b6a78b7SSimon Schubert } 5054b6a78b7SSimon Schubert 5064b6a78b7SSimon Schubert ASSERT (lehmer_un + u1n <= ualloc); 5074b6a78b7SSimon Schubert ASSERT (lehmer_vn + u0n <= ualloc); 5084b6a78b7SSimon Schubert 5094b6a78b7SSimon Schubert /* Now u0, u1, u are non-zero. We may still have v == 0 */ 5104b6a78b7SSimon Schubert 5114b6a78b7SSimon Schubert /* Compute u u0 */ 5124b6a78b7SSimon Schubert if (lehmer_un <= u1n) 5134b6a78b7SSimon Schubert /* Should be the common case */ 5144b6a78b7SSimon Schubert mpn_mul (up, u1, u1n, lehmer_up, lehmer_un); 5154b6a78b7SSimon Schubert else 5164b6a78b7SSimon Schubert mpn_mul (up, lehmer_up, lehmer_un, u1, u1n); 5174b6a78b7SSimon Schubert 5184b6a78b7SSimon Schubert un = u1n + lehmer_un; 5194b6a78b7SSimon Schubert un -= (up[un - 1] == 0); 5204b6a78b7SSimon Schubert 5214b6a78b7SSimon Schubert if (lehmer_vn > 0) 5224b6a78b7SSimon Schubert { 5234b6a78b7SSimon Schubert mp_limb_t cy; 5244b6a78b7SSimon Schubert 5254b6a78b7SSimon Schubert /* Overwrites old u1 value */ 5264b6a78b7SSimon Schubert if (lehmer_vn <= u0n) 5274b6a78b7SSimon Schubert /* Should be the common case */ 5284b6a78b7SSimon Schubert mpn_mul (u1, u0, u0n, lehmer_vp, lehmer_vn); 5294b6a78b7SSimon Schubert else 5304b6a78b7SSimon Schubert mpn_mul (u1, lehmer_vp, lehmer_vn, u0, u0n); 5314b6a78b7SSimon Schubert 5324b6a78b7SSimon Schubert u1n = u0n + lehmer_vn; 5334b6a78b7SSimon Schubert u1n -= (u1[u1n - 1] == 0); 5344b6a78b7SSimon Schubert 5354b6a78b7SSimon Schubert if (u1n <= un) 5364b6a78b7SSimon Schubert { 5374b6a78b7SSimon Schubert cy = mpn_add (up, up, un, u1, u1n); 5384b6a78b7SSimon Schubert } 5394b6a78b7SSimon Schubert else 5404b6a78b7SSimon Schubert { 5414b6a78b7SSimon Schubert cy = mpn_add (up, u1, u1n, up, un); 5424b6a78b7SSimon Schubert un = u1n; 5434b6a78b7SSimon Schubert } 5444b6a78b7SSimon Schubert up[un] = cy; 5454b6a78b7SSimon Schubert un += (cy != 0); 5464b6a78b7SSimon Schubert 5474b6a78b7SSimon Schubert ASSERT (un < ualloc); 5484b6a78b7SSimon Schubert } 5494b6a78b7SSimon Schubert *usizep = negate ? -un : un; 5504b6a78b7SSimon Schubert 5514b6a78b7SSimon Schubert TMP_FREE; 5524b6a78b7SSimon Schubert return gn; 5534b6a78b7SSimon Schubert } 5544b6a78b7SSimon Schubert } 555