1*4b6a78b7SSimon Schubert /* mpn_gcdext -- Extended Greatest Common Divisor. 2*4b6a78b7SSimon Schubert 3*4b6a78b7SSimon Schubert Copyright 1996, 1998, 2000, 2001, 2002, 2003, 2004, 2005, 2008 Free Software 4*4b6a78b7SSimon Schubert Foundation, Inc. 5*4b6a78b7SSimon Schubert 6*4b6a78b7SSimon Schubert This file is part of the GNU MP Library. 7*4b6a78b7SSimon Schubert 8*4b6a78b7SSimon Schubert The GNU MP Library is free software; you can redistribute it and/or modify 9*4b6a78b7SSimon Schubert it under the terms of the GNU Lesser General Public License as published by 10*4b6a78b7SSimon Schubert the Free Software Foundation; either version 3 of the License, or (at your 11*4b6a78b7SSimon Schubert option) any later version. 12*4b6a78b7SSimon Schubert 13*4b6a78b7SSimon Schubert The GNU MP Library is distributed in the hope that it will be useful, but 14*4b6a78b7SSimon Schubert WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15*4b6a78b7SSimon Schubert or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16*4b6a78b7SSimon Schubert License for more details. 17*4b6a78b7SSimon Schubert 18*4b6a78b7SSimon Schubert You should have received a copy of the GNU Lesser General Public License 19*4b6a78b7SSimon Schubert along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 20*4b6a78b7SSimon Schubert 21*4b6a78b7SSimon Schubert #include "gmp.h" 22*4b6a78b7SSimon Schubert #include "gmp-impl.h" 23*4b6a78b7SSimon Schubert #include "longlong.h" 24*4b6a78b7SSimon Schubert 25*4b6a78b7SSimon Schubert /* Computes (r;b) = (a; b) M. Result is of size n + M->n +/- 1, and 26*4b6a78b7SSimon Schubert the size is returned (if inputs are non-normalized, result may be 27*4b6a78b7SSimon Schubert non-normalized too). Temporary space needed is M->n + n. 28*4b6a78b7SSimon Schubert */ 29*4b6a78b7SSimon Schubert static size_t 30*4b6a78b7SSimon Schubert hgcd_mul_matrix_vector (struct hgcd_matrix *M, 31*4b6a78b7SSimon Schubert mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp) 32*4b6a78b7SSimon Schubert { 33*4b6a78b7SSimon Schubert mp_limb_t ah, bh; 34*4b6a78b7SSimon Schubert 35*4b6a78b7SSimon Schubert /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as 36*4b6a78b7SSimon Schubert 37*4b6a78b7SSimon Schubert t = u00 * a 38*4b6a78b7SSimon Schubert r = u10 * b 39*4b6a78b7SSimon Schubert r += t; 40*4b6a78b7SSimon Schubert 41*4b6a78b7SSimon Schubert t = u11 * b 42*4b6a78b7SSimon Schubert b = u01 * a 43*4b6a78b7SSimon Schubert b += t; 44*4b6a78b7SSimon Schubert */ 45*4b6a78b7SSimon Schubert 46*4b6a78b7SSimon Schubert if (M->n >= n) 47*4b6a78b7SSimon Schubert { 48*4b6a78b7SSimon Schubert mpn_mul (tp, M->p[0][0], M->n, ap, n); 49*4b6a78b7SSimon Schubert mpn_mul (rp, M->p[1][0], M->n, bp, n); 50*4b6a78b7SSimon Schubert } 51*4b6a78b7SSimon Schubert else 52*4b6a78b7SSimon Schubert { 53*4b6a78b7SSimon Schubert mpn_mul (tp, ap, n, M->p[0][0], M->n); 54*4b6a78b7SSimon Schubert mpn_mul (rp, bp, n, M->p[1][0], M->n); 55*4b6a78b7SSimon Schubert } 56*4b6a78b7SSimon Schubert 57*4b6a78b7SSimon Schubert ah = mpn_add_n (rp, rp, tp, n + M->n); 58*4b6a78b7SSimon Schubert 59*4b6a78b7SSimon Schubert if (M->n >= n) 60*4b6a78b7SSimon Schubert { 61*4b6a78b7SSimon Schubert mpn_mul (tp, M->p[1][1], M->n, bp, n); 62*4b6a78b7SSimon Schubert mpn_mul (bp, M->p[0][1], M->n, ap, n); 63*4b6a78b7SSimon Schubert } 64*4b6a78b7SSimon Schubert else 65*4b6a78b7SSimon Schubert { 66*4b6a78b7SSimon Schubert mpn_mul (tp, bp, n, M->p[1][1], M->n); 67*4b6a78b7SSimon Schubert mpn_mul (bp, ap, n, M->p[0][1], M->n); 68*4b6a78b7SSimon Schubert } 69*4b6a78b7SSimon Schubert bh = mpn_add_n (bp, bp, tp, n + M->n); 70*4b6a78b7SSimon Schubert 71*4b6a78b7SSimon Schubert n += M->n; 72*4b6a78b7SSimon Schubert if ( (ah | bh) > 0) 73*4b6a78b7SSimon Schubert { 74*4b6a78b7SSimon Schubert rp[n] = ah; 75*4b6a78b7SSimon Schubert bp[n] = bh; 76*4b6a78b7SSimon Schubert n++; 77*4b6a78b7SSimon Schubert } 78*4b6a78b7SSimon Schubert else 79*4b6a78b7SSimon Schubert { 80*4b6a78b7SSimon Schubert /* Normalize */ 81*4b6a78b7SSimon Schubert while ( (rp[n-1] | bp[n-1]) == 0) 82*4b6a78b7SSimon Schubert n--; 83*4b6a78b7SSimon Schubert } 84*4b6a78b7SSimon Schubert 85*4b6a78b7SSimon Schubert return n; 86*4b6a78b7SSimon Schubert } 87*4b6a78b7SSimon Schubert 88*4b6a78b7SSimon Schubert #define COMPUTE_V_ITCH(n) (2*(n) + 1) 89*4b6a78b7SSimon Schubert 90*4b6a78b7SSimon Schubert /* Computes |v| = |(g - u a)| / b, where u may be positive or 91*4b6a78b7SSimon Schubert negative, and v is of the opposite sign. a, b are of size n, u and 92*4b6a78b7SSimon Schubert v at most size n, and v must have space for n+1 limbs. */ 93*4b6a78b7SSimon Schubert static mp_size_t 94*4b6a78b7SSimon Schubert compute_v (mp_ptr vp, 95*4b6a78b7SSimon Schubert mp_srcptr ap, mp_srcptr bp, mp_size_t n, 96*4b6a78b7SSimon Schubert mp_srcptr gp, mp_size_t gn, 97*4b6a78b7SSimon Schubert mp_srcptr up, mp_size_t usize, 98*4b6a78b7SSimon Schubert mp_ptr tp) 99*4b6a78b7SSimon Schubert { 100*4b6a78b7SSimon Schubert mp_size_t size; 101*4b6a78b7SSimon Schubert mp_size_t an; 102*4b6a78b7SSimon Schubert mp_size_t bn; 103*4b6a78b7SSimon Schubert mp_size_t vn; 104*4b6a78b7SSimon Schubert 105*4b6a78b7SSimon Schubert ASSERT (n > 0); 106*4b6a78b7SSimon Schubert ASSERT (gn > 0); 107*4b6a78b7SSimon Schubert ASSERT (usize != 0); 108*4b6a78b7SSimon Schubert 109*4b6a78b7SSimon Schubert size = ABS (usize); 110*4b6a78b7SSimon Schubert ASSERT (size <= n); 111*4b6a78b7SSimon Schubert 112*4b6a78b7SSimon Schubert an = n; 113*4b6a78b7SSimon Schubert MPN_NORMALIZE (ap, an); 114*4b6a78b7SSimon Schubert 115*4b6a78b7SSimon Schubert if (an >= size) 116*4b6a78b7SSimon Schubert mpn_mul (tp, ap, an, up, size); 117*4b6a78b7SSimon Schubert else 118*4b6a78b7SSimon Schubert mpn_mul (tp, up, size, ap, an); 119*4b6a78b7SSimon Schubert 120*4b6a78b7SSimon Schubert size += an; 121*4b6a78b7SSimon Schubert 122*4b6a78b7SSimon Schubert ASSERT (gn <= size); 123*4b6a78b7SSimon Schubert 124*4b6a78b7SSimon Schubert if (usize > 0) 125*4b6a78b7SSimon Schubert { 126*4b6a78b7SSimon Schubert /* |v| = -v = (u a - g) / b */ 127*4b6a78b7SSimon Schubert 128*4b6a78b7SSimon Schubert ASSERT_NOCARRY (mpn_sub (tp, tp, size, gp, gn)); 129*4b6a78b7SSimon Schubert MPN_NORMALIZE (tp, size); 130*4b6a78b7SSimon Schubert if (size == 0) 131*4b6a78b7SSimon Schubert return 0; 132*4b6a78b7SSimon Schubert } 133*4b6a78b7SSimon Schubert else 134*4b6a78b7SSimon Schubert { /* usize < 0 */ 135*4b6a78b7SSimon Schubert /* |v| = v = (c - u a) / b = (c + |u| a) / b */ 136*4b6a78b7SSimon Schubert mp_limb_t cy = mpn_add (tp, tp, size, gp, gn); 137*4b6a78b7SSimon Schubert if (cy) 138*4b6a78b7SSimon Schubert tp[size++] = cy; 139*4b6a78b7SSimon Schubert } 140*4b6a78b7SSimon Schubert 141*4b6a78b7SSimon Schubert /* Now divide t / b. There must be no remainder */ 142*4b6a78b7SSimon Schubert bn = n; 143*4b6a78b7SSimon Schubert MPN_NORMALIZE (bp, bn); 144*4b6a78b7SSimon Schubert ASSERT (size >= bn); 145*4b6a78b7SSimon Schubert 146*4b6a78b7SSimon Schubert vn = size + 1 - bn; 147*4b6a78b7SSimon Schubert ASSERT (vn <= n + 1); 148*4b6a78b7SSimon Schubert 149*4b6a78b7SSimon Schubert /* FIXME: Use divexact. Or do the entire calculation mod 2^{n * 150*4b6a78b7SSimon Schubert GMP_NUMB_BITS}. */ 151*4b6a78b7SSimon Schubert mpn_tdiv_qr (vp, tp, 0, tp, size, bp, bn); 152*4b6a78b7SSimon Schubert vn -= (vp[vn-1] == 0); 153*4b6a78b7SSimon Schubert 154*4b6a78b7SSimon Schubert /* Remainder must be zero */ 155*4b6a78b7SSimon Schubert #if WANT_ASSERT 156*4b6a78b7SSimon Schubert { 157*4b6a78b7SSimon Schubert mp_size_t i; 158*4b6a78b7SSimon Schubert for (i = 0; i < bn; i++) 159*4b6a78b7SSimon Schubert { 160*4b6a78b7SSimon Schubert ASSERT (tp[i] == 0); 161*4b6a78b7SSimon Schubert } 162*4b6a78b7SSimon Schubert } 163*4b6a78b7SSimon Schubert #endif 164*4b6a78b7SSimon Schubert return vn; 165*4b6a78b7SSimon Schubert } 166*4b6a78b7SSimon Schubert 167*4b6a78b7SSimon Schubert /* Temporary storage: 168*4b6a78b7SSimon Schubert 169*4b6a78b7SSimon Schubert Initial division: Quotient of at most an - n + 1 <= an limbs. 170*4b6a78b7SSimon Schubert 171*4b6a78b7SSimon Schubert Storage for u0 and u1: 2(n+1). 172*4b6a78b7SSimon Schubert 173*4b6a78b7SSimon Schubert Storage for hgcd matrix M, with input ceil(n/2): 5 * ceil(n/4) 174*4b6a78b7SSimon Schubert 175*4b6a78b7SSimon Schubert Storage for hgcd, input (n + 1)/2: 9 n/4 plus some. 176*4b6a78b7SSimon Schubert 177*4b6a78b7SSimon Schubert When hgcd succeeds: 1 + floor(3n/2) for adjusting a and b, and 2(n+1) for the cofactors. 178*4b6a78b7SSimon Schubert 179*4b6a78b7SSimon Schubert When hgcd fails: 2n + 1 for mpn_gcdext_subdiv_step, which is less. 180*4b6a78b7SSimon Schubert 181*4b6a78b7SSimon Schubert For the lehmer call after the loop, Let T denote 182*4b6a78b7SSimon Schubert GCDEXT_DC_THRESHOLD. For the gcdext_lehmer call, we need T each for 183*4b6a78b7SSimon Schubert u, a and b, and 4T+3 scratch space. Next, for compute_v, we need T 184*4b6a78b7SSimon Schubert + 1 for v and 2T + 1 scratch space. In all, 7T + 3 is sufficient. 185*4b6a78b7SSimon Schubert 186*4b6a78b7SSimon Schubert */ 187*4b6a78b7SSimon Schubert 188*4b6a78b7SSimon Schubert /* Optimal choice of p seems difficult. In each iteration the division 189*4b6a78b7SSimon Schubert * of work between hgcd and the updates of u0 and u1 depends on the 190*4b6a78b7SSimon Schubert * current size of the u. It may be desirable to use a different 191*4b6a78b7SSimon Schubert * choice of p in each iteration. Also the input size seems to matter; 192*4b6a78b7SSimon Schubert * choosing p = n / 3 in the first iteration seems to improve 193*4b6a78b7SSimon Schubert * performance slightly for input size just above the threshold, but 194*4b6a78b7SSimon Schubert * degrade performance for larger inputs. */ 195*4b6a78b7SSimon Schubert #define CHOOSE_P_1(n) ((n) / 2) 196*4b6a78b7SSimon Schubert #define CHOOSE_P_2(n) ((n) / 3) 197*4b6a78b7SSimon Schubert 198*4b6a78b7SSimon Schubert mp_size_t 199*4b6a78b7SSimon Schubert mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep, 200*4b6a78b7SSimon Schubert mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t n) 201*4b6a78b7SSimon Schubert { 202*4b6a78b7SSimon Schubert mp_size_t talloc; 203*4b6a78b7SSimon Schubert mp_size_t scratch; 204*4b6a78b7SSimon Schubert mp_size_t matrix_scratch; 205*4b6a78b7SSimon Schubert mp_size_t ualloc = n + 1; 206*4b6a78b7SSimon Schubert 207*4b6a78b7SSimon Schubert mp_size_t un; 208*4b6a78b7SSimon Schubert mp_ptr u0; 209*4b6a78b7SSimon Schubert mp_ptr u1; 210*4b6a78b7SSimon Schubert 211*4b6a78b7SSimon Schubert mp_ptr tp; 212*4b6a78b7SSimon Schubert 213*4b6a78b7SSimon Schubert TMP_DECL; 214*4b6a78b7SSimon Schubert 215*4b6a78b7SSimon Schubert ASSERT (an >= n); 216*4b6a78b7SSimon Schubert ASSERT (n > 0); 217*4b6a78b7SSimon Schubert 218*4b6a78b7SSimon Schubert TMP_MARK; 219*4b6a78b7SSimon Schubert 220*4b6a78b7SSimon Schubert /* FIXME: Check for small sizes first, before setting up temporary 221*4b6a78b7SSimon Schubert storage etc. */ 222*4b6a78b7SSimon Schubert talloc = MPN_GCDEXT_LEHMER_N_ITCH(n); 223*4b6a78b7SSimon Schubert 224*4b6a78b7SSimon Schubert /* For initial division */ 225*4b6a78b7SSimon Schubert scratch = an - n + 1; 226*4b6a78b7SSimon Schubert if (scratch > talloc) 227*4b6a78b7SSimon Schubert talloc = scratch; 228*4b6a78b7SSimon Schubert 229*4b6a78b7SSimon Schubert if (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD)) 230*4b6a78b7SSimon Schubert { 231*4b6a78b7SSimon Schubert /* For hgcd loop. */ 232*4b6a78b7SSimon Schubert mp_size_t hgcd_scratch; 233*4b6a78b7SSimon Schubert mp_size_t update_scratch; 234*4b6a78b7SSimon Schubert mp_size_t p1 = CHOOSE_P_1 (n); 235*4b6a78b7SSimon Schubert mp_size_t p2 = CHOOSE_P_2 (n); 236*4b6a78b7SSimon Schubert mp_size_t min_p = MIN(p1, p2); 237*4b6a78b7SSimon Schubert mp_size_t max_p = MAX(p1, p2); 238*4b6a78b7SSimon Schubert matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - min_p); 239*4b6a78b7SSimon Schubert hgcd_scratch = mpn_hgcd_itch (n - min_p); 240*4b6a78b7SSimon Schubert update_scratch = max_p + n - 1; 241*4b6a78b7SSimon Schubert 242*4b6a78b7SSimon Schubert scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch); 243*4b6a78b7SSimon Schubert if (scratch > talloc) 244*4b6a78b7SSimon Schubert talloc = scratch; 245*4b6a78b7SSimon Schubert 246*4b6a78b7SSimon Schubert /* Final mpn_gcdext_lehmer_n call. Need space for u and for 247*4b6a78b7SSimon Schubert copies of a and b. */ 248*4b6a78b7SSimon Schubert scratch = MPN_GCDEXT_LEHMER_N_ITCH (GCDEXT_DC_THRESHOLD) 249*4b6a78b7SSimon Schubert + 3*GCDEXT_DC_THRESHOLD; 250*4b6a78b7SSimon Schubert 251*4b6a78b7SSimon Schubert if (scratch > talloc) 252*4b6a78b7SSimon Schubert talloc = scratch; 253*4b6a78b7SSimon Schubert 254*4b6a78b7SSimon Schubert /* Cofactors u0 and u1 */ 255*4b6a78b7SSimon Schubert talloc += 2*(n+1); 256*4b6a78b7SSimon Schubert } 257*4b6a78b7SSimon Schubert 258*4b6a78b7SSimon Schubert tp = TMP_ALLOC_LIMBS(talloc); 259*4b6a78b7SSimon Schubert 260*4b6a78b7SSimon Schubert if (an > n) 261*4b6a78b7SSimon Schubert { 262*4b6a78b7SSimon Schubert mpn_tdiv_qr (tp, ap, 0, ap, an, bp, n); 263*4b6a78b7SSimon Schubert 264*4b6a78b7SSimon Schubert if (mpn_zero_p (ap, n)) 265*4b6a78b7SSimon Schubert { 266*4b6a78b7SSimon Schubert MPN_COPY (gp, bp, n); 267*4b6a78b7SSimon Schubert *usizep = 0; 268*4b6a78b7SSimon Schubert TMP_FREE; 269*4b6a78b7SSimon Schubert return n; 270*4b6a78b7SSimon Schubert } 271*4b6a78b7SSimon Schubert } 272*4b6a78b7SSimon Schubert 273*4b6a78b7SSimon Schubert if (BELOW_THRESHOLD (n, GCDEXT_DC_THRESHOLD)) 274*4b6a78b7SSimon Schubert { 275*4b6a78b7SSimon Schubert mp_size_t gn = mpn_gcdext_lehmer_n(gp, up, usizep, ap, bp, n, tp); 276*4b6a78b7SSimon Schubert 277*4b6a78b7SSimon Schubert TMP_FREE; 278*4b6a78b7SSimon Schubert return gn; 279*4b6a78b7SSimon Schubert } 280*4b6a78b7SSimon Schubert 281*4b6a78b7SSimon Schubert MPN_ZERO (tp, 2*ualloc); 282*4b6a78b7SSimon Schubert u0 = tp; tp += ualloc; 283*4b6a78b7SSimon Schubert u1 = tp; tp += ualloc; 284*4b6a78b7SSimon Schubert 285*4b6a78b7SSimon Schubert { 286*4b6a78b7SSimon Schubert /* For the first hgcd call, there are no u updates, and it makes 287*4b6a78b7SSimon Schubert some sense to use a different choice for p. */ 288*4b6a78b7SSimon Schubert 289*4b6a78b7SSimon Schubert /* FIXME: We could trim use of temporary storage, since u0 and u1 290*4b6a78b7SSimon Schubert are not used yet. For the hgcd call, we could swap in the u0 291*4b6a78b7SSimon Schubert and u1 pointers for the relevant matrix elements. */ 292*4b6a78b7SSimon Schubert 293*4b6a78b7SSimon Schubert struct hgcd_matrix M; 294*4b6a78b7SSimon Schubert mp_size_t p = CHOOSE_P_1 (n); 295*4b6a78b7SSimon Schubert mp_size_t nn; 296*4b6a78b7SSimon Schubert 297*4b6a78b7SSimon Schubert mpn_hgcd_matrix_init (&M, n - p, tp); 298*4b6a78b7SSimon Schubert nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch); 299*4b6a78b7SSimon Schubert if (nn > 0) 300*4b6a78b7SSimon Schubert { 301*4b6a78b7SSimon Schubert ASSERT (M.n <= (n - p - 1)/2); 302*4b6a78b7SSimon Schubert ASSERT (M.n + p <= (p + n - 1) / 2); 303*4b6a78b7SSimon Schubert 304*4b6a78b7SSimon Schubert /* Temporary storage 2 (p + M->n) <= p + n - 1 */ 305*4b6a78b7SSimon Schubert n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + matrix_scratch); 306*4b6a78b7SSimon Schubert 307*4b6a78b7SSimon Schubert MPN_COPY (u0, M.p[1][0], M.n); 308*4b6a78b7SSimon Schubert MPN_COPY (u1, M.p[1][1], M.n); 309*4b6a78b7SSimon Schubert un = M.n; 310*4b6a78b7SSimon Schubert while ( (u0[un-1] | u1[un-1] ) == 0) 311*4b6a78b7SSimon Schubert un--; 312*4b6a78b7SSimon Schubert } 313*4b6a78b7SSimon Schubert else 314*4b6a78b7SSimon Schubert { 315*4b6a78b7SSimon Schubert /* mpn_hgcd has failed. Then either one of a or b is very 316*4b6a78b7SSimon Schubert small, or the difference is very small. Perform one 317*4b6a78b7SSimon Schubert subtraction followed by one division. */ 318*4b6a78b7SSimon Schubert mp_size_t gn; 319*4b6a78b7SSimon Schubert mp_size_t updated_un = 1; 320*4b6a78b7SSimon Schubert 321*4b6a78b7SSimon Schubert u1[0] = 1; 322*4b6a78b7SSimon Schubert 323*4b6a78b7SSimon Schubert /* Temporary storage 2n + 1 */ 324*4b6a78b7SSimon Schubert n = mpn_gcdext_subdiv_step (gp, &gn, up, usizep, ap, bp, n, 325*4b6a78b7SSimon Schubert u0, u1, &updated_un, tp, tp + n); 326*4b6a78b7SSimon Schubert if (n == 0) 327*4b6a78b7SSimon Schubert { 328*4b6a78b7SSimon Schubert TMP_FREE; 329*4b6a78b7SSimon Schubert return gn; 330*4b6a78b7SSimon Schubert } 331*4b6a78b7SSimon Schubert 332*4b6a78b7SSimon Schubert un = updated_un; 333*4b6a78b7SSimon Schubert ASSERT (un < ualloc); 334*4b6a78b7SSimon Schubert } 335*4b6a78b7SSimon Schubert } 336*4b6a78b7SSimon Schubert 337*4b6a78b7SSimon Schubert while (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD)) 338*4b6a78b7SSimon Schubert { 339*4b6a78b7SSimon Schubert struct hgcd_matrix M; 340*4b6a78b7SSimon Schubert mp_size_t p = CHOOSE_P_2 (n); 341*4b6a78b7SSimon Schubert mp_size_t nn; 342*4b6a78b7SSimon Schubert 343*4b6a78b7SSimon Schubert mpn_hgcd_matrix_init (&M, n - p, tp); 344*4b6a78b7SSimon Schubert nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch); 345*4b6a78b7SSimon Schubert if (nn > 0) 346*4b6a78b7SSimon Schubert { 347*4b6a78b7SSimon Schubert mp_ptr t0; 348*4b6a78b7SSimon Schubert 349*4b6a78b7SSimon Schubert t0 = tp + matrix_scratch; 350*4b6a78b7SSimon Schubert ASSERT (M.n <= (n - p - 1)/2); 351*4b6a78b7SSimon Schubert ASSERT (M.n + p <= (p + n - 1) / 2); 352*4b6a78b7SSimon Schubert 353*4b6a78b7SSimon Schubert /* Temporary storage 2 (p + M->n) <= p + n - 1 */ 354*4b6a78b7SSimon Schubert n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, t0); 355*4b6a78b7SSimon Schubert 356*4b6a78b7SSimon Schubert /* By the same analysis as for mpn_hgcd_matrix_mul */ 357*4b6a78b7SSimon Schubert ASSERT (M.n + un <= ualloc); 358*4b6a78b7SSimon Schubert 359*4b6a78b7SSimon Schubert /* FIXME: This copying could be avoided by some swapping of 360*4b6a78b7SSimon Schubert * pointers. May need more temporary storage, though. */ 361*4b6a78b7SSimon Schubert MPN_COPY (t0, u0, un); 362*4b6a78b7SSimon Schubert 363*4b6a78b7SSimon Schubert /* Temporary storage ualloc */ 364*4b6a78b7SSimon Schubert un = hgcd_mul_matrix_vector (&M, u0, t0, u1, un, t0 + un); 365*4b6a78b7SSimon Schubert 366*4b6a78b7SSimon Schubert ASSERT (un < ualloc); 367*4b6a78b7SSimon Schubert ASSERT ( (u0[un-1] | u1[un-1]) > 0); 368*4b6a78b7SSimon Schubert } 369*4b6a78b7SSimon Schubert else 370*4b6a78b7SSimon Schubert { 371*4b6a78b7SSimon Schubert /* mpn_hgcd has failed. Then either one of a or b is very 372*4b6a78b7SSimon Schubert small, or the difference is very small. Perform one 373*4b6a78b7SSimon Schubert subtraction followed by one division. */ 374*4b6a78b7SSimon Schubert mp_size_t gn; 375*4b6a78b7SSimon Schubert mp_size_t updated_un = un; 376*4b6a78b7SSimon Schubert 377*4b6a78b7SSimon Schubert /* Temporary storage 2n + 1 */ 378*4b6a78b7SSimon Schubert n = mpn_gcdext_subdiv_step (gp, &gn, up, usizep, ap, bp, n, 379*4b6a78b7SSimon Schubert u0, u1, &updated_un, tp, tp + n); 380*4b6a78b7SSimon Schubert if (n == 0) 381*4b6a78b7SSimon Schubert { 382*4b6a78b7SSimon Schubert TMP_FREE; 383*4b6a78b7SSimon Schubert return gn; 384*4b6a78b7SSimon Schubert } 385*4b6a78b7SSimon Schubert 386*4b6a78b7SSimon Schubert un = updated_un; 387*4b6a78b7SSimon Schubert ASSERT (un < ualloc); 388*4b6a78b7SSimon Schubert } 389*4b6a78b7SSimon Schubert } 390*4b6a78b7SSimon Schubert 391*4b6a78b7SSimon Schubert if (mpn_zero_p (ap, n)) 392*4b6a78b7SSimon Schubert { 393*4b6a78b7SSimon Schubert MPN_COPY (gp, bp, n); 394*4b6a78b7SSimon Schubert MPN_NORMALIZE_NOT_ZERO (u0, un); 395*4b6a78b7SSimon Schubert MPN_COPY (up, u0, un); 396*4b6a78b7SSimon Schubert *usizep = -un; 397*4b6a78b7SSimon Schubert 398*4b6a78b7SSimon Schubert TMP_FREE; 399*4b6a78b7SSimon Schubert return n; 400*4b6a78b7SSimon Schubert } 401*4b6a78b7SSimon Schubert else if (mpn_zero_p (bp, n)) 402*4b6a78b7SSimon Schubert { 403*4b6a78b7SSimon Schubert MPN_COPY (gp, ap, n); 404*4b6a78b7SSimon Schubert MPN_NORMALIZE_NOT_ZERO (u1, un); 405*4b6a78b7SSimon Schubert MPN_COPY (up, u1, un); 406*4b6a78b7SSimon Schubert *usizep = un; 407*4b6a78b7SSimon Schubert 408*4b6a78b7SSimon Schubert TMP_FREE; 409*4b6a78b7SSimon Schubert return n; 410*4b6a78b7SSimon Schubert } 411*4b6a78b7SSimon Schubert else if (mpn_zero_p (u0, un)) 412*4b6a78b7SSimon Schubert { 413*4b6a78b7SSimon Schubert mp_size_t gn; 414*4b6a78b7SSimon Schubert ASSERT (un == 1); 415*4b6a78b7SSimon Schubert ASSERT (u1[0] == 1); 416*4b6a78b7SSimon Schubert 417*4b6a78b7SSimon Schubert /* g = u a + v b = (u u1 - v u0) A + (...) B = u A + (...) B */ 418*4b6a78b7SSimon Schubert gn = mpn_gcdext_lehmer_n (gp, up, usizep, ap, bp, n, tp); 419*4b6a78b7SSimon Schubert 420*4b6a78b7SSimon Schubert TMP_FREE; 421*4b6a78b7SSimon Schubert return gn; 422*4b6a78b7SSimon Schubert } 423*4b6a78b7SSimon Schubert else 424*4b6a78b7SSimon Schubert { 425*4b6a78b7SSimon Schubert /* We have A = ... a + ... b 426*4b6a78b7SSimon Schubert B = u0 a + u1 b 427*4b6a78b7SSimon Schubert 428*4b6a78b7SSimon Schubert a = u1 A + ... B 429*4b6a78b7SSimon Schubert b = -u0 A + ... B 430*4b6a78b7SSimon Schubert 431*4b6a78b7SSimon Schubert with bounds 432*4b6a78b7SSimon Schubert 433*4b6a78b7SSimon Schubert |u0|, |u1| <= B / min(a, b) 434*4b6a78b7SSimon Schubert 435*4b6a78b7SSimon Schubert Compute g = u a + v b = (u u1 - v u0) A + (...) B 436*4b6a78b7SSimon Schubert Here, u, v are bounded by 437*4b6a78b7SSimon Schubert 438*4b6a78b7SSimon Schubert |u| <= b, 439*4b6a78b7SSimon Schubert |v| <= a 440*4b6a78b7SSimon Schubert */ 441*4b6a78b7SSimon Schubert 442*4b6a78b7SSimon Schubert mp_size_t u0n; 443*4b6a78b7SSimon Schubert mp_size_t u1n; 444*4b6a78b7SSimon Schubert mp_size_t lehmer_un; 445*4b6a78b7SSimon Schubert mp_size_t lehmer_vn; 446*4b6a78b7SSimon Schubert mp_size_t gn; 447*4b6a78b7SSimon Schubert 448*4b6a78b7SSimon Schubert mp_ptr lehmer_up; 449*4b6a78b7SSimon Schubert mp_ptr lehmer_vp; 450*4b6a78b7SSimon Schubert int negate; 451*4b6a78b7SSimon Schubert 452*4b6a78b7SSimon Schubert lehmer_up = tp; tp += n; 453*4b6a78b7SSimon Schubert 454*4b6a78b7SSimon Schubert /* Call mpn_gcdext_lehmer_n with copies of a and b. */ 455*4b6a78b7SSimon Schubert MPN_COPY (tp, ap, n); 456*4b6a78b7SSimon Schubert MPN_COPY (tp + n, bp, n); 457*4b6a78b7SSimon Schubert gn = mpn_gcdext_lehmer_n (gp, lehmer_up, &lehmer_un, tp, tp + n, n, tp + 2*n); 458*4b6a78b7SSimon Schubert 459*4b6a78b7SSimon Schubert u0n = un; 460*4b6a78b7SSimon Schubert MPN_NORMALIZE (u0, u0n); 461*4b6a78b7SSimon Schubert if (lehmer_un == 0) 462*4b6a78b7SSimon Schubert { 463*4b6a78b7SSimon Schubert /* u == 0 ==> v = g / b == 1 ==> g = - u0 A + (...) B */ 464*4b6a78b7SSimon Schubert MPN_COPY (up, u0, u0n); 465*4b6a78b7SSimon Schubert *usizep = -u0n; 466*4b6a78b7SSimon Schubert 467*4b6a78b7SSimon Schubert TMP_FREE; 468*4b6a78b7SSimon Schubert return gn; 469*4b6a78b7SSimon Schubert } 470*4b6a78b7SSimon Schubert 471*4b6a78b7SSimon Schubert lehmer_vp = tp; 472*4b6a78b7SSimon Schubert /* Compute v = (g - u a) / b */ 473*4b6a78b7SSimon Schubert lehmer_vn = compute_v (lehmer_vp, 474*4b6a78b7SSimon Schubert ap, bp, n, gp, gn, lehmer_up, lehmer_un, tp + n + 1); 475*4b6a78b7SSimon Schubert 476*4b6a78b7SSimon Schubert if (lehmer_un > 0) 477*4b6a78b7SSimon Schubert negate = 0; 478*4b6a78b7SSimon Schubert else 479*4b6a78b7SSimon Schubert { 480*4b6a78b7SSimon Schubert lehmer_un = -lehmer_un; 481*4b6a78b7SSimon Schubert negate = 1; 482*4b6a78b7SSimon Schubert } 483*4b6a78b7SSimon Schubert 484*4b6a78b7SSimon Schubert u1n = un; 485*4b6a78b7SSimon Schubert MPN_NORMALIZE (u1, u1n); 486*4b6a78b7SSimon Schubert 487*4b6a78b7SSimon Schubert /* It's possible that u0 = 1, u1 = 0 */ 488*4b6a78b7SSimon Schubert if (u1n == 0) 489*4b6a78b7SSimon Schubert { 490*4b6a78b7SSimon Schubert ASSERT (un == 1); 491*4b6a78b7SSimon Schubert ASSERT (u0[0] == 1); 492*4b6a78b7SSimon Schubert 493*4b6a78b7SSimon Schubert /* u1 == 0 ==> u u1 + v u0 = v */ 494*4b6a78b7SSimon Schubert MPN_COPY (up, lehmer_vp, lehmer_vn); 495*4b6a78b7SSimon Schubert *usizep = negate ? lehmer_vn : - lehmer_vn; 496*4b6a78b7SSimon Schubert 497*4b6a78b7SSimon Schubert TMP_FREE; 498*4b6a78b7SSimon Schubert return gn; 499*4b6a78b7SSimon Schubert } 500*4b6a78b7SSimon Schubert 501*4b6a78b7SSimon Schubert ASSERT (lehmer_un + u1n <= ualloc); 502*4b6a78b7SSimon Schubert ASSERT (lehmer_vn + u0n <= ualloc); 503*4b6a78b7SSimon Schubert 504*4b6a78b7SSimon Schubert /* Now u0, u1, u are non-zero. We may still have v == 0 */ 505*4b6a78b7SSimon Schubert 506*4b6a78b7SSimon Schubert /* Compute u u0 */ 507*4b6a78b7SSimon Schubert if (lehmer_un <= u1n) 508*4b6a78b7SSimon Schubert /* Should be the common case */ 509*4b6a78b7SSimon Schubert mpn_mul (up, u1, u1n, lehmer_up, lehmer_un); 510*4b6a78b7SSimon Schubert else 511*4b6a78b7SSimon Schubert mpn_mul (up, lehmer_up, lehmer_un, u1, u1n); 512*4b6a78b7SSimon Schubert 513*4b6a78b7SSimon Schubert un = u1n + lehmer_un; 514*4b6a78b7SSimon Schubert un -= (up[un - 1] == 0); 515*4b6a78b7SSimon Schubert 516*4b6a78b7SSimon Schubert if (lehmer_vn > 0) 517*4b6a78b7SSimon Schubert { 518*4b6a78b7SSimon Schubert mp_limb_t cy; 519*4b6a78b7SSimon Schubert 520*4b6a78b7SSimon Schubert /* Overwrites old u1 value */ 521*4b6a78b7SSimon Schubert if (lehmer_vn <= u0n) 522*4b6a78b7SSimon Schubert /* Should be the common case */ 523*4b6a78b7SSimon Schubert mpn_mul (u1, u0, u0n, lehmer_vp, lehmer_vn); 524*4b6a78b7SSimon Schubert else 525*4b6a78b7SSimon Schubert mpn_mul (u1, lehmer_vp, lehmer_vn, u0, u0n); 526*4b6a78b7SSimon Schubert 527*4b6a78b7SSimon Schubert u1n = u0n + lehmer_vn; 528*4b6a78b7SSimon Schubert u1n -= (u1[u1n - 1] == 0); 529*4b6a78b7SSimon Schubert 530*4b6a78b7SSimon Schubert if (u1n <= un) 531*4b6a78b7SSimon Schubert { 532*4b6a78b7SSimon Schubert cy = mpn_add (up, up, un, u1, u1n); 533*4b6a78b7SSimon Schubert } 534*4b6a78b7SSimon Schubert else 535*4b6a78b7SSimon Schubert { 536*4b6a78b7SSimon Schubert cy = mpn_add (up, u1, u1n, up, un); 537*4b6a78b7SSimon Schubert un = u1n; 538*4b6a78b7SSimon Schubert } 539*4b6a78b7SSimon Schubert up[un] = cy; 540*4b6a78b7SSimon Schubert un += (cy != 0); 541*4b6a78b7SSimon Schubert 542*4b6a78b7SSimon Schubert ASSERT (un < ualloc); 543*4b6a78b7SSimon Schubert } 544*4b6a78b7SSimon Schubert *usizep = negate ? -un : un; 545*4b6a78b7SSimon Schubert 546*4b6a78b7SSimon Schubert TMP_FREE; 547*4b6a78b7SSimon Schubert return gn; 548*4b6a78b7SSimon Schubert } 549*4b6a78b7SSimon Schubert } 550