1 /* mpn_mod_1_1p (ap, n, b, cps) 2 Divide (ap,,n) by b. Return the single-limb remainder. 3 4 Contributed to the GNU project by Torbjorn Granlund and Niels M�ller. 5 Based on a suggestion by Peter L. Montgomery. 6 7 THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY 8 SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 9 GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 10 11 Copyright 2008, 2009, 2010, 2011 Free Software Foundation, Inc. 12 13 This file is part of the GNU MP Library. 14 15 The GNU MP Library is free software; you can redistribute it and/or modify 16 it under the terms of the GNU Lesser General Public License as published by 17 the Free Software Foundation; either version 3 of the License, or (at your 18 option) any later version. 19 20 The GNU MP Library is distributed in the hope that it will be useful, but 21 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 22 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 23 License for more details. 24 25 You should have received a copy of the GNU Lesser General Public License 26 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 27 28 #include "gmp.h" 29 #include "gmp-impl.h" 30 #include "longlong.h" 31 32 #ifndef MOD_1_1P_METHOD 33 # define MOD_1_1P_METHOD 1 /* need to make sure this is 2 for asm testing */ 34 #endif 35 36 /* Define some longlong.h-style macros, but for wider operations. 37 * add_mssaaaa is like longlong.h's add_ssaaaa, but also generates 38 * carry out, in the form of a mask. */ 39 40 #if defined (__GNUC__) 41 42 #if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32 43 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \ 44 __asm__ ( "add %6, %k2\n\t" \ 45 "adc %4, %k1\n\t" \ 46 "sbb %k0, %k0" \ 47 : "=r" (m), "=r" (s1), "=&r" (s0) \ 48 : "1" ((USItype)(a1)), "g" ((USItype)(b1)), \ 49 "%2" ((USItype)(a0)), "g" ((USItype)(b0))) 50 #endif 51 52 #if HAVE_HOST_CPU_FAMILY_x86_64 && W_TYPE_SIZE == 64 53 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \ 54 __asm__ ( "add %6, %q2\n\t" \ 55 "adc %4, %q1\n\t" \ 56 "sbb %q0, %q0" \ 57 : "=r" (m), "=r" (s1), "=&r" (s0) \ 58 : "1" ((UDItype)(a1)), "rme" ((UDItype)(b1)), \ 59 "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0))) 60 #endif 61 62 #if defined (__sparc__) && W_TYPE_SIZE == 32 63 #define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \ 64 __asm__ ( "addcc %r5, %6, %2\n\t" \ 65 "addxcc %r3, %4, %1\n\t" \ 66 "subx %%g0, %%g0, %0" \ 67 : "=r" (m), "=r" (sh), "=&r" (sl) \ 68 : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl) \ 69 __CLOBBER_CC) 70 #endif 71 72 #if defined (__sparc__) && W_TYPE_SIZE == 64 73 #define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \ 74 __asm__ ( "addcc %r5, %6, %2\n\t" \ 75 "addccc %r7, %8, %%g0\n\t" \ 76 "addccc %r3, %4, %1\n\t" \ 77 "clr %0\n\t" \ 78 "movcs %%xcc, -1, %0" \ 79 : "=r" (m), "=r" (sh), "=&r" (sl) \ 80 : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl), \ 81 "rJ" ((al) >> 32), "rI" ((bl) >> 32) \ 82 __CLOBBER_CC) 83 #endif 84 85 #if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB) 86 /* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a 87 processor running in 32-bit mode, since the carry flag then gets the 32-bit 88 carry. */ 89 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \ 90 __asm__ ( "add%I6c %2, %5, %6\n\t" \ 91 "adde %1, %3, %4\n\t" \ 92 "subfe %0, %0, %0\n\t" \ 93 "nor %0, %0, %0" \ 94 : "=r" (m), "=r" (s1), "=&r" (s0) \ 95 : "r" (a1), "r" (b1), "%r" (a0), "rI" (b0)) 96 #endif 97 98 #if defined (__s390x__) && W_TYPE_SIZE == 64 99 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \ 100 __asm__ ( "algr %2, %6\n\t" \ 101 "alcgr %1, %4\n\t" \ 102 "lghi %0, 0\n\t" \ 103 "alcgr %0, %0\n\t" \ 104 "lcgr %0, %0" \ 105 : "=r" (m), "=r" (s1), "=&r" (s0) \ 106 : "1" ((UDItype)(a1)), "r" ((UDItype)(b1)), \ 107 "%2" ((UDItype)(a0)), "r" ((UDItype)(b0)) __CLOBBER_CC) 108 #endif 109 110 #if defined (__arm__) && W_TYPE_SIZE == 32 111 #define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \ 112 __asm__ ( "adds %2, %5, %6\n\t" \ 113 "adcs %1, %3, %4\n\t" \ 114 "movcc %0, #0\n\t" \ 115 "movcs %0, #-1" \ 116 : "=r" (m), "=r" (sh), "=&r" (sl) \ 117 : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC) 118 #endif 119 #endif /* defined (__GNUC__) */ 120 121 #ifndef add_mssaaaa 122 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \ 123 do { \ 124 UWtype __s0, __s1, __c0, __c1; \ 125 __s0 = (a0) + (b0); \ 126 __s1 = (a1) + (b1); \ 127 __c0 = __s0 < (a0); \ 128 __c1 = __s1 < (a1); \ 129 (s0) = __s0; \ 130 __s1 = __s1 + __c0; \ 131 (s1) = __s1; \ 132 (m) = - (__c1 + (__s1 < __c0)); \ 133 } while (0) 134 #endif 135 136 #if MOD_1_1P_METHOD == 1 137 void 138 mpn_mod_1_1p_cps (mp_limb_t cps[4], mp_limb_t b) 139 { 140 mp_limb_t bi; 141 mp_limb_t B1modb, B2modb; 142 int cnt; 143 144 count_leading_zeros (cnt, b); 145 146 b <<= cnt; 147 invert_limb (bi, b); 148 149 cps[0] = bi; 150 cps[1] = cnt; 151 152 B1modb = -b; 153 if (LIKELY (cnt != 0)) 154 B1modb *= ((bi >> (GMP_LIMB_BITS-cnt)) | (CNST_LIMB(1) << cnt)); 155 ASSERT (B1modb <= b); /* NB: not fully reduced mod b */ 156 cps[2] = B1modb >> cnt; 157 158 /* In the normalized case, this can be simplified to 159 * 160 * B2modb = - b * bi; 161 * ASSERT (B2modb <= b); // NB: equality iff b = B/2 162 */ 163 udiv_rnnd_preinv (B2modb, B1modb, CNST_LIMB(0), b, bi); 164 cps[3] = B2modb >> cnt; 165 } 166 167 mp_limb_t 168 mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t bmodb[4]) 169 { 170 mp_limb_t rh, rl, bi, ph, pl, r; 171 mp_limb_t B1modb, B2modb; 172 mp_size_t i; 173 int cnt; 174 mp_limb_t mask; 175 176 ASSERT (n >= 2); /* fix tuneup.c if this is changed */ 177 178 B1modb = bmodb[2]; 179 B2modb = bmodb[3]; 180 181 rl = ap[n - 1]; 182 umul_ppmm (ph, pl, rl, B1modb); 183 add_ssaaaa (rh, rl, ph, pl, 0, ap[n - 2]); 184 185 for (i = n - 3; i >= 0; i -= 1) 186 { 187 /* rr = ap[i] < B 188 + LO(rr) * (B mod b) <= (B-1)(b-1) 189 + HI(rr) * (B^2 mod b) <= (B-1)(b-1) 190 */ 191 umul_ppmm (ph, pl, rl, B1modb); 192 add_ssaaaa (ph, pl, ph, pl, 0, ap[i]); 193 194 umul_ppmm (rh, rl, rh, B2modb); 195 add_ssaaaa (rh, rl, rh, rl, ph, pl); 196 } 197 198 cnt = bmodb[1]; 199 bi = bmodb[0]; 200 201 if (LIKELY (cnt != 0)) 202 rh = (rh << cnt) | (rl >> (GMP_LIMB_BITS - cnt)); 203 204 mask = -(mp_limb_t) (rh >= b); 205 rh -= mask & b; 206 207 udiv_rnnd_preinv (r, rh, rl << cnt, b, bi); 208 209 return r >> cnt; 210 } 211 #endif /* MOD_1_1P_METHOD == 1 */ 212 213 #if MOD_1_1P_METHOD == 2 214 void 215 mpn_mod_1_1p_cps (mp_limb_t cps[4], mp_limb_t b) 216 { 217 mp_limb_t bi; 218 mp_limb_t B2modb; 219 int cnt; 220 221 count_leading_zeros (cnt, b); 222 223 b <<= cnt; 224 invert_limb (bi, b); 225 226 cps[0] = bi; 227 cps[1] = cnt; 228 229 if (LIKELY (cnt != 0)) 230 { 231 mp_limb_t B1modb = -b; 232 B1modb *= ((bi >> (GMP_LIMB_BITS-cnt)) | (CNST_LIMB(1) << cnt)); 233 ASSERT (B1modb <= b); /* NB: not fully reduced mod b */ 234 cps[2] = B1modb >> cnt; 235 } 236 B2modb = - b * bi; 237 ASSERT (B2modb <= b); // NB: equality iff b = B/2 238 cps[3] = B2modb; 239 } 240 241 mp_limb_t 242 mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t bmodb[4]) 243 { 244 int cnt; 245 mp_limb_t bi, B1modb; 246 mp_limb_t r0, r1; 247 mp_limb_t r; 248 249 ASSERT (n >= 2); /* fix tuneup.c if this is changed */ 250 251 r0 = ap[n-2]; 252 r1 = ap[n-1]; 253 254 if (n > 2) 255 { 256 mp_limb_t B2modb, B2mb; 257 mp_limb_t p0, p1; 258 mp_limb_t r2; 259 mp_size_t j; 260 261 B2modb = bmodb[3]; 262 B2mb = B2modb - b; 263 264 umul_ppmm (p1, p0, r1, B2modb); 265 add_mssaaaa (r2, r1, r0, r0, ap[n-3], p1, p0); 266 267 for (j = n-4; j >= 0; j--) 268 { 269 mp_limb_t cy; 270 /* mp_limb_t t = r0 + B2mb; */ 271 umul_ppmm (p1, p0, r1, B2modb); 272 273 ADDC_LIMB (cy, r0, r0, r2 & B2modb); 274 /* Alternative, for cmov: if (cy) r0 = t; */ 275 r0 -= (-cy) & b; 276 add_mssaaaa (r2, r1, r0, r0, ap[j], p1, p0); 277 } 278 279 r1 -= (r2 & b); 280 } 281 282 cnt = bmodb[1]; 283 284 if (LIKELY (cnt != 0)) 285 { 286 mp_limb_t t; 287 mp_limb_t B1modb = bmodb[2]; 288 289 umul_ppmm (r1, t, r1, B1modb); 290 r0 += t; 291 r1 += (r0 < t); 292 293 /* Normalize */ 294 r1 = (r1 << cnt) | (r0 >> (GMP_LIMB_BITS - cnt)); 295 r0 <<= cnt; 296 297 /* NOTE: Might get r1 == b here, but udiv_rnnd_preinv allows 298 that. */ 299 } 300 else 301 { 302 mp_limb_t mask = -(mp_limb_t) (r1 >= b); 303 r1 -= mask & b; 304 } 305 306 bi = bmodb[0]; 307 308 udiv_rnnd_preinv (r, r1, r0, b, bi); 309 return r >> cnt; 310 } 311 #endif /* MOD_1_1P_METHOD == 2 */ 312