1 /* UltraSPARC 64 mpn_mod_1 -- mpn by limb remainder. 2 3 Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2003, 2010 Free Software 4 Foundation, Inc. 5 6 This file is part of the GNU MP Library. 7 8 The GNU MP Library is free software; you can redistribute it and/or modify 9 it under the terms of the GNU Lesser General Public License as published by 10 the Free Software Foundation; either version 3 of the License, or (at your 11 option) any later version. 12 13 The GNU MP Library is distributed in the hope that it will be useful, but 14 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16 License for more details. 17 18 You should have received a copy of the GNU Lesser General Public License 19 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 20 21 #include "gmp.h" 22 #include "gmp-impl.h" 23 #include "longlong.h" 24 25 #include "mpn/sparc64/sparc64.h" 26 27 28 /* 64-bit divisor 32-bit divisor 29 cycles/limb cycles/limb 30 (approx) (approx) 31 Ultrasparc 2i: 160 120 32 */ 33 34 35 /* 32-bit divisors are treated in special case code. This requires 4 mulx 36 per limb instead of 8 in the general case. 37 38 For big endian systems we need HALF_ENDIAN_ADJ included in the src[i] 39 addressing, to get the two halves of each limb read in the correct order. 40 This is kept in an adj variable. Doing that measures about 6 c/l faster 41 than just writing HALF_ENDIAN_ADJ(i) in the loop. The latter shouldn't 42 be 6 cycles worth of work, but perhaps it doesn't schedule well (on gcc 43 3.2.1 at least). 44 45 A simple udivx/umulx loop for the 32-bit case was attempted for small 46 sizes, but at size==2 it was only about the same speed and at size==3 was 47 slower. */ 48 49 static mp_limb_t 50 mpn_mod_1_anynorm (mp_srcptr src_limbptr, mp_size_t size_limbs, mp_limb_t d_limb) 51 { 52 int norm, norm_rshift; 53 mp_limb_t src_high_limb; 54 mp_size_t i; 55 56 ASSERT (size_limbs >= 0); 57 ASSERT (d_limb != 0); 58 59 if (UNLIKELY (size_limbs == 0)) 60 return 0; 61 62 src_high_limb = src_limbptr[size_limbs-1]; 63 64 /* udivx is good for size==1, and no need to bother checking limb<divisor, 65 since if that's likely the caller should check */ 66 if (UNLIKELY (size_limbs == 1)) 67 return src_high_limb % d_limb; 68 69 if (d_limb <= CNST_LIMB(0xFFFFFFFF)) 70 { 71 unsigned *src, n1, n0, r, dummy_q, nshift, norm_rmask; 72 mp_size_t size, adj; 73 mp_limb_t dinv_limb; 74 75 size = 2 * size_limbs; /* halfwords */ 76 src = (unsigned *) src_limbptr; 77 78 /* prospective initial remainder, if < d */ 79 r = src_high_limb >> 32; 80 81 /* If the length of the source is uniformly distributed, then there's 82 a 50% chance of the high 32-bits being zero, which we can skip. */ 83 if (r == 0) 84 { 85 r = (unsigned) src_high_limb; 86 size--; 87 ASSERT (size > 0); /* because always even */ 88 } 89 90 /* Skip a division if high < divisor. Having the test here before 91 normalizing will still skip as often as possible. */ 92 if (r < d_limb) 93 { 94 size--; 95 ASSERT (size > 0); /* because size==1 handled above */ 96 } 97 else 98 r = 0; 99 100 count_leading_zeros_32 (norm, d_limb); 101 norm -= 32; 102 d_limb <<= norm; 103 104 norm_rshift = 32 - norm; 105 norm_rmask = (norm == 0 ? 0 : 0xFFFFFFFF); 106 i = size-1; 107 adj = HALF_ENDIAN_ADJ (i); 108 n1 = src [i + adj]; 109 r = (r << norm) | ((n1 >> norm_rshift) & norm_rmask); 110 111 invert_half_limb (dinv_limb, d_limb); 112 adj = -adj; 113 114 for (i--; i >= 0; i--) 115 { 116 n0 = src [i + adj]; 117 adj = -adj; 118 nshift = (n1 << norm) | ((n0 >> norm_rshift) & norm_rmask); 119 udiv_qrnnd_half_preinv (dummy_q, r, r, nshift, d_limb, dinv_limb); 120 n1 = n0; 121 } 122 123 /* same as loop, but without n0 */ 124 nshift = n1 << norm; 125 udiv_qrnnd_half_preinv (dummy_q, r, r, nshift, d_limb, dinv_limb); 126 127 ASSERT ((r & ((1 << norm) - 1)) == 0); 128 return r >> norm; 129 } 130 else 131 { 132 mp_srcptr src; 133 mp_size_t size; 134 mp_limb_t n1, n0, r, dinv, dummy_q, nshift, norm_rmask; 135 136 src = src_limbptr; 137 size = size_limbs; 138 r = src_high_limb; /* initial remainder */ 139 140 /* Skip a division if high < divisor. Having the test here before 141 normalizing will still skip as often as possible. */ 142 if (r < d_limb) 143 { 144 size--; 145 ASSERT (size > 0); /* because size==1 handled above */ 146 } 147 else 148 r = 0; 149 150 count_leading_zeros (norm, d_limb); 151 d_limb <<= norm; 152 153 norm_rshift = GMP_LIMB_BITS - norm; 154 norm_rmask = (norm == 0 ? 0 : 0xFFFFFFFF); 155 156 src += size; 157 n1 = *--src; 158 r = (r << norm) | ((n1 >> norm_rshift) & norm_rmask); 159 160 invert_limb (dinv, d_limb); 161 162 for (i = size-2; i >= 0; i--) 163 { 164 n0 = *--src; 165 nshift = (n1 << norm) | ((n0 >> norm_rshift) & norm_rmask); 166 udiv_qrnnd_preinv (dummy_q, r, r, nshift, d_limb, dinv); 167 n1 = n0; 168 } 169 170 /* same as loop, but without n0 */ 171 nshift = n1 << norm; 172 udiv_qrnnd_preinv (dummy_q, r, r, nshift, d_limb, dinv); 173 174 ASSERT ((r & ((CNST_LIMB(1) << norm) - 1)) == 0); 175 return r >> norm; 176 } 177 } 178 179 mp_limb_t 180 mpn_mod_1 (mp_srcptr ap, mp_size_t n, mp_limb_t b) 181 { 182 ASSERT (n >= 0); 183 ASSERT (b != 0); 184 185 /* Should this be handled at all? Rely on callers? Note un==0 is currently 186 required by mpz/fdiv_r_ui.c and possibly other places. */ 187 if (n == 0) 188 return 0; 189 190 if (UNLIKELY ((b & GMP_NUMB_HIGHBIT) != 0)) 191 { 192 if (BELOW_THRESHOLD (n, MOD_1N_TO_MOD_1_1_THRESHOLD)) 193 { 194 return mpn_mod_1_anynorm (ap, n, b); 195 } 196 else 197 { 198 mp_limb_t pre[4]; 199 mpn_mod_1_1p_cps (pre, b); 200 return mpn_mod_1_1p (ap, n, b, pre); 201 } 202 } 203 else 204 { 205 if (BELOW_THRESHOLD (n, MOD_1U_TO_MOD_1_1_THRESHOLD)) 206 { 207 return mpn_mod_1_anynorm (ap, n, b); 208 } 209 else if (BELOW_THRESHOLD (n, MOD_1_1_TO_MOD_1_2_THRESHOLD)) 210 { 211 mp_limb_t pre[4]; 212 mpn_mod_1_1p_cps (pre, b); 213 return mpn_mod_1_1p (ap, n, b << pre[1], pre); 214 } 215 else if (BELOW_THRESHOLD (n, MOD_1_2_TO_MOD_1_4_THRESHOLD) || UNLIKELY (b > GMP_NUMB_MASK / 4)) 216 { 217 mp_limb_t pre[5]; 218 mpn_mod_1s_2p_cps (pre, b); 219 return mpn_mod_1s_2p (ap, n, b << pre[1], pre); 220 } 221 else 222 { 223 mp_limb_t pre[7]; 224 mpn_mod_1s_4p_cps (pre, b); 225 return mpn_mod_1s_4p (ap, n, b << pre[1], pre); 226 } 227 } 228 } 229