1 /* UltraSPARC 64 mpn_modexact_1c_odd -- mpn by limb exact style remainder. 2 3 THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST 4 CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN 5 FUTURE GNU MP RELEASES. 6 7 Copyright 2000, 2001, 2002, 2003 Free Software Foundation, Inc. 8 9 This file is part of the GNU MP Library. 10 11 The GNU MP Library is free software; you can redistribute it and/or modify 12 it under the terms of the GNU Lesser General Public License as published by 13 the Free Software Foundation; either version 3 of the License, or (at your 14 option) any later version. 15 16 The GNU MP Library is distributed in the hope that it will be useful, but 17 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 18 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 19 License for more details. 20 21 You should have received a copy of the GNU Lesser General Public License 22 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 23 24 #include "gmp.h" 25 #include "gmp-impl.h" 26 #include "longlong.h" 27 28 #include "mpn/sparc64/sparc64.h" 29 30 31 /* 64-bit divisor 32-bit divisor 32 cycles/limb cycles/limb 33 (approx) (approx) 34 Ultrasparc 2i: ? ? 35 */ 36 37 38 /* This implementation reduces the number of multiplies done, knowing that 39 on ultrasparc 1 and 2 the mulx instruction stalls the whole chip. 40 41 The key idea is to use the fact that the low limb of q*d equals l, this 42 being the whole purpose of the q calculated. It means there's no need to 43 calculate the lowest 32x32->64 part of the q*d, instead it can be 44 inferred from l and the other three 32x32->64 parts. See sparc64.h for 45 details. 46 47 When d is 32-bits, the same applies, but in this case there's only one 48 other 32x32->64 part (ie. HIGH(q)*d). 49 50 The net effect is that for 64-bit divisor each limb is 4 mulx, or for 51 32-bit divisor each is 2 mulx. 52 53 Enhancements: 54 55 No doubt this could be done in assembler, if that helped the scheduling, 56 or perhaps guaranteed good code irrespective of the compiler. 57 58 Alternatives: 59 60 It might be possibly to use floating point. The loop is dominated by 61 multiply latency, so not sure if floats would improve that. One 62 possibility would be to take two limbs at a time, with a 128 bit inverse, 63 if there's enough registers, which could effectively use float throughput 64 to reduce total latency across two limbs. */ 65 66 #define ASSERT_RETVAL(r) \ 67 ASSERT (orig_c < d ? r < d : r <= d) 68 69 mp_limb_t 70 mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d, mp_limb_t orig_c) 71 { 72 mp_limb_t c = orig_c; 73 mp_limb_t s, l, q, h, inverse; 74 75 ASSERT (size >= 1); 76 ASSERT (d & 1); 77 ASSERT_MPN (src, size); 78 ASSERT_LIMB (d); 79 ASSERT_LIMB (c); 80 81 /* udivx is faster than 10 or 12 mulx's for one limb via an inverse */ 82 if (size == 1) 83 { 84 s = src[0]; 85 if (s > c) 86 { 87 l = s-c; 88 h = l % d; 89 if (h != 0) 90 h = d - h; 91 } 92 else 93 { 94 l = c-s; 95 h = l % d; 96 } 97 return h; 98 } 99 100 binvert_limb (inverse, d); 101 102 if (d <= 0xFFFFFFFF) 103 { 104 s = *src++; 105 size--; 106 do 107 { 108 SUBC_LIMB (c, l, s, c); 109 s = *src++; 110 q = l * inverse; 111 umul_ppmm_half_lowequal (h, q, d, l); 112 c += h; 113 size--; 114 } 115 while (size != 0); 116 117 if (s <= d) 118 { 119 /* With high s <= d the final step can be a subtract and addback. 120 If c==0 then the addback will restore to l>=0. If c==d then 121 will get l==d if s==0, but that's ok per the function 122 definition. */ 123 124 l = c - s; 125 l += (l > c ? d : 0); 126 127 ASSERT_RETVAL (l); 128 return l; 129 } 130 else 131 { 132 /* Can't skip a divide, just do the loop code once more. */ 133 SUBC_LIMB (c, l, s, c); 134 q = l * inverse; 135 umul_ppmm_half_lowequal (h, q, d, l); 136 c += h; 137 138 ASSERT_RETVAL (c); 139 return c; 140 } 141 } 142 else 143 { 144 mp_limb_t dl = LOW32 (d); 145 mp_limb_t dh = HIGH32 (d); 146 long i; 147 148 s = *src++; 149 size--; 150 do 151 { 152 SUBC_LIMB (c, l, s, c); 153 s = *src++; 154 q = l * inverse; 155 umul_ppmm_lowequal (h, q, d, dh, dl, l); 156 c += h; 157 size--; 158 } 159 while (size != 0); 160 161 if (s <= d) 162 { 163 /* With high s <= d the final step can be a subtract and addback. 164 If c==0 then the addback will restore to l>=0. If c==d then 165 will get l==d if s==0, but that's ok per the function 166 definition. */ 167 168 l = c - s; 169 l += (l > c ? d : 0); 170 171 ASSERT_RETVAL (l); 172 return l; 173 } 174 else 175 { 176 /* Can't skip a divide, just do the loop code once more. */ 177 SUBC_LIMB (c, l, s, c); 178 q = l * inverse; 179 umul_ppmm_lowequal (h, q, d, dh, dl, l); 180 c += h; 181 182 ASSERT_RETVAL (c); 183 return c; 184 } 185 } 186 } 187