1dnl Alpha mpn_modexact_1c_odd -- mpn exact remainder 2 3dnl Copyright 2003, 2004 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C cycles/limb 24C EV4: 47 25C EV5: 30 26C EV6: 15 27 28 29C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d, 30C mp_limb_t c) 31C 32C This code follows the "alternate" code in mpn/generic/mode1o.c, 33C eliminating cbit+climb from the dependent chain. This leaves, 34C 35C ev4 ev5 ev6 36C 1 3 1 subq y = x - h 37C 23 13 7 mulq q = y * inverse 38C 23 14 7 umulh h = high (q * d) 39C -- -- -- 40C 47 30 15 41C 42C In each case, the load latency, loop control, and extra carry bit handling 43C hide under the multiply latencies. Those latencies are long enough that 44C we don't need to worry about alignment or pairing to squeeze out 45C performance. 46C 47C For the first limb, some of the loop code is broken out and scheduled back 48C since it can be done earlier. 49C 50C - The first ldq src[0] is near the start of the routine, for maximum 51C time from memory. 52C 53C - The subq y=x-climb can be done without waiting for the inverse. 54C 55C - The mulq y*inverse is replicated after the final subq for the inverse, 56C instead of branching to the mulq in the main loop. On ev4 a branch 57C there would cost cycles, but we can hide them under the mulq latency. 58C 59C For the last limb, high<divisor is tested and if that's true a subtract 60C and addback is done, as per the main mpn/generic/mode1o.c code. This is a 61C data-dependent branch, but we're waiting for umulh so any penalty should 62C hide there. The multiplies saved would be worth the cost anyway. 63C 64C Enhancements: 65C 66C For size==1, a plain division (done bitwise say) might be faster than 67C calculating an inverse, the latter taking about 130 cycles on ev4 or 70 on 68C ev5. A call to gcc __remqu might be a possibility. 69 70ASM_START() 71PROLOGUE(mpn_modexact_1c_odd,gp) 72 73 C r16 src 74 C r17 size 75 C r18 d 76 C r19 c 77 78 LEA(r0, binvert_limb_table) 79 srl r18, 1, r20 C d >> 1 80 81 and r20, 127, r20 C idx = d>>1 & 0x7F 82 83 addq r0, r20, r21 C table + idx 84 85ifelse(bwx_available_p,1, 86` ldbu r20, 0(r21) C table[idx], inverse 8 bits 87',` 88 ldq_u r20, 0(r21) C table[idx] qword 89 extbl r20, r21, r20 C table[idx], inverse 8 bits 90') 91 92 mull r20, r20, r7 C i*i 93 addq r20, r20, r20 C 2*i 94 95 ldq r2, 0(r16) C x = s = src[0] 96 lda r17, -1(r17) C size-- 97 clr r0 C initial cbit=0 98 99 mull r7, r18, r7 C i*i*d 100 101 subq r20, r7, r20 C 2*i-i*i*d, inverse 16 bits 102 103 mull r20, r20, r7 C i*i 104 addq r20, r20, r20 C 2*i 105 106 mull r7, r18, r7 C i*i*d 107 108 subq r20, r7, r20 C 2*i-i*i*d, inverse 32 bits 109 110 mulq r20, r20, r7 C i*i 111 addq r20, r20, r20 C 2*i 112 113 mulq r7, r18, r7 C i*i*d 114 subq r2, r19, r3 C y = x - climb 115 116 subq r20, r7, r20 C inv = 2*i-i*i*d, inverse 64 bits 117 118ASSERT(r7, C should have d*inv==1 mod 2^64 119` mulq r18, r20, r7 120 cmpeq r7, 1, r7') 121 122 mulq r3, r20, r4 C first q = y * inv 123 124 beq r17, L(one) C if size==1 125 br L(entry) 126 127 128L(top): 129 C r0 cbit 130 C r16 src, incrementing 131 C r17 size, decrementing 132 C r18 d 133 C r19 climb 134 C r20 inv 135 136 ldq r1, 0(r16) C s = src[i] 137 subq r1, r0, r2 C x = s - cbit 138 cmpult r1, r0, r0 C new cbit = s < cbit 139 140 subq r2, r19, r3 C y = x - climb 141 142 mulq r3, r20, r4 C q = y * inv 143L(entry): 144 cmpult r2, r19, r5 C cbit2 = x < climb 145 addq r5, r0, r0 C cbit += cbit2 146 lda r16, 8(r16) C src++ 147 lda r17, -1(r17) C size-- 148 149 umulh r4, r18, r19 C climb = q * d 150 bne r17, L(top) C while 2 or more limbs left 151 152 153 154 C r0 cbit 155 C r18 d 156 C r19 climb 157 C r20 inv 158 159 ldq r1, 0(r16) C s = src[size-1] high limb 160 161 cmpult r1, r18, r2 C test high<divisor 162 bne r2, L(skip) C skip if so 163 164 C can't skip a division, repeat loop code 165 166 subq r1, r0, r2 C x = s - cbit 167 cmpult r1, r0, r0 C new cbit = s < cbit 168 169 subq r2, r19, r3 C y = x - climb 170 171 mulq r3, r20, r4 C q = y * inv 172L(one): 173 cmpult r2, r19, r5 C cbit2 = x < climb 174 addq r5, r0, r0 C cbit += cbit2 175 176 umulh r4, r18, r19 C climb = q * d 177 178 addq r19, r0, r0 C return climb + cbit 179 ret r31, (r26), 1 180 181 182 ALIGN(8) 183L(skip): 184 C with high<divisor, the final step can be just (cbit+climb)-s and 185 C an addback of d if that underflows 186 187 addq r19, r0, r19 C c = climb + cbit 188 189 subq r19, r1, r2 C c - s 190 cmpult r19, r1, r3 C c < s 191 192 addq r2, r18, r0 C return c-s + divisor 193 194 cmoveq r3, r2, r0 C return c-s if no underflow 195 ret r31, (r26), 1 196 197EPILOGUE() 198ASM_END() 199