1dnl x86-32 mpn_mod_1_1p, requiring cmov. 2 3dnl Contributed to the GNU project by Niels M�ller and Torbjorn Granlund. 4dnl 5dnl Copyright 2010, 2011 Free Software Foundation, Inc. 6dnl 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of the GNU Lesser General Public License as published 11dnl by the Free Software Foundation; either version 3 of the License, or (at 12dnl your option) any later version. 13dnl 14dnl The GNU MP Library is distributed in the hope that it will be useful, but 15dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 16dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 17dnl License for more details. 18dnl 19dnl You should have received a copy of the GNU Lesser General Public License 20dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 21 22include(`../config.m4') 23 24C cycles/limb 25C P5 ? 26C P6 model 0-8,10-12 ? 27C P6 model 9 (Banias) ? 28C P6 model 13 (Dothan) ? 29C P4 model 0 (Willamette) ? 30C P4 model 1 (?) ? 31C P4 model 2 (Northwood) ? 32C P4 model 3 (Prescott) ? 33C P4 model 4 (Nocona) ? 34C AMD K6 ? 35C AMD K7 7 36C AMD K8 ? 37 38define(`B2mb', `%ebx') 39define(`r0', `%esi') 40define(`r2', `%ebp') 41define(`t0', `%edi') 42define(`ap', `%ecx') C Also shift count 43 44C Stack frame 45C pre 36(%esp) 46C b 32(%esp) 47C n 28(%esp) 48C ap 24(%esp) 49C return 20(%esp) 50C %ebp 16(%esp) 51C %edi 12(%esp) 52C %esi 8(%esp) 53C %ebx 4(%esp) 54C B2mod (%esp) 55 56define(`B2modb', `(%esp)') 57define(`n', `28(%esp)') 58define(`b', `32(%esp)') 59define(`pre', `36(%esp)') 60 61C mp_limb_t 62C mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t pre[4]) 63C 64C The pre array contains bi, cnt, B1modb, B2modb 65C Note: This implementation needs B1modb only when cnt > 0 66 67ASM_START() 68 TEXT 69 ALIGN(8) 70PROLOGUE(mpn_mod_1_1p) 71 push %ebp 72 push %edi 73 push %esi 74 push %ebx 75 mov 32(%esp), %ebp C pre[] 76 77 mov 12(%ebp), %eax C B2modb 78 push %eax C Put it on stack 79 80 mov n, %edx 81 mov 24(%esp), ap 82 83 lea (ap, %edx, 4), ap 84 mov -4(ap), %eax 85 cmp $3, %edx 86 jnc L(first) 87 mov -8(ap), r0 88 jmp L(reduce_two) 89 90L(first): 91 C First iteration, no r2 92 mull B2modb 93 mov -12(ap), r0 94 add %eax, r0 95 mov -8(ap), %eax 96 adc %edx, %eax 97 sbb r2, r2 98 sub $3, n 99 lea -16(ap), ap 100 jz L(reduce_three) 101 102 mov B2modb, B2mb 103 sub b, B2mb 104 lea (B2mb, r0), t0 105 jmp L(mid) 106 107 ALIGN(16) 108L(top): C Loopmixed to 7 c/l on k7 109 add %eax, r0 110 lea (B2mb, r0), t0 111 mov r2, %eax 112 adc %edx, %eax 113 sbb r2, r2 114L(mid): mull B2modb 115 and B2modb, r2 116 add r0, r2 117 decl n 118 mov (ap), r0 119 cmovc( t0, r2) 120 lea -4(ap), ap 121 jnz L(top) 122 123 add %eax, r0 124 mov r2, %eax 125 adc %edx, %eax 126 sbb r2, r2 127 128L(reduce_three): 129 C Eliminate r2 130 and b, r2 131 sub r2, %eax 132 133L(reduce_two): 134 mov pre, %ebp 135 movb 4(%ebp), %cl 136 test %cl, %cl 137 jz L(normalized) 138 139 C Unnormalized, use B1modb to reduce to size < B b 140 mull 8(%ebp) 141 xor t0, t0 142 add %eax, r0 143 adc %edx, t0 144 mov t0, %eax 145 146 C Left-shift to normalize 147 shld %cl, r0, %eax C Always use shld? 148 149 shl %cl, r0 150 jmp L(udiv) 151 152L(normalized): 153 mov %eax, t0 154 sub b, t0 155 cmovnc( t0, %eax) 156 157L(udiv): 158 lea 1(%eax), t0 159 mull (%ebp) 160 mov b, %ebx C Needed in register for lea 161 add r0, %eax 162 adc t0, %edx 163 imul %ebx, %edx 164 sub %edx, r0 165 cmp r0, %eax 166 lea (%ebx, r0), %eax 167 cmovnc( r0, %eax) 168 cmp %ebx, %eax 169 jnc L(fix) 170L(ok): shr %cl, %eax 171 172 add $4, %esp 173 pop %ebx 174 pop %esi 175 pop %edi 176 pop %ebp 177 178 ret 179L(fix): sub %ebx, %eax 180 jmp L(ok) 181EPILOGUE() 182 183PROLOGUE(mpn_mod_1_1p_cps) 184 push %ebp 185 mov 12(%esp), %ebp 186 push %esi 187 bsr %ebp, %ecx 188 push %ebx 189 xor $31, %ecx 190 mov 16(%esp), %esi 191 sal %cl, %ebp 192 mov %ebp, %edx 193 not %edx 194 mov $-1, %eax 195 div %ebp C On K7, invert_limb would be a few cycles faster. 196 mov %eax, (%esi) C store bi 197 mov %ecx, 4(%esi) C store cnt 198 neg %ebp 199 mov $1, %edx 200 shld %cl, %eax, %edx 201 imul %ebp, %edx 202 shr %cl, %edx 203 imul %ebp, %eax 204 mov %edx, 8(%esi) C store B1modb 205 mov %eax, 12(%esi) C store B2modb 206 pop %ebx 207 pop %esi 208 pop %ebp 209 ret 210EPILOGUE() 211