1dnl x86-32 mpn_mod_1s_4p, requiring cmov. 2 3dnl Contributed to the GNU project by Torbjorn Granlund. 4 5dnl Copyright 2009 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of the GNU Lesser General Public License as published 11dnl by the Free Software Foundation; either version 3 of the License, or (at 12dnl your option) any later version. 13 14dnl The GNU MP Library is distributed in the hope that it will be useful, but 15dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 16dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 17dnl License for more details. 18 19dnl You should have received a copy of the GNU Lesser General Public License 20dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 21 22include(`../config.m4') 23 24C cycles/limb 25C P5: 26C P6 model 0-8,10-12) 27C P6 model 9 (Banias) 28C P6 model 13 (Dothan) 6.0 29C P4 model 0 (Willamette) 30C P4 model 1 (?) 31C P4 model 2 (Northwood) 15.5 32C P4 model 3 (Prescott) 33C P4 model 4 (Nocona) 34C K6: 35C K7: 4.75 36C K8: 37 38 39C Ths inner loop was manually written, it ought to be loopmixed. 40C Presumably, we could get to 4 c/l for K7. 41 42C The cps function was compiler generated. It can clearly be optimized. 43 44 45ASM_START() 46 TEXT 47 48 ALIGN(16) 49PROLOGUE(mpn_mod_1s_4p) 50 push %ebp 51 push %edi 52 push %esi 53 push %ebx 54 sub $28, %esp 55 mov 60(%esp), %edi C cps 56 mov 8(%edi), %eax 57 mov 12(%edi), %edx 58 mov 16(%edi), %ecx 59 mov 20(%edi), %esi 60 mov 24(%edi), %edi 61 mov %eax, 4(%esp) 62 mov %edx, 8(%esp) 63 mov %ecx, 12(%esp) 64 mov %esi, 16(%esp) 65 mov %edi, 20(%esp) 66 mov 52(%esp), %eax C n 67 xor %edi, %edi 68 mov 48(%esp), %esi C up 69 lea -12(%esi,%eax,4), %esi 70 and $3, %eax 71 je L(b0) 72 cmp $2, %eax 73 jc L(b1) 74 je L(b2) 75 76L(b3): mov 4(%esi), %eax 77 mull 4(%esp) 78 mov (%esi), %ebp 79 add %eax, %ebp 80 adc %edx, %edi 81 mov 8(%esi), %eax 82 mull 8(%esp) 83 lea -12(%esi), %esi 84 jmp L(m0) 85 86L(b0): mov (%esi), %eax 87 mull 4(%esp) 88 mov -4(%esi), %ebp 89 add %eax, %ebp 90 adc %edx, %edi 91 mov 4(%esi), %eax 92 mull 8(%esp) 93 add %eax, %ebp 94 adc %edx, %edi 95 mov 8(%esi), %eax 96 mull 12(%esp) 97 lea -16(%esi), %esi 98 jmp L(m0) 99 100L(b1): mov 8(%esi), %ebp 101 lea -4(%esi), %esi 102 jmp L(m1) 103 104L(b2): mov 8(%esi), %eax 105 mull 4(%esp) 106 mov 4(%esi), %ebp 107 lea -8(%esi), %esi 108 jmp L(m0) 109 110 ALIGN(16) 111L(top): mov (%esi), %eax 112 mull 4(%esp) 113 mov -4(%esi), %ebx 114 xor %ecx, %ecx 115 add %eax, %ebx 116 adc %edx, %ecx 117 mov 4(%esi), %eax 118 mull 8(%esp) 119 add %eax, %ebx 120 adc %edx, %ecx 121 mov 8(%esi), %eax 122 mull 12(%esp) 123 add %eax, %ebx 124 adc %edx, %ecx 125 lea -16(%esi), %esi 126 mov 16(%esp), %eax 127 mul %ebp 128 add %eax, %ebx 129 adc %edx, %ecx 130 mov 20(%esp), %eax 131 mul %edi 132 mov %ebx, %ebp 133 mov %ecx, %edi 134L(m0): add %eax, %ebp 135 adc %edx, %edi 136L(m1): sub $4, 52(%esp) 137 ja L(top) 138 139L(end): mov 4(%esp), %eax 140 mul %edi 141 mov 60(%esp), %edi 142 add %eax, %ebp 143 adc $0, %edx 144 mov 4(%edi), %ecx 145 mov %edx, %esi 146 mov %ebp, %eax 147 sal %cl, %esi 148 mov %ecx, %ebx 149 neg %ecx 150 shr %cl, %eax 151 or %esi, %eax 152 lea 1(%eax), %esi 153 mull (%edi) 154 mov %ebx, %ecx 155 mov %eax, %ebx 156 mov %ebp, %eax 157 sal %cl, %eax 158 add %eax, %ebx 159 adc %esi, %edx 160 imul 56(%esp), %edx 161 mov 56(%esp), %esi 162 sub %edx, %eax 163 lea (%eax,%esi), %edx 164 cmp %eax, %ebx 165 cmovb( %edx, %eax) 166 mov %eax, %edx 167 sub %esi, %eax 168 cmovb( %edx, %eax) 169 add $28, %esp 170 pop %ebx 171 pop %esi 172 pop %edi 173 pop %ebp 174 shr %cl, %eax 175 ret 176EPILOGUE() 177 178 ALIGN(16) 179PROLOGUE(mpn_mod_1s_4p_cps) 180 sub $56, %esp 181 mov %esi, 44(%esp) 182 mov 64(%esp), %esi 183 mov %edi, 48(%esp) 184 mov %ebx, 40(%esp) 185 mov $-1, %ebx 186 mov %ebp, 52(%esp) 187 bsr %esi, %eax 188 xor $31, %eax 189 mov %eax, %ecx 190 mov %eax, 24(%esp) 191 mov %ebx, %eax 192 sal %cl, %esi 193 mov %esi, %ecx 194 mov %esi, %edi 195 mov %esi, %ebp 196 neg %ecx 197 not %edi 198 mov %ecx, 20(%esp) 199 mov $32, %ecx 200 sub 24(%esp), %ecx 201 mov %edi, %edx 202 mov %edi, 16(%esp) 203 mov 20(%esp), %edi 204 div %esi 205 mov %eax, %ebx 206 shr %cl, %eax 207 movzbl 24(%esp), %ecx 208 mov %eax, 12(%esp) 209 mov $1, %eax 210 sal %cl, %eax 211 or %eax, 12(%esp) 212 imul 12(%esp), %edi 213 mov %edi, %eax 214 mov %edi, 20(%esp) 215 mul %ebx 216 mov %eax, %ecx 217 lea 1(%edx,%edi), %eax 218 neg %eax 219 imul %eax, %ebp 220 lea (%ebp,%esi), %eax 221 cmp %ebp, %ecx 222 cmovb( %eax, %ebp) 223 mov %ebp, %eax 224 mul %ebx 225 lea 1(%ebp,%edx), %edi 226 mov %eax, %ecx 227 neg %edi 228 mov %edi, 8(%esp) 229 imul %esi, %edi 230 mov %edi, %eax 231 add %esi, %eax 232 cmp %edi, %ecx 233 cmovae( %edi, %eax) 234 mov %eax, 32(%esp) 235 mov 32(%esp), %edi 236 mul %ebx 237 mov %eax, 36(%esp) 238 lea 1(%edi,%edx), %eax 239 negl %eax 240 imul %esi, %eax 241 mov %eax, %ecx 242 add %esi, %ecx 243 cmp %eax, 36(%esp) 244 cmovae( %eax, %ecx) 245 mov %ecx, (%esp) 246 mov %ecx, %eax 247 mul %ebx 248 mov %eax, %edi 249 mov (%esp), %eax 250 lea 1(%eax,%edx), %ecx 251 mov 60(%esp), %edx 252 neg %ecx 253 imul %esi, %ecx 254 mov %ebx, (%edx) 255 add %ecx, %esi 256 cmp %ecx, %edi 257 cmovae( %ecx, %esi) 258 mov 24(%esp), %ecx 259 shrl %cl, 20(%esp) 260 mov 20(%esp), %edi 261 mov %esi, 4(%esp) 262 mov %ecx, 4(%edx) 263 movzbl 24(%esp), %ecx 264 mov %edi, 8(%edx) 265 shr %cl, %ebp 266 shr %cl, %eax 267 mov %ebp, 12(%edx) 268 shrl %cl, 32(%esp) 269 mov 32(%esp), %edi 270 shrl %cl, 4(%esp) 271 mov %eax, 20(%edx) 272 mov %edi, 16(%edx) 273 mov 4(%esp), %edi 274 mov %edi, 24(%edx) 275 mov 40(%esp), %ebx 276 mov 44(%esp), %esi 277 mov 48(%esp), %edi 278 mov 52(%esp), %ebp 279 add $56, %esp 280 ret 281EPILOGUE() 282