1dnl x86-32 mpn_mod_1s_4p, requiring cmov. 2 3dnl Contributed to the GNU project by Torbjorn Granlund. 4dnl 5dnl Copyright 2009, 2010 Free Software Foundation, Inc. 6dnl 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of the GNU Lesser General Public License as published 11dnl by the Free Software Foundation; either version 3 of the License, or (at 12dnl your option) any later version. 13dnl 14dnl The GNU MP Library is distributed in the hope that it will be useful, but 15dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 16dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 17dnl License for more details. 18dnl 19dnl You should have received a copy of the GNU Lesser General Public License 20dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 21 22include(`../config.m4') 23 24C cycles/limb 25C P5 ? 26C P6 model 0-8,10-12 ? 27C P6 model 9 (Banias) ? 28C P6 model 13 (Dothan) 6 29C P4 model 0 (Willamette) ? 30C P4 model 1 (?) ? 31C P4 model 2 (Northwood) 15.5 32C P4 model 3 (Prescott) ? 33C P4 model 4 (Nocona) ? 34C AMD K6 ? 35C AMD K7 4.75 36C AMD K8 ? 37 38ASM_START() 39 TEXT 40 ALIGN(16) 41PROLOGUE(mpn_mod_1s_4p) 42 push %ebp 43 push %edi 44 push %esi 45 push %ebx 46 sub $28, %esp 47 mov 60(%esp), %edi C cps[] 48 mov 8(%edi), %eax 49 mov 12(%edi), %edx 50 mov 16(%edi), %ecx 51 mov 20(%edi), %esi 52 mov 24(%edi), %edi 53 mov %eax, 4(%esp) 54 mov %edx, 8(%esp) 55 mov %ecx, 12(%esp) 56 mov %esi, 16(%esp) 57 mov %edi, 20(%esp) 58 mov 52(%esp), %eax C n 59 xor %edi, %edi 60 mov 48(%esp), %esi C up 61 lea -12(%esi,%eax,4), %esi 62 and $3, %eax 63 je L(b0) 64 cmp $2, %eax 65 jc L(b1) 66 je L(b2) 67 68L(b3): mov 4(%esi), %eax 69 mull 4(%esp) 70 mov (%esi), %ebp 71 add %eax, %ebp 72 adc %edx, %edi 73 mov 8(%esi), %eax 74 mull 8(%esp) 75 lea -12(%esi), %esi 76 jmp L(m0) 77 78L(b0): mov (%esi), %eax 79 mull 4(%esp) 80 mov -4(%esi), %ebp 81 add %eax, %ebp 82 adc %edx, %edi 83 mov 4(%esi), %eax 84 mull 8(%esp) 85 add %eax, %ebp 86 adc %edx, %edi 87 mov 8(%esi), %eax 88 mull 12(%esp) 89 lea -16(%esi), %esi 90 jmp L(m0) 91 92L(b1): mov 8(%esi), %ebp 93 lea -4(%esi), %esi 94 jmp L(m1) 95 96L(b2): mov 8(%esi), %edi 97 mov 4(%esi), %ebp 98 lea -8(%esi), %esi 99 jmp L(m1) 100 101 ALIGN(16) 102L(top): mov (%esi), %eax 103 mull 4(%esp) 104 mov -4(%esi), %ebx 105 xor %ecx, %ecx 106 add %eax, %ebx 107 adc %edx, %ecx 108 mov 4(%esi), %eax 109 mull 8(%esp) 110 add %eax, %ebx 111 adc %edx, %ecx 112 mov 8(%esi), %eax 113 mull 12(%esp) 114 add %eax, %ebx 115 adc %edx, %ecx 116 lea -16(%esi), %esi 117 mov 16(%esp), %eax 118 mul %ebp 119 add %eax, %ebx 120 adc %edx, %ecx 121 mov 20(%esp), %eax 122 mul %edi 123 mov %ebx, %ebp 124 mov %ecx, %edi 125L(m0): add %eax, %ebp 126 adc %edx, %edi 127L(m1): sub $4, 52(%esp) 128 ja L(top) 129 130L(end): mov 4(%esp), %eax 131 mul %edi 132 mov 60(%esp), %edi 133 add %eax, %ebp 134 adc $0, %edx 135 mov 4(%edi), %ecx 136 mov %edx, %esi 137 mov %ebp, %eax 138 sal %cl, %esi 139 mov %ecx, %ebx 140 neg %ecx 141 shr %cl, %eax 142 or %esi, %eax 143 lea 1(%eax), %esi 144 mull (%edi) 145 mov %ebx, %ecx 146 mov %eax, %ebx 147 mov %ebp, %eax 148 mov 56(%esp), %ebp 149 sal %cl, %eax 150 add %eax, %ebx 151 adc %esi, %edx 152 imul %ebp, %edx 153 sub %edx, %eax 154 lea (%eax,%ebp), %edx 155 cmp %eax, %ebx 156 cmovc( %edx, %eax) 157 mov %eax, %edx 158 sub %ebp, %eax 159 cmovc( %edx, %eax) 160 add $28, %esp 161 pop %ebx 162 pop %esi 163 pop %edi 164 pop %ebp 165 shr %cl, %eax 166 ret 167EPILOGUE() 168 169 ALIGN(16) 170PROLOGUE(mpn_mod_1s_4p_cps) 171C CAUTION: This is the same code as in pentium4/sse2/mod_1_4.asm 172 push %ebp 173 push %edi 174 push %esi 175 push %ebx 176 mov 20(%esp), %ebp C FIXME: avoid bp for 0-idx 177 mov 24(%esp), %ebx 178 bsr %ebx, %ecx 179 xor $31, %ecx 180 sal %cl, %ebx C b << cnt 181 mov %ebx, %edx 182 not %edx 183 mov $-1, %eax 184 div %ebx 185 xor %edi, %edi 186 sub %ebx, %edi 187 mov $1, %esi 188 mov %eax, (%ebp) C store bi 189 mov %ecx, 4(%ebp) C store cnt 190 shld %cl, %eax, %esi 191 imul %edi, %esi 192 mov %eax, %edi 193 mul %esi 194 195 add %esi, %edx 196 shr %cl, %esi 197 mov %esi, 8(%ebp) C store B1modb 198 199 not %edx 200 imul %ebx, %edx 201 lea (%edx,%ebx), %esi 202 cmp %edx, %eax 203 cmovnc( %edx, %esi) 204 mov %edi, %eax 205 mul %esi 206 207 add %esi, %edx 208 shr %cl, %esi 209 mov %esi, 12(%ebp) C store B2modb 210 211 not %edx 212 imul %ebx, %edx 213 lea (%edx,%ebx), %esi 214 cmp %edx, %eax 215 cmovnc( %edx, %esi) 216 mov %edi, %eax 217 mul %esi 218 219 add %esi, %edx 220 shr %cl, %esi 221 mov %esi, 16(%ebp) C store B3modb 222 223 not %edx 224 imul %ebx, %edx 225 lea (%edx,%ebx), %esi 226 cmp %edx, %eax 227 cmovnc( %edx, %esi) 228 mov %edi, %eax 229 mul %esi 230 231 add %esi, %edx 232 shr %cl, %esi 233 mov %esi, 20(%ebp) C store B4modb 234 235 not %edx 236 imul %ebx, %edx 237 add %edx, %ebx 238 cmp %edx, %eax 239 cmovnc( %edx, %ebx) 240 241 shr %cl, %ebx 242 mov %ebx, 24(%ebp) C store B5modb 243 244 pop %ebx 245 pop %esi 246 pop %edi 247 pop %ebp 248 ret 249EPILOGUE() 250