1dnl AMD64 mpn_mod_1s_4p 2 3dnl Contributed to the GNU project by Torbjorn Granlund. 4 5dnl Copyright 2009, 2010, 2011, 2012 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of the GNU Lesser General Public License as published 11dnl by the Free Software Foundation; either version 3 of the License, or (at 12dnl your option) any later version. 13 14dnl The GNU MP Library is distributed in the hope that it will be useful, but 15dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 16dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 17dnl License for more details. 18 19dnl You should have received a copy of the GNU Lesser General Public License 20dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 21 22include(`../config.m4') 23 24C cycles/limb 25C AMD K8,K9 3 26C AMD K10 3 27C Intel P4 15.5 28C Intel core2 5 29C Intel corei 4 30C Intel atom 23 31C VIA nano 4.75 32 33ABI_SUPPORT(DOS64) 34ABI_SUPPORT(STD64) 35 36ASM_START() 37 TEXT 38 ALIGN(16) 39PROLOGUE(mpn_mod_1s_4p) 40 FUNC_ENTRY(4) 41 push %r15 42 push %r14 43 push %r13 44 push %r12 45 push %rbp 46 push %rbx 47 48 mov %rdx, %r15 49 mov %rcx, %r14 50 mov 16(%rcx), %r11 C B1modb 51 mov 24(%rcx), %rbx C B2modb 52 mov 32(%rcx), %rbp C B3modb 53 mov 40(%rcx), %r13 C B4modb 54 mov 48(%rcx), %r12 C B5modb 55 xor R32(%r8), R32(%r8) 56 mov R32(%rsi), R32(%rdx) 57 and $3, R32(%rdx) 58 je L(b0) 59 cmp $2, R32(%rdx) 60 jc L(b1) 61 je L(b2) 62 63L(b3): lea -24(%rdi,%rsi,8), %rdi 64 mov 8(%rdi), %rax 65 mul %r11 66 mov (%rdi), %r9 67 add %rax, %r9 68 adc %rdx, %r8 69 mov 16(%rdi), %rax 70 mul %rbx 71 jmp L(m0) 72 73 ALIGN(8) 74L(b0): lea -32(%rdi,%rsi,8), %rdi 75 mov 8(%rdi), %rax 76 mul %r11 77 mov (%rdi), %r9 78 add %rax, %r9 79 adc %rdx, %r8 80 mov 16(%rdi), %rax 81 mul %rbx 82 add %rax, %r9 83 adc %rdx, %r8 84 mov 24(%rdi), %rax 85 mul %rbp 86 jmp L(m0) 87 88 ALIGN(8) 89L(b1): lea -8(%rdi,%rsi,8), %rdi 90 mov (%rdi), %r9 91 jmp L(m1) 92 93 ALIGN(8) 94L(b2): lea -16(%rdi,%rsi,8), %rdi 95 mov 8(%rdi), %r8 96 mov (%rdi), %r9 97 jmp L(m1) 98 99 ALIGN(16) 100L(top): mov -24(%rdi), %rax 101 mov -32(%rdi), %r10 102 mul %r11 C up[1] * B1modb 103 add %rax, %r10 104 mov -16(%rdi), %rax 105 mov $0, R32(%rcx) 106 adc %rdx, %rcx 107 mul %rbx C up[2] * B2modb 108 add %rax, %r10 109 mov -8(%rdi), %rax 110 adc %rdx, %rcx 111 sub $32, %rdi 112 mul %rbp C up[3] * B3modb 113 add %rax, %r10 114 mov %r13, %rax 115 adc %rdx, %rcx 116 mul %r9 C rl * B4modb 117 add %rax, %r10 118 mov %r12, %rax 119 adc %rdx, %rcx 120 mul %r8 C rh * B5modb 121 mov %r10, %r9 122 mov %rcx, %r8 123L(m0): add %rax, %r9 124 adc %rdx, %r8 125L(m1): sub $4, %rsi 126 ja L(top) 127 128L(end): mov 8(%r14), R32(%rsi) 129 mov %r8, %rax 130 mul %r11 131 mov %rax, %r8 132 add %r9, %r8 133 adc $0, %rdx 134 xor R32(%rcx), R32(%rcx) 135 sub R32(%rsi), R32(%rcx) 136 mov %r8, %rdi 137 shr R8(%rcx), %rdi 138 mov R32(%rsi), R32(%rcx) 139 sal R8(%rcx), %rdx 140 or %rdx, %rdi 141 mov %rdi, %rax 142 mulq (%r14) 143 mov %r15, %rbx 144 mov %rax, %r9 145 sal R8(%rcx), %r8 146 inc %rdi 147 add %r8, %r9 148 adc %rdi, %rdx 149 imul %rbx, %rdx 150 sub %rdx, %r8 151 lea (%r8,%rbx), %rax 152 cmp %r8, %r9 153 cmovc %rax, %r8 154 mov %r8, %rax 155 sub %rbx, %rax 156 cmovc %r8, %rax 157 shr R8(%rcx), %rax 158 pop %rbx 159 pop %rbp 160 pop %r12 161 pop %r13 162 pop %r14 163 pop %r15 164 FUNC_EXIT() 165 ret 166EPILOGUE() 167 168 ALIGN(16) 169PROLOGUE(mpn_mod_1s_4p_cps) 170 FUNC_ENTRY(2) 171 push %rbp 172 bsr %rsi, %rcx 173 push %rbx 174 mov %rdi, %rbx 175 push %r12 176 xor $63, R32(%rcx) 177 mov %rsi, %r12 178 mov R32(%rcx), R32(%rbp) C preserve cnt over call 179 sal R8(%rcx), %r12 C b << cnt 180IFSTD(` mov %r12, %rdi ') C pass parameter 181IFDOS(` mov %r12, %rcx ') C pass parameter 182 CALL( mpn_invert_limb) 183 mov %r12, %r8 184 mov %rax, %r11 185 mov %rax, (%rbx) C store bi 186 mov %rbp, 8(%rbx) C store cnt 187 neg %r8 188 mov R32(%rbp), R32(%rcx) 189 mov $1, R32(%rsi) 190ifdef(`SHLD_SLOW',` 191 shl R8(%rcx), %rsi 192 neg R32(%rcx) 193 mov %rax, %rbp 194 shr R8(%rcx), %rax 195 or %rax, %rsi 196 mov %rbp, %rax 197 neg R32(%rcx) 198',` 199 shld R8(%rcx), %rax, %rsi C FIXME: Slow on Atom and Nano 200') 201 imul %r8, %rsi 202 mul %rsi 203 204 add %rsi, %rdx 205 shr R8(%rcx), %rsi 206 mov %rsi, 16(%rbx) C store B1modb 207 208 not %rdx 209 imul %r12, %rdx 210 lea (%rdx,%r12), %rsi 211 cmp %rdx, %rax 212 cmovnc %rdx, %rsi 213 mov %r11, %rax 214 mul %rsi 215 216 add %rsi, %rdx 217 shr R8(%rcx), %rsi 218 mov %rsi, 24(%rbx) C store B2modb 219 220 not %rdx 221 imul %r12, %rdx 222 lea (%rdx,%r12), %rsi 223 cmp %rdx, %rax 224 cmovnc %rdx, %rsi 225 mov %r11, %rax 226 mul %rsi 227 228 add %rsi, %rdx 229 shr R8(%rcx), %rsi 230 mov %rsi, 32(%rbx) C store B3modb 231 232 not %rdx 233 imul %r12, %rdx 234 lea (%rdx,%r12), %rsi 235 cmp %rdx, %rax 236 cmovnc %rdx, %rsi 237 mov %r11, %rax 238 mul %rsi 239 240 add %rsi, %rdx 241 shr R8(%rcx), %rsi 242 mov %rsi, 40(%rbx) C store B4modb 243 244 not %rdx 245 imul %r12, %rdx 246 add %rdx, %r12 247 cmp %rdx, %rax 248 cmovnc %rdx, %r12 249 250 shr R8(%rcx), %r12 251 mov %r12, 48(%rbx) C store B5modb 252 253 pop %r12 254 pop %rbx 255 pop %rbp 256 FUNC_EXIT() 257 ret 258EPILOGUE() 259