1dnl AMD64 mpn_mod_1s_2p 2 3dnl Contributed to the GNU project by Torbjorn Granlund. 4 5dnl Copyright 2009-2012, 2014 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35C cycles/limb 36C AMD K8,K9 4 37C AMD K10 4 38C Intel P4 19 39C Intel core2 8 40C Intel NHM 6.5 41C Intel SBR 4.5 42C Intel atom 28 43C VIA nano 8 44 45ABI_SUPPORT(DOS64) 46ABI_SUPPORT(STD64) 47 48ASM_START() 49 TEXT 50 ALIGN(16) 51PROLOGUE(mpn_mod_1s_2p) 52 FUNC_ENTRY(4) 53 push %r14 54 test $1, R8(%rsi) 55 mov %rdx, %r14 56 push %r13 57 mov %rcx, %r13 58 push %r12 59 push %rbp 60 push %rbx 61 mov 16(%rcx), %r10 62 mov 24(%rcx), %rbx 63 mov 32(%rcx), %rbp 64 je L(b0) 65 dec %rsi 66 je L(one) 67 mov -8(%rdi,%rsi,8), %rax 68 mul %r10 69 mov %rax, %r9 70 mov %rdx, %r8 71 mov (%rdi,%rsi,8), %rax 72 add -16(%rdi,%rsi,8), %r9 73 adc $0, %r8 74 mul %rbx 75 add %rax, %r9 76 adc %rdx, %r8 77 jmp L(11) 78 79L(b0): mov -8(%rdi,%rsi,8), %r8 80 mov -16(%rdi,%rsi,8), %r9 81 82L(11): sub $4, %rsi 83 jb L(ed2) 84 lea 40(%rdi,%rsi,8), %rdi 85 mov -40(%rdi), %r11 86 mov -32(%rdi), %rax 87 jmp L(m0) 88 89 ALIGN(16) 90L(top): mov -24(%rdi), %r9 91 add %rax, %r11 92 mov -16(%rdi), %rax 93 adc %rdx, %r12 94 mul %r10 95 add %rax, %r9 96 mov %r11, %rax 97 mov %rdx, %r8 98 adc $0, %r8 99 mul %rbx 100 add %rax, %r9 101 mov %r12, %rax 102 adc %rdx, %r8 103 mul %rbp 104 sub $2, %rsi 105 jb L(ed1) 106 mov -40(%rdi), %r11 107 add %rax, %r9 108 mov -32(%rdi), %rax 109 adc %rdx, %r8 110L(m0): mul %r10 111 add %rax, %r11 112 mov %r9, %rax 113 mov %rdx, %r12 114 adc $0, %r12 115 mul %rbx 116 add %rax, %r11 117 lea -32(%rdi), %rdi C ap -= 4 118 mov %r8, %rax 119 adc %rdx, %r12 120 mul %rbp 121 sub $2, %rsi 122 jae L(top) 123 124L(ed0): mov %r11, %r9 125 mov %r12, %r8 126L(ed1): add %rax, %r9 127 adc %rdx, %r8 128L(ed2): mov 8(%r13), R32(%rdi) C cnt 129 mov %r8, %rax 130 mov %r9, %r8 131 mul %r10 132 add %rax, %r8 133 adc $0, %rdx 134L(1): xor R32(%rcx), R32(%rcx) 135 mov %r8, %r9 136 sub R32(%rdi), R32(%rcx) 137 shr R8(%rcx), %r9 138 mov R32(%rdi), R32(%rcx) 139 sal R8(%rcx), %rdx 140 or %rdx, %r9 141 sal R8(%rcx), %r8 142 mov %r9, %rax 143 mulq (%r13) 144 mov %rax, %rsi 145 inc %r9 146 add %r8, %rsi 147 adc %r9, %rdx 148 imul %r14, %rdx 149 sub %rdx, %r8 150 lea (%r8,%r14), %rax 151 cmp %r8, %rsi 152 cmovc %rax, %r8 153 mov %r8, %rax 154 sub %r14, %rax 155 cmovc %r8, %rax 156 mov R32(%rdi), R32(%rcx) 157 shr R8(%rcx), %rax 158 pop %rbx 159 pop %rbp 160 pop %r12 161 pop %r13 162 pop %r14 163 FUNC_EXIT() 164 ret 165L(one): 166 mov (%rdi), %r8 167 mov 8(%rcx), R32(%rdi) 168 xor %rdx, %rdx 169 jmp L(1) 170EPILOGUE() 171 172 ALIGN(16) 173PROLOGUE(mpn_mod_1s_2p_cps) 174 FUNC_ENTRY(2) 175 push %rbp 176 bsr %rsi, %rcx 177 push %rbx 178 mov %rdi, %rbx 179 push %r12 180 xor $63, R32(%rcx) 181 mov %rsi, %r12 182 mov R32(%rcx), R32(%rbp) C preserve cnt over call 183 sal R8(%rcx), %r12 C b << cnt 184IFSTD(` mov %r12, %rdi ') C pass parameter 185IFDOS(` mov %r12, %rcx ') C pass parameter 186IFDOS(` sub $32, %rsp ') 187 ASSERT(nz, `test $15, %rsp') 188 CALL( mpn_invert_limb) 189IFDOS(` add $32, %rsp ') 190 mov %r12, %r8 191 mov %rax, %r11 192 mov %rax, (%rbx) C store bi 193 mov %rbp, 8(%rbx) C store cnt 194 neg %r8 195 mov R32(%rbp), R32(%rcx) 196 mov $1, R32(%rsi) 197ifdef(`SHLD_SLOW',` 198 shl R8(%rcx), %rsi 199 neg R32(%rcx) 200 mov %rax, %rbp 201 shr R8(%rcx), %rax 202 or %rax, %rsi 203 mov %rbp, %rax 204 neg R32(%rcx) 205',` 206 shld R8(%rcx), %rax, %rsi C FIXME: Slow on Atom and Nano 207') 208 imul %r8, %rsi 209 mul %rsi 210 211 add %rsi, %rdx 212 shr R8(%rcx), %rsi 213 mov %rsi, 16(%rbx) C store B1modb 214 215 not %rdx 216 imul %r12, %rdx 217 lea (%rdx,%r12), %rsi 218 cmp %rdx, %rax 219 cmovnc %rdx, %rsi 220 mov %r11, %rax 221 mul %rsi 222 223 add %rsi, %rdx 224 shr R8(%rcx), %rsi 225 mov %rsi, 24(%rbx) C store B2modb 226 227 not %rdx 228 imul %r12, %rdx 229 add %rdx, %r12 230 cmp %rdx, %rax 231 cmovnc %rdx, %r12 232 233 shr R8(%rcx), %r12 234 mov %r12, 32(%rbx) C store B3modb 235 236 pop %r12 237 pop %rbx 238 pop %rbp 239 FUNC_EXIT() 240 ret 241EPILOGUE() 242