1dnl AMD64 mpn_mod_1s_2p 2 3dnl Contributed to the GNU project by Torbjorn Granlund. 4 5dnl Copyright 2009, 2010, 2011, 2012 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of the GNU Lesser General Public License as published 11dnl by the Free Software Foundation; either version 3 of the License, or (at 12dnl your option) any later version. 13 14dnl The GNU MP Library is distributed in the hope that it will be useful, but 15dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 16dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 17dnl License for more details. 18 19dnl You should have received a copy of the GNU Lesser General Public License 20dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 21 22include(`../config.m4') 23 24C cycles/limb 25C AMD K8,K9 4 26C AMD K10 4 27C Intel P4 19 28C Intel core2 8 29C Intel NHM 6.5 30C Intel SBR 4.5 31C Intel atom 28 32C VIA nano 8 33 34ABI_SUPPORT(DOS64) 35ABI_SUPPORT(STD64) 36 37ASM_START() 38 TEXT 39 ALIGN(16) 40PROLOGUE(mpn_mod_1s_2p) 41 FUNC_ENTRY(4) 42 push %r14 43 test $1, R8(%rsi) 44 mov %rdx, %r14 45 push %r13 46 mov %rcx, %r13 47 push %r12 48 push %rbp 49 push %rbx 50 mov 16(%rcx), %r10 51 mov 24(%rcx), %rbx 52 mov 32(%rcx), %rbp 53 je L(b0) 54 dec %rsi 55 je L(one) 56 mov -8(%rdi,%rsi,8), %rax 57 mul %r10 58 mov %rax, %r9 59 mov %rdx, %r8 60 mov (%rdi,%rsi,8), %rax 61 add -16(%rdi,%rsi,8), %r9 62 adc $0, %r8 63 mul %rbx 64 add %rax, %r9 65 adc %rdx, %r8 66 jmp L(11) 67 68L(b0): mov -8(%rdi,%rsi,8), %r8 69 mov -16(%rdi,%rsi,8), %r9 70 71L(11): sub $4, %rsi 72 jb L(ed2) 73 lea 40(%rdi,%rsi,8), %rdi 74 mov -40(%rdi), %r11 75 mov -32(%rdi), %rax 76 jmp L(m0) 77 78 ALIGN(16) 79L(top): mov -24(%rdi), %r9 80 add %rax, %r11 81 mov -16(%rdi), %rax 82 adc %rdx, %r12 83 mul %r10 84 add %rax, %r9 85 mov %r11, %rax 86 mov %rdx, %r8 87 adc $0, %r8 88 mul %rbx 89 add %rax, %r9 90 mov %r12, %rax 91 adc %rdx, %r8 92 mul %rbp 93 sub $2, %rsi 94 jb L(ed1) 95 mov -40(%rdi), %r11 96 add %rax, %r9 97 mov -32(%rdi), %rax 98 adc %rdx, %r8 99L(m0): mul %r10 100 add %rax, %r11 101 mov %r9, %rax 102 mov %rdx, %r12 103 adc $0, %r12 104 mul %rbx 105 add %rax, %r11 106 lea -32(%rdi), %rdi C ap -= 4 107 mov %r8, %rax 108 adc %rdx, %r12 109 mul %rbp 110 sub $2, %rsi 111 jae L(top) 112 113L(ed0): mov %r11, %r9 114 mov %r12, %r8 115L(ed1): add %rax, %r9 116 adc %rdx, %r8 117L(ed2): mov 8(%r13), R32(%rdi) C cnt 118 mov %r8, %rax 119 mov %r9, %r8 120 mul %r10 121 add %rax, %r8 122 adc $0, %rdx 123L(1): xor R32(%rcx), R32(%rcx) 124 mov %r8, %r9 125 sub R32(%rdi), R32(%rcx) 126 shr R8(%rcx), %r9 127 mov R32(%rdi), R32(%rcx) 128 sal R8(%rcx), %rdx 129 or %rdx, %r9 130 sal R8(%rcx), %r8 131 mov %r9, %rax 132 mulq (%r13) 133 mov %rax, %rsi 134 inc %r9 135 add %r8, %rsi 136 adc %r9, %rdx 137 imul %r14, %rdx 138 sub %rdx, %r8 139 lea (%r8,%r14), %rax 140 cmp %r8, %rsi 141 cmovc %rax, %r8 142 mov %r8, %rax 143 sub %r14, %rax 144 cmovc %r8, %rax 145 mov R32(%rdi), R32(%rcx) 146 shr R8(%rcx), %rax 147 pop %rbx 148 pop %rbp 149 pop %r12 150 pop %r13 151 pop %r14 152 FUNC_EXIT() 153 ret 154L(one): 155 mov (%rdi), %r8 156 mov 8(%rcx), R32(%rdi) 157 xor %rdx, %rdx 158 jmp L(1) 159EPILOGUE() 160 161 ALIGN(16) 162PROLOGUE(mpn_mod_1s_2p_cps) 163 FUNC_ENTRY(2) 164 push %rbp 165 bsr %rsi, %rcx 166 push %rbx 167 mov %rdi, %rbx 168 push %r12 169 xor $63, R32(%rcx) 170 mov %rsi, %r12 171 mov R32(%rcx), R32(%rbp) C preserve cnt over call 172 sal R8(%rcx), %r12 C b << cnt 173IFSTD(` mov %r12, %rdi ') C pass parameter 174IFDOS(` mov %r12, %rcx ') C pass parameter 175 CALL( mpn_invert_limb) 176 mov %r12, %r8 177 mov %rax, %r11 178 mov %rax, (%rbx) C store bi 179 mov %rbp, 8(%rbx) C store cnt 180 neg %r8 181 mov R32(%rbp), R32(%rcx) 182 mov $1, R32(%rsi) 183ifdef(`SHLD_SLOW',` 184 shl R8(%rcx), %rsi 185 neg R32(%rcx) 186 mov %rax, %rbp 187 shr R8(%rcx), %rax 188 or %rax, %rsi 189 mov %rbp, %rax 190 neg R32(%rcx) 191',` 192 shld R8(%rcx), %rax, %rsi C FIXME: Slow on Atom and Nano 193') 194 imul %r8, %rsi 195 mul %rsi 196 197 add %rsi, %rdx 198 shr R8(%rcx), %rsi 199 mov %rsi, 16(%rbx) C store B1modb 200 201 not %rdx 202 imul %r12, %rdx 203 lea (%rdx,%r12), %rsi 204 cmp %rdx, %rax 205 cmovnc %rdx, %rsi 206 mov %r11, %rax 207 mul %rsi 208 209 add %rsi, %rdx 210 shr R8(%rcx), %rsi 211 mov %rsi, 24(%rbx) C store B2modb 212 213 not %rdx 214 imul %r12, %rdx 215 add %rdx, %r12 216 cmp %rdx, %rax 217 cmovnc %rdx, %r12 218 219 shr R8(%rcx), %r12 220 mov %r12, 32(%rbx) C store B3modb 221 222 pop %r12 223 pop %rbx 224 pop %rbp 225 FUNC_EXIT() 226 ret 227EPILOGUE() 228