1dnl AMD64 mpn_mod_1s_4p 2 3dnl Contributed to the GNU project by Torbjorn Granlund. 4 5dnl Copyright 2009 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of the GNU Lesser General Public License as published 11dnl by the Free Software Foundation; either version 3 of the License, or (at 12dnl your option) any later version. 13 14dnl The GNU MP Library is distributed in the hope that it will be useful, but 15dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 16dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 17dnl License for more details. 18 19dnl You should have received a copy of the GNU Lesser General Public License 20dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 21 22include(`../config.m4') 23 24C cycles/limb 25C K8,K9: 3.0 26C K10: 3.0 27C P4: 14.5 28C P6 core2: 5.0 29C P6 corei7: 4.3 30C P6 atom: 25.0 31 32ASM_START() 33 TEXT 34 ALIGN(16) 35PROLOGUE(mpn_mod_1s_4p) 36 push %r14 37 push %r13 38 push %r12 39 push %rbp 40 push %rbx 41 42 mov %rdx, -16(%rsp) 43 mov %rcx, %r14 44 mov 16(%rcx), %r11 45 mov 24(%rcx), %rbx 46 mov 32(%rcx), %rbp 47 mov 40(%rcx), %r13 48 mov 48(%rcx), %r12 49 xor R32(%r8), R32(%r8) 50 mov R32(%rsi), R32(%rdx) 51 and $3, R32(%rdx) 52 je L(b0) 53 cmp $2, R32(%rdx) 54 jc L(b1) 55 je L(b2) 56 57L(b3): lea -24(%rdi,%rsi,8), %rdi 58 mov 8(%rdi), %rax 59 mul %r11 60 mov (%rdi), %r9 61 add %rax, %r9 62 adc %rdx, %r8 63 mov 16(%rdi), %rax 64 mul %rbx 65 jmp L(m0) 66 67 ALIGN(8) 68L(b0): lea -32(%rdi,%rsi,8), %rdi 69 mov 8(%rdi), %rax 70 mul %r11 71 mov (%rdi), %r9 72 add %rax, %r9 73 adc %rdx, %r8 74 mov 16(%rdi), %rax 75 mul %rbx 76 add %rax, %r9 77 adc %rdx, %r8 78 mov 24(%rdi), %rax 79 mul %rbp 80 jmp L(m0) 81 82 ALIGN(8) 83L(b1): lea -8(%rdi,%rsi,8), %rdi 84 mov (%rdi), %r9 85 jmp L(m1) 86 87 ALIGN(8) 88L(b2): lea -16(%rdi,%rsi,8), %rdi 89 mov 8(%rdi), %rax 90 mul %r11 91 mov (%rdi), %r9 92 jmp L(m0) 93 94 ALIGN(16) 95L(top): mov -24(%rdi), %rax 96 mov -32(%rdi), %r10 97 mul %r11 98 add %rax, %r10 99 mov -16(%rdi), %rax 100 mov %rdx, %rcx 101 adc $0, %rcx 102 mul %rbx 103 add %rax, %r10 104 mov -8(%rdi), %rax 105 adc %rdx, %rcx 106 sub $32, %rdi 107 mul %rbp 108 add %rax, %r10 109 mov %r9, %rax 110 adc %rdx, %rcx 111 mul %r13 112 add %rax, %r10 113 mov %r8, %rax 114 adc %rdx, %rcx 115 mul %r12 116 mov %r10, %r9 117 mov %rcx, %r8 118L(m0): add %rax, %r9 119 adc %rdx, %r8 120L(m1): sub $4, %rsi 121 ja L(top) 122 123L(end): mov 8(%r14), R32(%rsi) 124 mov %r8, %rax 125 mul %r11 126 mov %rax, %r8 127 add %r9, %r8 128 adc $0, %rdx 129 xor R32(%rcx), R32(%rcx) 130 sub R32(%rsi), R32(%rcx) 131 mov %r8, %rdi 132 shr R8(%rcx), %rdi 133 mov R32(%rsi), R32(%rcx) 134 sal R8(%rcx), %rdx 135 or %rdx, %rdi 136 mov %rdi, %rax 137 mulq (%r14) 138 mov -16(%rsp), %rbx 139 mov %rax, %r9 140 sal R8(%rcx), %r8 141 inc %rdi 142 add %r8, %r9 143 adc %rdi, %rdx 144 imul %rbx, %rdx 145 sub %rdx, %r8 146 lea (%r8,%rbx), %rax 147 cmp %r8, %r9 148 cmovb %rax, %r8 149 mov %r8, %rax 150 sub %rbx, %rax 151 cmovb %r8, %rax 152 shr R8(%rcx), %rax 153 pop %rbx 154 pop %rbp 155 pop %r12 156 pop %r13 157 pop %r14 158 ret 159EPILOGUE() 160 161 ALIGN(16) 162PROLOGUE(mpn_mod_1s_4p_cps) 163 push %r12 164 bsr %rsi, %rcx 165 push %rbp 166 xor $63, R32(%rcx) 167 mov %rsi, %rbp 168 mov R32(%rcx), R32(%r12) 169 sal R8(%rcx), %rbp 170 push %rbx 171 mov %rdi, %rbx 172 mov %rbp, %rdi 173 CALL( mpn_invert_limb) 174 mov R32(%r12), R32(%rcx) 175 mov $1, R32(%r10) 176 sal R8(%rcx), %r10 177 mov $64, R32(%rcx) 178 mov %rax, %r9 179 sub R32(%r12), R32(%rcx) 180 mov %r9, (%rbx) 181 shr R8(%rcx), %rax 182 mov R32(%r12), R32(%rcx) 183 or %rax, %r10 184 mov %rbp, %rax 185 neg %rax 186 imul %rax, %r10 187 mov %r10, %rax 188 mul %r9 189 lea 1(%r10,%rdx), %r8 190 neg %r8 191 imul %rbp, %r8 192 cmp %r8, %rax 193 lea (%r8,%rbp), %rdx 194 cmovb %rdx, %r8 195 mov %r8, %rax 196 mul %r9 197 lea 1(%r8,%rdx), %rdi 198 neg %rdi 199 imul %rbp, %rdi 200 cmp %rdi, %rax 201 lea (%rdi,%rbp), %rdx 202 cmovb %rdx, %rdi 203 mov %rdi, %rax 204 mul %r9 205 lea 1(%rdi,%rdx), %rsi 206 neg %rsi 207 imul %rbp, %rsi 208 cmp %rsi, %rax 209 lea (%rsi,%rbp), %rdx 210 cmovb %rdx, %rsi 211 mov %rsi, %rax 212 mul %r9 213 lea 1(%rsi,%rdx), %rdx 214 neg %rdx 215 imul %rbp, %rdx 216 cmp %rdx, %rax 217 lea (%rdx,%rbp), %rbp 218 movslq R32(%r12), %rax 219 cmovae %rdx, %rbp 220 shr R8(%rcx), %r10 221 shr R8(%rcx), %r8 222 shr R8(%rcx), %rbp 223 shr R8(%rcx), %rdi 224 shr R8(%rcx), %rsi 225 mov %rbp, 48(%rbx) 226 mov %rax, 8(%rbx) 227 mov %r10, 16(%rbx) 228 mov %r8, 24(%rbx) 229 mov %rdi, 32(%rbx) 230 mov %rsi, 40(%rbx) 231 pop %rbx 232 pop %rbp 233 pop %r12 234 ret 235EPILOGUE() 236