1dnl AMD64 mpn_mul_2 optimised for AMD Bulldozer. 2 3dnl Contributed to the GNU project by Torbjörn Granlund. 4 5dnl Copyright 2008, 2011-2013 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35C cycles/limb 36C AMD K8,K9 6.78 37C AMD K10 6.78 38C AMD bd1 8.39 8.65 39C AMD bd2 8.47 40C AMD bd3 41C AMD bd4 42C AMD zen 43C AMD bt1 12.1 44C AMD bt2 11.5 45C Intel P4 24.0 46C Intel PNR 8.14 47C Intel NHM 7.78 48C Intel SBR 6.34 49C Intel IBR 6.15 50C Intel HWL 6.04 51C Intel BWL 4.33 52C Intel SKL 4.41 53C Intel atom 39.5 54C Intel SLM 27.8 55C VIA nano 56 57C The loop of this code is the result of running a code generation and 58C optimisation tool suite written by David Harvey and Torbjorn Granlund. 59 60define(`rp', `%rdi') C rcx 61define(`up', `%rsi') C rdx 62define(`n_param', `%rdx') C r8 63define(`vp', `%rcx') C r9 64 65define(`v0', `%r8') 66define(`v1', `%r9') 67define(`w0', `%rbx') 68define(`w1', `%rcx') 69define(`w2', `%rbp') 70define(`w3', `%r10') 71define(`n', `%r11') 72 73ABI_SUPPORT(DOS64) 74ABI_SUPPORT(STD64) 75 76ASM_START() 77 TEXT 78 ALIGN(32) 79PROLOGUE(mpn_mul_2) 80 FUNC_ENTRY(4) 81 push %rbx 82 push %rbp 83 84 mov (up), %rax 85 86 mov (vp), v0 87 mov 8(vp), v1 88 89 lea (up,n_param,8), up 90 lea (rp,n_param,8), rp 91 92 mov n_param, n 93 mul v0 94 neg n 95 96 test $1, R8(n) 97 jnz L(bx1) 98 99L(bx0): test $2, R8(n) 100 jnz L(b10) 101 102L(b00): mov %rax, w0 103 mov %rdx, w1 104 xor R32(w2), R32(w2) 105 mov (up,n,8), %rax 106 jmp L(lo0) 107 108L(b10): mov %rax, w2 109 mov %rdx, w3 110 mov (up,n,8), %rax 111 xor R32(w0), R32(w0) 112 mul v1 113 add $-2, n 114 jmp L(lo2) 115 116L(bx1): test $2, R8(n) 117 jz L(b11) 118 119L(b01): mov %rax, w3 120 mov %rdx, w0 121 mov (up,n,8), %rax 122 mul v1 123 xor R32(w1), R32(w1) 124 inc n 125 jmp L(lo1) 126 127L(b11): mov %rax, w1 128 mov %rdx, w2 129 mov (up,n,8), %rax 130 xor R32(w3), R32(w3) 131 dec n 132 jmp L(lo3) 133 134 ALIGN(32) 135L(top): mov -8(up,n,8), %rax 136 mul v1 137 mov w2, -16(rp,n,8) 138L(lo1): add %rax, w0 139 mov w3, -8(rp,n,8) 140 adc %rdx, w1 141 mov (up,n,8), %rax 142 mul v0 143 mov $0, R32(w2) 144 add %rax, w0 145 adc %rdx, w1 146 adc $0, R32(w2) 147 mov (up,n,8), %rax 148L(lo0): mul v1 149 add %rax, w1 150 adc %rdx, w2 151 mov 8(up,n,8), %rax 152 mul v0 153 add %rax, w1 154 mov w0, (rp,n,8) 155 mov $0, R32(w3) 156 mov 8(up,n,8), %rax 157 adc %rdx, w2 158 adc $0, R32(w3) 159L(lo3): mul v1 160 add %rax, w2 161 mov 16(up,n,8), %rax 162 adc %rdx, w3 163 mul v0 164 add %rax, w2 165 mov 16(up,n,8), %rax 166 mov $0, R32(w0) 167 adc %rdx, w3 168 adc $0, R32(w0) 169 mul v1 170 mov w1, 8(rp,n,8) 171L(lo2): add %rax, w3 172 adc %rdx, w0 173 mov 24(up,n,8), %rax 174 mul v0 175 add %rax, w3 176 adc %rdx, w0 177 mov $0, R32(w1) 178 adc $0, R32(w1) 179 add $4, n 180 jnc L(top) 181 182L(end): mov -8(up), %rax 183 mul v1 184 mov w2, -16(rp) 185 add %rax, w0 186 mov w3, -8(rp) 187 adc %rdx, w1 188 mov w0, (rp) 189 mov w1, %rax 190 191 pop %rbp 192 pop %rbx 193 FUNC_EXIT() 194 ret 195EPILOGUE() 196