1dnl AMD64 mpn_mul_2 optimised for AMD Bulldozer. 2 3dnl Contributed to the GNU project by Torbjörn Granlund. 4 5dnl Copyright 2008, 2011-2013 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35C cycles/limb 36C AMD K8,K9 37C AMD K10 38C AMD bull 4.36 average, quite fluctuating 39C AMD pile 4.38 slighty fluctuating 40C AMD steam 41C AMD bobcat 42C AMD jaguar 43C Intel P4 44C Intel core 45C Intel NHM 46C Intel SBR 47C Intel IBR 48C Intel HWL 49C Intel BWL 50C Intel atom 51C VIA nano 52 53C The loop of this code is the result of running a code generation and 54C optimisation tool suite written by David Harvey and Torbjorn Granlund. 55C Scheme: genxmul --mul 56 57define(`rp', `%rdi') C rcx 58define(`up', `%rsi') C rdx 59define(`n_param', `%rdx') C r8 60define(`vp', `%rcx') C r9 61 62define(`v0', `%r8') 63define(`v1', `%r9') 64define(`w0', `%rbx') 65define(`w1', `%rcx') 66define(`w2', `%rbp') 67define(`w3', `%r10') 68define(`n', `%r11') 69 70ABI_SUPPORT(DOS64) 71ABI_SUPPORT(STD64) 72 73ASM_START() 74 TEXT 75 ALIGN(32) 76PROLOGUE(mpn_mul_2) 77 FUNC_ENTRY(4) 78 push %rbx 79 push %rbp 80 81 mov (up), %rax 82 83 mov (vp), v0 84 mov 8(vp), v1 85 86 lea (up,n_param,8), up 87 lea (rp,n_param,8), rp 88 89 mov n_param, n 90 mul v0 91 neg n 92 93 test $1, R8(n) 94 jnz L(bx1) 95 96L(bx0): test $2, R8(n) 97 jnz L(b10) 98 99L(b00): mov %rax, w0 100 mov %rdx, w1 101 xor R32(w2), R32(w2) 102 mov (up,n,8), %rax 103 jmp L(lo0) 104 105L(b10): mov %rax, w2 106 mov %rdx, w3 107 mov (up,n,8), %rax 108 xor R32(w0), R32(w0) 109 mul v1 110 add $-2, n 111 jmp L(lo2) 112 113L(bx1): test $2, R8(n) 114 jz L(b11) 115 116L(b01): mov %rax, w3 117 mov %rdx, w0 118 mov (up,n,8), %rax 119 mul v1 120 xor R32(w1), R32(w1) 121 inc n 122 jmp L(lo1) 123 124L(b11): mov %rax, w1 125 mov %rdx, w2 126 mov (up,n,8), %rax 127 xor R32(w3), R32(w3) 128 dec n 129 jmp L(lo3) 130 131 ALIGN(32) 132L(top): mov -8(up,n,8), %rax 133 mul v1 134 mov w2, -16(rp,n,8) 135L(lo1): add %rax, w0 136 mov w3, -8(rp,n,8) 137 adc %rdx, w1 138 mov (up,n,8), %rax 139 mul v0 140 mov $0, R32(w2) 141 add %rax, w0 142 adc %rdx, w1 143 adc $0, R32(w2) 144 mov (up,n,8), %rax 145L(lo0): mul v1 146 add %rax, w1 147 adc %rdx, w2 148 mov 8(up,n,8), %rax 149 mul v0 150 add %rax, w1 151 mov w0, (rp,n,8) 152 mov $0, R32(w3) 153 mov 8(up,n,8), %rax 154 adc %rdx, w2 155 adc $0, R32(w3) 156L(lo3): mul v1 157 add %rax, w2 158 mov 16(up,n,8), %rax 159 adc %rdx, w3 160 mul v0 161 add %rax, w2 162 mov 16(up,n,8), %rax 163 mov $0, R32(w0) 164 adc %rdx, w3 165 adc $0, R32(w0) 166 mul v1 167 mov w1, 8(rp,n,8) 168L(lo2): add %rax, w3 169 adc %rdx, w0 170 mov 24(up,n,8), %rax 171 mul v0 172 add %rax, w3 173 adc %rdx, w0 174 mov $0, R32(w1) 175 adc $0, R32(w1) 176 add $4, n 177 jnc L(top) 178 179L(end): mov -8(up,n,8), %rax 180 mul v1 181 mov w2, -16(rp,n,8) 182 add %rax, w0 183 mov w3, -8(rp,n,8) 184 adc %rdx, w1 185 mov w0, (rp,n,8) 186 mov w1, %rax 187 188 pop %rbp 189 pop %rbx 190 FUNC_EXIT() 191 ret 192EPILOGUE() 193