1dnl AMD64 mpn_addmul_2 optimised for Intel Atom. 2 3dnl Copyright 2008, 2011-2013 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33C cycles/limb best 34C AMD K8,K9 35C AMD K10 36C AMD bd1 37C AMD bd2 38C AMD bobcat 39C AMD jaguar 40C Intel P4 41C Intel PNR 42C Intel NHM 43C Intel SBR 44C Intel IBR 45C Intel HWL 46C Intel BWL 47C Intel atom 18.8 this 48C VIA nano 49 50C The loop of this code is the result of running a code generation and 51C optimisation tool suite written by David Harvey and Torbjorn Granlund. 52 53define(`rp', `%rdi') C rcx 54define(`up', `%rsi') C rdx 55define(`n_param', `%rdx') C r8 56define(`vp', `%rcx') C r9 57 58define(`v0', `%r8') 59define(`v1', `%r9') 60define(`w0', `%rbx') 61define(`w1', `%rcx') 62define(`w2', `%rbp') 63define(`w3', `%r10') 64define(`n', `%r11') 65 66ABI_SUPPORT(DOS64) 67ABI_SUPPORT(STD64) 68 69ASM_START() 70 TEXT 71 ALIGN(16) 72PROLOGUE(mpn_addmul_2) 73 FUNC_ENTRY(4) 74 push %rbx 75 push %rbp 76 77 mov (up), %rax 78 79 mov (vp), v0 80 mov 8(vp), v1 81 82 mov n_param, n 83 mul v0 84 85 test $1, R8(n) 86 jnz L(bx1) 87 88L(bx0): test $2, R8(n) 89 jnz L(b10) 90 91L(b00): mov %rax, w0 92 mov (up), %rax 93 mov %rdx, w1 94 xor R32(w2), R32(w2) 95 lea -8(rp), rp 96 jmp L(lo0) 97 98L(b10): mov %rax, w2 99 mov (up), %rax 100 mov %rdx, w3 101 xor R32(w0), R32(w0) 102 lea -16(up), up 103 lea -24(rp), rp 104 jmp L(lo2) 105 106L(bx1): test $2, R8(n) 107 jnz L(b11) 108 109L(b01): mov %rax, w3 110 mov %rdx, w0 111 mov (up), %rax 112 xor R32(w1), R32(w1) 113 lea 8(up), up 114 dec n 115 jmp L(lo1) 116 117L(b11): mov %rax, w1 118 mov (up), %rax 119 mov %rdx, w2 120 xor R32(w3), R32(w3) 121 lea -8(up), up 122 lea -16(rp), rp 123 jmp L(lo3) 124 125 ALIGN(16) 126L(top): 127L(lo1): mul v1 128 add w3, (rp) 129 mov $0, R32(w2) 130 adc %rax, w0 131 mov (up), %rax 132 adc %rdx, w1 133 mul v0 134 add %rax, w0 135 mov (up), %rax 136 adc %rdx, w1 137 adc $0, R32(w2) 138L(lo0): mul v1 139 add w0, 8(rp) 140 adc %rax, w1 141 mov 8(up), %rax 142 mov $0, R32(w3) 143 adc %rdx, w2 144 mul v0 145 add %rax, w1 146 mov 8(up), %rax 147 adc %rdx, w2 148 adc $0, R32(w3) 149L(lo3): mul v1 150 add w1, 16(rp) 151 adc %rax, w2 152 mov 16(up), %rax 153 mov $0, R32(w0) 154 adc %rdx, w3 155 mul v0 156 add %rax, w2 157 mov 16(up), %rax 158 adc %rdx, w3 159 adc $0, R32(w0) 160L(lo2): mul v1 161 add w2, 24(rp) 162 adc %rax, w3 163 mov 24(up), %rax 164 adc %rdx, w0 165 mov $0, R32(w1) 166 lea 32(rp), rp 167 mul v0 168 lea 32(up), up 169 add %rax, w3 170 adc %rdx, w0 171 mov -8(up), %rax 172 adc $0, R32(w1) 173 sub $4, n 174 ja L(top) 175 176L(end): mul v1 177 add w3, (rp) 178 adc %rax, w0 179 adc %rdx, w1 180 mov w0, 8(rp) 181 mov w1, %rax 182 pop %rbp 183 pop %rbx 184 FUNC_EXIT() 185 ret 186EPILOGUE() 187