1dnl AMD64 mpn_mul_2 optimised for Intel Atom. 2 3dnl Copyright 2008, 2011-2013 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33C cycles/limb best 34C AMD K8,K9 5.78 35C AMD K10 5.78 36C AMD bull 9.10 37C AMD pile 9.17 38C AMD steam 39C AMD excavator 40C AMD bobcat 11.3 41C AMD jaguar 10.9 42C Intel P4 24.6 43C Intel core2 8.06 44C Intel NHM 7.65 45C Intel SBR 6.28 46C Intel IBR 6.10 47C Intel HWL 6.09 48C Intel BWL 4.73 49C Intel SKL 4.77 50C Intel atom 35.3 51C Intel SLM 25.6 52C VIA nano 53 54C The loop of this code is the result of running a code generation and 55C optimisation tool suite written by David Harvey and Torbjorn Granlund. 56 57define(`rp', `%rdi') C rcx 58define(`up', `%rsi') C rdx 59define(`n_param', `%rdx') C r8 60define(`vp', `%rcx') C r9 61 62define(`v0', `%r8') 63define(`v1', `%r9') 64define(`w0', `%rbx') 65define(`w1', `%rcx') 66define(`w2', `%rbp') 67define(`w3', `%r10') 68define(`n', `%r11') 69 70ABI_SUPPORT(DOS64) 71ABI_SUPPORT(STD64) 72 73ASM_START() 74 TEXT 75 ALIGN(16) 76PROLOGUE(mpn_mul_2) 77 FUNC_ENTRY(4) 78 push %rbx 79 push %rbp 80 81 mov (up), %rax 82 83 mov (vp), v0 84 mov 8(vp), v1 85 86 mov n_param, n 87 mul v0 88 89 test $1, R8(n) 90 jnz L(bx1) 91 92L(bx0): test $2, R8(n) 93 jnz L(b10) 94 95L(b00): mov %rax, w0 96 mov (up), %rax 97 mov %rdx, w1 98 xor R32(w2), R32(w2) 99 lea -8(rp), rp 100 jmp L(lo0) 101 102L(b10): mov %rax, w2 103 mov (up), %rax 104 mov %rdx, w3 105 xor R32(w0), R32(w0) 106 lea -16(up), up 107 lea -24(rp), rp 108 jmp L(lo2) 109 110L(bx1): test $2, R8(n) 111 jnz L(b11) 112 113L(b01): mov %rax, w3 114 mov %rdx, w0 115 mov (up), %rax 116 xor R32(w1), R32(w1) 117 lea 8(up), up 118 dec n 119 jmp L(lo1) 120 121L(b11): mov %rax, w1 122 mov (up), %rax 123 mov %rdx, w2 124 xor R32(w3), R32(w3) 125 lea -8(up), up 126 lea -16(rp), rp 127 jmp L(lo3) 128 129 ALIGN(16) 130L(top): 131L(lo1): mul v1 132 add %rax, w0 133 mov (up), %rax 134 mov $0, R32(w2) 135 mov w3, (rp) 136 adc %rdx, w1 137 mul v0 138 add %rax, w0 139 mov (up), %rax 140 adc %rdx, w1 141 adc $0, R32(w2) 142L(lo0): mul v1 143 add %rax, w1 144 mov 8(up), %rax 145 mov w0, 8(rp) 146 adc %rdx, w2 147 mul v0 148 add %rax, w1 149 mov 8(up), %rax 150 adc %rdx, w2 151 mov $0, R32(w3) 152 adc $0, R32(w3) 153L(lo3): mul v1 154 add %rax, w2 155 mov 16(up), %rax 156 mov w1, 16(rp) 157 mov $0, R32(w0) 158 adc %rdx, w3 159 mul v0 160 add %rax, w2 161 mov 16(up), %rax 162 adc %rdx, w3 163L(lo2): mov $0, R32(w1) 164 mov w2, 24(rp) 165 adc $0, R32(w0) 166 mul v1 167 add %rax, w3 168 mov 24(up), %rax 169 lea 32(up), up 170 adc %rdx, w0 171 mul v0 172 lea 32(rp), rp 173 add %rax, w3 174 adc %rdx, w0 175 mov -8(up), %rax 176 adc $0, R32(w1) 177 sub $4, n 178 ja L(top) 179 180L(end): mul v1 181 mov w3, (rp) 182 add %rax, w0 183 adc %rdx, w1 184 mov w0, 8(rp) 185 mov w1, %rax 186 pop %rbp 187 pop %rbx 188 FUNC_EXIT() 189 ret 190EPILOGUE() 191