1dnl AMD64 mpn_addmul_2 -- Multiply an n-limb vector with a 2-limb vector and 2dnl add the result to a third limb vector. 3 4dnl Copyright 2008, 2011, 2012, 2016 Free Software Foundation, Inc. 5 6dnl This file is part of the GNU MP Library. 7dnl 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of either: 10dnl 11dnl * the GNU Lesser General Public License as published by the Free 12dnl Software Foundation; either version 3 of the License, or (at your 13dnl option) any later version. 14dnl 15dnl or 16dnl 17dnl * the GNU General Public License as published by the Free Software 18dnl Foundation; either version 2 of the License, or (at your option) any 19dnl later version. 20dnl 21dnl or both in parallel, as here. 22dnl 23dnl The GNU MP Library is distributed in the hope that it will be useful, but 24dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26dnl for more details. 27dnl 28dnl You should have received copies of the GNU General Public License and the 29dnl GNU Lesser General Public License along with the GNU MP Library. If not, 30dnl see https://www.gnu.org/licenses/. 31 32include(`../config.m4') 33 34C cycles/limb cycles/limb cfg cycles/limb am1+am1 35C AMD K8,K9 2.375 36C AMD K10 2.375 37C AMD bull 5.2 <- 4.6-4.75 bad 38C AMD pile 4.96 <- 4.6-4.75 bad 39C AMD steam ? 40C AMD excavator ? 41C AMD bobcat 5.75 5.0 bad 42C AMD jaguar 5.9 5.2-5.4 bad 43C Intel P4 15-16 44C Intel core2 4.5 4.25-4.5 bad 45C Intel NHM 4.33 4.55 bad 46C Intel SBR 3.4 2.93 3.24 bad 47C Intel IBR 3.35 2.6 2.95 bad 48C Intel HWL 3.3 2.15 2.3 bad 49C Intel BWL 2.33 2.33 1.65 bad 50C Intel SKL 2.37 2.21 1.64 bad 51C Intel atom 20 18.7 52C Intel SLM 8 8.5 53C VIA nano 4.4 54 55C This code is the result of running a code generation and optimization tool 56C suite written by David Harvey and Torbjorn Granlund. 57 58C TODO 59C * Tune feed-in and wind-down code. 60 61C INPUT PARAMETERS 62define(`rp', `%rdi') 63define(`up', `%rsi') 64define(`n_param',`%rdx') 65define(`vp', `%rcx') 66 67define(`v0', `%r8') 68define(`v1', `%r9') 69define(`w0', `%rbx') 70define(`w1', `%rcx') 71define(`w2', `%rbp') 72define(`w3', `%r10') 73define(`n', `%r11') 74 75ABI_SUPPORT(DOS64) 76ABI_SUPPORT(STD64) 77 78ASM_START() 79 TEXT 80 ALIGN(16) 81PROLOGUE(mpn_addmul_2) 82 FUNC_ENTRY(4) 83 mov n_param, n 84 push %rbx 85 push %rbp 86 87 mov 0(vp), v0 88 mov 8(vp), v1 89 90 mov R32(n_param), R32(%rbx) 91 mov (up), %rax 92 lea -8(up,n_param,8), up 93 lea -8(rp,n_param,8), rp 94 mul v0 95 neg n 96 and $3, R32(%rbx) 97 jz L(b0) 98 cmp $2, R32(%rbx) 99 jc L(b1) 100 jz L(b2) 101 102L(b3): mov %rax, w1 103 mov %rdx, w2 104 xor R32(w3), R32(w3) 105 mov 8(up,n,8), %rax 106 dec n 107 jmp L(lo3) 108 109L(b2): mov %rax, w2 110 mov 8(up,n,8), %rax 111 mov %rdx, w3 112 xor R32(w0), R32(w0) 113 add $-2, n 114 jmp L(lo2) 115 116L(b1): mov %rax, w3 117 mov 8(up,n,8), %rax 118 mov %rdx, w0 119 xor R32(w1), R32(w1) 120 inc n 121 jmp L(lo1) 122 123L(b0): mov $0, R32(w3) 124 mov %rax, w0 125 mov 8(up,n,8), %rax 126 mov %rdx, w1 127 xor R32(w2), R32(w2) 128 jmp L(lo0) 129 130 ALIGN(32) 131L(top): mov $0, R32(w1) 132 mul v0 133 add %rax, w3 134 mov (up,n,8), %rax 135 adc %rdx, w0 136 adc $0, R32(w1) 137L(lo1): mul v1 138 add w3, (rp,n,8) 139 mov $0, R32(w3) 140 adc %rax, w0 141 mov $0, R32(w2) 142 mov 8(up,n,8), %rax 143 adc %rdx, w1 144 mul v0 145 add %rax, w0 146 mov 8(up,n,8), %rax 147 adc %rdx, w1 148 adc $0, R32(w2) 149L(lo0): mul v1 150 add w0, 8(rp,n,8) 151 adc %rax, w1 152 adc %rdx, w2 153 mov 16(up,n,8), %rax 154 mul v0 155 add %rax, w1 156 adc %rdx, w2 157 adc $0, R32(w3) 158 mov 16(up,n,8), %rax 159L(lo3): mul v1 160 add w1, 16(rp,n,8) 161 adc %rax, w2 162 adc %rdx, w3 163 xor R32(w0), R32(w0) 164 mov 24(up,n,8), %rax 165 mul v0 166 add %rax, w2 167 mov 24(up,n,8), %rax 168 adc %rdx, w3 169 adc $0, R32(w0) 170L(lo2): mul v1 171 add w2, 24(rp,n,8) 172 adc %rax, w3 173 adc %rdx, w0 174 mov 32(up,n,8), %rax 175 add $4, n 176 js L(top) 177 178L(end): xor R32(w1), R32(w1) 179 mul v0 180 add %rax, w3 181 mov (up), %rax 182 adc %rdx, w0 183 adc R32(w1), R32(w1) 184 mul v1 185 add w3, (rp) 186 adc %rax, w0 187 adc %rdx, w1 188 mov w0, 8(rp) 189 mov w1, %rax 190 191 pop %rbp 192 pop %rbx 193 FUNC_EXIT() 194 ret 195EPILOGUE() 196