1dnl AMD64 mpn_mul_2 -- Multiply an n-limb vector with a 2-limb vector and 2dnl store the result in a third limb vector. 3 4dnl Copyright 2008, 2011, 2012, 2016 Free Software Foundation, Inc. 5 6dnl This file is part of the GNU MP Library. 7dnl 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of either: 10dnl 11dnl * the GNU Lesser General Public License as published by the Free 12dnl Software Foundation; either version 3 of the License, or (at your 13dnl option) any later version. 14dnl 15dnl or 16dnl 17dnl * the GNU General Public License as published by the Free Software 18dnl Foundation; either version 2 of the License, or (at your option) any 19dnl later version. 20dnl 21dnl or both in parallel, as here. 22dnl 23dnl The GNU MP Library is distributed in the hope that it will be useful, but 24dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26dnl for more details. 27dnl 28dnl You should have received copies of the GNU General Public License and the 29dnl GNU Lesser General Public License along with the GNU MP Library. If not, 30dnl see https://www.gnu.org/licenses/. 31 32include(`../config.m4') 33 34C cycles/limb 35C AMD K8,K9 4.53 36C AMD K10 4.53 37C AMD bull 9.76 10.37 38C AMD pile 9.22 39C AMD steam 40C AMD excavator 41C AMD bobcat 11.3 42C AMD jaguar 11.9 43C Intel P4 25.0 44C Intel core2 8.05 45C Intel NHM 7.72 46C Intel SBR 6.33 47C Intel IBR 6.15 48C Intel HWL 6.00 49C Intel BWL 4.44 50C Intel SKL 4.54 51C Intel atom 39.0 52C Intel SLM 24.0 53C VIA nano 54 55C This code is the result of running a code generation and optimization tool 56C suite written by David Harvey and Torbjorn Granlund. 57 58C TODO 59C * Work on feed-in and wind-down code. 60C * Convert "mov $0" to "xor". 61C * Adjust initial lea to save some bytes. 62C * Perhaps adjust n from n_param&3 value? 63C * Replace with 2.25 c/l sequence. 64 65C INPUT PARAMETERS 66define(`rp', `%rdi') 67define(`up', `%rsi') 68define(`n_param',`%rdx') 69define(`vp', `%rcx') 70 71define(`v0', `%r8') 72define(`v1', `%r9') 73define(`w0', `%rbx') 74define(`w1', `%rcx') 75define(`w2', `%rbp') 76define(`w3', `%r10') 77define(`n', `%r11') 78 79ABI_SUPPORT(DOS64) 80ABI_SUPPORT(STD64) 81 82ASM_START() 83 TEXT 84 ALIGN(16) 85PROLOGUE(mpn_mul_2) 86 FUNC_ENTRY(4) 87 push %rbx 88 push %rbp 89 90 mov (vp), v0 91 mov 8(vp), v1 92 93 mov (up), %rax 94 95 mov n_param, n 96 neg n 97 lea -8(up,n_param,8), up 98 lea -8(rp,n_param,8), rp 99 100 and $3, R32(n_param) 101 jz L(m2p0) 102 cmp $2, R32(n_param) 103 jc L(m2p1) 104 jz L(m2p2) 105L(m2p3): 106 mul v0 107 xor R32(w3), R32(w3) 108 mov %rax, w1 109 mov %rdx, w2 110 mov 8(up,n,8), %rax 111 add $-1, n 112 mul v1 113 add %rax, w2 114 jmp L(m23) 115L(m2p0): 116 mul v0 117 xor R32(w2), R32(w2) 118 mov %rax, w0 119 mov %rdx, w1 120 jmp L(m20) 121L(m2p1): 122 mul v0 123 xor R32(w3), R32(w3) 124 xor R32(w0), R32(w0) 125 xor R32(w1), R32(w1) 126 add $1, n 127 jmp L(m2top) 128L(m2p2): 129 mul v0 130 xor R32(w0), R32(w0) 131 xor R32(w1), R32(w1) 132 mov %rax, w2 133 mov %rdx, w3 134 mov 8(up,n,8), %rax 135 add $-2, n 136 jmp L(m22) 137 138 139 ALIGN(32) 140L(m2top): 141 add %rax, w3 142 adc %rdx, w0 143 mov 0(up,n,8), %rax 144 adc $0, R32(w1) 145 mov $0, R32(w2) 146 mul v1 147 add %rax, w0 148 mov w3, 0(rp,n,8) 149 adc %rdx, w1 150 mov 8(up,n,8), %rax 151 mul v0 152 add %rax, w0 153 adc %rdx, w1 154 adc $0, R32(w2) 155L(m20): mov 8(up,n,8), %rax 156 mul v1 157 add %rax, w1 158 adc %rdx, w2 159 mov 16(up,n,8), %rax 160 mov $0, R32(w3) 161 mul v0 162 add %rax, w1 163 mov 16(up,n,8), %rax 164 adc %rdx, w2 165 adc $0, R32(w3) 166 mul v1 167 add %rax, w2 168 mov w0, 8(rp,n,8) 169L(m23): adc %rdx, w3 170 mov 24(up,n,8), %rax 171 mul v0 172 mov $0, R32(w0) 173 add %rax, w2 174 adc %rdx, w3 175 mov w1, 16(rp,n,8) 176 mov 24(up,n,8), %rax 177 mov $0, R32(w1) 178 adc $0, R32(w0) 179L(m22): mul v1 180 add %rax, w3 181 mov w2, 24(rp,n,8) 182 adc %rdx, w0 183 mov 32(up,n,8), %rax 184 mul v0 185 add $4, n 186 js L(m2top) 187 188 189 add %rax, w3 190 adc %rdx, w0 191 adc $0, R32(w1) 192 mov (up), %rax 193 mul v1 194 mov w3, (rp) 195 add %rax, w0 196 adc %rdx, w1 197 mov w0, 8(rp) 198 mov w1, %rax 199 200 pop %rbp 201 pop %rbx 202 FUNC_EXIT() 203 ret 204EPILOGUE() 205