1dnl AMD64 mpn_mul_2 -- Multiply an n-limb vector with a 2-limb vector and 2dnl store the result in a third limb vector. 3 4dnl Copyright 2008, 2011, 2012 Free Software Foundation, Inc. 5 6dnl This file is part of the GNU MP Library. 7 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of the GNU Lesser General Public License as published 10dnl by the Free Software Foundation; either version 3 of the License, or (at 11dnl your option) any later version. 12 13dnl The GNU MP Library is distributed in the hope that it will be useful, but 14dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16dnl License for more details. 17 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21include(`../config.m4') 22 23C cycles/limb 24C AMD K8,K9 2.275 25C AMD K10 2.275 26C Intel P4 13.5 27C Intel core2 4.0 28C Intel corei 3.8 29C Intel atom ? 30C VIA nano ? 31 32C This code is the result of running a code generation and optimization tool 33C suite written by David Harvey and Torbjorn Granlund. 34 35C TODO 36C * Work on feed-in and wind-down code. 37C * Convert "mov $0" to "xor". 38C * Adjust initial lea to save some bytes. 39C * Perhaps adjust n from n_param&3 value? 40C * Replace with 2.25 c/l sequence. 41 42C INPUT PARAMETERS 43define(`rp', `%rdi') 44define(`up', `%rsi') 45define(`n_param',`%rdx') 46define(`vp', `%rcx') 47 48define(`v0', `%r8') 49define(`v1', `%r9') 50define(`w0', `%rbx') 51define(`w1', `%rcx') 52define(`w2', `%rbp') 53define(`w3', `%r10') 54define(`n', `%r11') 55 56ABI_SUPPORT(DOS64) 57ABI_SUPPORT(STD64) 58 59ASM_START() 60 TEXT 61 ALIGN(16) 62PROLOGUE(mpn_mul_2) 63 FUNC_ENTRY(4) 64 push %rbx 65 push %rbp 66 67 mov (vp), v0 68 mov 8(vp), v1 69 70 mov (up), %rax 71 72 mov n_param, n 73 neg n 74 lea -8(up,n_param,8), up 75 lea -8(rp,n_param,8), rp 76 77 and $3, R32(n_param) 78 jz L(m2p0) 79 cmp $2, R32(n_param) 80 jc L(m2p1) 81 jz L(m2p2) 82L(m2p3): 83 mul v0 84 xor R32(w3), R32(w3) 85 mov %rax, w1 86 mov %rdx, w2 87 mov 8(up,n,8), %rax 88 add $-1, n 89 mul v1 90 add %rax, w2 91 jmp L(m23) 92L(m2p0): 93 mul v0 94 xor R32(w2), R32(w2) 95 mov %rax, w0 96 mov %rdx, w1 97 jmp L(m20) 98L(m2p1): 99 mul v0 100 xor R32(w3), R32(w3) 101 xor R32(w0), R32(w0) 102 xor R32(w1), R32(w1) 103 add $1, n 104 jmp L(m2top) 105L(m2p2): 106 mul v0 107 xor R32(w0), R32(w0) 108 xor R32(w1), R32(w1) 109 mov %rax, w2 110 mov %rdx, w3 111 mov 8(up,n,8), %rax 112 add $-2, n 113 jmp L(m22) 114 115 116 ALIGN(32) 117L(m2top): 118 add %rax, w3 119 adc %rdx, w0 120 mov 0(up,n,8), %rax 121 adc $0, R32(w1) 122 mov $0, R32(w2) 123 mul v1 124 add %rax, w0 125 mov w3, 0(rp,n,8) 126 adc %rdx, w1 127 mov 8(up,n,8), %rax 128 mul v0 129 add %rax, w0 130 adc %rdx, w1 131 adc $0, R32(w2) 132L(m20): mov 8(up,n,8), %rax 133 mul v1 134 add %rax, w1 135 adc %rdx, w2 136 mov 16(up,n,8), %rax 137 mov $0, R32(w3) 138 mul v0 139 add %rax, w1 140 mov 16(up,n,8), %rax 141 adc %rdx, w2 142 adc $0, R32(w3) 143 mul v1 144 add %rax, w2 145 mov w0, 8(rp,n,8) 146L(m23): adc %rdx, w3 147 mov 24(up,n,8), %rax 148 mul v0 149 mov $0, R32(w0) 150 add %rax, w2 151 adc %rdx, w3 152 mov w1, 16(rp,n,8) 153 mov 24(up,n,8), %rax 154 mov $0, R32(w1) 155 adc $0, R32(w0) 156L(m22): mul v1 157 add %rax, w3 158 mov w2, 24(rp,n,8) 159 adc %rdx, w0 160 mov 32(up,n,8), %rax 161 mul v0 162 add $4, n 163 js L(m2top) 164 165 166 add %rax, w3 167 adc %rdx, w0 168 adc $0, R32(w1) 169 mov (up), %rax 170 mul v1 171 mov w3, (rp) 172 add %rax, w0 173 adc %rdx, w1 174 mov w0, 8(rp) 175 mov w1, %rax 176 177 pop %rbp 178 pop %rbx 179 FUNC_EXIT() 180 ret 181EPILOGUE() 182