1dnl AMD64 mpn_mul_2 -- Multiply an n-limb vector with a 2-limb vector and 2dnl store the result in a third limb vector. 3 4dnl Copyright 2008 Free Software Foundation, Inc. 5 6dnl This file is part of the GNU MP Library. 7 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of the GNU Lesser General Public License as published 10dnl by the Free Software Foundation; either version 3 of the License, or (at 11dnl your option) any later version. 12 13dnl The GNU MP Library is distributed in the hope that it will be useful, but 14dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16dnl License for more details. 17 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21include(`../config.m4') 22 23C cycles/limb 24C K8,K9: 2.275 25C K10: 2.275 26C P4: ? 27C P6 core2: 4.0 28C P6 corei7: 3.8 29 30C This code is the result of running a code generation and optimization tool 31C suite written by David Harvey and Torbjorn Granlund. 32 33C TODO 34C * Work on feed-in and wind-down code. 35C * Convert "mov $0" to "xor". 36C * Adjust initial lea to save some bytes. 37C * Perhaps adjust n from n_param&3 value? 38C * Replace with 2.25 c/l sequence. 39 40C INPUT PARAMETERS 41define(`rp', `%rdi') 42define(`up', `%rsi') 43define(`n_param',`%rdx') 44define(`vp', `%rcx') 45 46define(`v0', `%r8') 47define(`v1', `%r9') 48define(`w0', `%rbx') 49define(`w1', `%rcx') 50define(`w2', `%rbp') 51define(`w3', `%r10') 52define(`n', `%r11') 53 54ASM_START() 55 TEXT 56 ALIGN(16) 57PROLOGUE(mpn_mul_2) 58 push %rbx 59 push %rbp 60 61 mov (vp), v0 62 mov 8(vp), v1 63 64 mov (up), %rax 65 66 mov n_param, n 67 neg n 68 lea -8(up,n_param,8), up 69 lea -8(rp,n_param,8), rp 70 71 and $3, R32(n_param) 72 jz L(m2p0) 73 cmp $2, R32(n_param) 74 jc L(m2p1) 75 jz L(m2p2) 76L(m2p3): 77 mul v0 78 xor R32(w3), R32(w3) 79 mov %rax, w1 80 mov %rdx, w2 81 mov 8(up,n,8), %rax 82 add $-1, n 83 mul v1 84 add %rax, w2 85 jmp L(m23) 86L(m2p0): 87 mul v0 88 xor R32(w2), R32(w2) 89 mov %rax, w0 90 mov %rdx, w1 91 jmp L(m20) 92L(m2p1): 93 mul v0 94 xor R32(w3), R32(w3) 95 xor R32(w0), R32(w0) 96 xor R32(w1), R32(w1) 97 add $1, n 98 jmp L(m2top) 99L(m2p2): 100 mul v0 101 xor R32(w0), R32(w0) 102 xor R32(w1), R32(w1) 103 mov %rax, w2 104 mov %rdx, w3 105 mov 8(up,n,8), %rax 106 add $-2, n 107 jmp L(m22) 108 109 110 ALIGN(32) 111L(m2top): 112 add %rax, w3 113 adc %rdx, w0 114 mov 0(up,n,8), %rax 115 adc $0, R32(w1) 116 mov $0, R32(w2) 117 mul v1 118 add %rax, w0 119 mov w3, 0(rp,n,8) 120 adc %rdx, w1 121 mov 8(up,n,8), %rax 122 mul v0 123 add %rax, w0 124 adc %rdx, w1 125 adc $0, R32(w2) 126L(m20): mov 8(up,n,8), %rax 127 mul v1 128 add %rax, w1 129 adc %rdx, w2 130 mov 16(up,n,8), %rax 131 mov $0, R32(w3) 132 mul v0 133 add %rax, w1 134 mov 16(up,n,8), %rax 135 adc %rdx, w2 136 adc $0, R32(w3) 137 mul v1 138 add %rax, w2 139 mov w0, 8(rp,n,8) 140L(m23): adc %rdx, w3 141 mov 24(up,n,8), %rax 142 mul v0 143 mov $0, R32(w0) 144 add %rax, w2 145 adc %rdx, w3 146 mov w1, 16(rp,n,8) 147 mov 24(up,n,8), %rax 148 mov $0, R32(w1) 149 adc $0, R32(w0) 150L(m22): mul v1 151 add %rax, w3 152 mov w2, 24(rp,n,8) 153 adc %rdx, w0 154 mov 32(up,n,8), %rax 155 mul v0 156 add $4, n 157 js L(m2top) 158 159 160 add %rax, w3 161 adc %rdx, w0 162 adc $0, R32(w1) 163 mov (up), %rax 164 mul v1 165 mov w3, (rp) 166 add %rax, w0 167 adc %rdx, w1 168 mov w0, 8(rp) 169 mov w1, %rax 170 171 pop %rbp 172 pop %rbx 173 ret 174EPILOGUE() 175