1dnl AMD64 mpn_lshift -- mpn left shift. 2 3dnl Copyright 2003, 2005, 2007, 2009, 2011, 2012 Free Software Foundation, 4dnl Inc. 5dnl 6dnl This file is part of the GNU MP Library. 7 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of the GNU Lesser General Public License as published 10dnl by the Free Software Foundation; either version 3 of the License, or (at 11dnl your option) any later version. 12 13dnl The GNU MP Library is distributed in the hope that it will be useful, but 14dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16dnl License for more details. 17 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21include(`../config.m4') 22 23 24C cycles/limb cycles/limb cnt=1 25C AMD K8,K9 2.375 1.375 26C AMD K10 2.375 1.375 27C Intel P4 8 10.5 28C Intel core2 2.11 4.28 29C Intel corei ? ? 30C Intel atom 5.75 3.5 31C VIA nano 3.5 2.25 32 33 34C INPUT PARAMETERS 35define(`rp', `%rdi') 36define(`up', `%rsi') 37define(`n', `%rdx') 38define(`cnt', `%rcx') 39 40ABI_SUPPORT(DOS64) 41ABI_SUPPORT(STD64) 42 43ASM_START() 44 TEXT 45 ALIGN(32) 46PROLOGUE(mpn_lshift) 47 FUNC_ENTRY(4) 48 cmp $1, R8(%rcx) 49 jne L(gen) 50 51C For cnt=1 we want to work from lowest limb towards higher limbs. 52C Check for bad overlap (up=rp is OK!) up=rp+1..rp+n-1 is bad. 53C FIXME: this could surely be done more cleverly. 54 55 mov rp, %rax 56 sub up, %rax 57 je L(fwd) C rp = up 58 shr $3, %rax 59 cmp n, %rax 60 jb L(gen) 61 62L(fwd): mov R32(n), R32(%rax) 63 shr $2, n 64 je L(e1) 65 and $3, R32(%rax) 66 67 ALIGN(8) 68 nop 69 nop 70L(t1): mov (up), %r8 71 mov 8(up), %r9 72 mov 16(up), %r10 73 mov 24(up), %r11 74 lea 32(up), up 75 adc %r8, %r8 76 mov %r8, (rp) 77 adc %r9, %r9 78 mov %r9, 8(rp) 79 adc %r10, %r10 80 mov %r10, 16(rp) 81 adc %r11, %r11 82 mov %r11, 24(rp) 83 lea 32(rp), rp 84 dec n 85 jne L(t1) 86 87 inc R32(%rax) 88 dec R32(%rax) 89 jne L(n00) 90 adc R32(%rax), R32(%rax) 91 FUNC_EXIT() 92 ret 93L(e1): test R32(%rax), R32(%rax) C clear cy 94L(n00): mov (up), %r8 95 dec R32(%rax) 96 jne L(n01) 97 adc %r8, %r8 98 mov %r8, (rp) 99L(ret): adc R32(%rax), R32(%rax) 100 FUNC_EXIT() 101 ret 102L(n01): dec R32(%rax) 103 mov 8(up), %r9 104 jne L(n10) 105 adc %r8, %r8 106 adc %r9, %r9 107 mov %r8, (rp) 108 mov %r9, 8(rp) 109 adc R32(%rax), R32(%rax) 110 FUNC_EXIT() 111 ret 112L(n10): mov 16(up), %r10 113 adc %r8, %r8 114 adc %r9, %r9 115 adc %r10, %r10 116 mov %r8, (rp) 117 mov %r9, 8(rp) 118 mov %r10, 16(rp) 119 adc $-1, R32(%rax) 120 FUNC_EXIT() 121 ret 122 123L(gen): neg R32(%rcx) C put rsh count in cl 124 mov -8(up,n,8), %rax 125 shr R8(%rcx), %rax C function return value 126 127 neg R32(%rcx) C put lsh count in cl 128 lea 1(n), R32(%r8) 129 and $3, R32(%r8) 130 je L(rlx) C jump for n = 3, 7, 11, ... 131 132 dec R32(%r8) 133 jne L(1) 134C n = 4, 8, 12, ... 135 mov -8(up,n,8), %r10 136 shl R8(%rcx), %r10 137 neg R32(%rcx) C put rsh count in cl 138 mov -16(up,n,8), %r8 139 shr R8(%rcx), %r8 140 or %r8, %r10 141 mov %r10, -8(rp,n,8) 142 dec n 143 jmp L(rll) 144 145L(1): dec R32(%r8) 146 je L(1x) C jump for n = 1, 5, 9, 13, ... 147C n = 2, 6, 10, 16, ... 148 mov -8(up,n,8), %r10 149 shl R8(%rcx), %r10 150 neg R32(%rcx) C put rsh count in cl 151 mov -16(up,n,8), %r8 152 shr R8(%rcx), %r8 153 or %r8, %r10 154 mov %r10, -8(rp,n,8) 155 dec n 156 neg R32(%rcx) C put lsh count in cl 157L(1x): 158 cmp $1, n 159 je L(ast) 160 mov -8(up,n,8), %r10 161 shl R8(%rcx), %r10 162 mov -16(up,n,8), %r11 163 shl R8(%rcx), %r11 164 neg R32(%rcx) C put rsh count in cl 165 mov -16(up,n,8), %r8 166 mov -24(up,n,8), %r9 167 shr R8(%rcx), %r8 168 or %r8, %r10 169 shr R8(%rcx), %r9 170 or %r9, %r11 171 mov %r10, -8(rp,n,8) 172 mov %r11, -16(rp,n,8) 173 sub $2, n 174 175L(rll): neg R32(%rcx) C put lsh count in cl 176L(rlx): mov -8(up,n,8), %r10 177 shl R8(%rcx), %r10 178 mov -16(up,n,8), %r11 179 shl R8(%rcx), %r11 180 181 sub $4, n C 4 182 jb L(end) C 2 183 ALIGN(16) 184L(top): 185 C finish stuff from lsh block 186 neg R32(%rcx) C put rsh count in cl 187 mov 16(up,n,8), %r8 188 mov 8(up,n,8), %r9 189 shr R8(%rcx), %r8 190 or %r8, %r10 191 shr R8(%rcx), %r9 192 or %r9, %r11 193 mov %r10, 24(rp,n,8) 194 mov %r11, 16(rp,n,8) 195 C start two new rsh 196 mov 0(up,n,8), %r8 197 mov -8(up,n,8), %r9 198 shr R8(%rcx), %r8 199 shr R8(%rcx), %r9 200 201 C finish stuff from rsh block 202 neg R32(%rcx) C put lsh count in cl 203 mov 8(up,n,8), %r10 204 mov 0(up,n,8), %r11 205 shl R8(%rcx), %r10 206 or %r10, %r8 207 shl R8(%rcx), %r11 208 or %r11, %r9 209 mov %r8, 8(rp,n,8) 210 mov %r9, 0(rp,n,8) 211 C start two new lsh 212 mov -8(up,n,8), %r10 213 mov -16(up,n,8), %r11 214 shl R8(%rcx), %r10 215 shl R8(%rcx), %r11 216 217 sub $4, n 218 jae L(top) C 2 219L(end): 220 neg R32(%rcx) C put rsh count in cl 221 mov 8(up), %r8 222 shr R8(%rcx), %r8 223 or %r8, %r10 224 mov (up), %r9 225 shr R8(%rcx), %r9 226 or %r9, %r11 227 mov %r10, 16(rp) 228 mov %r11, 8(rp) 229 230 neg R32(%rcx) C put lsh count in cl 231L(ast): mov (up), %r10 232 shl R8(%rcx), %r10 233 mov %r10, (rp) 234 FUNC_EXIT() 235 ret 236EPILOGUE() 237