1dnl AMD64 mpn_lshift -- mpn left shift. 2 3dnl Copyright 2003, 2005, 2007, 2009 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C cycles/limb cycles/limb cnt=1 24C K8,K9: 2.375 1.375 25C K10: 2.375 1.375 26C P4: 8 10.5 27C P6-15 (Core2): 2.11 4.28 28C P6-28 (Atom): 5.75 3.5 29 30 31C INPUT PARAMETERS 32define(`rp', `%rdi') 33define(`up', `%rsi') 34define(`n', `%rdx') 35define(`cnt', `%rcx') 36 37ASM_START() 38 TEXT 39 ALIGN(32) 40PROLOGUE(mpn_lshift) 41 cmp $1, R8(%rcx) 42 jne L(gen) 43 44C For cnt=1 we want to work from lowest limb towards higher limbs. 45C Check for bad overlap (up=rp is OK!) up=1..rp+n-1 is bad. 46C FIXME: this could surely be done more cleverly. 47 48 mov rp, %rax 49 sub up, %rax 50 je L(fwd) C rp = up 51 shr $3, %rax 52 cmp n, %rax 53 jb L(gen) 54 55L(fwd): mov R32(n), R32(%rax) 56 shr $2, n 57 je L(e1) 58 and $3, R32(%rax) 59 60 ALIGN(8) 61 nop 62 nop 63L(t1): mov (up), %r8 64 mov 8(up), %r9 65 mov 16(up), %r10 66 mov 24(up), %r11 67 lea 32(up), up 68 adc %r8, %r8 69 mov %r8, (rp) 70 adc %r9, %r9 71 mov %r9, 8(rp) 72 adc %r10, %r10 73 mov %r10, 16(rp) 74 adc %r11, %r11 75 mov %r11, 24(rp) 76 lea 32(rp), rp 77 dec n 78 jne L(t1) 79 80 inc R32(%rax) 81 dec R32(%rax) 82 jne L(n00) 83 adc R32(%rax), R32(%rax) 84 ret 85L(e1): test R32(%rax), R32(%rax) C clear cy 86L(n00): mov (up), %r8 87 dec R32(%rax) 88 jne L(n01) 89 adc %r8, %r8 90 mov %r8, (rp) 91L(ret): adc R32(%rax), R32(%rax) 92 ret 93L(n01): dec R32(%rax) 94 mov 8(up), %r9 95 jne L(n10) 96 adc %r8, %r8 97 adc %r9, %r9 98 mov %r8, (rp) 99 mov %r9, 8(rp) 100 adc R32(%rax), R32(%rax) 101 ret 102L(n10): mov 16(up), %r10 103 adc %r8, %r8 104 adc %r9, %r9 105 adc %r10, %r10 106 mov %r8, (rp) 107 mov %r9, 8(rp) 108 mov %r10, 16(rp) 109 adc $-1, R32(%rax) 110 ret 111 112L(gen): neg R32(%rcx) C put rsh count in cl 113 mov -8(up,n,8), %rax 114 shr R8(%rcx), %rax C function return value 115 116 neg R32(%rcx) C put lsh count in cl 117 lea 1(n), R32(%r8) 118 and $3, R32(%r8) 119 je L(rlx) C jump for n = 3, 7, 11, ... 120 121 dec R32(%r8) 122 jne L(1) 123C n = 4, 8, 12, ... 124 mov -8(up,n,8), %r10 125 shl R8(%rcx), %r10 126 neg R32(%rcx) C put rsh count in cl 127 mov -16(up,n,8), %r8 128 shr R8(%rcx), %r8 129 or %r8, %r10 130 mov %r10, -8(rp,n,8) 131 dec n 132 jmp L(rll) 133 134L(1): dec R32(%r8) 135 je L(1x) C jump for n = 1, 5, 9, 13, ... 136C n = 2, 6, 10, 16, ... 137 mov -8(up,n,8), %r10 138 shl R8(%rcx), %r10 139 neg R32(%rcx) C put rsh count in cl 140 mov -16(up,n,8), %r8 141 shr R8(%rcx), %r8 142 or %r8, %r10 143 mov %r10, -8(rp,n,8) 144 dec n 145 neg R32(%rcx) C put lsh count in cl 146L(1x): 147 cmp $1, n 148 je L(ast) 149 mov -8(up,n,8), %r10 150 shl R8(%rcx), %r10 151 mov -16(up,n,8), %r11 152 shl R8(%rcx), %r11 153 neg R32(%rcx) C put rsh count in cl 154 mov -16(up,n,8), %r8 155 mov -24(up,n,8), %r9 156 shr R8(%rcx), %r8 157 or %r8, %r10 158 shr R8(%rcx), %r9 159 or %r9, %r11 160 mov %r10, -8(rp,n,8) 161 mov %r11, -16(rp,n,8) 162 sub $2, n 163 164L(rll): neg R32(%rcx) C put lsh count in cl 165L(rlx): mov -8(up,n,8), %r10 166 shl R8(%rcx), %r10 167 mov -16(up,n,8), %r11 168 shl R8(%rcx), %r11 169 170 sub $4, n C 4 171 jb L(end) C 2 172 ALIGN(16) 173L(top): 174 C finish stuff from lsh block 175 neg R32(%rcx) C put rsh count in cl 176 mov 16(up,n,8), %r8 177 mov 8(up,n,8), %r9 178 shr R8(%rcx), %r8 179 or %r8, %r10 180 shr R8(%rcx), %r9 181 or %r9, %r11 182 mov %r10, 24(rp,n,8) 183 mov %r11, 16(rp,n,8) 184 C start two new rsh 185 mov 0(up,n,8), %r8 186 mov -8(up,n,8), %r9 187 shr R8(%rcx), %r8 188 shr R8(%rcx), %r9 189 190 C finish stuff from rsh block 191 neg R32(%rcx) C put lsh count in cl 192 mov 8(up,n,8), %r10 193 mov 0(up,n,8), %r11 194 shl R8(%rcx), %r10 195 or %r10, %r8 196 shl R8(%rcx), %r11 197 or %r11, %r9 198 mov %r8, 8(rp,n,8) 199 mov %r9, 0(rp,n,8) 200 C start two new lsh 201 mov -8(up,n,8), %r10 202 mov -16(up,n,8), %r11 203 shl R8(%rcx), %r10 204 shl R8(%rcx), %r11 205 206 sub $4, n 207 jae L(top) C 2 208L(end): 209 neg R32(%rcx) C put rsh count in cl 210 mov 8(up), %r8 211 shr R8(%rcx), %r8 212 or %r8, %r10 213 mov (up), %r9 214 shr R8(%rcx), %r9 215 or %r9, %r11 216 mov %r10, 16(rp) 217 mov %r11, 8(rp) 218 219 neg R32(%rcx) C put lsh count in cl 220L(ast): mov (up), %r10 221 shl R8(%rcx), %r10 222 mov %r10, (rp) 223 ret 224EPILOGUE() 225