1dnl AMD64 mpn_lshift -- mpn left shift. 2 3dnl Copyright 2003, 2005, 2007, 2009, 2011, 2012 Free Software Foundation, 4dnl Inc. 5 6dnl This file is part of the GNU MP Library. 7dnl 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of either: 10dnl 11dnl * the GNU Lesser General Public License as published by the Free 12dnl Software Foundation; either version 3 of the License, or (at your 13dnl option) any later version. 14dnl 15dnl or 16dnl 17dnl * the GNU General Public License as published by the Free Software 18dnl Foundation; either version 2 of the License, or (at your option) any 19dnl later version. 20dnl 21dnl or both in parallel, as here. 22dnl 23dnl The GNU MP Library is distributed in the hope that it will be useful, but 24dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26dnl for more details. 27dnl 28dnl You should have received copies of the GNU General Public License and the 29dnl GNU Lesser General Public License along with the GNU MP Library. If not, 30dnl see https://www.gnu.org/licenses/. 31 32include(`../config.m4') 33 34 35C cycles/limb cycles/limb cnt=1 36C AMD K8,K9 2.375 1.375 37C AMD K10 2.375 1.375 38C Intel P4 8 10.5 39C Intel core2 2.11 4.28 40C Intel corei ? ? 41C Intel atom 5.75 3.5 42C VIA nano 3.5 2.25 43 44 45C INPUT PARAMETERS 46define(`rp', `%rdi') 47define(`up', `%rsi') 48define(`n', `%rdx') 49define(`cnt', `%rcx') 50 51ABI_SUPPORT(DOS64) 52ABI_SUPPORT(STD64) 53 54ASM_START() 55 TEXT 56 ALIGN(32) 57PROLOGUE(mpn_lshift) 58 FUNC_ENTRY(4) 59 cmp $1, R8(%rcx) 60 jne L(gen) 61 62C For cnt=1 we want to work from lowest limb towards higher limbs. 63C Check for bad overlap (up=rp is OK!) up=rp+1..rp+n-1 is bad. 64C FIXME: this could surely be done more cleverly. 65 66 mov rp, %rax 67 sub up, %rax 68 je L(fwd) C rp = up 69 shr $3, %rax 70 cmp n, %rax 71 jb L(gen) 72 73L(fwd): mov R32(n), R32(%rax) 74 shr $2, n 75 je L(e1) 76 and $3, R32(%rax) 77 78 ALIGN(8) 79 nop 80 nop 81L(t1): mov (up), %r8 82 mov 8(up), %r9 83 mov 16(up), %r10 84 mov 24(up), %r11 85 lea 32(up), up 86 adc %r8, %r8 87 mov %r8, (rp) 88 adc %r9, %r9 89 mov %r9, 8(rp) 90 adc %r10, %r10 91 mov %r10, 16(rp) 92 adc %r11, %r11 93 mov %r11, 24(rp) 94 lea 32(rp), rp 95 dec n 96 jne L(t1) 97 98 inc R32(%rax) 99 dec R32(%rax) 100 jne L(n00) 101 adc R32(%rax), R32(%rax) 102 FUNC_EXIT() 103 ret 104L(e1): test R32(%rax), R32(%rax) C clear cy 105L(n00): mov (up), %r8 106 dec R32(%rax) 107 jne L(n01) 108 adc %r8, %r8 109 mov %r8, (rp) 110L(ret): adc R32(%rax), R32(%rax) 111 FUNC_EXIT() 112 ret 113L(n01): dec R32(%rax) 114 mov 8(up), %r9 115 jne L(n10) 116 adc %r8, %r8 117 adc %r9, %r9 118 mov %r8, (rp) 119 mov %r9, 8(rp) 120 adc R32(%rax), R32(%rax) 121 FUNC_EXIT() 122 ret 123L(n10): mov 16(up), %r10 124 adc %r8, %r8 125 adc %r9, %r9 126 adc %r10, %r10 127 mov %r8, (rp) 128 mov %r9, 8(rp) 129 mov %r10, 16(rp) 130 adc $-1, R32(%rax) 131 FUNC_EXIT() 132 ret 133 134L(gen): neg R32(%rcx) C put rsh count in cl 135 mov -8(up,n,8), %rax 136 shr R8(%rcx), %rax C function return value 137 138 neg R32(%rcx) C put lsh count in cl 139 lea 1(n), R32(%r8) 140 and $3, R32(%r8) 141 je L(rlx) C jump for n = 3, 7, 11, ... 142 143 dec R32(%r8) 144 jne L(1) 145C n = 4, 8, 12, ... 146 mov -8(up,n,8), %r10 147 shl R8(%rcx), %r10 148 neg R32(%rcx) C put rsh count in cl 149 mov -16(up,n,8), %r8 150 shr R8(%rcx), %r8 151 or %r8, %r10 152 mov %r10, -8(rp,n,8) 153 dec n 154 jmp L(rll) 155 156L(1): dec R32(%r8) 157 je L(1x) C jump for n = 1, 5, 9, 13, ... 158C n = 2, 6, 10, 16, ... 159 mov -8(up,n,8), %r10 160 shl R8(%rcx), %r10 161 neg R32(%rcx) C put rsh count in cl 162 mov -16(up,n,8), %r8 163 shr R8(%rcx), %r8 164 or %r8, %r10 165 mov %r10, -8(rp,n,8) 166 dec n 167 neg R32(%rcx) C put lsh count in cl 168L(1x): 169 cmp $1, n 170 je L(ast) 171 mov -8(up,n,8), %r10 172 shl R8(%rcx), %r10 173 mov -16(up,n,8), %r11 174 shl R8(%rcx), %r11 175 neg R32(%rcx) C put rsh count in cl 176 mov -16(up,n,8), %r8 177 mov -24(up,n,8), %r9 178 shr R8(%rcx), %r8 179 or %r8, %r10 180 shr R8(%rcx), %r9 181 or %r9, %r11 182 mov %r10, -8(rp,n,8) 183 mov %r11, -16(rp,n,8) 184 sub $2, n 185 186L(rll): neg R32(%rcx) C put lsh count in cl 187L(rlx): mov -8(up,n,8), %r10 188 shl R8(%rcx), %r10 189 mov -16(up,n,8), %r11 190 shl R8(%rcx), %r11 191 192 sub $4, n C 4 193 jb L(end) C 2 194 ALIGN(16) 195L(top): 196 C finish stuff from lsh block 197 neg R32(%rcx) C put rsh count in cl 198 mov 16(up,n,8), %r8 199 mov 8(up,n,8), %r9 200 shr R8(%rcx), %r8 201 or %r8, %r10 202 shr R8(%rcx), %r9 203 or %r9, %r11 204 mov %r10, 24(rp,n,8) 205 mov %r11, 16(rp,n,8) 206 C start two new rsh 207 mov 0(up,n,8), %r8 208 mov -8(up,n,8), %r9 209 shr R8(%rcx), %r8 210 shr R8(%rcx), %r9 211 212 C finish stuff from rsh block 213 neg R32(%rcx) C put lsh count in cl 214 mov 8(up,n,8), %r10 215 mov 0(up,n,8), %r11 216 shl R8(%rcx), %r10 217 or %r10, %r8 218 shl R8(%rcx), %r11 219 or %r11, %r9 220 mov %r8, 8(rp,n,8) 221 mov %r9, 0(rp,n,8) 222 C start two new lsh 223 mov -8(up,n,8), %r10 224 mov -16(up,n,8), %r11 225 shl R8(%rcx), %r10 226 shl R8(%rcx), %r11 227 228 sub $4, n 229 jae L(top) C 2 230L(end): 231 neg R32(%rcx) C put rsh count in cl 232 mov 8(up), %r8 233 shr R8(%rcx), %r8 234 or %r8, %r10 235 mov (up), %r9 236 shr R8(%rcx), %r9 237 or %r9, %r11 238 mov %r10, 16(rp) 239 mov %r11, 8(rp) 240 241 neg R32(%rcx) C put lsh count in cl 242L(ast): mov (up), %r10 243 shl R8(%rcx), %r10 244 mov %r10, (rp) 245 FUNC_EXIT() 246 ret 247EPILOGUE() 248