1dnl Intel P5 mpn_lshift -- mpn left shift. 2 3dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C P5: 1.75 cycles/limb. 24 25 26C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size, 27C unsigned shift); 28C 29C Shift src,size left by shift many bits and store the result in dst,size. 30C Zeros are shifted in at the right. Return the bits shifted out at the 31C left. 32C 33C The comments in mpn_rshift apply here too. 34 35defframe(PARAM_SHIFT,16) 36defframe(PARAM_SIZE, 12) 37defframe(PARAM_SRC, 8) 38defframe(PARAM_DST, 4) 39deflit(`FRAME',0) 40 41dnl minimum 5, because the unrolled loop can't handle less 42deflit(UNROLL_THRESHOLD, 5) 43 44 TEXT 45 ALIGN(8) 46 47PROLOGUE(mpn_lshift) 48 49 pushl %ebx 50 pushl %edi 51deflit(`FRAME',8) 52 53 movl PARAM_SIZE, %eax 54 movl PARAM_DST, %edx 55 56 movl PARAM_SRC, %ebx 57 movl PARAM_SHIFT, %ecx 58 59 cmp $UNROLL_THRESHOLD, %eax 60 jae L(unroll) 61 62 movl -4(%ebx,%eax,4), %edi C src high limb 63 decl %eax 64 65 jnz L(simple) 66 67 shldl( %cl, %edi, %eax) C eax was decremented to zero 68 69 shll %cl, %edi 70 71 movl %edi, (%edx) C dst low limb 72 popl %edi C risk of data cache bank clash 73 74 popl %ebx 75 76 ret 77 78 79C ----------------------------------------------------------------------------- 80L(simple): 81 C eax size-1 82 C ebx src 83 C ecx shift 84 C edx dst 85 C esi 86 C edi 87 C ebp 88deflit(`FRAME',8) 89 90 movd (%ebx,%eax,4), %mm5 C src high limb 91 92 movd %ecx, %mm6 C lshift 93 negl %ecx 94 95 psllq %mm6, %mm5 96 addl $32, %ecx 97 98 movd %ecx, %mm7 99 psrlq $32, %mm5 C retval 100 101 102L(simple_top): 103 C eax counter, limbs, negative 104 C ebx src 105 C ecx 106 C edx dst 107 C esi 108 C edi 109 C 110 C mm0 scratch 111 C mm5 return value 112 C mm6 shift 113 C mm7 32-shift 114 115 movq -4(%ebx,%eax,4), %mm0 116 decl %eax 117 118 psrlq %mm7, %mm0 119 120 C 121 122 movd %mm0, 4(%edx,%eax,4) 123 jnz L(simple_top) 124 125 126 movd (%ebx), %mm0 127 128 movd %mm5, %eax 129 psllq %mm6, %mm0 130 131 popl %edi 132 popl %ebx 133 134 movd %mm0, (%edx) 135 136 emms 137 138 ret 139 140 141C ----------------------------------------------------------------------------- 142 ALIGN(8) 143L(unroll): 144 C eax size 145 C ebx src 146 C ecx shift 147 C edx dst 148 C esi 149 C edi 150 C ebp 151deflit(`FRAME',8) 152 153 movd -4(%ebx,%eax,4), %mm5 C src high limb 154 leal (%ebx,%eax,4), %edi 155 156 movd %ecx, %mm6 C lshift 157 andl $4, %edi 158 159 psllq %mm6, %mm5 160 jz L(start_src_aligned) 161 162 163 C src isn't aligned, process high limb separately (marked xxx) to 164 C make it so. 165 C 166 C source -8(ebx,%eax,4) 167 C | 168 C +-------+-------+-------+-- 169 C | | 170 C +-------+-------+-------+-- 171 C 0mod8 4mod8 0mod8 172 C 173 C dest 174 C -4(edx,%eax,4) 175 C | 176 C +-------+-------+-- 177 C | xxx | | 178 C +-------+-------+-- 179 180 movq -8(%ebx,%eax,4), %mm0 C unaligned load 181 182 psllq %mm6, %mm0 183 decl %eax 184 185 psrlq $32, %mm0 186 187 C 188 189 movd %mm0, (%edx,%eax,4) 190L(start_src_aligned): 191 192 movq -8(%ebx,%eax,4), %mm1 C src high qword 193 leal (%edx,%eax,4), %edi 194 195 andl $4, %edi 196 psrlq $32, %mm5 C return value 197 198 movq -16(%ebx,%eax,4), %mm3 C src second highest qword 199 jz L(start_dst_aligned) 200 201 C dst isn't aligned, subtract 4 to make it so, and pretend the shift 202 C is 32 bits extra. High limb of dst (marked xxx) handled here 203 C separately. 204 C 205 C source -8(ebx,%eax,4) 206 C | 207 C +-------+-------+-- 208 C | mm1 | 209 C +-------+-------+-- 210 C 0mod8 4mod8 211 C 212 C dest 213 C -4(edx,%eax,4) 214 C | 215 C +-------+-------+-------+-- 216 C | xxx | | 217 C +-------+-------+-------+-- 218 C 0mod8 4mod8 0mod8 219 220 movq %mm1, %mm0 221 addl $32, %ecx C new shift 222 223 psllq %mm6, %mm0 224 225 movd %ecx, %mm6 226 psrlq $32, %mm0 227 228 C wasted cycle here waiting for %mm0 229 230 movd %mm0, -4(%edx,%eax,4) 231 subl $4, %edx 232L(start_dst_aligned): 233 234 235 psllq %mm6, %mm1 236 negl %ecx C -shift 237 238 addl $64, %ecx C 64-shift 239 movq %mm3, %mm2 240 241 movd %ecx, %mm7 242 subl $8, %eax C size-8 243 244 psrlq %mm7, %mm3 245 246 por %mm1, %mm3 C mm3 ready to store 247 jc L(finish) 248 249 250 C The comments in mpn_rshift apply here too. 251 252 ALIGN(8) 253L(unroll_loop): 254 C eax counter, limbs 255 C ebx src 256 C ecx 257 C edx dst 258 C esi 259 C edi 260 C 261 C mm0 262 C mm1 263 C mm2 src qword from 16(%ebx,%eax,4) 264 C mm3 dst qword ready to store to 24(%edx,%eax,4) 265 C 266 C mm5 return value 267 C mm6 lshift 268 C mm7 rshift 269 270 movq 8(%ebx,%eax,4), %mm0 271 psllq %mm6, %mm2 272 273 movq %mm0, %mm1 274 psrlq %mm7, %mm0 275 276 movq %mm3, 24(%edx,%eax,4) C prev 277 por %mm2, %mm0 278 279 movq (%ebx,%eax,4), %mm3 C 280 psllq %mm6, %mm1 C 281 282 movq %mm0, 16(%edx,%eax,4) 283 movq %mm3, %mm2 C 284 285 psrlq %mm7, %mm3 C 286 subl $4, %eax 287 288 por %mm1, %mm3 C 289 jnc L(unroll_loop) 290 291 292 293L(finish): 294 C eax -4 to -1 representing respectively 0 to 3 limbs remaining 295 296 testb $2, %al 297 298 jz L(finish_no_two) 299 300 movq 8(%ebx,%eax,4), %mm0 301 psllq %mm6, %mm2 302 303 movq %mm0, %mm1 304 psrlq %mm7, %mm0 305 306 movq %mm3, 24(%edx,%eax,4) C prev 307 por %mm2, %mm0 308 309 movq %mm1, %mm2 310 movq %mm0, %mm3 311 312 subl $2, %eax 313L(finish_no_two): 314 315 316 C eax -4 or -3 representing respectively 0 or 1 limbs remaining 317 C 318 C mm2 src prev qword, from 16(%ebx,%eax,4) 319 C mm3 dst qword, for 24(%edx,%eax,4) 320 321 testb $1, %al 322 movd %mm5, %eax C retval 323 324 popl %edi 325 jz L(finish_zero) 326 327 328 C One extra src limb, destination was aligned. 329 C 330 C source ebx 331 C --+---------------+-------+ 332 C | mm2 | | 333 C --+---------------+-------+ 334 C 335 C dest edx+12 edx+4 edx 336 C --+---------------+---------------+-------+ 337 C | mm3 | | | 338 C --+---------------+---------------+-------+ 339 C 340 C mm6 = shift 341 C mm7 = ecx = 64-shift 342 343 344 C One extra src limb, destination was unaligned. 345 C 346 C source ebx 347 C --+---------------+-------+ 348 C | mm2 | | 349 C --+---------------+-------+ 350 C 351 C dest edx+12 edx+4 352 C --+---------------+---------------+ 353 C | mm3 | | 354 C --+---------------+---------------+ 355 C 356 C mm6 = shift+32 357 C mm7 = ecx = 64-(shift+32) 358 359 360 C In both cases there's one extra limb of src to fetch and combine 361 C with mm2 to make a qword at 4(%edx), and in the aligned case 362 C there's an extra limb of dst to be formed from that extra src limb 363 C left shifted. 364 365 366 movd (%ebx), %mm0 367 psllq %mm6, %mm2 368 369 movq %mm3, 12(%edx) 370 psllq $32, %mm0 371 372 movq %mm0, %mm1 373 psrlq %mm7, %mm0 374 375 por %mm2, %mm0 376 psllq %mm6, %mm1 377 378 movq %mm0, 4(%edx) 379 psrlq $32, %mm1 380 381 andl $32, %ecx 382 popl %ebx 383 384 jz L(finish_one_unaligned) 385 386 movd %mm1, (%edx) 387L(finish_one_unaligned): 388 389 emms 390 391 ret 392 393 394L(finish_zero): 395 396 C No extra src limbs, destination was aligned. 397 C 398 C source ebx 399 C --+---------------+ 400 C | mm2 | 401 C --+---------------+ 402 C 403 C dest edx+8 edx 404 C --+---------------+---------------+ 405 C | mm3 | | 406 C --+---------------+---------------+ 407 C 408 C mm6 = shift 409 C mm7 = ecx = 64-shift 410 411 412 C No extra src limbs, destination was unaligned. 413 C 414 C source ebx 415 C --+---------------+ 416 C | mm2 | 417 C --+---------------+ 418 C 419 C dest edx+8 edx+4 420 C --+---------------+-------+ 421 C | mm3 | | 422 C --+---------------+-------+ 423 C 424 C mm6 = shift+32 425 C mm7 = ecx = 64-(shift+32) 426 427 428 C The movd for the unaligned case writes the same data to 4(%edx) 429 C that the movq does for the aligned case. 430 431 432 movq %mm3, 8(%edx) 433 andl $32, %ecx 434 435 psllq %mm6, %mm2 436 jz L(finish_zero_unaligned) 437 438 movq %mm2, (%edx) 439L(finish_zero_unaligned): 440 441 psrlq $32, %mm2 442 popl %ebx 443 444 movd %mm5, %eax C retval 445 446 movd %mm2, 4(%edx) 447 448 emms 449 450 ret 451 452EPILOGUE() 453