1dnl Intel P5 mpn_rshift -- mpn right shift. 2 3dnl Copyright 2000, 2002 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C P5: 1.75 cycles/limb. 24 25 26C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size, 27C unsigned shift); 28C 29C Shift src,size right by shift many bits and store the result in dst,size. 30C Zeros are shifted in at the left. Return the bits shifted out at the 31C right. 32C 33C It takes 6 mmx instructions to process 2 limbs, making 1.5 cycles/limb, 34C and with a 4 limb loop and 1 cycle of loop overhead the total is 1.75 c/l. 35C 36C Full speed depends on source and destination being aligned. Unaligned mmx 37C loads and stores on P5 don't pair and have a 2 cycle penalty. Some hairy 38C setups and finish-ups are done to ensure alignment for the loop. 39C 40C MMX shifts work out a bit faster even for the simple loop. 41 42defframe(PARAM_SHIFT,16) 43defframe(PARAM_SIZE, 12) 44defframe(PARAM_SRC, 8) 45defframe(PARAM_DST, 4) 46deflit(`FRAME',0) 47 48dnl Minimum 5, because the unrolled loop can't handle less. 49deflit(UNROLL_THRESHOLD, 5) 50 51 TEXT 52 ALIGN(8) 53 54PROLOGUE(mpn_rshift) 55 56 pushl %ebx 57 pushl %edi 58deflit(`FRAME',8) 59 60 movl PARAM_SIZE, %eax 61 movl PARAM_DST, %edx 62 63 movl PARAM_SRC, %ebx 64 movl PARAM_SHIFT, %ecx 65 66 cmp $UNROLL_THRESHOLD, %eax 67 jae L(unroll) 68 69 decl %eax 70 movl (%ebx), %edi C src low limb 71 72 jnz L(simple) 73 74 shrdl( %cl, %edi, %eax) C eax was decremented to zero 75 76 shrl %cl, %edi 77 78 movl %edi, (%edx) C dst low limb 79 popl %edi C risk of data cache bank clash 80 81 popl %ebx 82 83 ret 84 85 86C ----------------------------------------------------------------------------- 87 ALIGN(8) 88L(simple): 89 C eax size-1 90 C ebx src 91 C ecx shift 92 C edx dst 93 C esi 94 C edi 95 C ebp 96deflit(`FRAME',8) 97 98 movd (%ebx), %mm5 C src[0] 99 leal (%ebx,%eax,4), %ebx C &src[size-1] 100 101 movd %ecx, %mm6 C rshift 102 leal -4(%edx,%eax,4), %edx C &dst[size-2] 103 104 psllq $32, %mm5 105 negl %eax 106 107 108C This loop is 5 or 8 cycles, with every second load unaligned and a wasted 109C cycle waiting for the mm0 result to be ready. For comparison a shrdl is 4 110C cycles and would be 8 in a simple loop. Using mmx helps the return value 111C and last limb calculations too. 112 113L(simple_top): 114 C eax counter, limbs, negative 115 C ebx &src[size-1] 116 C ecx return value 117 C edx &dst[size-2] 118 C 119 C mm0 scratch 120 C mm5 return value 121 C mm6 shift 122 123 movq (%ebx,%eax,4), %mm0 124 incl %eax 125 126 psrlq %mm6, %mm0 127 128 movd %mm0, (%edx,%eax,4) 129 jnz L(simple_top) 130 131 132 movd (%ebx), %mm0 133 psrlq %mm6, %mm5 C return value 134 135 psrlq %mm6, %mm0 136 popl %edi 137 138 movd %mm5, %eax 139 popl %ebx 140 141 movd %mm0, 4(%edx) 142 143 emms 144 145 ret 146 147 148C ----------------------------------------------------------------------------- 149 ALIGN(8) 150L(unroll): 151 C eax size 152 C ebx src 153 C ecx shift 154 C edx dst 155 C esi 156 C edi 157 C ebp 158deflit(`FRAME',8) 159 160 movd (%ebx), %mm5 C src[0] 161 movl $4, %edi 162 163 movd %ecx, %mm6 C rshift 164 testl %edi, %ebx 165 166 psllq $32, %mm5 167 jz L(start_src_aligned) 168 169 170 C src isn't aligned, process low limb separately (marked xxx) and 171 C step src and dst by one limb, making src aligned. 172 C 173 C source ebx 174 C --+-------+-------+-------+ 175 C | xxx | 176 C --+-------+-------+-------+ 177 C 4mod8 0mod8 4mod8 178 C 179 C dest edx 180 C --+-------+-------+ 181 C | | xxx | 182 C --+-------+-------+ 183 184 movq (%ebx), %mm0 C unaligned load 185 186 psrlq %mm6, %mm0 187 addl $4, %ebx 188 189 decl %eax 190 191 movd %mm0, (%edx) 192 addl $4, %edx 193L(start_src_aligned): 194 195 196 movq (%ebx), %mm1 197 testl %edi, %edx 198 199 psrlq %mm6, %mm5 C retval 200 jz L(start_dst_aligned) 201 202 C dst isn't aligned, add 4 to make it so, and pretend the shift is 203 C 32 bits extra. Low limb of dst (marked xxx) handled here 204 C separately. 205 C 206 C source ebx 207 C --+-------+-------+ 208 C | mm1 | 209 C --+-------+-------+ 210 C 4mod8 0mod8 211 C 212 C dest edx 213 C --+-------+-------+-------+ 214 C | xxx | 215 C --+-------+-------+-------+ 216 C 4mod8 0mod8 4mod8 217 218 movq %mm1, %mm0 219 addl $32, %ecx C new shift 220 221 psrlq %mm6, %mm0 222 223 movd %ecx, %mm6 224 225 movd %mm0, (%edx) 226 addl $4, %edx 227L(start_dst_aligned): 228 229 230 movq 8(%ebx), %mm3 231 negl %ecx 232 233 movq %mm3, %mm2 C mm2 src qword 234 addl $64, %ecx 235 236 movd %ecx, %mm7 237 psrlq %mm6, %mm1 238 239 leal -12(%ebx,%eax,4), %ebx 240 leal -20(%edx,%eax,4), %edx 241 242 psllq %mm7, %mm3 243 subl $7, %eax C size-7 244 245 por %mm1, %mm3 C mm3 ready to store 246 negl %eax C -(size-7) 247 248 jns L(finish) 249 250 251 C This loop is the important bit, the rest is just support. Careful 252 C instruction scheduling achieves the claimed 1.75 c/l. The 253 C relevant parts of the pairing rules are: 254 C 255 C - mmx loads and stores execute only in the U pipe 256 C - only one mmx shift in a pair 257 C - wait one cycle before storing an mmx register result 258 C - the usual address generation interlock 259 C 260 C Two qword calculations are slightly interleaved. The instructions 261 C marked "C" belong to the second qword, and the "C prev" one is for 262 C the second qword from the previous iteration. 263 264 ALIGN(8) 265L(unroll_loop): 266 C eax counter, limbs, negative 267 C ebx &src[size-12] 268 C ecx 269 C edx &dst[size-12] 270 C esi 271 C edi 272 C 273 C mm0 274 C mm1 275 C mm2 src qword from -8(%ebx,%eax,4) 276 C mm3 dst qword ready to store to -8(%edx,%eax,4) 277 C 278 C mm5 return value 279 C mm6 rshift 280 C mm7 lshift 281 282 movq (%ebx,%eax,4), %mm0 283 psrlq %mm6, %mm2 284 285 movq %mm0, %mm1 286 psllq %mm7, %mm0 287 288 movq %mm3, -8(%edx,%eax,4) C prev 289 por %mm2, %mm0 290 291 movq 8(%ebx,%eax,4), %mm3 C 292 psrlq %mm6, %mm1 C 293 294 movq %mm0, (%edx,%eax,4) 295 movq %mm3, %mm2 C 296 297 psllq %mm7, %mm3 C 298 addl $4, %eax 299 300 por %mm1, %mm3 C 301 js L(unroll_loop) 302 303 304L(finish): 305 C eax 0 to 3 representing respectively 3 to 0 limbs remaining 306 307 testb $2, %al 308 309 jnz L(finish_no_two) 310 311 movq (%ebx,%eax,4), %mm0 312 psrlq %mm6, %mm2 313 314 movq %mm0, %mm1 315 psllq %mm7, %mm0 316 317 movq %mm3, -8(%edx,%eax,4) C prev 318 por %mm2, %mm0 319 320 movq %mm1, %mm2 321 movq %mm0, %mm3 322 323 addl $2, %eax 324L(finish_no_two): 325 326 327 C eax 2 or 3 representing respectively 1 or 0 limbs remaining 328 C 329 C mm2 src prev qword, from -8(%ebx,%eax,4) 330 C mm3 dst qword, for -8(%edx,%eax,4) 331 332 testb $1, %al 333 popl %edi 334 335 movd %mm5, %eax C retval 336 jnz L(finish_zero) 337 338 339 C One extra limb, destination was aligned. 340 C 341 C source ebx 342 C +-------+---------------+-- 343 C | | mm2 | 344 C +-------+---------------+-- 345 C 346 C dest edx 347 C +-------+---------------+---------------+-- 348 C | | | mm3 | 349 C +-------+---------------+---------------+-- 350 C 351 C mm6 = shift 352 C mm7 = ecx = 64-shift 353 354 355 C One extra limb, destination was unaligned. 356 C 357 C source ebx 358 C +-------+---------------+-- 359 C | | mm2 | 360 C +-------+---------------+-- 361 C 362 C dest edx 363 C +---------------+---------------+-- 364 C | | mm3 | 365 C +---------------+---------------+-- 366 C 367 C mm6 = shift+32 368 C mm7 = ecx = 64-(shift+32) 369 370 371 C In both cases there's one extra limb of src to fetch and combine 372 C with mm2 to make a qword at 8(%edx), and in the aligned case 373 C there's a further extra limb of dst to be formed. 374 375 376 movd 8(%ebx), %mm0 377 psrlq %mm6, %mm2 378 379 movq %mm0, %mm1 380 psllq %mm7, %mm0 381 382 movq %mm3, (%edx) 383 por %mm2, %mm0 384 385 psrlq %mm6, %mm1 386 andl $32, %ecx 387 388 popl %ebx 389 jz L(finish_one_unaligned) 390 391 C dst was aligned, must store one extra limb 392 movd %mm1, 16(%edx) 393L(finish_one_unaligned): 394 395 movq %mm0, 8(%edx) 396 397 emms 398 399 ret 400 401 402L(finish_zero): 403 404 C No extra limbs, destination was aligned. 405 C 406 C source ebx 407 C +---------------+-- 408 C | mm2 | 409 C +---------------+-- 410 C 411 C dest edx+4 412 C +---------------+---------------+-- 413 C | | mm3 | 414 C +---------------+---------------+-- 415 C 416 C mm6 = shift 417 C mm7 = ecx = 64-shift 418 419 420 C No extra limbs, destination was unaligned. 421 C 422 C source ebx 423 C +---------------+-- 424 C | mm2 | 425 C +---------------+-- 426 C 427 C dest edx+4 428 C +-------+---------------+-- 429 C | | mm3 | 430 C +-------+---------------+-- 431 C 432 C mm6 = shift+32 433 C mm7 = 64-(shift+32) 434 435 436 C The movd for the unaligned case is clearly the same data as the 437 C movq for the aligned case, it's just a choice between whether one 438 C or two limbs should be written. 439 440 441 movq %mm3, 4(%edx) 442 psrlq %mm6, %mm2 443 444 movd %mm2, 12(%edx) 445 andl $32, %ecx 446 447 popl %ebx 448 jz L(finish_zero_unaligned) 449 450 movq %mm2, 12(%edx) 451L(finish_zero_unaligned): 452 453 emms 454 455 ret 456 457EPILOGUE() 458