1dnl AMD K6 mpn_rshift -- mpn right shift. 2 3dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33 34C K6: 3.0 cycles/limb 35 36 37C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size, 38C unsigned shift); 39C 40C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx 41C instructions. This is despite every second fetch being unaligned. 42 43 44defframe(PARAM_SHIFT,16) 45defframe(PARAM_SIZE, 12) 46defframe(PARAM_SRC, 8) 47defframe(PARAM_DST, 4) 48deflit(`FRAME',0) 49 50 TEXT 51 ALIGN(32) 52 53PROLOGUE(mpn_rshift) 54deflit(`FRAME',0) 55 56 C The 1 limb case can be done without the push %ebx, but it's then 57 C still the same speed. The push is left as a free helping hand for 58 C the two_or_more code. 59 60 movl PARAM_SIZE, %eax 61 pushl %ebx FRAME_pushl() 62 63 movl PARAM_SRC, %ebx 64 decl %eax 65 66 movl PARAM_SHIFT, %ecx 67 jnz L(two_or_more) 68 69 movl (%ebx), %edx C src limb 70 movl PARAM_DST, %ebx 71 72 shrdl( %cl, %edx, %eax) C return value 73 74 shrl %cl, %edx 75 76 movl %edx, (%ebx) C dst limb 77 popl %ebx 78 79 ret 80 81 82 ALIGN(16) C avoid offset 0x1f 83L(two_or_more): 84 C eax size-1 85 C ebx src 86 C ecx shift 87 C edx 88 89 movl (%ebx), %edx C src low limb 90 negl %ecx 91 92 addl $32, %ecx C 32-shift 93 movd PARAM_SHIFT, %mm6 94 95 shll %cl, %edx C retval 96 movl PARAM_DST, %ecx 97 98 leal (%ebx,%eax,4), %ebx 99 100 leal -4(%ecx,%eax,4), %ecx 101 negl %eax 102 103 104L(simple): 105 C eax counter (negative) 106 C ebx &src[size-1] 107 C ecx &dst[size-1] 108 C edx retval 109 C 110 C mm0 scratch 111 C mm6 shift 112 113Zdisp( movq, 0,(%ebx,%eax,4), %mm0) 114 incl %eax 115 116 psrlq %mm6, %mm0 117 118Zdisp( movd, %mm0, 0,(%ecx,%eax,4)) 119 jnz L(simple) 120 121 122 movq %mm0, (%ecx) 123 movl %edx, %eax 124 125 popl %ebx 126 127 emms 128 ret 129 130EPILOGUE() 131