1dnl Intel Pentium mpn_rshift -- mpn right shift. 2 3dnl Copyright 1992, 1994, 1995, 1996, 1999, 2000, 2002 Free Software 4dnl Foundation, Inc. 5dnl 6dnl This file is part of the GNU MP Library. 7dnl 8dnl The GNU MP Library is free software; you can redistribute it and/or 9dnl modify it under the terms of the GNU Lesser General Public License as 10dnl published by the Free Software Foundation; either version 3 of the 11dnl License, or (at your option) any later version. 12dnl 13dnl The GNU MP Library is distributed in the hope that it will be useful, 14dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 15dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16dnl Lesser General Public License for more details. 17dnl 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21include(`../config.m4') 22 23 24C cycles/limb 25C P5,P54: 6.0 26C P55: 5.375 27 28 29C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size, 30C unsigned shift); 31C 32C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does, 33C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere. 34 35defframe(PARAM_SHIFT,16) 36defframe(PARAM_SIZE, 12) 37defframe(PARAM_SRC, 8) 38defframe(PARAM_DST, 4) 39 40 TEXT 41 ALIGN(8) 42PROLOGUE(mpn_rshift) 43 44 pushl %edi 45 pushl %esi 46 pushl %ebx 47 pushl %ebp 48deflit(`FRAME',16) 49 50 movl PARAM_DST,%edi 51 movl PARAM_SRC,%esi 52 movl PARAM_SIZE,%ebp 53 movl PARAM_SHIFT,%ecx 54 55C We can use faster code for shift-by-1 under certain conditions. 56 cmp $1,%ecx 57 jne L(normal) 58 leal 4(%edi),%eax 59 cmpl %esi,%eax 60 jnc L(special) C jump if res_ptr + 1 >= s_ptr 61 leal (%edi,%ebp,4),%eax 62 cmpl %eax,%esi 63 jnc L(special) C jump if s_ptr >= res_ptr + size 64 65L(normal): 66 movl (%esi),%edx 67 addl $4,%esi 68 xorl %eax,%eax 69 shrdl( %cl, %edx, %eax) C compute carry limb 70 pushl %eax C push carry limb onto stack 71 72 decl %ebp 73 pushl %ebp 74 shrl $3,%ebp 75 jz L(end) 76 77 movl (%edi),%eax C fetch destination cache line 78 79 ALIGN(4) 80L(oop): movl 28(%edi),%eax C fetch destination cache line 81 movl %edx,%ebx 82 83 movl (%esi),%eax 84 movl 4(%esi),%edx 85 shrdl( %cl, %eax, %ebx) 86 shrdl( %cl, %edx, %eax) 87 movl %ebx,(%edi) 88 movl %eax,4(%edi) 89 90 movl 8(%esi),%ebx 91 movl 12(%esi),%eax 92 shrdl( %cl, %ebx, %edx) 93 shrdl( %cl, %eax, %ebx) 94 movl %edx,8(%edi) 95 movl %ebx,12(%edi) 96 97 movl 16(%esi),%edx 98 movl 20(%esi),%ebx 99 shrdl( %cl, %edx, %eax) 100 shrdl( %cl, %ebx, %edx) 101 movl %eax,16(%edi) 102 movl %edx,20(%edi) 103 104 movl 24(%esi),%eax 105 movl 28(%esi),%edx 106 shrdl( %cl, %eax, %ebx) 107 shrdl( %cl, %edx, %eax) 108 movl %ebx,24(%edi) 109 movl %eax,28(%edi) 110 111 addl $32,%esi 112 addl $32,%edi 113 decl %ebp 114 jnz L(oop) 115 116L(end): popl %ebp 117 andl $7,%ebp 118 jz L(end2) 119L(oop2): 120 movl (%esi),%eax 121 shrdl( %cl,%eax,%edx) C compute result limb 122 movl %edx,(%edi) 123 movl %eax,%edx 124 addl $4,%esi 125 addl $4,%edi 126 decl %ebp 127 jnz L(oop2) 128 129L(end2): 130 shrl %cl,%edx C compute most significant limb 131 movl %edx,(%edi) C store it 132 133 popl %eax C pop carry limb 134 135 popl %ebp 136 popl %ebx 137 popl %esi 138 popl %edi 139 ret 140 141 142C We loop from least significant end of the arrays, which is only 143C permissable if the source and destination don't overlap, since the 144C function is documented to work for overlapping source and destination. 145 146L(special): 147 leal -4(%edi,%ebp,4),%edi 148 leal -4(%esi,%ebp,4),%esi 149 150 movl (%esi),%edx 151 subl $4,%esi 152 153 decl %ebp 154 pushl %ebp 155 shrl $3,%ebp 156 157 shrl %edx 158 incl %ebp 159 decl %ebp 160 jz L(Lend) 161 162 movl (%edi),%eax C fetch destination cache line 163 164 ALIGN(4) 165L(Loop): 166 movl -28(%edi),%eax C fetch destination cache line 167 movl %edx,%ebx 168 169 movl (%esi),%eax 170 movl -4(%esi),%edx 171 rcrl %eax 172 movl %ebx,(%edi) 173 rcrl %edx 174 movl %eax,-4(%edi) 175 176 movl -8(%esi),%ebx 177 movl -12(%esi),%eax 178 rcrl %ebx 179 movl %edx,-8(%edi) 180 rcrl %eax 181 movl %ebx,-12(%edi) 182 183 movl -16(%esi),%edx 184 movl -20(%esi),%ebx 185 rcrl %edx 186 movl %eax,-16(%edi) 187 rcrl %ebx 188 movl %edx,-20(%edi) 189 190 movl -24(%esi),%eax 191 movl -28(%esi),%edx 192 rcrl %eax 193 movl %ebx,-24(%edi) 194 rcrl %edx 195 movl %eax,-28(%edi) 196 197 leal -32(%esi),%esi C use leal not to clobber carry 198 leal -32(%edi),%edi 199 decl %ebp 200 jnz L(Loop) 201 202L(Lend): 203 popl %ebp 204 sbbl %eax,%eax C save carry in %eax 205 andl $7,%ebp 206 jz L(Lend2) 207 addl %eax,%eax C restore carry from eax 208L(Loop2): 209 movl %edx,%ebx 210 movl (%esi),%edx 211 rcrl %edx 212 movl %ebx,(%edi) 213 214 leal -4(%esi),%esi C use leal not to clobber carry 215 leal -4(%edi),%edi 216 decl %ebp 217 jnz L(Loop2) 218 219 jmp L(L1) 220L(Lend2): 221 addl %eax,%eax C restore carry from eax 222L(L1): movl %edx,(%edi) C store last limb 223 224 movl $0,%eax 225 rcrl %eax 226 227 popl %ebp 228 popl %ebx 229 popl %esi 230 popl %edi 231 ret 232 233EPILOGUE() 234