1dnl AMD K6 mpn_divrem_1 -- mpn by limb division. 2 3dnl Copyright 1999, 2000, 2001, 2002, 2003, 2007 Free Software Foundation, 4dnl Inc. 5dnl 6dnl This file is part of the GNU MP Library. 7dnl 8dnl The GNU MP Library is free software; you can redistribute it and/or 9dnl modify it under the terms of the GNU Lesser General Public License as 10dnl published by the Free Software Foundation; either version 3 of the 11dnl License, or (at your option) any later version. 12dnl 13dnl The GNU MP Library is distributed in the hope that it will be useful, 14dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 15dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16dnl Lesser General Public License for more details. 17dnl 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21include(`../config.m4') 22 23 24C K6: 20 cycles/limb 25 26 27C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize, 28C mp_srcptr src, mp_size_t size, mp_limb_t divisor); 29C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize, 30C mp_srcptr src, mp_size_t size, mp_limb_t divisor, 31C mp_limb_t carry); 32C 33C The code here is basically the same as mpn/x86/divrem_1.asm, but uses loop 34C instead of decl+jnz, since it comes out 2 cycles/limb faster. 35C 36C A test is done to see if the high limb is less than the divisor, and if so 37C one less div is done. A div is 20 cycles, so assuming high<divisor about 38C half the time, then this test saves half that amount. The branch 39C misprediction penalty is less than that. 40C 41C Back-to-back div instructions run at 20 cycles, the same as the loop here, 42C so it seems there's nothing to gain by rearranging the loop. Pairing the 43C mov and loop instructions was found to gain nothing. 44C 45C Enhancements: 46C 47C The low-latency K6 multiply might be thought to suit a mul-by-inverse, but 48C that algorithm has been found to suffer from the relatively poor carry 49C handling on K6 and too many auxiliary instructions. The fractional part 50C however could be done at about 13 c/l, if it mattered enough. 51 52defframe(PARAM_CARRY, 24) 53defframe(PARAM_DIVISOR,20) 54defframe(PARAM_SIZE, 16) 55defframe(PARAM_SRC, 12) 56defframe(PARAM_XSIZE, 8) 57defframe(PARAM_DST, 4) 58 59 TEXT 60 61 ALIGN(32) 62PROLOGUE(mpn_divrem_1c) 63deflit(`FRAME',0) 64 65 movl PARAM_SIZE, %ecx 66 pushl %edi FRAME_pushl() 67 68 movl PARAM_SRC, %edi 69 pushl %esi FRAME_pushl() 70 71 movl PARAM_DIVISOR, %esi 72 pushl %ebx FRAME_pushl() 73 74 movl PARAM_DST, %ebx 75 pushl %ebp FRAME_pushl() 76 77 movl PARAM_XSIZE, %ebp 78 orl %ecx, %ecx C size 79 80 movl PARAM_CARRY, %edx 81 jz L(fraction) C if size==0 82 83 leal -4(%ebx,%ebp,4), %ebx C dst one limb below integer part 84 jmp L(integer_top) 85 86EPILOGUE() 87 88 89 ALIGN(16) 90PROLOGUE(mpn_divrem_1) 91deflit(`FRAME',0) 92 93 movl PARAM_SIZE, %ecx 94 pushl %edi FRAME_pushl() 95 96 movl PARAM_SRC, %edi 97 pushl %esi FRAME_pushl() 98 99 movl PARAM_DIVISOR, %esi 100 orl %ecx,%ecx C size 101 102 jz L(size_zero) 103 pushl %ebx FRAME_pushl() 104 105 movl -4(%edi,%ecx,4), %eax C src high limb 106 xorl %edx, %edx 107 108 movl PARAM_DST, %ebx 109 pushl %ebp FRAME_pushl() 110 111 movl PARAM_XSIZE, %ebp 112 cmpl %esi, %eax 113 114 leal -4(%ebx,%ebp,4), %ebx C dst one limb below integer part 115 jae L(integer_entry) 116 117 118 C high<divisor, so high of dst is zero, and avoid one div 119 120 movl %edx, (%ebx,%ecx,4) 121 decl %ecx 122 123 movl %eax, %edx 124 jz L(fraction) 125 126 127L(integer_top): 128 C eax scratch (quotient) 129 C ebx dst+4*xsize-4 130 C ecx counter 131 C edx scratch (remainder) 132 C esi divisor 133 C edi src 134 C ebp xsize 135 136 movl -4(%edi,%ecx,4), %eax 137L(integer_entry): 138 139 divl %esi 140 141 movl %eax, (%ebx,%ecx,4) 142 loop L(integer_top) 143 144 145L(fraction): 146 orl %ebp, %ecx 147 jz L(done) 148 149 movl PARAM_DST, %ebx 150 151 152L(fraction_top): 153 C eax scratch (quotient) 154 C ebx dst 155 C ecx counter 156 C edx scratch (remainder) 157 C esi divisor 158 C edi 159 C ebp 160 161 xorl %eax, %eax 162 163 divl %esi 164 165 movl %eax, -4(%ebx,%ecx,4) 166 loop L(fraction_top) 167 168 169L(done): 170 popl %ebp 171 movl %edx, %eax 172 popl %ebx 173 popl %esi 174 popl %edi 175 ret 176 177 178L(size_zero): 179deflit(`FRAME',8) 180 movl PARAM_XSIZE, %ecx 181 xorl %eax, %eax 182 183 movl PARAM_DST, %edi 184 185 cld C better safe than sorry, see mpn/x86/README 186 187 rep 188 stosl 189 190 popl %esi 191 popl %edi 192 ret 193EPILOGUE() 194