1dnl AMD K7 mpn_bdiv_q_1 -- mpn by limb exact division. 2 3dnl Copyright 2001, 2002, 2004, 2007, 2011 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl Rearranged from mpn/x86/k7/dive_1.asm by Marco Bodrato. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or 10dnl modify it under the terms of the GNU Lesser General Public License as 11dnl published by the Free Software Foundation; either version 3 of the 12dnl License, or (at your option) any later version. 13dnl 14dnl The GNU MP Library is distributed in the hope that it will be useful, 15dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 16dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17dnl Lesser General Public License for more details. 18dnl 19dnl You should have received a copy of the GNU Lesser General Public License 20dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 21 22include(`../config.m4') 23 24 25C cycles/limb 26C Athlon: 11.0 27C Hammer: 9.0 28 29 30C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, 31C mp_limb_t divisor); 32C 33C The dependent chain is mul+imul+sub for 11 cycles and that speed is 34C achieved with no special effort. The load and shrld latencies are hidden 35C by out of order execution. 36C 37C It's a touch faster on size==1 to use the mul-by-inverse than divl. 38 39defframe(PARAM_SHIFT, 24) 40defframe(PARAM_INVERSE,20) 41defframe(PARAM_DIVISOR,16) 42defframe(PARAM_SIZE, 12) 43defframe(PARAM_SRC, 8) 44defframe(PARAM_DST, 4) 45 46defframe(SAVE_EBX, -4) 47defframe(SAVE_ESI, -8) 48defframe(SAVE_EDI, -12) 49defframe(SAVE_EBP, -16) 50defframe(VAR_INVERSE, -20) 51defframe(VAR_DST_END, -24) 52 53deflit(STACK_SPACE, 24) 54 55 TEXT 56 57C mp_limb_t 58C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor, 59C mp_limb_t inverse, int shift) 60 ALIGN(16) 61PROLOGUE(mpn_pi1_bdiv_q_1) 62deflit(`FRAME',0) 63 64 subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE) 65 movl PARAM_SHIFT, %ecx C shift count 66 67 movl %ebp, SAVE_EBP 68 movl PARAM_SIZE, %ebp 69 70 movl %esi, SAVE_ESI 71 movl PARAM_SRC, %esi 72 73 movl %edi, SAVE_EDI 74 movl PARAM_DST, %edi 75 76 movl %ebx, SAVE_EBX 77 78 leal (%esi,%ebp,4), %esi C src end 79 leal (%edi,%ebp,4), %edi C dst end 80 negl %ebp C -size 81 82 movl PARAM_INVERSE, %eax C inv 83 84L(common): 85 movl %eax, VAR_INVERSE 86 movl (%esi,%ebp,4), %eax C src[0] 87 88 incl %ebp 89 jz L(one) 90 91 movl (%esi,%ebp,4), %edx C src[1] 92 93 shrdl( %cl, %edx, %eax) 94 95 movl %edi, VAR_DST_END 96 xorl %ebx, %ebx 97 jmp L(entry) 98 99 ALIGN(8) 100L(top): 101 C eax q 102 C ebx carry bit, 0 or 1 103 C ecx shift 104 C edx 105 C esi src end 106 C edi dst end 107 C ebp counter, limbs, negative 108 109 mull PARAM_DIVISOR C carry limb in edx 110 111 movl -4(%esi,%ebp,4), %eax 112 movl (%esi,%ebp,4), %edi 113 114 shrdl( %cl, %edi, %eax) 115 116 subl %ebx, %eax C apply carry bit 117 setc %bl 118 movl VAR_DST_END, %edi 119 120 subl %edx, %eax C apply carry limb 121 adcl $0, %ebx 122 123L(entry): 124 imull VAR_INVERSE, %eax 125 126 movl %eax, -4(%edi,%ebp,4) 127 incl %ebp 128 jnz L(top) 129 130 131 mull PARAM_DIVISOR C carry limb in edx 132 133 movl -4(%esi), %eax C src high limb 134 shrl %cl, %eax 135 movl SAVE_ESI, %esi 136 137 subl %ebx, %eax C apply carry bit 138 movl SAVE_EBX, %ebx 139 movl SAVE_EBP, %ebp 140 141 subl %edx, %eax C apply carry limb 142 143 imull VAR_INVERSE, %eax 144 145 movl %eax, -4(%edi) 146 movl SAVE_EDI, %edi 147 addl $STACK_SPACE, %esp 148 149 ret 150 151L(one): 152 shrl %cl, %eax 153 movl SAVE_ESI, %esi 154 movl SAVE_EBX, %ebx 155 156 imull VAR_INVERSE, %eax 157 158 movl SAVE_EBP, %ebp 159 160 movl %eax, -4(%edi) 161 movl SAVE_EDI, %edi 162 addl $STACK_SPACE, %esp 163 164 ret 165EPILOGUE() 166 167C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, 168C mp_limb_t divisor); 169C 170 171 ALIGN(16) 172PROLOGUE(mpn_bdiv_q_1) 173deflit(`FRAME',0) 174 175 movl PARAM_DIVISOR, %eax 176 subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE) 177 movl $-1, %ecx C shift count 178 179 movl %ebp, SAVE_EBP 180 movl PARAM_SIZE, %ebp 181 182 movl %esi, SAVE_ESI 183 movl %edi, SAVE_EDI 184 185 C If there's usually only one or two trailing zero bits then this 186 C should be faster than bsfl. 187L(strip_twos): 188 incl %ecx 189 shrl %eax 190 jnc L(strip_twos) 191 192 movl %ebx, SAVE_EBX 193 leal 1(%eax,%eax), %ebx C d without twos 194 andl $127, %eax C d/2, 7 bits 195 196ifdef(`PIC',` 197 LEA( binvert_limb_table, %edx) 198 movzbl (%eax,%edx), %eax C inv 8 bits 199',` 200 movzbl binvert_limb_table(%eax), %eax C inv 8 bits 201') 202 203 leal (%eax,%eax), %edx C 2*inv 204 movl %ebx, PARAM_DIVISOR C d without twos 205 206 imull %eax, %eax C inv*inv 207 208 movl PARAM_SRC, %esi 209 movl PARAM_DST, %edi 210 211 imull %ebx, %eax C inv*inv*d 212 213 subl %eax, %edx C inv = 2*inv - inv*inv*d 214 leal (%edx,%edx), %eax C 2*inv 215 216 imull %edx, %edx C inv*inv 217 218 leal (%esi,%ebp,4), %esi C src end 219 leal (%edi,%ebp,4), %edi C dst end 220 negl %ebp C -size 221 222 imull %ebx, %edx C inv*inv*d 223 224 subl %edx, %eax C inv = 2*inv - inv*inv*d 225 226 ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS 227 pushl %eax FRAME_pushl() 228 imull PARAM_DIVISOR, %eax 229 cmpl $1, %eax 230 popl %eax FRAME_popl()') 231 232 jmp L(common) 233EPILOGUE() 234