1dnl AMD K6 mpn_preinv_mod_1 -- mpn by 1 remainder, with pre-inverted divisor. 2 3dnl Copyright 2000, 2002, 2003 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C K6: 18.0 cycles/limb 24 25 26C mp_limb_t mpn_preinv_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor, 27C mp_limb_t inverse); 28C 29C This code is only 2 c/l faster than a simple divl, but that's 10% so it's 30C considered worthwhile (just). 31 32defframe(PARAM_INVERSE,16) 33defframe(PARAM_DIVISOR,12) 34defframe(PARAM_SIZE, 8) 35defframe(PARAM_SRC, 4) 36 37 TEXT 38 ALIGN(32) 39PROLOGUE(mpn_preinv_mod_1) 40deflit(`FRAME',0) 41 42 ASSERT(ae,`cmpl $1, PARAM_SIZE') 43 ASSERT(nz,`testl $0x80000000, PARAM_DIVISOR') 44 45 movl PARAM_SIZE, %ecx 46 pushl %ebp FRAME_pushl() 47 48 movl PARAM_SRC, %ebp 49 pushl %edi FRAME_pushl() 50 51 movl PARAM_DIVISOR, %eax 52 pushl %esi FRAME_pushl() 53 54 movl -4(%ebp,%ecx,4), %esi C src high limb 55 pushl %ebx FRAME_pushl() 56 57 movl %edx, %edi C first n2 to cancel 58 subl %eax, %esi C first n1 = high-divisor 59 60 decl %ecx 61 jz L(done_sbbl) 62 63L(top): 64 C eax scratch 65 C ebx n10, nadj, q1 66 C ecx counter, size to 1 67 C edx scratch 68 C esi n2 69 C edi old high, for underflow test 70 C ebp src 71 72 sbbl %edx, %edi C high n-(q1+1)*d, 0 or -1 73 74L(entry): 75 andl PARAM_DIVISOR, %edi 76L(q1_ff_top): 77 movl -4(%ebp,%ecx,4), %ebx 78 79 addl %esi, %edi C possible addback 80 movl %ebx, %esi C n10 81 82 sarl $31, %ebx C -n1 = 0 or -1 83 movl %edi, %eax C n2 84 85 movl PARAM_INVERSE, %edx 86 subl %ebx, %eax C n2+n1 87 88 mull %edx C m*(n2+n1) 89 90 andl PARAM_DIVISOR, %ebx C -n1 & d 91 addl %esi, %ebx C nadj = n10 + (-n1&d), ignoring overflow 92 93 addl %ebx, %eax C low m*(n2+n1) + nadj, giving carry flag 94 leal 1(%edi), %ebx C n2+1 95 96 adcl %ebx, %edx C 1+high(n2<<32+m*(n2+n1)+nadj) = q1+1 97 98 movl PARAM_DIVISOR, %eax C d 99 jz L(q1_ff) 100 101 mull %edx C (q1+1)*d 102 103 subl %eax, %esi C low n-(q1+1)*d 104 loop L(top) 105 106 107 108L(done_sbbl): 109 sbbl %edx, %edi C high n-(q1+1)*d, 0 or -1 110 111 andl PARAM_DIVISOR, %edi 112L(done_esi_edi): 113 popl %ebx 114 115 leal (%esi,%edi), %eax 116 popl %esi 117 118 popl %edi 119 popl %ebp 120 121 ret 122 123 124C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword 125C of q*d is simply -d and the remainder n-q*d = n10+d. This is rarely 126C reached. 127 128L(q1_ff): 129 movl PARAM_DIVISOR, %edi 130 loop L(q1_ff_top) 131 132 jmp L(done_esi_edi) 133 134 135EPILOGUE() 136