1dnl AMD K6 mpn_preinv_mod_1 -- mpn by 1 remainder, with pre-inverted divisor. 2 3dnl Copyright 2000, 2002, 2003 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33 34C K6: 18.0 cycles/limb 35 36 37C mp_limb_t mpn_preinv_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor, 38C mp_limb_t inverse); 39C 40C This code is only 2 c/l faster than a simple divl, but that's 10% so it's 41C considered worthwhile (just). 42 43defframe(PARAM_INVERSE,16) 44defframe(PARAM_DIVISOR,12) 45defframe(PARAM_SIZE, 8) 46defframe(PARAM_SRC, 4) 47 48 TEXT 49 ALIGN(32) 50PROLOGUE(mpn_preinv_mod_1) 51deflit(`FRAME',0) 52 53 ASSERT(ae,`cmpl $1, PARAM_SIZE') 54 ASSERT(nz,`testl $0x80000000, PARAM_DIVISOR') 55 56 movl PARAM_SIZE, %ecx 57 pushl %ebp FRAME_pushl() 58 59 movl PARAM_SRC, %ebp 60 pushl %edi FRAME_pushl() 61 62 movl PARAM_DIVISOR, %eax 63 pushl %esi FRAME_pushl() 64 65 movl -4(%ebp,%ecx,4), %esi C src high limb 66 pushl %ebx FRAME_pushl() 67 68 movl %edx, %edi C first n2 to cancel 69 subl %eax, %esi C first n1 = high-divisor 70 71 decl %ecx 72 jz L(done_sbbl) 73 74L(top): 75 C eax scratch 76 C ebx n10, nadj, q1 77 C ecx counter, size to 1 78 C edx scratch 79 C esi n2 80 C edi old high, for underflow test 81 C ebp src 82 83 sbbl %edx, %edi C high n-(q1+1)*d, 0 or -1 84 85L(entry): 86 andl PARAM_DIVISOR, %edi 87L(q1_ff_top): 88 movl -4(%ebp,%ecx,4), %ebx 89 90 addl %esi, %edi C possible addback 91 movl %ebx, %esi C n10 92 93 sarl $31, %ebx C -n1 = 0 or -1 94 movl %edi, %eax C n2 95 96 movl PARAM_INVERSE, %edx 97 subl %ebx, %eax C n2+n1 98 99 mull %edx C m*(n2+n1) 100 101 andl PARAM_DIVISOR, %ebx C -n1 & d 102 addl %esi, %ebx C nadj = n10 + (-n1&d), ignoring overflow 103 104 addl %ebx, %eax C low m*(n2+n1) + nadj, giving carry flag 105 leal 1(%edi), %ebx C n2+1 106 107 adcl %ebx, %edx C 1+high(n2<<32+m*(n2+n1)+nadj) = q1+1 108 109 movl PARAM_DIVISOR, %eax C d 110 jz L(q1_ff) 111 112 mull %edx C (q1+1)*d 113 114 subl %eax, %esi C low n-(q1+1)*d 115 loop L(top) 116 117 118 119L(done_sbbl): 120 sbbl %edx, %edi C high n-(q1+1)*d, 0 or -1 121 122 andl PARAM_DIVISOR, %edi 123L(done_esi_edi): 124 popl %ebx 125 126 leal (%esi,%edi), %eax 127 popl %esi 128 129 popl %edi 130 popl %ebp 131 132 ret 133 134 135C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword 136C of q*d is simply -d and the remainder n-q*d = n10+d. This is rarely 137C reached. 138 139L(q1_ff): 140 movl PARAM_DIVISOR, %edi 141 loop L(q1_ff_top) 142 143 jmp L(done_esi_edi) 144 145 146EPILOGUE() 147