xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/k6/pre_mod_1.asm (revision 63aea4bd5b445e491ff0389fe27ec78b3099dba3)
1dnl  AMD K6 mpn_preinv_mod_1 -- mpn by 1 remainder, with pre-inverted divisor.
2
3dnl  Copyright 2000, 2002, 2003 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C K6: 18.0 cycles/limb
24
25
26C mp_limb_t mpn_preinv_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
27C                             mp_limb_t inverse);
28C
29C This code is only 2 c/l faster than a simple divl, but that's 10% so it's
30C considered worthwhile (just).
31
32defframe(PARAM_INVERSE,16)
33defframe(PARAM_DIVISOR,12)
34defframe(PARAM_SIZE,    8)
35defframe(PARAM_SRC,     4)
36
37	TEXT
38	ALIGN(32)
39PROLOGUE(mpn_preinv_mod_1)
40deflit(`FRAME',0)
41
42	ASSERT(ae,`cmpl $1, PARAM_SIZE')
43	ASSERT(nz,`testl $0x80000000, PARAM_DIVISOR')
44
45	movl	PARAM_SIZE, %ecx
46	pushl	%ebp	FRAME_pushl()
47
48	movl	PARAM_SRC, %ebp
49	pushl	%edi	FRAME_pushl()
50
51	movl	PARAM_DIVISOR, %eax
52	pushl	%esi	FRAME_pushl()
53
54	movl	-4(%ebp,%ecx,4), %esi	C src high limb
55	pushl	%ebx	FRAME_pushl()
56
57	movl	%edx, %edi		C first n2 to cancel
58	subl	%eax, %esi		C first n1 = high-divisor
59
60	decl	%ecx
61	jz	L(done_sbbl)
62
63L(top):
64	C eax	scratch
65	C ebx	n10, nadj, q1
66	C ecx	counter, size to 1
67	C edx	scratch
68	C esi	n2
69	C edi	old high, for underflow test
70	C ebp	src
71
72	sbbl	%edx, %edi	    C high n-(q1+1)*d, 0 or -1
73
74L(entry):
75	andl	PARAM_DIVISOR, %edi
76L(q1_ff_top):
77	movl	-4(%ebp,%ecx,4), %ebx
78
79	addl	%esi, %edi	    C possible addback
80	movl	%ebx, %esi	    C n10
81
82	sarl	$31, %ebx	    C -n1 = 0 or -1
83	movl	%edi, %eax	    C n2
84
85	movl	PARAM_INVERSE, %edx
86	subl	%ebx, %eax	    C n2+n1
87
88	mull	%edx		    C m*(n2+n1)
89
90	andl	PARAM_DIVISOR, %ebx C -n1 & d
91	addl	%esi, %ebx	    C nadj = n10 + (-n1&d), ignoring overflow
92
93	addl	%ebx, %eax	    C low m*(n2+n1) + nadj, giving carry flag
94	leal	1(%edi), %ebx	    C n2+1
95
96	adcl	%ebx, %edx	    C 1+high(n2<<32+m*(n2+n1)+nadj) = q1+1
97
98	movl	PARAM_DIVISOR, %eax C d
99	jz	L(q1_ff)
100
101	mull	%edx		    C (q1+1)*d
102
103	subl	%eax, %esi	    C low  n-(q1+1)*d
104	loop	L(top)
105
106
107
108L(done_sbbl):
109	sbbl	%edx, %edi	    C high n-(q1+1)*d, 0 or -1
110
111	andl	PARAM_DIVISOR, %edi
112L(done_esi_edi):
113	popl	%ebx
114
115	leal	(%esi,%edi), %eax
116	popl	%esi
117
118	popl	%edi
119	popl	%ebp
120
121	ret
122
123
124C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
125C of q*d is simply -d and the remainder n-q*d = n10+d.  This is rarely
126C reached.
127
128L(q1_ff):
129	movl	PARAM_DIVISOR, %edi
130	loop	L(q1_ff_top)
131
132	jmp	L(done_esi_edi)
133
134
135EPILOGUE()
136