xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/k6/pre_mod_1.asm (revision aef5eb5f59cdfe8314f1b5f78ac04eb144e44010)
1dnl  AMD K6 mpn_preinv_mod_1 -- mpn by 1 remainder, with pre-inverted divisor.
2
3dnl  Copyright 2000, 2002, 2003 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C K6: 18.0 cycles/limb
35
36
37C mp_limb_t mpn_preinv_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
38C                             mp_limb_t inverse);
39C
40C This code is only 2 c/l faster than a simple divl, but that's 10% so it's
41C considered worthwhile (just).
42
43defframe(PARAM_INVERSE,16)
44defframe(PARAM_DIVISOR,12)
45defframe(PARAM_SIZE,    8)
46defframe(PARAM_SRC,     4)
47
48	TEXT
49	ALIGN(32)
50PROLOGUE(mpn_preinv_mod_1)
51deflit(`FRAME',0)
52
53	ASSERT(ae,`cmpl $1, PARAM_SIZE')
54	ASSERT(nz,`testl $0x80000000, PARAM_DIVISOR')
55
56	movl	PARAM_SIZE, %ecx
57	pushl	%ebp	FRAME_pushl()
58
59	movl	PARAM_SRC, %ebp
60	pushl	%edi	FRAME_pushl()
61
62	movl	PARAM_DIVISOR, %eax
63	pushl	%esi	FRAME_pushl()
64
65	movl	-4(%ebp,%ecx,4), %esi	C src high limb
66	pushl	%ebx	FRAME_pushl()
67
68	movl	%edx, %edi		C first n2 to cancel
69	subl	%eax, %esi		C first n1 = high-divisor
70
71	decl	%ecx
72	jz	L(done_sbbl)
73
74L(top):
75	C eax	scratch
76	C ebx	n10, nadj, q1
77	C ecx	counter, size to 1
78	C edx	scratch
79	C esi	n2
80	C edi	old high, for underflow test
81	C ebp	src
82
83	sbbl	%edx, %edi	    C high n-(q1+1)*d, 0 or -1
84
85L(entry):
86	andl	PARAM_DIVISOR, %edi
87L(q1_ff_top):
88	movl	-4(%ebp,%ecx,4), %ebx
89
90	addl	%esi, %edi	    C possible addback
91	movl	%ebx, %esi	    C n10
92
93	sarl	$31, %ebx	    C -n1 = 0 or -1
94	movl	%edi, %eax	    C n2
95
96	movl	PARAM_INVERSE, %edx
97	subl	%ebx, %eax	    C n2+n1
98
99	mull	%edx		    C m*(n2+n1)
100
101	andl	PARAM_DIVISOR, %ebx C -n1 & d
102	addl	%esi, %ebx	    C nadj = n10 + (-n1&d), ignoring overflow
103
104	addl	%ebx, %eax	    C low m*(n2+n1) + nadj, giving carry flag
105	leal	1(%edi), %ebx	    C n2+1
106
107	adcl	%ebx, %edx	    C 1+high(n2<<32+m*(n2+n1)+nadj) = q1+1
108
109	movl	PARAM_DIVISOR, %eax C d
110	jz	L(q1_ff)
111
112	mull	%edx		    C (q1+1)*d
113
114	subl	%eax, %esi	    C low  n-(q1+1)*d
115	loop	L(top)
116
117
118
119L(done_sbbl):
120	sbbl	%edx, %edi	    C high n-(q1+1)*d, 0 or -1
121
122	andl	PARAM_DIVISOR, %edi
123L(done_esi_edi):
124	popl	%ebx
125
126	leal	(%esi,%edi), %eax
127	popl	%esi
128
129	popl	%edi
130	popl	%ebp
131
132	ret
133
134
135C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
136C of q*d is simply -d and the remainder n-q*d = n10+d.  This is rarely
137C reached.
138
139L(q1_ff):
140	movl	PARAM_DIVISOR, %edi
141	loop	L(q1_ff_top)
142
143	jmp	L(done_esi_edi)
144
145
146EPILOGUE()
147