xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/k7/bdiv_q_1.asm (revision 92e958de60c71aa0f2452bd7074cbb006fe6546b)
1dnl  AMD K7 mpn_bdiv_q_1 -- mpn by limb exact division.
2
3dnl  Copyright 2001, 2002, 2004, 2007, 2011 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  Rearranged from mpn/x86/k7/dive_1.asm by Marco Bodrato.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or
10dnl  modify it under the terms of the GNU Lesser General Public License as
11dnl  published by the Free Software Foundation; either version 3 of the
12dnl  License, or (at your option) any later version.
13dnl
14dnl  The GNU MP Library is distributed in the hope that it will be useful,
15dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
16dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17dnl  Lesser General Public License for more details.
18dnl
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24
25C          cycles/limb
26C Athlon:     11.0
27C Hammer:      9.0
28
29
30C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
31C                      mp_limb_t divisor);
32C
33C The dependent chain is mul+imul+sub for 11 cycles and that speed is
34C achieved with no special effort.  The load and shrld latencies are hidden
35C by out of order execution.
36C
37C It's a touch faster on size==1 to use the mul-by-inverse than divl.
38
39defframe(PARAM_SHIFT,  24)
40defframe(PARAM_INVERSE,20)
41defframe(PARAM_DIVISOR,16)
42defframe(PARAM_SIZE,   12)
43defframe(PARAM_SRC,    8)
44defframe(PARAM_DST,    4)
45
46defframe(SAVE_EBX,     -4)
47defframe(SAVE_ESI,     -8)
48defframe(SAVE_EDI,    -12)
49defframe(SAVE_EBP,    -16)
50defframe(VAR_INVERSE, -20)
51defframe(VAR_DST_END, -24)
52
53deflit(STACK_SPACE, 24)
54
55	TEXT
56
57C mp_limb_t
58C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
59C		    mp_limb_t inverse, int shift)
60	ALIGN(16)
61PROLOGUE(mpn_pi1_bdiv_q_1)
62deflit(`FRAME',0)
63
64	subl	$STACK_SPACE, %esp	deflit(`FRAME',STACK_SPACE)
65	movl	PARAM_SHIFT, %ecx	C shift count
66
67	movl	%ebp, SAVE_EBP
68	movl	PARAM_SIZE, %ebp
69
70	movl	%esi, SAVE_ESI
71	movl	PARAM_SRC, %esi
72
73	movl	%edi, SAVE_EDI
74	movl	PARAM_DST, %edi
75
76	movl	%ebx, SAVE_EBX
77
78	leal	(%esi,%ebp,4), %esi	C src end
79	leal	(%edi,%ebp,4), %edi	C dst end
80	negl	%ebp			C -size
81
82	movl	PARAM_INVERSE, %eax	C inv
83
84L(common):
85	movl	%eax, VAR_INVERSE
86	movl	(%esi,%ebp,4), %eax	C src[0]
87
88	incl	%ebp
89	jz	L(one)
90
91	movl	(%esi,%ebp,4), %edx	C src[1]
92
93	shrdl(	%cl, %edx, %eax)
94
95	movl	%edi, VAR_DST_END
96	xorl	%ebx, %ebx
97	jmp	L(entry)
98
99	ALIGN(8)
100L(top):
101	C eax	q
102	C ebx	carry bit, 0 or 1
103	C ecx	shift
104	C edx
105	C esi	src end
106	C edi	dst end
107	C ebp	counter, limbs, negative
108
109	mull	PARAM_DIVISOR		C carry limb in edx
110
111	movl	-4(%esi,%ebp,4), %eax
112	movl	(%esi,%ebp,4), %edi
113
114	shrdl(	%cl, %edi, %eax)
115
116	subl	%ebx, %eax		C apply carry bit
117	setc	%bl
118	movl	VAR_DST_END, %edi
119
120	subl	%edx, %eax		C apply carry limb
121	adcl	$0, %ebx
122
123L(entry):
124	imull	VAR_INVERSE, %eax
125
126	movl	%eax, -4(%edi,%ebp,4)
127	incl	%ebp
128	jnz	L(top)
129
130
131	mull	PARAM_DIVISOR		C carry limb in edx
132
133	movl	-4(%esi), %eax		C src high limb
134	shrl	%cl, %eax
135	movl	SAVE_ESI, %esi
136
137	subl	%ebx, %eax		C apply carry bit
138	movl	SAVE_EBX, %ebx
139	movl	SAVE_EBP, %ebp
140
141	subl	%edx, %eax		C apply carry limb
142
143	imull	VAR_INVERSE, %eax
144
145	movl	%eax, -4(%edi)
146	movl	SAVE_EDI, %edi
147	addl	$STACK_SPACE, %esp
148
149	ret
150
151L(one):
152	shrl	%cl, %eax
153	movl	SAVE_ESI, %esi
154	movl	SAVE_EBX, %ebx
155
156	imull	VAR_INVERSE, %eax
157
158	movl	SAVE_EBP, %ebp
159
160	movl	%eax, -4(%edi)
161	movl	SAVE_EDI, %edi
162	addl	$STACK_SPACE, %esp
163
164	ret
165EPILOGUE()
166
167C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
168C                           mp_limb_t divisor);
169C
170
171	ALIGN(16)
172PROLOGUE(mpn_bdiv_q_1)
173deflit(`FRAME',0)
174
175	movl	PARAM_DIVISOR, %eax
176	subl	$STACK_SPACE, %esp	deflit(`FRAME',STACK_SPACE)
177	movl	$-1, %ecx		C shift count
178
179	movl	%ebp, SAVE_EBP
180	movl	PARAM_SIZE, %ebp
181
182	movl	%esi, SAVE_ESI
183	movl	%edi, SAVE_EDI
184
185	C If there's usually only one or two trailing zero bits then this
186	C should be faster than bsfl.
187L(strip_twos):
188	incl	%ecx
189	shrl	%eax
190	jnc	L(strip_twos)
191
192	movl	%ebx, SAVE_EBX
193	leal	1(%eax,%eax), %ebx	C d without twos
194	andl	$127, %eax		C d/2, 7 bits
195
196ifdef(`PIC',`
197	LEA(	binvert_limb_table, %edx)
198	movzbl	(%eax,%edx), %eax		C inv 8 bits
199',`
200	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
201')
202
203	leal	(%eax,%eax), %edx	C 2*inv
204	movl	%ebx, PARAM_DIVISOR	C d without twos
205
206	imull	%eax, %eax		C inv*inv
207
208	movl	PARAM_SRC, %esi
209	movl	PARAM_DST, %edi
210
211	imull	%ebx, %eax		C inv*inv*d
212
213	subl	%eax, %edx		C inv = 2*inv - inv*inv*d
214	leal	(%edx,%edx), %eax	C 2*inv
215
216	imull	%edx, %edx		C inv*inv
217
218	leal	(%esi,%ebp,4), %esi	C src end
219	leal	(%edi,%ebp,4), %edi	C dst end
220	negl	%ebp			C -size
221
222	imull	%ebx, %edx		C inv*inv*d
223
224	subl	%edx, %eax		C inv = 2*inv - inv*inv*d
225
226	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
227	pushl	%eax	FRAME_pushl()
228	imull	PARAM_DIVISOR, %eax
229	cmpl	$1, %eax
230	popl	%eax	FRAME_popl()')
231
232	jmp	L(common)
233EPILOGUE()
234