xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/k7/bdiv_q_1.asm (revision 867d70fc718005c0918b8b8b2f9d7f2d52d0a0db)
1dnl  AMD K7 mpn_bdiv_q_1 -- mpn by limb exact division.
2
3dnl  Rearranged from mpn/x86/k7/dive_1.asm by Marco Bodrato.
4
5dnl  Copyright 2001, 2002, 2004, 2007, 2011 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35
36C          cycles/limb
37C Athlon:     11.0
38C Hammer:      9.0
39
40
41C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
42C                      mp_limb_t divisor);
43C
44C The dependent chain is mul+imul+sub for 11 cycles and that speed is
45C achieved with no special effort.  The load and shrld latencies are hidden
46C by out of order execution.
47C
48C It's a touch faster on size==1 to use the mul-by-inverse than divl.
49
50defframe(PARAM_SHIFT,  24)
51defframe(PARAM_INVERSE,20)
52defframe(PARAM_DIVISOR,16)
53defframe(PARAM_SIZE,   12)
54defframe(PARAM_SRC,    8)
55defframe(PARAM_DST,    4)
56
57defframe(SAVE_EBX,     -4)
58defframe(SAVE_ESI,     -8)
59defframe(SAVE_EDI,    -12)
60defframe(SAVE_EBP,    -16)
61defframe(VAR_INVERSE, -20)
62defframe(VAR_DST_END, -24)
63
64deflit(STACK_SPACE, 24)
65
66	TEXT
67
68C mp_limb_t
69C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
70C		    mp_limb_t inverse, int shift)
71	ALIGN(16)
72PROLOGUE(mpn_pi1_bdiv_q_1)
73deflit(`FRAME',0)
74
75	subl	$STACK_SPACE, %esp	deflit(`FRAME',STACK_SPACE)
76	movl	PARAM_SHIFT, %ecx	C shift count
77
78	movl	%ebp, SAVE_EBP
79	movl	PARAM_SIZE, %ebp
80
81	movl	%esi, SAVE_ESI
82	movl	PARAM_SRC, %esi
83
84	movl	%edi, SAVE_EDI
85	movl	PARAM_DST, %edi
86
87	movl	%ebx, SAVE_EBX
88
89	leal	(%esi,%ebp,4), %esi	C src end
90	leal	(%edi,%ebp,4), %edi	C dst end
91	negl	%ebp			C -size
92
93	movl	PARAM_INVERSE, %eax	C inv
94
95L(common):
96	movl	%eax, VAR_INVERSE
97	movl	(%esi,%ebp,4), %eax	C src[0]
98
99	incl	%ebp
100	jz	L(one)
101
102	movl	(%esi,%ebp,4), %edx	C src[1]
103
104	shrdl(	%cl, %edx, %eax)
105
106	movl	%edi, VAR_DST_END
107	xorl	%ebx, %ebx
108	jmp	L(entry)
109
110	ALIGN(8)
111L(top):
112	C eax	q
113	C ebx	carry bit, 0 or 1
114	C ecx	shift
115	C edx
116	C esi	src end
117	C edi	dst end
118	C ebp	counter, limbs, negative
119
120	mull	PARAM_DIVISOR		C carry limb in edx
121
122	movl	-4(%esi,%ebp,4), %eax
123	movl	(%esi,%ebp,4), %edi
124
125	shrdl(	%cl, %edi, %eax)
126
127	subl	%ebx, %eax		C apply carry bit
128	setc	%bl
129	movl	VAR_DST_END, %edi
130
131	subl	%edx, %eax		C apply carry limb
132	adcl	$0, %ebx
133
134L(entry):
135	imull	VAR_INVERSE, %eax
136
137	movl	%eax, -4(%edi,%ebp,4)
138	incl	%ebp
139	jnz	L(top)
140
141
142	mull	PARAM_DIVISOR		C carry limb in edx
143
144	movl	-4(%esi), %eax		C src high limb
145	shrl	%cl, %eax
146	movl	SAVE_ESI, %esi
147
148	subl	%ebx, %eax		C apply carry bit
149	movl	SAVE_EBX, %ebx
150	movl	SAVE_EBP, %ebp
151
152	subl	%edx, %eax		C apply carry limb
153
154	imull	VAR_INVERSE, %eax
155
156	movl	%eax, -4(%edi)
157	movl	SAVE_EDI, %edi
158	addl	$STACK_SPACE, %esp
159
160	ret
161
162L(one):
163	shrl	%cl, %eax
164	movl	SAVE_ESI, %esi
165	movl	SAVE_EBX, %ebx
166
167	imull	VAR_INVERSE, %eax
168
169	movl	SAVE_EBP, %ebp
170
171	movl	%eax, -4(%edi)
172	movl	SAVE_EDI, %edi
173	addl	$STACK_SPACE, %esp
174
175	ret
176EPILOGUE()
177
178C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
179C                           mp_limb_t divisor);
180C
181
182	ALIGN(16)
183PROLOGUE(mpn_bdiv_q_1)
184deflit(`FRAME',0)
185
186	movl	PARAM_DIVISOR, %eax
187	subl	$STACK_SPACE, %esp	deflit(`FRAME',STACK_SPACE)
188	movl	$-1, %ecx		C shift count
189
190	movl	%ebp, SAVE_EBP
191	movl	PARAM_SIZE, %ebp
192
193	movl	%esi, SAVE_ESI
194	movl	%edi, SAVE_EDI
195
196	C If there's usually only one or two trailing zero bits then this
197	C should be faster than bsfl.
198L(strip_twos):
199	incl	%ecx
200	shrl	%eax
201	jnc	L(strip_twos)
202
203	movl	%ebx, SAVE_EBX
204	leal	1(%eax,%eax), %ebx	C d without twos
205	andl	$127, %eax		C d/2, 7 bits
206
207ifdef(`PIC',`
208	LEA(	binvert_limb_table, %edx)
209	movzbl	(%eax,%edx), %eax		C inv 8 bits
210',`
211	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
212')
213
214	leal	(%eax,%eax), %edx	C 2*inv
215	movl	%ebx, PARAM_DIVISOR	C d without twos
216
217	imull	%eax, %eax		C inv*inv
218
219	movl	PARAM_SRC, %esi
220	movl	PARAM_DST, %edi
221
222	imull	%ebx, %eax		C inv*inv*d
223
224	subl	%eax, %edx		C inv = 2*inv - inv*inv*d
225	leal	(%edx,%edx), %eax	C 2*inv
226
227	imull	%edx, %edx		C inv*inv
228
229	leal	(%esi,%ebp,4), %esi	C src end
230	leal	(%edi,%ebp,4), %edi	C dst end
231	negl	%ebp			C -size
232
233	imull	%ebx, %edx		C inv*inv*d
234
235	subl	%edx, %eax		C inv = 2*inv - inv*inv*d
236
237	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
238	pushl	%eax	FRAME_pushl()
239	imull	PARAM_DIVISOR, %eax
240	cmpl	$1, %eax
241	popl	%eax	FRAME_popl()')
242
243	jmp	L(common)
244EPILOGUE()
245ASM_END()
246