xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/pentium/dive_1.asm (revision b45fa494daa2ba02187711d31a4144faf0993066)
1dnl  Intel Pentium mpn_divexact_1 -- mpn by limb exact division.
2
3dnl  Copyright 2001, 2002 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C         divisor
24C       odd   even
25C P54:  24.5  30.5   cycles/limb
26C P55:  23.0  28.0
27
28
29C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
30C                      mp_limb_t divisor);
31C
32C Plain divl is used for small sizes, since the inverse takes a while to
33C setup.  Multiplying works out faster for size>=3 when the divisor is odd,
34C or size>=4 when the divisor is even.  Actually on P55 size==2 for odd or
35C size==3 for even are about the same speed for both divl or mul, but the
36C former is used since it will use up less code cache.
37C
38C The P55 speeds noted above, 23 cycles odd or 28 cycles even, are as
39C expected.  On P54 in the even case the shrdl pairing nonsense (see
40C mpn/x86/pentium/README) costs 1 cycle, but it's not clear why there's a
41C further 1.5 slowdown for both odd and even.
42
43defframe(PARAM_DIVISOR,16)
44defframe(PARAM_SIZE,   12)
45defframe(PARAM_SRC,    8)
46defframe(PARAM_DST,    4)
47
48dnl  re-use parameter space
49define(VAR_INVERSE,`PARAM_DST')
50
51	TEXT
52
53	ALIGN(32)
54PROLOGUE(mpn_divexact_1)
55deflit(`FRAME',0)
56
57	movl	PARAM_DIVISOR, %eax
58	movl	PARAM_SIZE, %ecx
59
60	pushl	%esi		FRAME_pushl()
61	push	%edi		FRAME_pushl()
62
63	movl	PARAM_SRC, %esi
64	andl	$1, %eax
65
66	movl	PARAM_DST, %edi
67	addl	%ecx, %eax	C size if even, size+1 if odd
68
69	cmpl	$4, %eax
70	jae	L(mul_by_inverse)
71
72
73	xorl	%edx, %edx
74L(div_top):
75	movl	-4(%esi,%ecx,4), %eax
76
77	divl	PARAM_DIVISOR
78
79	movl	%eax, -4(%edi,%ecx,4)
80	decl	%ecx
81
82	jnz	L(div_top)
83
84	popl	%edi
85	popl	%esi
86
87	ret
88
89
90
91L(mul_by_inverse):
92	movl	PARAM_DIVISOR, %eax
93	movl	$-1, %ecx
94
95L(strip_twos):
96	ASSERT(nz, `orl %eax, %eax')
97	shrl	%eax
98	incl	%ecx			C shift count
99
100	jnc	L(strip_twos)
101
102	leal	1(%eax,%eax), %edx	C d
103	andl	$127, %eax		C d/2, 7 bits
104
105	pushl	%ebx		FRAME_pushl()
106	pushl	%ebp		FRAME_pushl()
107
108ifdef(`PIC',`
109	call	L(here)
110L(here):
111	popl	%ebp			C eip
112
113	addl	$_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ebp
114	C AGI
115	movl	binvert_limb_table@GOT(%ebp), %ebp
116	C AGI
117	movzbl	(%eax,%ebp), %eax
118',`
119
120dnl non-PIC
121	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
122')
123
124	movl	%eax, %ebp		C inv
125	addl	%eax, %eax		C 2*inv
126
127	imull	%ebp, %ebp		C inv*inv
128
129	imull	%edx, %ebp		C inv*inv*d
130
131	subl	%ebp, %eax		C inv = 2*inv - inv*inv*d
132	movl	PARAM_SIZE, %ebx
133
134	movl	%eax, %ebp
135	addl	%eax, %eax		C 2*inv
136
137	imull	%ebp, %ebp		C inv*inv
138
139	imull	%edx, %ebp		C inv*inv*d
140
141	subl	%ebp, %eax		C inv = 2*inv - inv*inv*d
142	movl	%edx, PARAM_DIVISOR	C d without twos
143
144	leal	(%esi,%ebx,4), %esi	C src end
145	leal	(%edi,%ebx,4), %edi	C dst end
146
147	negl	%ebx			C -size
148
149	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
150	pushl	%eax	FRAME_pushl()
151	imull	PARAM_DIVISOR, %eax
152	cmpl	$1, %eax
153	popl	%eax	FRAME_popl()')
154
155	movl	%eax, VAR_INVERSE
156	xorl	%ebp, %ebp		C initial carry bit
157
158	movl	(%esi,%ebx,4), %eax	C src low limb
159	orl	%ecx, %ecx		C shift
160
161	movl	4(%esi,%ebx,4), %edx	C src second limb (for even)
162	jz	L(odd_entry)
163
164	shrdl(	%cl, %edx, %eax)
165
166	incl	%ebx
167	jmp	L(even_entry)
168
169
170	ALIGN(8)
171L(odd_top):
172	C eax	scratch
173	C ebx	counter, limbs, negative
174	C ecx
175	C edx
176	C esi	src end
177	C edi	dst end
178	C ebp	carry bit, 0 or -1
179
180	mull	PARAM_DIVISOR
181
182	movl	(%esi,%ebx,4), %eax
183	subl	%ebp, %edx
184
185	subl	%edx, %eax
186
187	sbbl	%ebp, %ebp
188
189L(odd_entry):
190	imull	VAR_INVERSE, %eax
191
192	movl	%eax, (%edi,%ebx,4)
193
194	incl	%ebx
195	jnz	L(odd_top)
196
197
198	popl	%ebp
199	popl	%ebx
200
201	popl	%edi
202	popl	%esi
203
204	ret
205
206
207L(even_top):
208	C eax	scratch
209	C ebx	counter, limbs, negative
210	C ecx	twos
211	C edx
212	C esi	src end
213	C edi	dst end
214	C ebp	carry bit, 0 or -1
215
216	mull	PARAM_DIVISOR
217
218	subl	%ebp, %edx		C carry bit
219	movl	-4(%esi,%ebx,4), %eax	C src limb
220
221	movl	(%esi,%ebx,4), %ebp	C and one above it
222
223	shrdl(	%cl, %ebp, %eax)
224
225	subl	%edx, %eax		C carry limb
226
227	sbbl	%ebp, %ebp
228
229L(even_entry):
230	imull	VAR_INVERSE, %eax
231
232	movl	%eax, -4(%edi,%ebx,4)
233	incl	%ebx
234
235	jnz	L(even_top)
236
237
238
239	mull	PARAM_DIVISOR
240
241	movl	-4(%esi), %eax		C src high limb
242	subl	%ebp, %edx
243
244	shrl	%cl, %eax
245
246	subl	%edx, %eax		C no carry if division is exact
247
248	imull	VAR_INVERSE, %eax
249
250	movl	%eax, -4(%edi)		C dst high limb
251	nop				C protect against cache bank clash
252
253	popl	%ebp
254	popl	%ebx
255
256	popl	%edi
257	popl	%esi
258
259	ret
260
261EPILOGUE()
262