xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/pentium/bdiv_q_1.asm (revision 479d8f7d843cc1b22d497efdf1f27a50ee8418d4)
1dnl  Intel Pentium mpn_divexact_1 -- mpn by limb exact division.
2
3dnl  Copyright 2001, 2002, 2011 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  Rearranged from mpn/x86/pentium/dive_1.asm by Marco Bodrato.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or
10dnl  modify it under the terms of the GNU Lesser General Public License as
11dnl  published by the Free Software Foundation; either version 3 of the
12dnl  License, or (at your option) any later version.
13dnl
14dnl  The GNU MP Library is distributed in the hope that it will be useful,
15dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
16dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17dnl  Lesser General Public License for more details.
18dnl
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24
25C         divisor
26C       odd   even
27C P54:  24.5  30.5   cycles/limb
28C P55:  23.0  28.0
29
30MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
31
32C The P55 speeds noted above, 23 cycles odd or 28 cycles even, are as
33C expected.  On P54 in the even case the shrdl pairing nonsense (see
34C mpn/x86/pentium/README) costs 1 cycle, but it's not clear why there's a
35C further 1.5 slowdown for both odd and even.
36
37defframe(PARAM_SHIFT,  24)
38defframe(PARAM_INVERSE,20)
39defframe(PARAM_DIVISOR,16)
40defframe(PARAM_SIZE,   12)
41defframe(PARAM_SRC,    8)
42defframe(PARAM_DST,    4)
43
44dnl  re-use parameter space
45define(VAR_INVERSE,`PARAM_DST')
46
47	TEXT
48
49	ALIGN(32)
50C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
51C                           mp_limb_t divisor);
52C
53PROLOGUE(mpn_bdiv_q_1)
54deflit(`FRAME',0)
55
56	movl	$-1, %ecx
57	movl	PARAM_DIVISOR, %eax
58
59L(strip_twos):
60	ASSERT(nz, `orl %eax, %eax')
61	shrl	%eax
62	incl	%ecx			C shift count
63
64	jnc	L(strip_twos)
65
66	leal	1(%eax,%eax), %edx	C d
67	andl	$127, %eax		C d/2, 7 bits
68
69	pushl	%ebx		FRAME_pushl()
70	pushl	%ebp		FRAME_pushl()
71
72ifdef(`PIC',`
73	call	L(here)
74L(here):
75	popl	%ebp			C eip
76
77	addl	$_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ebp
78	C AGI
79	movl	binvert_limb_table@GOT(%ebp), %ebp
80	C AGI
81	movzbl	(%eax,%ebp), %eax
82',`
83
84dnl non-PIC
85	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
86')
87
88	movl	%eax, %ebp		C inv
89	addl	%eax, %eax		C 2*inv
90
91	imull	%ebp, %ebp		C inv*inv
92
93	imull	%edx, %ebp		C inv*inv*d
94
95	subl	%ebp, %eax		C inv = 2*inv - inv*inv*d
96	movl	PARAM_SIZE, %ebx
97
98	movl	%eax, %ebp
99	addl	%eax, %eax		C 2*inv
100
101	imull	%ebp, %ebp		C inv*inv
102
103	imull	%edx, %ebp		C inv*inv*d
104
105	subl	%ebp, %eax		C inv = 2*inv - inv*inv*d
106	movl	%edx, PARAM_DIVISOR	C d without twos
107
108	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
109	pushl	%eax	FRAME_pushl()
110	imull	PARAM_DIVISOR, %eax
111	cmpl	$1, %eax
112	popl	%eax	FRAME_popl()')
113
114	jmp	L(common)
115EPILOGUE()
116
117C mp_limb_t
118C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
119C		    mp_limb_t inverse, int shift)
120	ALIGN(32)
121PROLOGUE(mpn_pi1_bdiv_q_1)
122deflit(`FRAME',0)
123
124	movl	PARAM_SHIFT, %ecx
125
126	pushl	%ebx		FRAME_pushl()
127	pushl	%ebp		FRAME_pushl()
128
129	movl	PARAM_SIZE, %ebx
130	movl	PARAM_INVERSE, %eax
131
132L(common):
133	pushl	%esi		FRAME_pushl()
134	push	%edi		FRAME_pushl()
135
136	movl	PARAM_SRC, %esi
137	movl	PARAM_DST, %edi
138	movl	%eax, VAR_INVERSE
139
140	leal	(%esi,%ebx,4), %esi	C src end
141	leal	(%edi,%ebx,4), %edi	C dst end
142
143	negl	%ebx			C -size
144
145	xorl	%ebp, %ebp		C initial carry bit
146
147	orl	%ecx, %ecx		C shift
148	movl	(%esi,%ebx,4), %eax	C src low limb
149	jz	L(odd_entry)
150
151	xorl	%edx, %edx		C initial carry limb (for even, if one)
152	incl	%ebx
153	jz	L(one)
154
155	movl	(%esi,%ebx,4), %edx	C src second limb (for even)
156	shrdl(	%cl, %edx, %eax)
157
158	jmp	L(even_entry)
159
160
161	ALIGN(8)
162L(odd_top):
163	C eax	scratch
164	C ebx	counter, limbs, negative
165	C ecx
166	C edx
167	C esi	src end
168	C edi	dst end
169	C ebp	carry bit, 0 or -1
170
171	mull	PARAM_DIVISOR
172
173	movl	(%esi,%ebx,4), %eax
174	subl	%ebp, %edx
175
176	subl	%edx, %eax
177
178	sbbl	%ebp, %ebp
179
180L(odd_entry):
181	imull	VAR_INVERSE, %eax
182
183	movl	%eax, (%edi,%ebx,4)
184
185	incl	%ebx
186	jnz	L(odd_top)
187
188	popl	%edi
189	popl	%esi
190
191	popl	%ebp
192	popl	%ebx
193
194	ret
195
196L(even_top):
197	C eax	scratch
198	C ebx	counter, limbs, negative
199	C ecx	twos
200	C edx
201	C esi	src end
202	C edi	dst end
203	C ebp	carry bit, 0 or -1
204
205	mull	PARAM_DIVISOR
206
207	subl	%ebp, %edx		C carry bit
208	movl	-4(%esi,%ebx,4), %eax	C src limb
209
210	movl	(%esi,%ebx,4), %ebp	C and one above it
211
212	shrdl(	%cl, %ebp, %eax)
213
214	subl	%edx, %eax		C carry limb
215
216	sbbl	%ebp, %ebp
217
218L(even_entry):
219	imull	VAR_INVERSE, %eax
220
221	movl	%eax, -4(%edi,%ebx,4)
222	incl	%ebx
223
224	jnz	L(even_top)
225
226	mull	PARAM_DIVISOR
227
228	movl	-4(%esi), %eax		C src high limb
229	subl	%ebp, %edx
230
231L(one):
232	shrl	%cl, %eax
233
234	subl	%edx, %eax		C no carry if division is exact
235
236	imull	VAR_INVERSE, %eax
237
238	movl	%eax, -4(%edi)		C dst high limb
239	nop				C protect against cache bank clash
240
241	popl	%edi
242	popl	%esi
243
244	popl	%ebp
245	popl	%ebx
246
247	ret
248
249EPILOGUE()
250