xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/pentium/bdiv_q_1.asm (revision ce54336801cf28877c3414aa2fcb251dddd543a2)
1dnl  Intel Pentium mpn_divexact_1 -- mpn by limb exact division.
2
3dnl  Rearranged from mpn/x86/pentium/dive_1.asm by Marco Bodrato.
4
5dnl  Copyright 2001, 2002, 2011, 2014 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35
36C         divisor
37C       odd   even
38C P54:  24.5  30.5   cycles/limb
39C P55:  23.0  28.0
40
41MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
42
43C The P55 speeds noted above, 23 cycles odd or 28 cycles even, are as
44C expected.  On P54 in the even case the shrdl pairing nonsense (see
45C mpn/x86/pentium/README) costs 1 cycle, but it's not clear why there's a
46C further 1.5 slowdown for both odd and even.
47
48defframe(PARAM_SHIFT,  24)
49defframe(PARAM_INVERSE,20)
50defframe(PARAM_DIVISOR,16)
51defframe(PARAM_SIZE,   12)
52defframe(PARAM_SRC,    8)
53defframe(PARAM_DST,    4)
54
55dnl  re-use parameter space
56define(VAR_INVERSE,`PARAM_DST')
57
58	TEXT
59
60	ALIGN(32)
61C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
62C                           mp_limb_t divisor);
63C
64PROLOGUE(mpn_bdiv_q_1)
65deflit(`FRAME',0)
66
67	movl	$-1, %ecx
68	movl	PARAM_DIVISOR, %eax
69
70L(strip_twos):
71	ASSERT(nz, `orl %eax, %eax')
72	shrl	%eax
73	incl	%ecx			C shift count
74
75	jnc	L(strip_twos)
76
77	leal	1(%eax,%eax), %edx	C d
78	andl	$127, %eax		C d/2, 7 bits
79
80	pushl	%ebx		FRAME_pushl()
81	pushl	%ebp		FRAME_pushl()
82
83ifdef(`PIC',`
84ifdef(`DARWIN',`
85	LEA(	binvert_limb_table, %ebp)
86	movzbl	(%eax,%ebp), %eax
87',`
88	call	L(here)
89L(here):
90	popl	%ebp			C eip
91
92	addl	$_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ebp
93	C AGI
94	movl	binvert_limb_table@GOT(%ebp), %ebp
95	C AGI
96	movzbl	(%eax,%ebp), %eax
97')
98',`
99
100dnl non-PIC
101	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
102')
103
104	movl	%eax, %ebp		C inv
105	addl	%eax, %eax		C 2*inv
106
107	imull	%ebp, %ebp		C inv*inv
108
109	imull	%edx, %ebp		C inv*inv*d
110
111	subl	%ebp, %eax		C inv = 2*inv - inv*inv*d
112	movl	PARAM_SIZE, %ebx
113
114	movl	%eax, %ebp
115	addl	%eax, %eax		C 2*inv
116
117	imull	%ebp, %ebp		C inv*inv
118
119	imull	%edx, %ebp		C inv*inv*d
120
121	subl	%ebp, %eax		C inv = 2*inv - inv*inv*d
122	movl	%edx, PARAM_DIVISOR	C d without twos
123
124	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
125	pushl	%eax	FRAME_pushl()
126	imull	PARAM_DIVISOR, %eax
127	cmpl	$1, %eax
128	popl	%eax	FRAME_popl()')
129
130	jmp	L(common)
131EPILOGUE()
132
133C mp_limb_t
134C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
135C		    mp_limb_t inverse, int shift)
136	ALIGN(32)
137PROLOGUE(mpn_pi1_bdiv_q_1)
138deflit(`FRAME',0)
139
140	movl	PARAM_SHIFT, %ecx
141
142	pushl	%ebx		FRAME_pushl()
143	pushl	%ebp		FRAME_pushl()
144
145	movl	PARAM_SIZE, %ebx
146	movl	PARAM_INVERSE, %eax
147
148L(common):
149	pushl	%esi		FRAME_pushl()
150	push	%edi		FRAME_pushl()
151
152	movl	PARAM_SRC, %esi
153	movl	PARAM_DST, %edi
154	movl	%eax, VAR_INVERSE
155
156	leal	(%esi,%ebx,4), %esi	C src end
157	leal	(%edi,%ebx,4), %edi	C dst end
158
159	negl	%ebx			C -size
160
161	xorl	%ebp, %ebp		C initial carry bit
162
163	orl	%ecx, %ecx		C shift
164	movl	(%esi,%ebx,4), %eax	C src low limb
165	jz	L(odd_entry)
166
167	xorl	%edx, %edx		C initial carry limb (for even, if one)
168	incl	%ebx
169	jz	L(one)
170
171	movl	(%esi,%ebx,4), %edx	C src second limb (for even)
172	shrdl(	%cl, %edx, %eax)
173
174	jmp	L(even_entry)
175
176
177	ALIGN(8)
178L(odd_top):
179	C eax	scratch
180	C ebx	counter, limbs, negative
181	C ecx
182	C edx
183	C esi	src end
184	C edi	dst end
185	C ebp	carry bit, 0 or -1
186
187	mull	PARAM_DIVISOR
188
189	movl	(%esi,%ebx,4), %eax
190	subl	%ebp, %edx
191
192	subl	%edx, %eax
193
194	sbbl	%ebp, %ebp
195
196L(odd_entry):
197	imull	VAR_INVERSE, %eax
198
199	movl	%eax, (%edi,%ebx,4)
200
201	incl	%ebx
202	jnz	L(odd_top)
203
204	popl	%edi
205	popl	%esi
206
207	popl	%ebp
208	popl	%ebx
209
210	ret
211
212L(even_top):
213	C eax	scratch
214	C ebx	counter, limbs, negative
215	C ecx	twos
216	C edx
217	C esi	src end
218	C edi	dst end
219	C ebp	carry bit, 0 or -1
220
221	mull	PARAM_DIVISOR
222
223	subl	%ebp, %edx		C carry bit
224	movl	-4(%esi,%ebx,4), %eax	C src limb
225
226	movl	(%esi,%ebx,4), %ebp	C and one above it
227
228	shrdl(	%cl, %ebp, %eax)
229
230	subl	%edx, %eax		C carry limb
231
232	sbbl	%ebp, %ebp
233
234L(even_entry):
235	imull	VAR_INVERSE, %eax
236
237	movl	%eax, -4(%edi,%ebx,4)
238	incl	%ebx
239
240	jnz	L(even_top)
241
242	mull	PARAM_DIVISOR
243
244	movl	-4(%esi), %eax		C src high limb
245	subl	%ebp, %edx
246
247L(one):
248	shrl	%cl, %eax
249
250	subl	%edx, %eax		C no carry if division is exact
251
252	imull	VAR_INVERSE, %eax
253
254	movl	%eax, -4(%edi)		C dst high limb
255	nop				C protect against cache bank clash
256
257	popl	%edi
258	popl	%esi
259
260	popl	%ebp
261	popl	%ebx
262
263	ret
264
265EPILOGUE()
266ASM_END()
267