xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/p6/dive_1.asm (revision 2f62cc9c12bc202c40224f32c879f81443fee079)
1dnl  Intel P6 mpn_modexact_1_odd -- exact division style remainder.
2
3dnl  Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C       odd  even  divisor
35C P6:  10.0  12.0  cycles/limb
36
37
38C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
39C                      mp_limb_t divisor);
40C
41C The odd case is basically the same as mpn_modexact_1_odd, just with an
42C extra store, and it runs at the same 10 cycles which is the dependent
43C chain.
44C
45C The shifts for the even case aren't on the dependent chain so in principle
46C it could run the same too, but nothing running at 10 has been found.
47C Perhaps there's too many uops (an extra 4 over the odd case).
48
49defframe(PARAM_DIVISOR,16)
50defframe(PARAM_SIZE,   12)
51defframe(PARAM_SRC,     8)
52defframe(PARAM_DST,     4)
53
54defframe(SAVE_EBX,     -4)
55defframe(SAVE_ESI,     -8)
56defframe(SAVE_EDI,    -12)
57defframe(SAVE_EBP,    -16)
58defframe(VAR_INVERSE, -20)
59deflit(STACK_SPACE, 20)
60
61	TEXT
62
63	ALIGN(16)
64PROLOGUE(mpn_divexact_1)
65deflit(`FRAME',0)
66
67	movl	PARAM_DIVISOR, %eax
68	subl	$STACK_SPACE, %esp	FRAME_subl_esp(STACK_SPACE)
69
70	movl	%esi, SAVE_ESI
71	movl	PARAM_SRC, %esi
72
73	movl	%ebx, SAVE_EBX
74	movl	PARAM_SIZE, %ebx
75
76	bsfl	%eax, %ecx		C trailing twos
77
78	movl	%ebp, SAVE_EBP
79
80	shrl	%cl, %eax		C d without twos
81
82	movl	%eax, %edx
83	shrl	%eax			C d/2 without twos
84
85	movl	%edx, PARAM_DIVISOR
86	andl	$127, %eax
87
88ifdef(`PIC',`
89	LEA(	binvert_limb_table, %ebp)
90	movzbl	(%eax,%ebp), %ebp		C inv 8 bits
91',`
92	movzbl	binvert_limb_table(%eax), %ebp	C inv 8 bits
93')
94
95	leal	(%ebp,%ebp), %eax	C 2*inv
96
97	imull	%ebp, %ebp		C inv*inv
98
99	movl	%edi, SAVE_EDI
100	movl	PARAM_DST, %edi
101
102	leal	(%esi,%ebx,4), %esi	C src end
103
104	imull	PARAM_DIVISOR, %ebp	C inv*inv*d
105
106	subl	%ebp, %eax		C inv = 2*inv - inv*inv*d
107	leal	(%eax,%eax), %ebp	C 2*inv
108
109	imull	%eax, %eax		C inv*inv
110
111	leal	(%edi,%ebx,4), %edi	C dst end
112	negl	%ebx			C -size
113
114	movl	%edi, PARAM_DST
115
116	imull	PARAM_DIVISOR, %eax	C inv*inv*d
117
118	subl	%eax, %ebp		C inv = 2*inv - inv*inv*d
119
120	ASSERT(e,`	C d*inv == 1 mod 2^GMP_LIMB_BITS
121	movl	PARAM_DIVISOR, %eax
122	imull	%ebp, %eax
123	cmpl	$1, %eax')
124
125	movl	%ebp, VAR_INVERSE
126	movl	(%esi,%ebx,4), %eax	C src[0]
127
128	orl	%ecx, %ecx
129	jnz	L(even)
130
131	C ecx initial carry is zero
132	jmp	L(odd_entry)
133
134
135C The dependent chain here is
136C
137C	subl	%edx, %eax       1
138C	imull	%ebp, %eax       4
139C	mull	PARAM_DIVISOR    5
140C			       ----
141C	total			10
142C
143C and this is the measured speed.  No special scheduling is necessary, out
144C of order execution hides the load latency.
145
146L(odd_top):
147	C eax	scratch (src limb)
148	C ebx	counter, limbs, negative
149	C ecx	carry bit
150	C edx	carry limb, high of last product
151	C esi	&src[size]
152	C edi	&dst[size]
153	C ebp
154
155	mull	PARAM_DIVISOR
156
157	movl	(%esi,%ebx,4), %eax
158	subl	%ecx, %eax
159
160	sbbl	%ecx, %ecx
161	subl	%edx, %eax
162
163	sbbl	$0, %ecx
164
165L(odd_entry):
166	imull	VAR_INVERSE, %eax
167
168	movl	%eax, (%edi,%ebx,4)
169	negl	%ecx
170
171	incl	%ebx
172	jnz	L(odd_top)
173
174
175	movl	SAVE_ESI, %esi
176
177	movl	SAVE_EDI, %edi
178
179	movl	SAVE_EBP, %ebp
180
181	movl	SAVE_EBX, %ebx
182	addl	$STACK_SPACE, %esp
183
184	ret
185
186
187L(even):
188	C eax	src[0]
189	C ebx	counter, limbs, negative
190	C ecx	shift
191	C edx
192	C esi
193	C edi
194	C ebp
195
196	xorl	%ebp, %ebp		C initial carry bit
197	xorl	%edx, %edx		C initial carry limb (for size==1)
198
199	incl	%ebx
200	jz	L(even_one)
201
202	movl	(%esi,%ebx,4), %edi	C src[1]
203
204	shrdl(	%cl, %edi, %eax)
205
206	jmp	L(even_entry)
207
208
209L(even_top):
210	C eax	scratch
211	C ebx	counter, limbs, negative
212	C ecx	shift
213	C edx	scratch
214	C esi	&src[size]
215	C edi	&dst[size] and scratch
216	C ebp	carry bit
217
218	movl	(%esi,%ebx,4), %edi
219
220	mull	PARAM_DIVISOR
221
222	movl	-4(%esi,%ebx,4), %eax
223	shrdl(	%cl, %edi, %eax)
224
225	subl	%ebp, %eax
226
227	sbbl	%ebp, %ebp
228	subl	%edx, %eax
229
230	sbbl	$0, %ebp
231
232L(even_entry):
233	imull	VAR_INVERSE, %eax
234
235	movl	PARAM_DST, %edi
236	negl	%ebp
237
238	movl	%eax, -4(%edi,%ebx,4)
239	incl	%ebx
240	jnz	L(even_top)
241
242
243
244	mull	PARAM_DIVISOR
245
246	movl	-4(%esi), %eax
247
248L(even_one):
249	shrl	%cl, %eax
250	movl	SAVE_ESI, %esi
251
252	subl	%ebp, %eax
253	movl	SAVE_EBP, %ebp
254
255	subl	%edx, %eax
256	movl	SAVE_EBX, %ebx
257
258	imull	VAR_INVERSE, %eax
259
260	movl	%eax, -4(%edi)
261	movl	SAVE_EDI, %edi
262	addl	$STACK_SPACE, %esp
263
264	ret
265
266EPILOGUE()
267ASM_END()
268