xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/k7/mod_1_4.asm (revision 51c5f9b7c2b2cc93506078d2cab158634a65201f)
1dnl  x86-32 mpn_mod_1s_4p, requiring cmov.
2
3dnl  Contributed to the GNU project by Torbjorn Granlund.
4
5dnl  Copyright 2009 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of the GNU Lesser General Public License as published
11dnl  by the Free Software Foundation; either version 3 of the License, or (at
12dnl  your option) any later version.
13
14dnl  The GNU MP Library is distributed in the hope that it will be useful, but
15dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17dnl  License for more details.
18
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24C                           cycles/limb
25C P5:
26C P6 model 0-8,10-12)
27C P6 model 9  (Banias)
28C P6 model 13 (Dothan)		 6.0
29C P4 model 0  (Willamette)
30C P4 model 1  (?)
31C P4 model 2  (Northwood)	15.5
32C P4 model 3  (Prescott)
33C P4 model 4  (Nocona)
34C K6:
35C K7:                            4.75
36C K8:
37
38
39C Ths inner loop was manually written, it ought to be loopmixed.
40C Presumably, we could get to 4 c/l for K7.
41
42C The cps function was compiler generated.  It can clearly be optimized.
43
44
45ASM_START()
46	TEXT
47
48	ALIGN(16)
49PROLOGUE(mpn_mod_1s_4p)
50	push	%ebp
51	push	%edi
52	push	%esi
53	push	%ebx
54	sub	$28, %esp
55	mov	60(%esp), %edi		C cps
56	mov	8(%edi), %eax
57	mov	12(%edi), %edx
58	mov	16(%edi), %ecx
59	mov	20(%edi), %esi
60	mov	24(%edi), %edi
61	mov	%eax, 4(%esp)
62	mov	%edx, 8(%esp)
63	mov	%ecx, 12(%esp)
64	mov	%esi, 16(%esp)
65	mov	%edi, 20(%esp)
66	mov	52(%esp), %eax		C n
67	xor	%edi, %edi
68	mov	48(%esp), %esi		C up
69	lea	-12(%esi,%eax,4), %esi
70	and	$3, %eax
71	je	L(b0)
72	cmp	$2, %eax
73	jc	L(b1)
74	je	L(b2)
75
76L(b3):	mov	4(%esi), %eax
77	mull	4(%esp)
78	mov	(%esi), %ebp
79	add	%eax, %ebp
80	adc	%edx, %edi
81	mov	8(%esi), %eax
82	mull	8(%esp)
83	lea	-12(%esi), %esi
84	jmp	L(m0)
85
86L(b0):	mov	(%esi), %eax
87	mull	4(%esp)
88	mov	-4(%esi), %ebp
89	add	%eax, %ebp
90	adc	%edx, %edi
91	mov	4(%esi), %eax
92	mull	8(%esp)
93	add	%eax, %ebp
94	adc	%edx, %edi
95	mov	8(%esi), %eax
96	mull	12(%esp)
97	lea	-16(%esi), %esi
98	jmp	L(m0)
99
100L(b1):	mov	8(%esi), %ebp
101	lea	-4(%esi), %esi
102	jmp	L(m1)
103
104L(b2):	mov	8(%esi), %eax
105	mull	4(%esp)
106	mov	4(%esi), %ebp
107	lea	-8(%esi), %esi
108	jmp	L(m0)
109
110	ALIGN(16)
111L(top):	mov	(%esi), %eax
112	mull	4(%esp)
113	mov	-4(%esi), %ebx
114	xor	%ecx, %ecx
115	add	%eax, %ebx
116	adc	%edx, %ecx
117	mov	4(%esi), %eax
118	mull	8(%esp)
119	add	%eax, %ebx
120	adc	%edx, %ecx
121	mov	8(%esi), %eax
122	mull	12(%esp)
123	add	%eax, %ebx
124	adc	%edx, %ecx
125	lea	-16(%esi), %esi
126	mov	16(%esp), %eax
127	mul	%ebp
128	add	%eax, %ebx
129	adc	%edx, %ecx
130	mov	20(%esp), %eax
131	mul	%edi
132	mov	%ebx, %ebp
133	mov	%ecx, %edi
134L(m0):	add	%eax, %ebp
135	adc	%edx, %edi
136L(m1):	sub	$4, 52(%esp)
137	ja	L(top)
138
139L(end):	mov	4(%esp), %eax
140	mul	%edi
141	mov	60(%esp), %edi
142	add	%eax, %ebp
143	adc	$0, %edx
144	mov	4(%edi), %ecx
145	mov	%edx, %esi
146	mov	%ebp, %eax
147	sal	%cl, %esi
148	mov	%ecx, %ebx
149	neg	%ecx
150	shr	%cl, %eax
151	or	%esi, %eax
152	lea	1(%eax), %esi
153	mull	(%edi)
154	mov	%ebx, %ecx
155	mov	%eax, %ebx
156	mov	%ebp, %eax
157	sal	%cl, %eax
158	add	%eax, %ebx
159	adc	%esi, %edx
160	imul	56(%esp), %edx
161	mov	56(%esp), %esi
162	sub	%edx, %eax
163	lea	(%eax,%esi), %edx
164	cmp	%eax, %ebx
165	cmovb(	%edx, %eax)
166	mov	%eax, %edx
167	sub	%esi, %eax
168	cmovb(	%edx, %eax)
169	add	$28, %esp
170	pop	%ebx
171	pop	%esi
172	pop	%edi
173	pop	%ebp
174	shr	%cl, %eax
175	ret
176EPILOGUE()
177
178	ALIGN(16)
179PROLOGUE(mpn_mod_1s_4p_cps)
180	sub	$56, %esp
181	mov	%esi, 44(%esp)
182	mov	64(%esp), %esi
183	mov	%edi, 48(%esp)
184	mov	%ebx, 40(%esp)
185	mov	$-1, %ebx
186	mov	%ebp, 52(%esp)
187	bsr	%esi, %eax
188	xor	$31, %eax
189	mov	%eax, %ecx
190	mov	%eax, 24(%esp)
191	mov	%ebx, %eax
192	sal	%cl, %esi
193	mov	%esi, %ecx
194	mov	%esi, %edi
195	mov	%esi, %ebp
196	neg	%ecx
197	not	%edi
198	mov	%ecx, 20(%esp)
199	mov	$32, %ecx
200	sub	24(%esp), %ecx
201	mov	%edi, %edx
202	mov	%edi, 16(%esp)
203	mov	20(%esp), %edi
204	div	%esi
205	mov	%eax, %ebx
206	shr	%cl, %eax
207	movzbl	24(%esp), %ecx
208	mov	%eax, 12(%esp)
209	mov	$1, %eax
210	sal	%cl, %eax
211	or	%eax, 12(%esp)
212	imul	12(%esp), %edi
213	mov	%edi, %eax
214	mov	%edi, 20(%esp)
215	mul	%ebx
216	mov	%eax, %ecx
217	lea	1(%edx,%edi), %eax
218	neg	%eax
219	imul	%eax, %ebp
220	lea	(%ebp,%esi), %eax
221	cmp	%ebp, %ecx
222	cmovb(	%eax, %ebp)
223	mov	%ebp, %eax
224	mul	%ebx
225	lea	1(%ebp,%edx), %edi
226	mov	%eax, %ecx
227	neg	%edi
228	mov	%edi, 8(%esp)
229	imul	%esi, %edi
230	mov	%edi, %eax
231	add	%esi, %eax
232	cmp	%edi, %ecx
233	cmovae(	%edi, %eax)
234	mov	%eax, 32(%esp)
235	mov	32(%esp), %edi
236	mul	%ebx
237	mov	%eax, 36(%esp)
238	lea	1(%edi,%edx), %eax
239	negl	%eax
240	imul	%esi, %eax
241	mov	%eax, %ecx
242	add	%esi, %ecx
243	cmp	%eax, 36(%esp)
244	cmovae(	%eax, %ecx)
245	mov	%ecx, (%esp)
246	mov	%ecx, %eax
247	mul	%ebx
248	mov	%eax, %edi
249	mov	(%esp), %eax
250	lea	1(%eax,%edx), %ecx
251	mov	60(%esp), %edx
252	neg	%ecx
253	imul	%esi, %ecx
254	mov	%ebx, (%edx)
255	add	%ecx, %esi
256	cmp	%ecx, %edi
257	cmovae(	%ecx, %esi)
258	mov	24(%esp), %ecx
259	shrl	%cl, 20(%esp)
260	mov	20(%esp), %edi
261	mov	%esi, 4(%esp)
262	mov	%ecx, 4(%edx)
263	movzbl	24(%esp), %ecx
264	mov	%edi, 8(%edx)
265	shr	%cl, %ebp
266	shr	%cl, %eax
267	mov	%ebp, 12(%edx)
268	shrl	%cl, 32(%esp)
269	mov	32(%esp), %edi
270	shrl	%cl, 4(%esp)
271	mov	%eax, 20(%edx)
272	mov	%edi, 16(%edx)
273	mov	4(%esp), %edi
274	mov	%edi, 24(%edx)
275	mov	40(%esp), %ebx
276	mov	44(%esp), %esi
277	mov	48(%esp), %edi
278	mov	52(%esp), %ebp
279	add	$56, %esp
280	ret
281EPILOGUE()
282