xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/k7/mod_1_4.asm (revision c3ab26950fe8540fb553d1d1dcae454bc98e5a25)
1dnl  x86-32 mpn_mod_1s_4p, requiring cmov.
2
3dnl  Contributed to the GNU project by Torbjorn Granlund.
4dnl
5dnl  Copyright 2009, 2010 Free Software Foundation, Inc.
6dnl
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of the GNU Lesser General Public License as published
11dnl  by the Free Software Foundation; either version 3 of the License, or (at
12dnl  your option) any later version.
13dnl
14dnl  The GNU MP Library is distributed in the hope that it will be useful, but
15dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17dnl  License for more details.
18dnl
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24C			    cycles/limb
25C P5				 ?
26C P6 model 0-8,10-12		 ?
27C P6 model 9  (Banias)		 ?
28C P6 model 13 (Dothan)		 6
29C P4 model 0  (Willamette)	 ?
30C P4 model 1  (?)		 ?
31C P4 model 2  (Northwood)	15.5
32C P4 model 3  (Prescott)	 ?
33C P4 model 4  (Nocona)		 ?
34C AMD K6			 ?
35C AMD K7			 4.75
36C AMD K8			 ?
37
38ASM_START()
39	TEXT
40	ALIGN(16)
41PROLOGUE(mpn_mod_1s_4p)
42	push	%ebp
43	push	%edi
44	push	%esi
45	push	%ebx
46	sub	$28, %esp
47	mov	60(%esp), %edi		C cps[]
48	mov	8(%edi), %eax
49	mov	12(%edi), %edx
50	mov	16(%edi), %ecx
51	mov	20(%edi), %esi
52	mov	24(%edi), %edi
53	mov	%eax, 4(%esp)
54	mov	%edx, 8(%esp)
55	mov	%ecx, 12(%esp)
56	mov	%esi, 16(%esp)
57	mov	%edi, 20(%esp)
58	mov	52(%esp), %eax		C n
59	xor	%edi, %edi
60	mov	48(%esp), %esi		C up
61	lea	-12(%esi,%eax,4), %esi
62	and	$3, %eax
63	je	L(b0)
64	cmp	$2, %eax
65	jc	L(b1)
66	je	L(b2)
67
68L(b3):	mov	4(%esi), %eax
69	mull	4(%esp)
70	mov	(%esi), %ebp
71	add	%eax, %ebp
72	adc	%edx, %edi
73	mov	8(%esi), %eax
74	mull	8(%esp)
75	lea	-12(%esi), %esi
76	jmp	L(m0)
77
78L(b0):	mov	(%esi), %eax
79	mull	4(%esp)
80	mov	-4(%esi), %ebp
81	add	%eax, %ebp
82	adc	%edx, %edi
83	mov	4(%esi), %eax
84	mull	8(%esp)
85	add	%eax, %ebp
86	adc	%edx, %edi
87	mov	8(%esi), %eax
88	mull	12(%esp)
89	lea	-16(%esi), %esi
90	jmp	L(m0)
91
92L(b1):	mov	8(%esi), %ebp
93	lea	-4(%esi), %esi
94	jmp	L(m1)
95
96L(b2):	mov	8(%esi), %edi
97	mov	4(%esi), %ebp
98	lea	-8(%esi), %esi
99	jmp	L(m1)
100
101	ALIGN(16)
102L(top):	mov	(%esi), %eax
103	mull	4(%esp)
104	mov	-4(%esi), %ebx
105	xor	%ecx, %ecx
106	add	%eax, %ebx
107	adc	%edx, %ecx
108	mov	4(%esi), %eax
109	mull	8(%esp)
110	add	%eax, %ebx
111	adc	%edx, %ecx
112	mov	8(%esi), %eax
113	mull	12(%esp)
114	add	%eax, %ebx
115	adc	%edx, %ecx
116	lea	-16(%esi), %esi
117	mov	16(%esp), %eax
118	mul	%ebp
119	add	%eax, %ebx
120	adc	%edx, %ecx
121	mov	20(%esp), %eax
122	mul	%edi
123	mov	%ebx, %ebp
124	mov	%ecx, %edi
125L(m0):	add	%eax, %ebp
126	adc	%edx, %edi
127L(m1):	sub	$4, 52(%esp)
128	ja	L(top)
129
130L(end):	mov	4(%esp), %eax
131	mul	%edi
132	mov	60(%esp), %edi
133	add	%eax, %ebp
134	adc	$0, %edx
135	mov	4(%edi), %ecx
136	mov	%edx, %esi
137	mov	%ebp, %eax
138	sal	%cl, %esi
139	mov	%ecx, %ebx
140	neg	%ecx
141	shr	%cl, %eax
142	or	%esi, %eax
143	lea	1(%eax), %esi
144	mull	(%edi)
145	mov	%ebx, %ecx
146	mov	%eax, %ebx
147	mov	%ebp, %eax
148	mov	56(%esp), %ebp
149	sal	%cl, %eax
150	add	%eax, %ebx
151	adc	%esi, %edx
152	imul	%ebp, %edx
153	sub	%edx, %eax
154	lea	(%eax,%ebp), %edx
155	cmp	%eax, %ebx
156	cmovc(	%edx, %eax)
157	mov	%eax, %edx
158	sub	%ebp, %eax
159	cmovc(	%edx, %eax)
160	add	$28, %esp
161	pop	%ebx
162	pop	%esi
163	pop	%edi
164	pop	%ebp
165	shr	%cl, %eax
166	ret
167EPILOGUE()
168
169	ALIGN(16)
170PROLOGUE(mpn_mod_1s_4p_cps)
171C CAUTION: This is the same code as in pentium4/sse2/mod_1_4.asm
172	push	%ebp
173	push	%edi
174	push	%esi
175	push	%ebx
176	mov	20(%esp), %ebp		C FIXME: avoid bp for 0-idx
177	mov	24(%esp), %ebx
178	bsr	%ebx, %ecx
179	xor	$31, %ecx
180	sal	%cl, %ebx		C b << cnt
181	mov	%ebx, %edx
182	not	%edx
183	mov	$-1, %eax
184	div	%ebx
185	xor	%edi, %edi
186	sub	%ebx, %edi
187	mov	$1, %esi
188	mov	%eax, (%ebp)		C store bi
189	mov	%ecx, 4(%ebp)		C store cnt
190	shld	%cl, %eax, %esi
191	imul	%edi, %esi
192	mov	%eax, %edi
193	mul	%esi
194
195	add	%esi, %edx
196	shr	%cl, %esi
197	mov	%esi, 8(%ebp)		C store B1modb
198
199	not	%edx
200	imul	%ebx, %edx
201	lea	(%edx,%ebx), %esi
202	cmp	%edx, %eax
203	cmovnc(	%edx, %esi)
204	mov	%edi, %eax
205	mul	%esi
206
207	add	%esi, %edx
208	shr	%cl, %esi
209	mov	%esi, 12(%ebp)		C store B2modb
210
211	not	%edx
212	imul	%ebx, %edx
213	lea	(%edx,%ebx), %esi
214	cmp	%edx, %eax
215	cmovnc(	%edx, %esi)
216	mov	%edi, %eax
217	mul	%esi
218
219	add	%esi, %edx
220	shr	%cl, %esi
221	mov	%esi, 16(%ebp)		C store B3modb
222
223	not	%edx
224	imul	%ebx, %edx
225	lea	(%edx,%ebx), %esi
226	cmp	%edx, %eax
227	cmovnc(	%edx, %esi)
228	mov	%edi, %eax
229	mul	%esi
230
231	add	%esi, %edx
232	shr	%cl, %esi
233	mov	%esi, 20(%ebp)		C store B4modb
234
235	not	%edx
236	imul	%ebx, %edx
237	add	%edx, %ebx
238	cmp	%edx, %eax
239	cmovnc(	%edx, %ebx)
240
241	shr	%cl, %ebx
242	mov	%ebx, 24(%ebp)		C store B5modb
243
244	pop	%ebx
245	pop	%esi
246	pop	%edi
247	pop	%ebp
248	ret
249EPILOGUE()
250