xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/pentium4/sse2/mul_basecase.asm (revision 1897181a7231d5fc7ab48994d1447fcbc4e13a49)
1dnl  mpn_mul_basecase for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
2
3dnl  Copyright 2001, 2002, 2005, 2007 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22C TODO:
23C  * Improve ad-hoc outer loop code and register handling.  Some feed-in
24C    scheduling could improve things by several cycles per outer iteration.
25C  * In code for un <= 3, try keeping accumulation operands in registers,
26C    without storing intermediates to rp.
27C  * We might want to keep 32 in a free mm register, since the register form is
28C    3 bytes and the immediate form is 4 bytes.  About 70 bytes to save.
29C  * Look into different loop alignment, we now expand the code about 50 bytes
30C    with possibly needless alignment.
31C  * Perhaps rewrap loops 00,01,02 (6 loops) to allow fall-through entry.
32C  * Use OSP, should solve feed-in latency problems.
33C  * Save a few tens of bytes by doing cross-jumping for Loel0, etc.
34C  * Save around 120 bytes by remapping "m 0", "m 1", "m 2" and "m 3" registers
35C    so that they can share feed-in code, and changing the branch targets from
36C    L<n> to Lm<nn>.
37
38C                           cycles/limb
39C P6 model 9   (Banias)         ?
40C P6 model 13  (Dothan)         5.24
41C P6 model 14  (Yonah)          ?
42C P4 model 0-1 (Willamette):    5
43C P4 model 2   (Northwood):     4.60 at 32 limbs
44C P4 model 3-4 (Prescott):      4.94 at 32 limbs
45
46C INPUT PARAMETERS
47C rp		sp + 4
48C up		sp + 8
49C un		sp + 12
50C vp		sp + 16
51C vn		sp + 20
52
53	TEXT
54	ALIGN(16)
55PROLOGUE(mpn_mul_basecase)
56	push	%esi
57	push	%ebx
58	mov	12(%esp), %edx		C rp
59	mov	16(%esp), %eax		C up
60	mov	20(%esp), %ecx		C un
61	mov	24(%esp), %esi		C vp
62	mov	28(%esp), %ebx		C vn
63	movd	(%esi), %mm7		C
64L(ent):	cmp	$3, %ecx
65	ja	L(big)
66	movd	(%eax), %mm6
67	pmuludq	%mm7, %mm6
68	jz	L(un3)
69	cmp	$2, %ecx
70	jz	L(un2)
71
72L(un1):	movd	%mm6, (%edx)		C				un=1
73	psrlq	$32, %mm6		C				un=1
74	movd	%mm6, 4(%edx)		C				un=1
75	jmp	L(rtr)			C				un=1
76
77L(un2):	movd	4(%eax), %mm1		C				un=2
78	pmuludq	%mm7, %mm1		C				un=2
79	movd	%mm6, (%edx)		C				un=2
80	psrlq	$32, %mm6		C				un=2
81	paddq	%mm1, %mm6		C				un=2
82	movd	%mm6, 4(%edx)		C				un=2
83	psrlq	$32, %mm6		C				un=2
84	movd	%mm6, 8(%edx)		C				un=2
85      dec	%ebx			C				un=2
86      jz	L(rtr)			C				un=2
87	movd	4(%esi), %mm7		C				un=2
88	movd	(%eax), %mm6		C				un=2
89	pmuludq	%mm7, %mm6		C				un=2
90	movd	4(%eax), %mm1		C				un=2
91	movd	4(%edx), %mm4		C				un=2
92	pmuludq	%mm7, %mm1		C				un=2
93	movd	8(%edx), %mm5		C				un=2
94	paddq	%mm4, %mm6		C				un=2
95	paddq	%mm1, %mm5		C				un=2
96	movd	%mm6, 4(%edx)		C				un=2
97	psrlq	$32, %mm6		C				un=2
98	paddq	%mm5, %mm6		C				un=2
99	movd	%mm6, 8(%edx)		C				un=2
100	psrlq	$32, %mm6		C				un=2
101	movd	%mm6, 12(%edx)		C				un=2
102L(rtr):	emms
103	pop	%ebx
104	pop	%esi
105	ret
106
107L(un3):	movd	4(%eax), %mm1		C				un=3
108	pmuludq	%mm7, %mm1		C				un=3
109	movd	8(%eax), %mm2		C				un=3
110	pmuludq	%mm7, %mm2		C				un=3
111	movd	%mm6, (%edx)		C				un=3
112	psrlq	$32, %mm6		C				un=3
113	paddq	%mm1, %mm6		C				un=3
114	movd	%mm6, 4(%edx)		C				un=3
115	psrlq	$32, %mm6		C				un=3
116	paddq	%mm2, %mm6		C				un=3
117	movd	%mm6, 8(%edx)		C				un=3
118	psrlq	$32, %mm6		C				un=3
119	movd	%mm6, 12(%edx)		C				un=3
120      dec	%ebx			C				un=3
121      jz	L(rtr)			C				un=3
122	movd	4(%esi), %mm7		C				un=3
123	movd	(%eax), %mm6		C				un=3
124	pmuludq	%mm7, %mm6		C				un=3
125	movd	4(%eax), %mm1		C				un=3
126	movd	4(%edx), %mm4		C				un=3
127	pmuludq	%mm7, %mm1		C				un=3
128	movd	8(%eax), %mm2		C				un=3
129	movd	8(%edx), %mm5		C				un=3
130	pmuludq	%mm7, %mm2		C				un=3
131	paddq	%mm4, %mm6		C				un=3
132	paddq	%mm1, %mm5		C				un=3
133	movd	12(%edx), %mm4		C				un=3
134	movd	%mm6, 4(%edx)		C				un=3
135	psrlq	$32, %mm6		C				un=3
136	paddq	%mm5, %mm6		C				un=3
137	paddq	%mm2, %mm4		C				un=3
138	movd	%mm6, 8(%edx)		C				un=3
139	psrlq	$32, %mm6		C				un=3
140	paddq	%mm4, %mm6		C				un=3
141	movd	%mm6, 12(%edx)		C				un=3
142	psrlq	$32, %mm6		C				un=3
143	movd	%mm6, 16(%edx)		C				un=3
144      dec	%ebx			C				un=3
145      jz	L(rtr)			C				un=3
146	movd	8(%esi), %mm7		C				un=3
147	movd	(%eax), %mm6		C				un=3
148	pmuludq	%mm7, %mm6		C				un=3
149	movd	4(%eax), %mm1		C				un=3
150	movd	8(%edx), %mm4		C				un=3
151	pmuludq	%mm7, %mm1		C				un=3
152	movd	8(%eax), %mm2		C				un=3
153	movd	12(%edx), %mm5		C				un=3
154	pmuludq	%mm7, %mm2		C				un=3
155	paddq	%mm4, %mm6		C				un=3
156	paddq	%mm1, %mm5		C				un=3
157	movd	16(%edx), %mm4		C				un=3
158	movd	%mm6, 8(%edx)		C				un=3
159	psrlq	$32, %mm6		C				un=3
160	paddq	%mm5, %mm6		C				un=3
161	paddq	%mm2, %mm4		C				un=3
162	movd	%mm6, 12(%edx)		C				un=3
163	psrlq	$32, %mm6		C				un=3
164	paddq	%mm4, %mm6		C				un=3
165	movd	%mm6, 16(%edx)		C				un=3
166	psrlq	$32, %mm6		C				un=3
167	movd	%mm6, 20(%edx)		C				un=3
168	jmp	L(rtr)
169
170
171L(big):	push	%edi
172	pxor	%mm6, %mm6
173	lea	4(%esi), %esi
174	and	$3, %ecx
175	jz	L(0)
176	cmp	$2, %ecx
177	jc	L(1)
178	jz	L(2)
179	jmp	L(3)			C FIXME: one case should fall through
180
181
182L(0):	movd	(%eax), %mm3		C				m 0
183	sub	24(%esp), %ecx		C inner loop count		m 0
184	mov	%ecx, 24(%esp)		C update loop count for later	m 0
185	pmuludq	%mm7, %mm3		C				m 0
186	movd	4(%eax), %mm0		C				m 0
187	pmuludq	%mm7, %mm0		C				m 0
188	movd	8(%eax), %mm1		C				m 0
189	jmp	L(m00)			C				m 0
190	ALIGN(16)			C				m 0
191L(lpm0):
192	pmuludq	%mm7, %mm4		C				m 0
193	paddq	%mm0, %mm6		C				m 0
194	movd	(%eax), %mm3		C				m 0
195	movd	%mm6, -12(%edx)		C				m 0
196	psrlq	$32, %mm6		C				m 0
197	pmuludq	%mm7, %mm3		C				m 0
198	paddq	%mm1, %mm6		C				m 0
199	movd	4(%eax), %mm0		C				m 0
200	movd	%mm6, -8(%edx)		C				m 0
201	psrlq	$32, %mm6		C				m 0
202	pmuludq	%mm7, %mm0		C				m 0
203	paddq	%mm4, %mm6		C				m 0
204	movd	8(%eax), %mm1		C				m 0
205	movd	%mm6, -4(%edx)		C				m 0
206	psrlq	$32, %mm6		C				m 0
207L(m00):	pmuludq	%mm7, %mm1		C				m 0
208	paddq	%mm3, %mm6		C				m 0
209	movd	12(%eax), %mm4		C				m 0
210	movd	%mm6, (%edx)		C				m 0
211	psrlq	$32, %mm6		C				m 0
212	lea	16(%eax), %eax		C				m 0
213	lea	16(%edx), %edx		C				m 0
214	add	$4, %ecx		C				m 0
215	ja	L(lpm0)			C				m 0
216	pmuludq	%mm7, %mm4		C				m 0
217	paddq	%mm0, %mm6		C				m 0
218	movd	%mm6, -12(%edx)		C				m 0
219	psrlq	$32, %mm6		C				m 0
220	paddq	%mm1, %mm6		C				m 0
221	mov	16(%esp), %edi		C rp				  0
222	jmp	L(x0)
223
224L(olp0):
225	lea	4(%edi), %edi		C				am 0
226	movd	(%esi), %mm7		C				am 0
227	lea	4(%esi), %esi		C				am 0
228	mov	%edi, %edx		C rp				am 0
229	mov	20(%esp), %eax		C up				am 0
230	movd	(%eax), %mm3		C				am 0
231	mov	24(%esp), %ecx		C inner loop count		am 0
232	pxor	%mm6, %mm6		C				am 0
233	pmuludq	%mm7, %mm3		C				am 0
234	movd	4(%eax), %mm0		C				am 0
235	movd	(%edx), %mm5		C				am 0
236	pmuludq	%mm7, %mm0		C				am 0
237	movd	8(%eax), %mm1		C				am 0
238	paddq	%mm3, %mm5		C				am 0
239	movd	4(%edx), %mm4		C				am 0
240	jmp	L(am00)			C				am 0
241	ALIGN(16)			C				mm 0
242L(lam0):
243	pmuludq	%mm7, %mm2		C				am 0
244	paddq	%mm4, %mm6		C				am 0
245	movd	(%eax), %mm3		C				am 0
246	paddq	%mm1, %mm5		C				am 0
247	movd	-4(%edx), %mm4		C				am 0
248	movd	%mm6, -12(%edx)		C				am 0
249	psrlq	$32, %mm6		C				am 0
250	pmuludq	%mm7, %mm3		C				am 0
251	paddq	%mm5, %mm6		C				am 0
252	movd	4(%eax), %mm0		C				am 0
253	paddq	%mm2, %mm4		C				am 0
254	movd	(%edx), %mm5		C				am 0
255	movd	%mm6, -8(%edx)		C				am 0
256	psrlq	$32, %mm6		C				am 0
257	pmuludq	%mm7, %mm0		C				am 0
258	paddq	%mm4, %mm6		C				am 0
259	movd	8(%eax), %mm1		C				am 0
260	paddq	%mm3, %mm5		C				am 0
261	movd	4(%edx), %mm4		C				am 0
262	movd	%mm6, -4(%edx)		C				am 0
263	psrlq	$32, %mm6		C				am 0
264L(am00):
265	pmuludq	%mm7, %mm1		C				am 0
266	paddq	%mm5, %mm6		C				am 0
267	movd	12(%eax), %mm2		C				am 0
268	paddq	%mm0, %mm4		C				am 0
269	movd	8(%edx), %mm5		C				am 0
270	movd	%mm6, (%edx)		C				am 0
271	psrlq	$32, %mm6		C				am 0
272	lea	16(%eax), %eax		C				am 0
273	lea	16(%edx), %edx		C				am 0
274	add	$4, %ecx		C				am 0
275	jnz	L(lam0)			C				am 0
276	pmuludq	%mm7, %mm2		C				am 0
277	paddq	%mm4, %mm6		C				am 0
278	paddq	%mm1, %mm5		C				am 0
279	movd	-4(%edx), %mm4		C				am 0
280	movd	%mm6, -12(%edx)		C				am 0
281	psrlq	$32, %mm6		C				am 0
282	paddq	%mm5, %mm6		C				am 0
283	paddq	%mm2, %mm4		C				am 0
284L(x0):	movd	%mm6, -8(%edx)		C				am 0
285	psrlq	$32, %mm6		C				am 0
286	paddq	%mm4, %mm6		C				am 0
287	movd	%mm6, -4(%edx)		C				am 0
288	psrlq	$32, %mm6		C				am 0
289	movd	%mm6, (%edx)		C				am 0
290	dec	%ebx			C				am 0
291	jnz	L(olp0)			C				am 0
292L(oel0):
293	emms				C				   0
294	pop	%edi			C				   0
295	pop	%ebx			C				   0
296	pop	%esi			C				   0
297	ret				C				   0
298
299
300L(1):	movd	(%eax), %mm4		C				m 1
301	sub	24(%esp), %ecx		C				m 1
302	mov	%ecx, 24(%esp)		C update loop count for later	m 1
303	pmuludq	%mm7, %mm4		C				m 1
304	movd	4(%eax), %mm3		C				m 1
305	pmuludq	%mm7, %mm3		C				m 1
306	movd	8(%eax), %mm0		C				m 1
307	jmp	L(m01)			C				m 1
308	ALIGN(16)			C				m 1
309L(lpm1):
310	pmuludq	%mm7, %mm4		C				m 1
311	paddq	%mm0, %mm6		C				m 1
312	movd	4(%eax), %mm3		C				m 1
313	movd	%mm6, -8(%edx)		C				m 1
314	psrlq	$32, %mm6		C				m 1
315	pmuludq	%mm7, %mm3		C				m 1
316	paddq	%mm1, %mm6		C				m 1
317	movd	8(%eax), %mm0		C				m 1
318	movd	%mm6, -4(%edx)		C				m 1
319	psrlq	$32, %mm6		C				m 1
320L(m01):	pmuludq	%mm7, %mm0		C				m 1
321	paddq	%mm4, %mm6		C				m 1
322	movd	12(%eax), %mm1		C				m 1
323	movd	%mm6, (%edx)		C				m 1
324	psrlq	$32, %mm6		C				m 1
325	pmuludq	%mm7, %mm1		C				m 1
326	paddq	%mm3, %mm6		C				m 1
327	movd	16(%eax), %mm4		C				m 1
328	movd	%mm6, 4(%edx)		C				m 1
329	psrlq	$32, %mm6		C				m 1
330	lea	16(%eax), %eax		C				m 1
331	lea	16(%edx), %edx		C				m 1
332	add	$4, %ecx		C				m 1
333	ja	L(lpm1)			C				m 1
334	pmuludq	%mm7, %mm4		C				m 1
335	paddq	%mm0, %mm6		C				m 1
336	movd	%mm6, -8(%edx)		C				m 1
337	psrlq	$32, %mm6		C				m 1
338	paddq	%mm1, %mm6		C				m 1
339	mov	16(%esp), %edi		C rp				  1
340	jmp	L(x1)
341
342L(olp1):
343	lea	4(%edi), %edi		C				am 1
344	movd	(%esi), %mm7		C				am 1
345	lea	4(%esi), %esi		C				am 1
346	mov	%edi, %edx		C rp				am 1
347	mov	20(%esp), %eax		C up				am 1
348	movd	(%eax), %mm2		C				am 1
349	mov	24(%esp), %ecx		C inner loop count		am 1
350	pxor	%mm6, %mm6		C				am 1
351	pmuludq	%mm7, %mm2		C				am 1
352	movd	4(%eax), %mm3		C				am 1
353	movd	(%edx), %mm4		C				am 1
354	pmuludq	%mm7, %mm3		C				am 1
355	movd	8(%eax), %mm0		C				am 1
356	paddq	%mm2, %mm4		C				am 1
357	movd	4(%edx), %mm5		C				am 1
358	jmp	L(am01)			C				am 1
359	ALIGN(16)			C				am 1
360L(lam1):
361	pmuludq	%mm7, %mm2		C				am 1
362	paddq	%mm4, %mm6		C				am 1
363	movd	4(%eax), %mm3		C				am 1
364	paddq	%mm1, %mm5		C				am 1
365	movd	(%edx), %mm4		C				am 1
366	movd	%mm6, -8(%edx)		C				am 1
367	psrlq	$32, %mm6		C				am 1
368	pmuludq	%mm7, %mm3		C				am 1
369	paddq	%mm5, %mm6		C				am 1
370	movd	8(%eax), %mm0		C				am 1
371	paddq	%mm2, %mm4		C				am 1
372	movd	4(%edx), %mm5		C				am 1
373	movd	%mm6, -4(%edx)		C				am 1
374	psrlq	$32, %mm6		C				am 1
375L(am01):
376	pmuludq	%mm7, %mm0		C				am 1
377	paddq	%mm4, %mm6		C				am 1
378	movd	12(%eax), %mm1		C				am 1
379	paddq	%mm3, %mm5		C				am 1
380	movd	8(%edx), %mm4		C				am 1
381	movd	%mm6, (%edx)		C				am 1
382	psrlq	$32, %mm6		C				am 1
383	pmuludq	%mm7, %mm1		C				am 1
384	paddq	%mm5, %mm6		C				am 1
385	movd	16(%eax), %mm2		C				am 1
386	paddq	%mm0, %mm4		C				am 1
387	movd	12(%edx), %mm5		C				am 1
388	movd	%mm6, 4(%edx)		C				am 1
389	psrlq	$32, %mm6		C				am 1
390	lea	16(%eax), %eax		C				am 1
391	lea	16(%edx), %edx		C				am 1
392	add	$4, %ecx		C				am 1
393	jnz	L(lam1)			C				am 1
394	pmuludq	%mm7, %mm2		C				am 1
395	paddq	%mm4, %mm6		C				am 1
396	paddq	%mm1, %mm5		C				am 1
397	movd	(%edx), %mm4		C				am 1
398	movd	%mm6, -8(%edx)		C				am 1
399	psrlq	$32, %mm6		C				am 1
400	paddq	%mm5, %mm6		C				am 1
401	paddq	%mm2, %mm4		C				am 1
402L(x1):	movd	%mm6, -4(%edx)		C				am 1
403	psrlq	$32, %mm6		C				am 1
404	paddq	%mm4, %mm6		C				am 1
405	movd	%mm6, (%edx)		C				am 1
406	psrlq	$32, %mm6		C				am 1
407	movd	%mm6, 4(%edx)		C				am 1
408	dec	%ebx			C				am 1
409	jnz	L(olp1)			C				am 1
410L(oel1):
411	emms				C				   1
412	pop	%edi			C				   1
413	pop	%ebx			C				   1
414	pop	%esi			C				   1
415	ret				C				   1
416
417
418L(2):	movd	(%eax), %mm1		C				m 2
419	sub	24(%esp), %ecx		C				m 2
420	mov	%ecx, 24(%esp)		C update loop count for later	m 2
421	pmuludq	%mm7, %mm1		C				m 2
422	movd	4(%eax), %mm4		C				m 2
423	pmuludq	%mm7, %mm4		C				m 2
424	movd	8(%eax), %mm3		C				m 2
425	jmp	L(m10)			C				m 2
426	ALIGN(16)			C				m 2
427L(lpm2):
428	pmuludq	%mm7, %mm4		C				m 2
429	paddq	%mm0, %mm6		C				m 2
430	movd	8(%eax), %mm3		C				m 2
431	movd	%mm6, -4(%edx)		C				m 2
432	psrlq	$32, %mm6		C				m 2
433L(m10):	pmuludq	%mm7, %mm3		C				m 2
434	paddq	%mm1, %mm6		C				m 2
435	movd	12(%eax), %mm0		C				m 2
436	movd	%mm6, (%edx)		C				m 2
437	psrlq	$32, %mm6		C				m 2
438	pmuludq	%mm7, %mm0		C				m 2
439	paddq	%mm4, %mm6		C				m 2
440	movd	16(%eax), %mm1		C				m 2
441	movd	%mm6, 4(%edx)		C				m 2
442	psrlq	$32, %mm6		C				m 2
443	pmuludq	%mm7, %mm1		C				m 2
444	paddq	%mm3, %mm6		C				m 2
445	movd	20(%eax), %mm4		C				m 2
446	movd	%mm6, 8(%edx)		C				m 2
447	psrlq	$32, %mm6		C				m 2
448	lea	16(%eax), %eax		C				m 2
449	lea	16(%edx), %edx		C				m 2
450	add	$4, %ecx		C				m 2
451	ja	L(lpm2)			C				m 2
452	pmuludq	%mm7, %mm4		C				m 2
453	paddq	%mm0, %mm6		C				m 2
454	movd	%mm6, -4(%edx)		C				m 2
455	psrlq	$32, %mm6		C				m 2
456	paddq	%mm1, %mm6		C				m 2
457	mov	16(%esp), %edi		C rp				  2
458	jmp	L(x2)
459
460L(olp2):
461	lea	4(%edi), %edi		C				am 2
462	movd	(%esi), %mm7		C				am 2
463	lea	4(%esi), %esi		C				am 2
464	mov	%edi, %edx		C rp				am 2
465	mov	20(%esp), %eax		C up				am 2
466	movd	(%eax), %mm1		C				am 2
467	mov	24(%esp), %ecx		C inner loop count		am 2
468	pxor	%mm6, %mm6		C				am 2
469	pmuludq	%mm7, %mm1		C				am 2
470	movd	4(%eax), %mm2		C				am 2
471	movd	(%edx), %mm5		C				am 2
472	pmuludq	%mm7, %mm2		C				am 2
473	movd	8(%eax), %mm3		C				am 2
474	paddq	%mm1, %mm5		C				am 2
475	movd	4(%edx), %mm4		C				am 2
476	jmp	L(am10)			C				am 2
477	ALIGN(16)			C				am 2
478L(lam2):
479	pmuludq	%mm7, %mm2		C				am 2
480	paddq	%mm4, %mm6		C				am 2
481	movd	8(%eax), %mm3		C				am 2
482	paddq	%mm1, %mm5		C				am 2
483	movd	4(%edx), %mm4		C				am 2
484	movd	%mm6, -4(%edx)		C				am 2
485	psrlq	$32, %mm6		C				am 2
486L(am10):
487	pmuludq	%mm7, %mm3		C				am 2
488	paddq	%mm5, %mm6		C				am 2
489	movd	12(%eax), %mm0		C				am 2
490	paddq	%mm2, %mm4		C				am 2
491	movd	8(%edx), %mm5		C				am 2
492	movd	%mm6, (%edx)		C				am 2
493	psrlq	$32, %mm6		C				am 2
494	pmuludq	%mm7, %mm0		C				am 2
495	paddq	%mm4, %mm6		C				am 2
496	movd	16(%eax), %mm1		C				am 2
497	paddq	%mm3, %mm5		C				am 2
498	movd	12(%edx), %mm4		C				am 2
499	movd	%mm6, 4(%edx)		C				am 2
500	psrlq	$32, %mm6		C				am 2
501	pmuludq	%mm7, %mm1		C				am 2
502	paddq	%mm5, %mm6		C				am 2
503	movd	20(%eax), %mm2		C				am 2
504	paddq	%mm0, %mm4		C				am 2
505	movd	16(%edx), %mm5		C				am 2
506	movd	%mm6, 8(%edx)		C				am 2
507	psrlq	$32, %mm6		C				am 2
508	lea	16(%eax), %eax		C				am 2
509	lea	16(%edx), %edx		C				am 2
510	add	$4, %ecx		C				am 2
511	jnz	L(lam2)			C				am 2
512	pmuludq	%mm7, %mm2		C				am 2
513	paddq	%mm4, %mm6		C				am 2
514	paddq	%mm1, %mm5		C				am 2
515	movd	4(%edx), %mm4		C				am 2
516	movd	%mm6, -4(%edx)		C				am 2
517	psrlq	$32, %mm6		C				am 2
518	paddq	%mm5, %mm6		C				am 2
519	paddq	%mm2, %mm4		C				am 2
520L(x2):	movd	%mm6, (%edx)		C				am 2
521	psrlq	$32, %mm6		C				am 2
522	paddq	%mm4, %mm6		C				am 2
523	movd	%mm6, 4(%edx)		C				am 2
524	psrlq	$32, %mm6		C				am 2
525	movd	%mm6, 8(%edx)		C				am 2
526	dec	%ebx			C				am 2
527	jnz	L(olp2)			C				am 2
528L(oel2):
529	emms				C				   2
530	pop	%edi			C				   2
531	pop	%ebx			C				   2
532	pop	%esi			C				   2
533	ret				C				   2
534
535
536L(3):	movd	(%eax), %mm0		C				m 3
537	sub	24(%esp), %ecx		C				m 3
538	mov	%ecx, 24(%esp)		C update loop count for later	m 3
539	pmuludq	%mm7, %mm0		C				m 3
540	movd	4(%eax), %mm1		C				m 3
541	pmuludq	%mm7, %mm1		C				m 3
542	movd	8(%eax), %mm4		C				m 3
543	jmp	L(lpm3)			C				m 3
544	ALIGN(16)			C				m 3
545L(lpm3):
546	pmuludq	%mm7, %mm4		C				m 3
547	paddq	%mm0, %mm6		C				m 3
548	movd	12(%eax), %mm3		C				m 3
549	movd	%mm6, (%edx)		C				m 3
550	psrlq	$32, %mm6		C				m 3
551	pmuludq	%mm7, %mm3		C				m 3
552	paddq	%mm1, %mm6		C				m 3
553	movd	16(%eax), %mm0		C				m 3
554	movd	%mm6, 4(%edx)		C				m 3
555	psrlq	$32, %mm6		C				m 3
556	pmuludq	%mm7, %mm0		C				m 3
557	paddq	%mm4, %mm6		C				m 3
558	movd	20(%eax), %mm1		C				m 3
559	movd	%mm6, 8(%edx)		C				m 3
560	psrlq	$32, %mm6		C				m 3
561	pmuludq	%mm7, %mm1		C				m 3
562	paddq	%mm3, %mm6		C				m 3
563	movd	24(%eax), %mm4		C				m 3
564	movd	%mm6, 12(%edx)		C				m 3
565	psrlq	$32, %mm6		C				m 3
566	lea	16(%eax), %eax		C				m 3
567	lea	16(%edx), %edx		C				m 3
568	add	$4, %ecx		C				m 3
569	ja	L(lpm3)			C				m 3
570	pmuludq	%mm7, %mm4		C				m 3
571	paddq	%mm0, %mm6		C				m 3
572	movd	%mm6, (%edx)		C				m 3
573	psrlq	$32, %mm6		C				m 3
574	paddq	%mm1, %mm6		C				m 3
575	mov	16(%esp), %edi		C rp				  3
576	jmp	L(x3)
577
578L(olp3):
579	lea	4(%edi), %edi		C				am 3
580	movd	(%esi), %mm7		C				am 3
581	lea	4(%esi), %esi		C				am 3
582	mov	%edi, %edx		C rp				am 3
583	mov	20(%esp), %eax		C up				am 3
584	movd	(%eax), %mm0		C				am 3
585	mov	24(%esp), %ecx		C inner loop count		am 3
586	pxor	%mm6, %mm6		C				am 3
587	pmuludq	%mm7, %mm0		C				am 3
588	movd	4(%eax), %mm1		C				am 3
589	movd	(%edx), %mm4		C				am 3
590	pmuludq	%mm7, %mm1		C				am 3
591	movd	8(%eax), %mm2		C				am 3
592	paddq	%mm0, %mm4		C				am 3
593	movd	4(%edx), %mm5		C				am 3
594	jmp	L(lam3)			C				am 3
595	ALIGN(16)			C				am 3
596L(lam3):
597	pmuludq	%mm7, %mm2		C				am 3
598	paddq	%mm4, %mm6		C				am 3
599	movd	12(%eax), %mm3		C				am 3
600	paddq	%mm1, %mm5		C				am 3
601	movd	8(%edx), %mm4		C				am 3
602	movd	%mm6, (%edx)		C				am 3
603	psrlq	$32, %mm6		C				am 3
604	pmuludq	%mm7, %mm3		C				am 3
605	paddq	%mm5, %mm6		C				am 3
606	movd	16(%eax), %mm0		C				am 3
607	paddq	%mm2, %mm4		C				am 3
608	movd	12(%edx), %mm5		C				am 3
609	movd	%mm6, 4(%edx)		C				am 3
610	psrlq	$32, %mm6		C				am 3
611	pmuludq	%mm7, %mm0		C				am 3
612	paddq	%mm4, %mm6		C				am 3
613	movd	20(%eax), %mm1		C				am 3
614	paddq	%mm3, %mm5		C				am 3
615	movd	16(%edx), %mm4		C				am 3
616	movd	%mm6, 8(%edx)		C				am 3
617	psrlq	$32, %mm6		C				am 3
618	pmuludq	%mm7, %mm1		C				am 3
619	paddq	%mm5, %mm6		C				am 3
620	movd	24(%eax), %mm2		C				am 3
621	paddq	%mm0, %mm4		C				am 3
622	movd	20(%edx), %mm5		C				am 3
623	movd	%mm6, 12(%edx)		C				am 3
624	psrlq	$32, %mm6		C				am 3
625	lea	16(%eax), %eax		C				am 3
626	lea	16(%edx), %edx		C				am 3
627	add	$4, %ecx		C				am 3
628	jnz	L(lam3)			C				am 3
629	pmuludq	%mm7, %mm2		C				am 3
630	paddq	%mm4, %mm6		C				am 3
631	paddq	%mm1, %mm5		C				am 3
632	movd	8(%edx), %mm4		C				am 3
633	movd	%mm6, (%edx)		C				am 3
634	psrlq	$32, %mm6		C				am 3
635	paddq	%mm5, %mm6		C				am 3
636	paddq	%mm2, %mm4		C				am 3
637L(x3):	movd	%mm6, 4(%edx)		C				am 3
638	psrlq	$32, %mm6		C				am 3
639	paddq	%mm4, %mm6		C				am 3
640	movd	%mm6, 8(%edx)		C				am 3
641	psrlq	$32, %mm6		C				am 3
642	movd	%mm6, 12(%edx)		C				am 3
643	dec	%ebx			C				am 3
644	jnz	L(olp3)			C				am 3
645L(oel3):
646	emms				C				   3
647	pop	%edi			C				   3
648	pop	%ebx			C				   3
649	pop	%esi			C				   3
650	ret				C				   3
651EPILOGUE()
652