xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/atom/sse2/mul_basecase.asm (revision ce54336801cf28877c3414aa2fcb251dddd543a2)
1dnl  x86 mpn_mul_basecase -- Multiply two limb vectors and store the result in
2dnl  a third limb vector.
3
4dnl  Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
5
6dnl  Copyright 2011 Free Software Foundation, Inc.
7
8dnl  This file is part of the GNU MP Library.
9dnl
10dnl  The GNU MP Library is free software; you can redistribute it and/or modify
11dnl  it under the terms of either:
12dnl
13dnl    * the GNU Lesser General Public License as published by the Free
14dnl      Software Foundation; either version 3 of the License, or (at your
15dnl      option) any later version.
16dnl
17dnl  or
18dnl
19dnl    * the GNU General Public License as published by the Free Software
20dnl      Foundation; either version 2 of the License, or (at your option) any
21dnl      later version.
22dnl
23dnl  or both in parallel, as here.
24dnl
25dnl  The GNU MP Library is distributed in the hope that it will be useful, but
26dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
27dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
28dnl  for more details.
29dnl
30dnl  You should have received copies of the GNU General Public License and the
31dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
32dnl  see https://www.gnu.org/licenses/.
33
34include(`../config.m4')
35
36C TODO
37C  * Check if 'jmp N(%esp)' is well-predicted enough to allow us to combine the
38C    4 large loops into one; we could use it for the outer loop branch.
39C  * Optimise code outside of inner loops.
40C  * Write combined addmul_1 feed-in a wind-down code, and use when iterating
41C    outer each loop.  ("Overlapping software pipelining")
42C  * Postpone push of ebx until we know vn > 1.  Perhaps use caller-saves regs
43C    for inlined mul_1, allowing us to postpone all pushes.
44C  * Perhaps write special code for vn <= un < M, for some small M.
45
46C void mpn_mul_basecase (mp_ptr wp,
47C                        mp_srcptr xp, mp_size_t xn,
48C                        mp_srcptr yp, mp_size_t yn);
49C
50
51define(`rp',  `%edi')
52define(`up',  `%esi')
53define(`un',  `%ecx')
54define(`vp',  `%ebp')
55define(`vn',  `36(%esp)')
56
57	TEXT
58	ALIGN(16)
59PROLOGUE(mpn_mul_basecase)
60	push	%edi
61	push	%esi
62	push	%ebx
63	push	%ebp
64	mov	20(%esp), rp
65	mov	24(%esp), up
66	mov	28(%esp), un
67	mov	32(%esp), vp
68
69	movd	(up), %mm0
70	movd	(vp), %mm7
71	pmuludq	%mm7, %mm0
72	pxor	%mm6, %mm6
73
74	mov	un, %eax
75	and	$3, %eax
76	jz	L(of0)
77	cmp	$2, %eax
78	jc	L(of1)
79	jz	L(of2)
80
81C ================================================================
82	jmp	L(m3)
83	ALIGN(16)
84L(lm3):	movd	-4(up), %mm0
85	pmuludq	%mm7, %mm0
86	psrlq	$32, %mm6
87	lea	16(rp), rp
88	paddq	%mm0, %mm6
89	movd	(up), %mm0
90	pmuludq	%mm7, %mm0
91	movd	%mm6, -4(rp)
92	psrlq	$32, %mm6
93L(m3):	paddq	%mm0, %mm6
94	movd	4(up), %mm0
95	pmuludq	%mm7, %mm0
96	movd	%mm6, (rp)
97	psrlq	$32, %mm6
98	paddq	%mm0, %mm6
99	movd	8(up), %mm0
100	pmuludq	%mm7, %mm0
101	movd	%mm6, 4(rp)
102	psrlq	$32, %mm6
103	paddq	%mm0, %mm6
104	sub	$4, un
105	movd	%mm6, 8(rp)
106	lea	16(up), up
107	ja	L(lm3)
108
109	psrlq	$32, %mm6
110	movd	%mm6, 12(rp)
111
112	decl	vn
113	jz	L(done)
114	lea	-8(rp), rp
115
116L(ol3):	mov	28(%esp), un
117	neg	un
118	lea	4(vp), vp
119	movd	(vp), %mm7	C read next V limb
120	mov	24(%esp), up
121	lea	16(rp,un,4), rp
122
123	movd	(up), %mm0
124	pmuludq	%mm7, %mm0
125	sar	$2, un
126	movd	4(up), %mm1
127	movd	%mm0, %ebx
128	pmuludq	%mm7, %mm1
129	lea	-8(up), up
130	xor	%edx, %edx	C zero edx and CF
131	jmp	L(a3)
132
133L(la3):	movd	4(up), %mm1
134	adc	$0, %edx
135	add	%eax, 12(rp)
136	movd	%mm0, %ebx
137	pmuludq	%mm7, %mm1
138	lea	16(rp), rp
139	psrlq	$32, %mm0
140	adc	%edx, %ebx
141	movd	%mm0, %edx
142	movd	%mm1, %eax
143	movd	8(up), %mm0
144	pmuludq	%mm7, %mm0
145	adc	$0, %edx
146	add	%ebx, (rp)
147	psrlq	$32, %mm1
148	adc	%edx, %eax
149	movd	%mm1, %edx
150	movd	%mm0, %ebx
151	movd	12(up), %mm1
152	pmuludq	%mm7, %mm1
153	adc	$0, %edx
154	add	%eax, 4(rp)
155L(a3):	psrlq	$32, %mm0
156	adc	%edx, %ebx
157	movd	%mm0, %edx
158	movd	%mm1, %eax
159	lea	16(up), up
160	movd	(up), %mm0
161	adc	$0, %edx
162	add	%ebx, 8(rp)
163	psrlq	$32, %mm1
164	adc	%edx, %eax
165	movd	%mm1, %edx
166	pmuludq	%mm7, %mm0
167	inc	un
168	jnz	L(la3)
169
170	adc	un, %edx	C un is zero here
171	add	%eax, 12(rp)
172	movd	%mm0, %ebx
173	psrlq	$32, %mm0
174	adc	%edx, %ebx
175	movd	%mm0, %eax
176	adc	un, %eax
177	add	%ebx, 16(rp)
178	adc	un, %eax
179	mov	%eax, 20(rp)
180
181	decl	vn
182	jnz	L(ol3)
183	jmp	L(done)
184
185C ================================================================
186	ALIGN(16)
187L(lm0):	movd	(up), %mm0
188	pmuludq	%mm7, %mm0
189	psrlq	$32, %mm6
190	lea	16(rp), rp
191L(of0):	paddq	%mm0, %mm6
192	movd	4(up), %mm0
193	pmuludq	%mm7, %mm0
194	movd	%mm6, (rp)
195	psrlq	$32, %mm6
196	paddq	%mm0, %mm6
197	movd	8(up), %mm0
198	pmuludq	%mm7, %mm0
199	movd	%mm6, 4(rp)
200	psrlq	$32, %mm6
201	paddq	%mm0, %mm6
202	movd	12(up), %mm0
203	pmuludq	%mm7, %mm0
204	movd	%mm6, 8(rp)
205	psrlq	$32, %mm6
206	paddq	%mm0, %mm6
207	sub	$4, un
208	movd	%mm6, 12(rp)
209	lea	16(up), up
210	ja	L(lm0)
211
212	psrlq	$32, %mm6
213	movd	%mm6, 16(rp)
214
215	decl	vn
216	jz	L(done)
217	lea	-4(rp), rp
218
219L(ol0):	mov	28(%esp), un
220	neg	un
221	lea	4(vp), vp
222	movd	(vp), %mm7	C read next V limb
223	mov	24(%esp), up
224	lea	20(rp,un,4), rp
225
226	movd	(up), %mm1
227	pmuludq	%mm7, %mm1
228	sar	$2, un
229	movd	4(up), %mm0
230	lea	-4(up), up
231	movd	%mm1, %eax
232	pmuludq	%mm7, %mm0
233	xor	%edx, %edx	C zero edx and CF
234	jmp	L(a0)
235
236L(la0):	movd	4(up), %mm1
237	adc	$0, %edx
238	add	%eax, 12(rp)
239	movd	%mm0, %ebx
240	pmuludq	%mm7, %mm1
241	lea	16(rp), rp
242	psrlq	$32, %mm0
243	adc	%edx, %ebx
244	movd	%mm0, %edx
245	movd	%mm1, %eax
246	movd	8(up), %mm0
247	pmuludq	%mm7, %mm0
248	adc	$0, %edx
249	add	%ebx, (rp)
250L(a0):	psrlq	$32, %mm1
251	adc	%edx, %eax
252	movd	%mm1, %edx
253	movd	%mm0, %ebx
254	movd	12(up), %mm1
255	pmuludq	%mm7, %mm1
256	adc	$0, %edx
257	add	%eax, 4(rp)
258	psrlq	$32, %mm0
259	adc	%edx, %ebx
260	movd	%mm0, %edx
261	movd	%mm1, %eax
262	lea	16(up), up
263	movd	(up), %mm0
264	adc	$0, %edx
265	add	%ebx, 8(rp)
266	psrlq	$32, %mm1
267	adc	%edx, %eax
268	movd	%mm1, %edx
269	pmuludq	%mm7, %mm0
270	inc	un
271	jnz	L(la0)
272
273	adc	un, %edx	C un is zero here
274	add	%eax, 12(rp)
275	movd	%mm0, %ebx
276	psrlq	$32, %mm0
277	adc	%edx, %ebx
278	movd	%mm0, %eax
279	adc	un, %eax
280	add	%ebx, 16(rp)
281	adc	un, %eax
282	mov	%eax, 20(rp)
283
284	decl	vn
285	jnz	L(ol0)
286	jmp	L(done)
287
288C ================================================================
289	ALIGN(16)
290L(lm1):	movd	-12(up), %mm0
291	pmuludq	%mm7, %mm0
292	psrlq	$32, %mm6
293	lea	16(rp), rp
294	paddq	%mm0, %mm6
295	movd	-8(up), %mm0
296	pmuludq	%mm7, %mm0
297	movd	%mm6, -12(rp)
298	psrlq	$32, %mm6
299	paddq	%mm0, %mm6
300	movd	-4(up), %mm0
301	pmuludq	%mm7, %mm0
302	movd	%mm6, -8(rp)
303	psrlq	$32, %mm6
304	paddq	%mm0, %mm6
305	movd	(up), %mm0
306	pmuludq	%mm7, %mm0
307	movd	%mm6, -4(rp)
308	psrlq	$32, %mm6
309L(of1):	paddq	%mm0, %mm6
310	sub	$4, un
311	movd	%mm6, (rp)
312	lea	16(up), up
313	ja	L(lm1)
314
315	psrlq	$32, %mm6
316	movd	%mm6, 4(rp)
317
318	decl	vn
319	jz	L(done)
320	lea	-16(rp), rp
321
322L(ol1):	mov	28(%esp), un
323	neg	un
324	lea	4(vp), vp
325	movd	(vp), %mm7	C read next V limb
326	mov	24(%esp), up
327	lea	24(rp,un,4), rp
328
329	movd	(up), %mm0
330	pmuludq	%mm7, %mm0
331	sar	$2, un
332	movd	%mm0, %ebx
333	movd	4(up), %mm1
334	pmuludq	%mm7, %mm1
335	xor	%edx, %edx	C zero edx and CF
336	inc	un
337	jmp	L(a1)
338
339L(la1):	movd	4(up), %mm1
340	adc	$0, %edx
341	add	%eax, 12(rp)
342	movd	%mm0, %ebx
343	pmuludq	%mm7, %mm1
344	lea	16(rp), rp
345L(a1):	psrlq	$32, %mm0
346	adc	%edx, %ebx
347	movd	%mm0, %edx
348	movd	%mm1, %eax
349	movd	8(up), %mm0
350	pmuludq	%mm7, %mm0
351	adc	$0, %edx
352	add	%ebx, (rp)
353	psrlq	$32, %mm1
354	adc	%edx, %eax
355	movd	%mm1, %edx
356	movd	%mm0, %ebx
357	movd	12(up), %mm1
358	pmuludq	%mm7, %mm1
359	adc	$0, %edx
360	add	%eax, 4(rp)
361	psrlq	$32, %mm0
362	adc	%edx, %ebx
363	movd	%mm0, %edx
364	movd	%mm1, %eax
365	lea	16(up), up
366	movd	(up), %mm0
367	adc	$0, %edx
368	add	%ebx, 8(rp)
369	psrlq	$32, %mm1
370	adc	%edx, %eax
371	movd	%mm1, %edx
372	pmuludq	%mm7, %mm0
373	inc	un
374	jnz	L(la1)
375
376	adc	un, %edx	C un is zero here
377	add	%eax, 12(rp)
378	movd	%mm0, %ebx
379	psrlq	$32, %mm0
380	adc	%edx, %ebx
381	movd	%mm0, %eax
382	adc	un, %eax
383	add	%ebx, 16(rp)
384	adc	un, %eax
385	mov	%eax, 20(rp)
386
387	decl	vn
388	jnz	L(ol1)
389	jmp	L(done)
390
391C ================================================================
392	ALIGN(16)
393L(lm2):	movd	-8(up), %mm0
394	pmuludq	%mm7, %mm0
395	psrlq	$32, %mm6
396	lea	16(rp), rp
397	paddq	%mm0, %mm6
398	movd	-4(up), %mm0
399	pmuludq	%mm7, %mm0
400	movd	%mm6, -8(rp)
401	psrlq	$32, %mm6
402	paddq	%mm0, %mm6
403	movd	(up), %mm0
404	pmuludq	%mm7, %mm0
405	movd	%mm6, -4(rp)
406	psrlq	$32, %mm6
407L(of2):	paddq	%mm0, %mm6
408	movd	4(up), %mm0
409	pmuludq	%mm7, %mm0
410	movd	%mm6, (rp)
411	psrlq	$32, %mm6
412	paddq	%mm0, %mm6
413	sub	$4, un
414	movd	%mm6, 4(rp)
415	lea	16(up), up
416	ja	L(lm2)
417
418	psrlq	$32, %mm6
419	movd	%mm6, 8(rp)
420
421	decl	vn
422	jz	L(done)
423	lea	-12(rp), rp
424
425L(ol2):	mov	28(%esp), un
426	neg	un
427	lea	4(vp), vp
428	movd	(vp), %mm7	C read next V limb
429	mov	24(%esp), up
430	lea	12(rp,un,4), rp
431
432	movd	(up), %mm1
433	pmuludq	%mm7, %mm1
434	sar	$2, un
435	movd	4(up), %mm0
436	lea	4(up), up
437	movd	%mm1, %eax
438	xor	%edx, %edx	C zero edx and CF
439	jmp	L(lo2)
440
441L(la2):	movd	4(up), %mm1
442	adc	$0, %edx
443	add	%eax, 12(rp)
444	movd	%mm0, %ebx
445	pmuludq	%mm7, %mm1
446	lea	16(rp), rp
447	psrlq	$32, %mm0
448	adc	%edx, %ebx
449	movd	%mm0, %edx
450	movd	%mm1, %eax
451	movd	8(up), %mm0
452	pmuludq	%mm7, %mm0
453	adc	$0, %edx
454	add	%ebx, (rp)
455	psrlq	$32, %mm1
456	adc	%edx, %eax
457	movd	%mm1, %edx
458	movd	%mm0, %ebx
459	movd	12(up), %mm1
460	pmuludq	%mm7, %mm1
461	adc	$0, %edx
462	add	%eax, 4(rp)
463	psrlq	$32, %mm0
464	adc	%edx, %ebx
465	movd	%mm0, %edx
466	movd	%mm1, %eax
467	lea	16(up), up
468	movd	(up), %mm0
469	adc	$0, %edx
470	add	%ebx, 8(rp)
471L(lo2):	psrlq	$32, %mm1
472	adc	%edx, %eax
473	movd	%mm1, %edx
474	pmuludq	%mm7, %mm0
475	inc	un
476	jnz	L(la2)
477
478	adc	un, %edx	C un is zero here
479	add	%eax, 12(rp)
480	movd	%mm0, %ebx
481	psrlq	$32, %mm0
482	adc	%edx, %ebx
483	movd	%mm0, %eax
484	adc	un, %eax
485	add	%ebx, 16(rp)
486	adc	un, %eax
487	mov	%eax, 20(rp)
488
489	decl	vn
490	jnz	L(ol2)
491C	jmp	L(done)
492
493C ================================================================
494L(done):
495	emms
496	pop	%ebp
497	pop	%ebx
498	pop	%esi
499	pop	%edi
500	ret
501EPILOGUE()
502