xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/atom/sse2/sqr_basecase.asm (revision 9573673d78c64ea1eac42d7f2e9521be89932ae5)
1dnl  x86 mpn_sqr_basecase -- square an mpn number, optimised for atom.
2
3dnl  Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
4dnl
5dnl  Copyright 2011 Free Software Foundation, Inc.
6dnl
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of the GNU Lesser General Public License as published
11dnl  by the Free Software Foundation; either version 3 of the License, or (at
12dnl  your option) any later version.
13dnl
14dnl  The GNU MP Library is distributed in the hope that it will be useful, but
15dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17dnl  License for more details.
18dnl
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24C TODO
25C  * Check if 'jmp N(%esp)' is well-predicted enough to allow us to combine the
26C    4 large loops into one; we could use it for the outer loop branch.
27C  * Optimise code outside of inner loops.
28C  * Write combined addmul_1 feed-in a wind-down code, and use when iterating
29C    outer each loop.  ("Overlapping software pipelining")
30C  * Perhaps use caller-saves regs for inlined mul_1, allowing us to postpone
31C    all pushes.
32C  * Perhaps write special code for n < M, for some small M.
33C  * Replace inlined addmul_1 with smaller code from aorsmul_1.asm, or perhaps
34C    with even less pipelined code.
35C  * We run the outer loop until we have a 2-limb by 1-limb addmul_1 left.
36C    Consider breaking out earlier, saving high the cost of short loops.
37
38C void mpn_sqr_basecase (mp_ptr wp,
39C                        mp_srcptr xp, mp_size_t xn);
40
41define(`rp',  `%edi')
42define(`up',  `%esi')
43define(`n',   `%ecx')
44
45define(`un',  `%ebp')
46
47	TEXT
48	ALIGN(16)
49PROLOGUE(mpn_sqr_basecase)
50	push	%edi
51	push	%esi
52	mov	12(%esp), rp
53	mov	16(%esp), up
54	mov	20(%esp), n
55
56	lea	4(rp), rp	C write triangular product starting at rp[1]
57	dec	n
58	movd	(up), %mm7
59
60	jz	L(one)
61	lea	4(up), up
62	push	%ebx
63	push	%ebp
64	mov	n, %eax
65
66	movd	(up), %mm0
67	neg	n
68	pmuludq	%mm7, %mm0
69	pxor	%mm6, %mm6
70	mov	n, un
71
72	and	$3, %eax
73	jz	L(of0)
74	cmp	$2, %eax
75	jc	L(of1)
76	jz	L(of2)
77
78C ================================================================
79	jmp	L(m3)
80	ALIGN(16)
81L(lm3):	movd	-4(up), %mm0
82	pmuludq	%mm7, %mm0
83	psrlq	$32, %mm6
84	lea	16(rp), rp
85	paddq	%mm0, %mm6
86	movd	(up), %mm0
87	pmuludq	%mm7, %mm0
88	movd	%mm6, -4(rp)
89	psrlq	$32, %mm6
90L(m3):	paddq	%mm0, %mm6
91	movd	4(up), %mm0
92	pmuludq	%mm7, %mm0
93	movd	%mm6, (rp)
94	psrlq	$32, %mm6
95	paddq	%mm0, %mm6
96	movd	8(up), %mm0
97	pmuludq	%mm7, %mm0
98	movd	%mm6, 4(rp)
99	psrlq	$32, %mm6
100	paddq	%mm0, %mm6
101	add	$4, un
102	movd	%mm6, 8(rp)
103	lea	16(up), up
104	js	L(lm3)
105
106	psrlq	$32, %mm6
107	movd	%mm6, 12(rp)
108
109	inc	n
110C	jz	L(done)
111  lea	-12(up), up
112  lea	4(rp), rp
113	jmp	L(ol2)
114
115C ================================================================
116	ALIGN(16)
117L(lm0):	movd	(up), %mm0
118	pmuludq	%mm7, %mm0
119	psrlq	$32, %mm6
120	lea	16(rp), rp
121L(of0):	paddq	%mm0, %mm6
122	movd	4(up), %mm0
123	pmuludq	%mm7, %mm0
124	movd	%mm6, (rp)
125	psrlq	$32, %mm6
126	paddq	%mm0, %mm6
127	movd	8(up), %mm0
128	pmuludq	%mm7, %mm0
129	movd	%mm6, 4(rp)
130	psrlq	$32, %mm6
131	paddq	%mm0, %mm6
132	movd	12(up), %mm0
133	pmuludq	%mm7, %mm0
134	movd	%mm6, 8(rp)
135	psrlq	$32, %mm6
136	paddq	%mm0, %mm6
137	add	$4, un
138	movd	%mm6, 12(rp)
139	lea	16(up), up
140	js	L(lm0)
141
142	psrlq	$32, %mm6
143	movd	%mm6, 16(rp)
144
145	inc	n
146C	jz	L(done)
147  lea	-8(up), up
148  lea	8(rp), rp
149	jmp	L(ol3)
150
151C ================================================================
152	ALIGN(16)
153L(lm1):	movd	-12(up), %mm0
154	pmuludq	%mm7, %mm0
155	psrlq	$32, %mm6
156	lea	16(rp), rp
157	paddq	%mm0, %mm6
158	movd	-8(up), %mm0
159	pmuludq	%mm7, %mm0
160	movd	%mm6, -12(rp)
161	psrlq	$32, %mm6
162	paddq	%mm0, %mm6
163	movd	-4(up), %mm0
164	pmuludq	%mm7, %mm0
165	movd	%mm6, -8(rp)
166	psrlq	$32, %mm6
167	paddq	%mm0, %mm6
168	movd	(up), %mm0
169	pmuludq	%mm7, %mm0
170	movd	%mm6, -4(rp)
171	psrlq	$32, %mm6
172L(of1):	paddq	%mm0, %mm6
173	add	$4, un
174	movd	%mm6, (rp)
175	lea	16(up), up
176	js	L(lm1)
177
178	psrlq	$32, %mm6
179	movd	%mm6, 4(rp)
180
181	inc	n
182	jz	L(done)		C goes away when we add special n=2 code
183  lea	-20(up), up
184  lea	-4(rp), rp
185	jmp	L(ol0)
186
187C ================================================================
188	ALIGN(16)
189L(lm2):	movd	-8(up), %mm0
190	pmuludq	%mm7, %mm0
191	psrlq	$32, %mm6
192	lea	16(rp), rp
193	paddq	%mm0, %mm6
194	movd	-4(up), %mm0
195	pmuludq	%mm7, %mm0
196	movd	%mm6, -8(rp)
197	psrlq	$32, %mm6
198	paddq	%mm0, %mm6
199	movd	(up), %mm0
200	pmuludq	%mm7, %mm0
201	movd	%mm6, -4(rp)
202	psrlq	$32, %mm6
203L(of2):	paddq	%mm0, %mm6
204	movd	4(up), %mm0
205	pmuludq	%mm7, %mm0
206	movd	%mm6, (rp)
207	psrlq	$32, %mm6
208	paddq	%mm0, %mm6
209	add	$4, un
210	movd	%mm6, 4(rp)
211	lea	16(up), up
212	js	L(lm2)
213
214	psrlq	$32, %mm6
215	movd	%mm6, 8(rp)
216
217	inc	n
218C	jz	L(done)
219  lea	-16(up), up
220C  lea	(rp), rp
221C	jmp	L(ol1)
222
223C ================================================================
224
225L(ol1):	lea	4(up,n,4), up
226	movd	(up), %mm7	C read next U invariant limb
227	lea	8(rp,n,4), rp
228	mov	n, un
229
230	movd	4(up), %mm1
231	pmuludq	%mm7, %mm1
232	sar	$2, un
233	movd	%mm1, %ebx
234	inc	un
235	jz	L(re1)
236
237	movd	8(up), %mm0
238	pmuludq	%mm7, %mm0
239	xor	%edx, %edx	C zero edx and CF
240	jmp	L(a1)
241
242L(la1):	adc	$0, %edx
243	add	%ebx, 12(rp)
244	movd	%mm0, %eax
245	pmuludq	%mm7, %mm1
246	lea	16(rp), rp
247	psrlq	$32, %mm0
248	adc	%edx, %eax
249	movd	%mm0, %edx
250	movd	%mm1, %ebx
251	movd	8(up), %mm0
252	pmuludq	%mm7, %mm0
253	adc	$0, %edx
254	add	%eax, (rp)
255L(a1):	psrlq	$32, %mm1
256	adc	%edx, %ebx
257	movd	%mm1, %edx
258	movd	%mm0, %eax
259	movd	12(up), %mm1
260	pmuludq	%mm7, %mm1
261	adc	$0, %edx
262	add	%ebx, 4(rp)
263	psrlq	$32, %mm0
264	adc	%edx, %eax
265	movd	%mm0, %edx
266	movd	%mm1, %ebx
267	lea	16(up), up
268	movd	(up), %mm0
269	adc	$0, %edx
270	add	%eax, 8(rp)
271	psrlq	$32, %mm1
272	adc	%edx, %ebx
273	movd	%mm1, %edx
274	pmuludq	%mm7, %mm0
275	inc	un
276	movd	4(up), %mm1
277	jnz	L(la1)
278
279	adc	un, %edx	C un is zero here
280	add	%ebx, 12(rp)
281	movd	%mm0, %eax
282	pmuludq	%mm7, %mm1
283	lea	16(rp), rp
284	psrlq	$32, %mm0
285	adc	%edx, %eax
286	movd	%mm0, %edx
287	movd	%mm1, %ebx
288	adc	un, %edx
289	add	%eax, (rp)
290	psrlq	$32, %mm1
291	adc	%edx, %ebx
292	movd	%mm1, %eax
293	adc	un, %eax
294	add	%ebx, 4(rp)
295	adc	un, %eax
296	mov	%eax, 8(rp)
297
298	inc	n
299
300C ================================================================
301
302L(ol0):	lea	(up,n,4), up
303	movd	4(up), %mm7	C read next U invariant limb
304	lea	4(rp,n,4), rp
305	mov	n, un
306
307	movd	8(up), %mm0
308	pmuludq	%mm7, %mm0
309	sar	$2, un
310	movd	12(up), %mm1
311	movd	%mm0, %eax
312	pmuludq	%mm7, %mm1
313	xor	%edx, %edx	C zero edx and CF
314	jmp	L(a0)
315
316L(la0):	adc	$0, %edx
317	add	%ebx, 12(rp)
318	movd	%mm0, %eax
319	pmuludq	%mm7, %mm1
320	lea	16(rp), rp
321	psrlq	$32, %mm0
322	adc	%edx, %eax
323	movd	%mm0, %edx
324	movd	%mm1, %ebx
325	movd	8(up), %mm0
326	pmuludq	%mm7, %mm0
327	adc	$0, %edx
328	add	%eax, (rp)
329	psrlq	$32, %mm1
330	adc	%edx, %ebx
331	movd	%mm1, %edx
332	movd	%mm0, %eax
333	movd	12(up), %mm1
334	pmuludq	%mm7, %mm1
335	adc	$0, %edx
336	add	%ebx, 4(rp)
337L(a0):	psrlq	$32, %mm0
338	adc	%edx, %eax
339	movd	%mm0, %edx
340	movd	%mm1, %ebx
341	lea	16(up), up
342	movd	(up), %mm0
343	adc	$0, %edx
344	add	%eax, 8(rp)
345	psrlq	$32, %mm1
346	adc	%edx, %ebx
347	movd	%mm1, %edx
348	pmuludq	%mm7, %mm0
349	inc	un
350	movd	4(up), %mm1
351	jnz	L(la0)
352
353	adc	un, %edx	C un is zero here
354	add	%ebx, 12(rp)
355	movd	%mm0, %eax
356	pmuludq	%mm7, %mm1
357	lea	16(rp), rp
358	psrlq	$32, %mm0
359	adc	%edx, %eax
360	movd	%mm0, %edx
361	movd	%mm1, %ebx
362	adc	un, %edx
363	add	%eax, (rp)
364	psrlq	$32, %mm1
365	adc	%edx, %ebx
366	movd	%mm1, %eax
367	adc	un, %eax
368	add	%ebx, 4(rp)
369	adc	un, %eax
370	mov	%eax, 8(rp)
371
372	inc	n
373
374C ================================================================
375
376L(ol3):	lea	12(up,n,4), up
377	movd	-8(up), %mm7	C read next U invariant limb
378	lea	(rp,n,4), rp	C put rp back
379	mov	n, un
380
381	movd	-4(up), %mm1
382	pmuludq	%mm7, %mm1
383	sar	$2, un
384	movd	%mm1, %ebx
385	movd	(up), %mm0
386	xor	%edx, %edx	C zero edx and CF
387	jmp	L(a3)
388
389L(la3):	adc	$0, %edx
390	add	%ebx, 12(rp)
391	movd	%mm0, %eax
392	pmuludq	%mm7, %mm1
393	lea	16(rp), rp
394	psrlq	$32, %mm0
395	adc	%edx, %eax
396	movd	%mm0, %edx
397	movd	%mm1, %ebx
398	movd	8(up), %mm0
399	pmuludq	%mm7, %mm0
400	adc	$0, %edx
401	add	%eax, (rp)
402	psrlq	$32, %mm1
403	adc	%edx, %ebx
404	movd	%mm1, %edx
405	movd	%mm0, %eax
406	movd	12(up), %mm1
407	pmuludq	%mm7, %mm1
408	adc	$0, %edx
409	add	%ebx, 4(rp)
410	psrlq	$32, %mm0
411	adc	%edx, %eax
412	movd	%mm0, %edx
413	movd	%mm1, %ebx
414	lea	16(up), up
415	movd	(up), %mm0
416	adc	$0, %edx
417	add	%eax, 8(rp)
418L(a3):	psrlq	$32, %mm1
419	adc	%edx, %ebx
420	movd	%mm1, %edx
421	pmuludq	%mm7, %mm0
422	inc	un
423	movd	4(up), %mm1
424	jnz	L(la3)
425
426	adc	un, %edx	C un is zero here
427	add	%ebx, 12(rp)
428	movd	%mm0, %eax
429	pmuludq	%mm7, %mm1
430	lea	16(rp), rp
431	psrlq	$32, %mm0
432	adc	%edx, %eax
433	movd	%mm0, %edx
434	movd	%mm1, %ebx
435	adc	un, %edx
436	add	%eax, (rp)
437	psrlq	$32, %mm1
438	adc	%edx, %ebx
439	movd	%mm1, %eax
440	adc	un, %eax
441	add	%ebx, 4(rp)
442	adc	un, %eax
443	mov	%eax, 8(rp)
444
445	inc	n
446
447C ================================================================
448
449L(ol2):	lea	8(up,n,4), up
450	movd	-4(up), %mm7	C read next U invariant limb
451	lea	12(rp,n,4), rp
452	mov	n, un
453
454	movd	(up), %mm0
455	pmuludq	%mm7, %mm0
456	xor	%edx, %edx
457	sar	$2, un
458	movd	4(up), %mm1
459	test	un, un		C clear carry
460	movd	%mm0, %eax
461	pmuludq	%mm7, %mm1
462	inc	un
463	jnz	L(a2)
464	jmp	L(re2)
465
466L(la2):	adc	$0, %edx
467	add	%ebx, 12(rp)
468	movd	%mm0, %eax
469	pmuludq	%mm7, %mm1
470	lea	16(rp), rp
471L(a2):	psrlq	$32, %mm0
472	adc	%edx, %eax
473	movd	%mm0, %edx
474	movd	%mm1, %ebx
475	movd	8(up), %mm0
476	pmuludq	%mm7, %mm0
477	adc	$0, %edx
478	add	%eax, (rp)
479	psrlq	$32, %mm1
480	adc	%edx, %ebx
481	movd	%mm1, %edx
482	movd	%mm0, %eax
483	movd	12(up), %mm1
484	pmuludq	%mm7, %mm1
485	adc	$0, %edx
486	add	%ebx, 4(rp)
487	psrlq	$32, %mm0
488	adc	%edx, %eax
489	movd	%mm0, %edx
490	movd	%mm1, %ebx
491	lea	16(up), up
492	movd	(up), %mm0
493	adc	$0, %edx
494	add	%eax, 8(rp)
495	psrlq	$32, %mm1
496	adc	%edx, %ebx
497	movd	%mm1, %edx
498	pmuludq	%mm7, %mm0
499	inc	un
500	movd	4(up), %mm1
501	jnz	L(la2)
502
503	adc	un, %edx	C un is zero here
504	add	%ebx, 12(rp)
505	movd	%mm0, %eax
506	pmuludq	%mm7, %mm1
507	lea	16(rp), rp
508	psrlq	$32, %mm0
509	adc	%edx, %eax
510	movd	%mm0, %edx
511	movd	%mm1, %ebx
512	adc	un, %edx
513	add	%eax, (rp)
514	psrlq	$32, %mm1
515	adc	%edx, %ebx
516	movd	%mm1, %eax
517	adc	un, %eax
518	add	%ebx, 4(rp)
519	adc	un, %eax
520	mov	%eax, 8(rp)
521
522	inc	n
523	jmp	L(ol1)
524
525C ================================================================
526L(re2):	psrlq	$32, %mm0
527	movd	(up), %mm7	C read next U invariant limb
528	adc	%edx, %eax
529	movd	%mm0, %edx
530	movd	%mm1, %ebx
531	adc	un, %edx
532	add	%eax, (rp)
533	lea	4(rp), rp
534	psrlq	$32, %mm1
535	adc	%edx, %ebx
536	movd	%mm1, %eax
537	movd	4(up), %mm1
538	adc	un, %eax
539	add	%ebx, (rp)
540	pmuludq	%mm7, %mm1
541	adc	un, %eax
542	mov	%eax, 4(rp)
543	movd	%mm1, %ebx
544
545L(re1):	psrlq	$32, %mm1
546	add	%ebx, 4(rp)
547	movd	%mm1, %eax
548	adc	un, %eax
549	xor	n, n		C make n zeroness assumption below true
550	mov	%eax, 8(rp)
551
552L(done):			C n is zero here
553	mov	24(%esp), up
554	mov	28(%esp), %eax
555
556	movd	(up), %mm0
557	inc	%eax
558	pmuludq	%mm0, %mm0
559	lea	4(up), up
560	mov	20(%esp), rp
561	shr	%eax
562	movd	%mm0, (rp)
563	psrlq	$32, %mm0
564	lea	-12(rp), rp
565	mov	%eax, 28(%esp)
566	jnc	L(odd)
567
568	movd	%mm0, %ebp
569	movd	(up), %mm0
570	lea	8(rp), rp
571	pmuludq	%mm0, %mm0
572	lea	-4(up), up
573	add	8(rp), %ebp
574	movd	%mm0, %edx
575	adc	12(rp), %edx
576	rcr	n
577	jmp	L(ent)
578
579C	ALIGN(16)		C alignment seems irrelevant
580L(top):	movd	(up), %mm1
581	adc	n, n
582	movd	%mm0, %eax
583	pmuludq	%mm1, %mm1
584	movd	4(up), %mm0
585	adc	(rp), %eax
586	movd	%mm1, %ebx
587	pmuludq	%mm0, %mm0
588	psrlq	$32, %mm1
589	adc	4(rp), %ebx
590	movd	%mm1, %ebp
591	movd	%mm0, %edx
592	adc	8(rp), %ebp
593	adc	12(rp), %edx
594	rcr	n		C FIXME: isn't this awfully slow on atom???
595	adc	%eax, (rp)
596	adc	%ebx, 4(rp)
597L(ent):	lea	8(up), up
598	adc	%ebp, 8(rp)
599	psrlq	$32, %mm0
600	adc	%edx, 12(rp)
601L(odd):	decl	28(%esp)
602	lea	16(rp), rp
603	jnz	L(top)
604
605L(end):	adc	n, n
606	movd	%mm0, %eax
607	adc	n, %eax
608	mov	%eax, (rp)
609
610L(rtn):	emms
611	pop	%ebp
612	pop	%ebx
613	pop	%esi
614	pop	%edi
615	ret
616
617L(one):	pmuludq	%mm7, %mm7
618	movq	%mm7, -4(rp)
619	emms
620	pop	%esi
621	pop	%edi
622	ret
623EPILOGUE()
624