xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/powerpc64/mode64/mul_basecase.asm (revision 0a3071956a3a9fdebdbf7f338cf2d439b45fc728)
1dnl  PowerPC-64 mpn_mul_basecase.
2
3dnl  Copyright 1999-2001, 2003-2006, 2008 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C                  cycles/limb
34C POWER3/PPC630         6-18
35C POWER4/PPC970          8
36C POWER5                 8
37C POWER6                24
38
39C INPUT PARAMETERS
40define(`rp', `r3')
41define(`up', `r4')
42define(`un', `r5')
43define(`vp', `r6')
44define(`vn', `r7')
45
46define(`v0',	   `r25')
47define(`outer_rp', `r22')
48define(`outer_up', `r23')
49
50ASM_START()
51PROLOGUE(mpn_mul_basecase)
52
53C Special code for un <= 2, for efficiency of these important cases,
54C and since it simplifies the default code.
55	cmpdi	cr0, un, 2
56	bgt	cr0, L(un_gt2)
57	cmpdi	cr6, vn, 1
58	ld	r7, 0(vp)
59	ld	r5, 0(up)
60	mulld	r8, r5, r7	C weight 0
61	mulhdu	r9, r5, r7	C weight 1
62	std	r8, 0(rp)
63	beq	cr0, L(2x)
64	std	r9, 8(rp)
65	blr
66	ALIGN(16)
67L(2x):	ld	r0, 8(up)
68	mulld	r8, r0, r7	C weight 1
69	mulhdu	r10, r0, r7	C weight 2
70	addc	r9, r9, r8
71	addze	r10, r10
72	bne	cr6, L(2x2)
73	std	r9, 8(rp)
74	std	r10, 16(rp)
75	blr
76	ALIGN(16)
77L(2x2):	ld	r6, 8(vp)
78	nop
79	mulld	r8, r5, r6	C weight 1
80	mulhdu	r11, r5, r6	C weight 2
81	addc	r9, r9, r8
82	std	r9, 8(rp)
83	adde	r11, r11, r10
84	mulld	r12, r0, r6	C weight 2
85	mulhdu	r0, r0, r6	C weight 3
86	addze	r0, r0
87	addc	r11, r11, r12
88	addze	r0, r0
89	std	r11, 16(rp)
90	std	r0, 24(rp)
91	blr
92
93L(un_gt2):
94	std	r31, -8(r1)
95	std	r30, -16(r1)
96	std	r29, -24(r1)
97	std	r28, -32(r1)
98	std	r27, -40(r1)
99	std	r26, -48(r1)
100	std	r25, -56(r1)
101	std	r24, -64(r1)
102	std	r23, -72(r1)
103	std	r22, -80(r1)
104
105	mr	outer_rp, rp
106	mr	outer_up, up
107
108	ld	v0, 0(vp)	C new v limb
109	addi	vp, vp, 8
110	ld	r26, 0(up)
111
112	rldicl.	r0, un, 0,62	C r0 = n & 3, set cr0
113	cmpdi	cr6, r0, 2
114	addi	un, un, 1	C compute count...
115	srdi	un, un, 2	C ...for ctr
116	mtctr	un		C copy inner loop count into ctr
117	beq	cr0, L(b0)
118	blt	cr6, L(b1)
119	beq	cr6, L(b2)
120
121
122	ALIGN(16)
123L(b3):	mulld	r0, r26, v0
124	mulhdu	r12, r26, v0
125	addic	r0, r0, 0
126	std	r0, 0(rp)
127	ld	r26, 8(up)
128	ld	r27, 16(up)
129	bdz	L(end_m_3)
130
131	ALIGN(16)
132L(lo_m_3):
133	mulld	r0, r26, v0
134	mulhdu	r31, r26, v0
135	ld	r26, 24(up)
136	nop
137	mulld	r24, r27, v0
138	mulhdu	r8, r27, v0
139	ld	r27, 32(up)
140	nop
141	adde	r0, r0, r12
142	adde	r24, r24, r31
143	mulld	r9, r26, v0
144	mulhdu	r10, r26, v0
145	ld	r26, 40(up)
146	nop
147	mulld	r11, r27, v0
148	mulhdu	r12, r27, v0
149	ld	r27, 48(up)
150	std	r0, 8(rp)
151	adde	r9, r9, r8
152	std	r24, 16(rp)
153	adde	r11, r11, r10
154	std	r9, 24(rp)
155	addi	up, up, 32
156	std	r11, 32(rp)
157	addi	rp, rp, 32
158	bdnz	L(lo_m_3)
159
160	ALIGN(16)
161L(end_m_3):
162	mulld	r0, r26, v0
163	mulhdu	r31, r26, v0
164
165	mulld	r24, r27, v0
166	mulhdu	r8, r27, v0
167
168	adde	r0, r0, r12
169	adde	r24, r24, r31
170
171	std	r0, 8(rp)
172	std	r24, 16(rp)
173	addze	r8, r8
174	std	r8, 24(rp)
175	addic.	vn, vn, -1
176	beq	L(ret)
177
178	ALIGN(16)
179L(outer_lo_3):
180	mtctr	un		C copy inner loop count into ctr
181	addi	rp, outer_rp, 8
182	mr	up, outer_up
183	addi	outer_rp, outer_rp, 8
184	ld	v0, 0(vp)	C new v limb
185	addi	vp, vp, 8
186	ld	r26, 0(up)
187	ld	r28, 0(rp)
188	mulld	r0, r26, v0
189	mulhdu	r12, r26, v0
190	addc	r0, r0, r28
191	std	r0, 0(rp)
192	ld	r26, 8(up)
193	ld	r27, 16(up)
194	bdz	L(end_3)
195
196	ALIGN(16)		C registers dying
197L(lo_3):
198	mulld	r0, r26, v0	C
199	mulhdu	r10, r26, v0	C 26
200	ld	r26, 24(up)	C
201	ld	r28, 8(rp)	C
202	mulld	r24, r27, v0	C
203	mulhdu	r8, r27, v0	C 27
204	ld	r27, 32(up)	C
205	ld	r29, 16(rp)	C
206	adde	r0, r0, r12	C 0 12
207	adde	r24, r24, r10	C 24 10
208	mulld	r9, r26, v0	C
209	mulhdu	r10, r26, v0	C 26
210	ld	r26, 40(up)	C
211	ld	r30, 24(rp)	C
212	mulld	r11, r27, v0	C
213	mulhdu	r12, r27, v0	C 27
214	ld	r27, 48(up)	C
215	ld	r31, 32(rp)	C
216	adde	r9, r9, r8	C 8 9
217	adde	r11, r11, r10	C 10 11
218	addze	r12, r12	C 12
219	addc	r0, r0, r28	C 0 28
220	std	r0, 8(rp)	C 0
221	adde	r24, r24, r29	C 7 29
222	std	r24, 16(rp)	C 7
223	adde	r9, r9, r30	C 9 30
224	std	r9, 24(rp)	C 9
225	adde	r11, r11, r31	C 11 31
226	std	r11, 32(rp)	C 11
227	addi	up, up, 32	C
228	addi	rp, rp, 32	C
229	bdnz	L(lo_3)	C
230
231	ALIGN(16)
232L(end_3):
233	mulld	r0, r26, v0
234	mulhdu	r10, r26, v0
235	ld	r28, 8(rp)
236	nop
237	mulld	r24, r27, v0
238	mulhdu	r8, r27, v0
239	ld	r29, 16(rp)
240	nop
241	adde	r0, r0, r12
242	adde	r24, r24, r10
243	addze	r8, r8
244	addc	r0, r0, r28
245	std	r0, 8(rp)
246	adde	r24, r24, r29
247	std	r24, 16(rp)
248	addze	r8, r8
249	std	r8, 24(rp)
250
251	addic.	vn, vn, -1
252	bne	L(outer_lo_3)
253	b	L(ret)
254
255
256	ALIGN(16)
257L(b0):	ld	r27, 8(up)
258	addi	up, up, 8
259	mulld	r0, r26, v0
260	mulhdu	r10, r26, v0
261	mulld	r24, r27, v0
262	mulhdu	r8, r27, v0
263	addc	r24, r24, r10
264	addze	r12, r8
265	std	r0, 0(rp)
266	std	r24, 8(rp)
267	addi	rp, rp, 8
268	ld	r26, 8(up)
269	ld	r27, 16(up)
270	bdz	L(end_m_0)
271
272	ALIGN(16)
273L(lo_m_0):
274	mulld	r0, r26, v0
275	mulhdu	r31, r26, v0
276	ld	r26, 24(up)
277	nop
278	mulld	r24, r27, v0
279	mulhdu	r8, r27, v0
280	ld	r27, 32(up)
281	nop
282	adde	r0, r0, r12
283	adde	r24, r24, r31
284	mulld	r9, r26, v0
285	mulhdu	r10, r26, v0
286	ld	r26, 40(up)
287	nop
288	mulld	r11, r27, v0
289	mulhdu	r12, r27, v0
290	ld	r27, 48(up)
291	std	r0, 8(rp)
292	adde	r9, r9, r8
293	std	r24, 16(rp)
294	adde	r11, r11, r10
295	std	r9, 24(rp)
296	addi	up, up, 32
297	std	r11, 32(rp)
298	addi	rp, rp, 32
299	bdnz	L(lo_m_0)
300
301	ALIGN(16)
302L(end_m_0):
303	mulld	r0, r26, v0
304	mulhdu	r31, r26, v0
305
306	mulld	r24, r27, v0
307	mulhdu	r8, r27, v0
308
309	adde	r0, r0, r12
310	adde	r24, r24, r31
311
312	std	r0, 8(rp)
313	addze	r8, r8
314	std	r24, 16(rp)
315	addic.	vn, vn, -1
316	std	r8, 24(rp)
317	nop
318	beq	L(ret)
319
320	ALIGN(16)
321L(outer_lo_0):
322	mtctr	un		C copy inner loop count into ctr
323	addi	rp, outer_rp, 16
324	addi	up, outer_up, 8
325	addi	outer_rp, outer_rp, 8
326	ld	v0, 0(vp)	C new v limb
327	addi	vp, vp, 8
328	ld	r26, -8(up)
329	ld	r27, 0(up)
330	ld	r28, -8(rp)
331	ld	r29, 0(rp)
332	nop
333	nop
334	mulld	r0, r26, v0
335	mulhdu	r10, r26, v0
336	mulld	r24, r27, v0
337	mulhdu	r8, r27, v0
338	addc	r24, r24, r10
339	addze	r12, r8
340	addc	r0, r0, r28
341	std	r0, -8(rp)
342	adde	r24, r24, r29
343	std	r24, 0(rp)
344	ld	r26, 8(up)
345	ld	r27, 16(up)
346	bdz	L(end_0)
347
348	ALIGN(16)		C registers dying
349L(lo_0):
350	mulld	r0, r26, v0	C
351	mulhdu	r10, r26, v0	C 26
352	ld	r26, 24(up)	C
353	ld	r28, 8(rp)	C
354	mulld	r24, r27, v0	C
355	mulhdu	r8, r27, v0	C 27
356	ld	r27, 32(up)	C
357	ld	r29, 16(rp)	C
358	adde	r0, r0, r12	C 0 12
359	adde	r24, r24, r10	C 24 10
360	mulld	r9, r26, v0	C
361	mulhdu	r10, r26, v0	C 26
362	ld	r26, 40(up)	C
363	ld	r30, 24(rp)	C
364	mulld	r11, r27, v0	C
365	mulhdu	r12, r27, v0	C 27
366	ld	r27, 48(up)	C
367	ld	r31, 32(rp)	C
368	adde	r9, r9, r8	C 8 9
369	adde	r11, r11, r10	C 10 11
370	addze	r12, r12	C 12
371	addc	r0, r0, r28	C 0 28
372	std	r0, 8(rp)	C 0
373	adde	r24, r24, r29	C 7 29
374	std	r24, 16(rp)	C 7
375	adde	r9, r9, r30	C 9 30
376	std	r9, 24(rp)	C 9
377	adde	r11, r11, r31	C 11 31
378	std	r11, 32(rp)	C 11
379	addi	up, up, 32	C
380	addi	rp, rp, 32	C
381	bdnz	L(lo_0)	C
382
383	ALIGN(16)
384L(end_0):
385	mulld	r0, r26, v0
386	mulhdu	r10, r26, v0
387	ld	r28, 8(rp)
388	nop
389	mulld	r24, r27, v0
390	mulhdu	r8, r27, v0
391	ld	r29, 16(rp)
392	nop
393	adde	r0, r0, r12
394	adde	r24, r24, r10
395	addze	r8, r8
396	addic.	vn, vn, -1
397	addc	r0, r0, r28
398	std	r0, 8(rp)
399	adde	r24, r24, r29
400	std	r24, 16(rp)
401	addze	r8, r8
402	std	r8, 24(rp)
403	bne	L(outer_lo_0)
404	b	L(ret)
405
406
407	ALIGN(16)
408L(b1):	ld	r27, 8(up)
409	nop
410	mulld	r0, r26, v0
411	mulhdu	r31, r26, v0
412	ld	r26, 16(up)
413	mulld	r24, r27, v0
414	mulhdu	r8, r27, v0
415	mulld	r9, r26, v0
416	mulhdu	r10, r26, v0
417	addc	r24, r24, r31
418	adde	r9, r9, r8
419	addze	r12, r10
420	std	r0, 0(rp)
421	std	r24, 8(rp)
422	std	r9, 16(rp)
423	addi	up, up, 16
424	addi	rp, rp, 16
425	ld	r26, 8(up)
426	ld	r27, 16(up)
427	bdz	L(end_m_1)
428
429	ALIGN(16)
430L(lo_m_1):
431	mulld	r0, r26, v0
432	mulhdu	r31, r26, v0
433	ld	r26, 24(up)
434	nop
435	mulld	r24, r27, v0
436	mulhdu	r8, r27, v0
437	ld	r27, 32(up)
438	nop
439	adde	r0, r0, r12
440	adde	r24, r24, r31
441	mulld	r9, r26, v0
442	mulhdu	r10, r26, v0
443	ld	r26, 40(up)
444	nop
445	mulld	r11, r27, v0
446	mulhdu	r12, r27, v0
447	ld	r27, 48(up)
448	std	r0, 8(rp)
449	adde	r9, r9, r8
450	std	r24, 16(rp)
451	adde	r11, r11, r10
452	std	r9, 24(rp)
453	addi	up, up, 32
454	std	r11, 32(rp)
455	addi	rp, rp, 32
456	bdnz	L(lo_m_1)
457
458	ALIGN(16)
459L(end_m_1):
460	mulld	r0, r26, v0
461	mulhdu	r31, r26, v0
462
463	mulld	r24, r27, v0
464	mulhdu	r8, r27, v0
465
466	adde	r0, r0, r12
467	adde	r24, r24, r31
468
469	std	r0, 8(rp)
470	addze	r8, r8
471	std	r24, 16(rp)
472	addic.	vn, vn, -1
473	std	r8, 24(rp)
474	nop
475	beq	L(ret)
476
477	ALIGN(16)
478L(outer_lo_1):
479	mtctr	un		C copy inner loop count into ctr
480	addi	rp, outer_rp, 24
481	addi	up, outer_up, 16
482	addi	outer_rp, outer_rp, 8
483	ld	v0, 0(vp)	C new v limb
484	addi	vp, vp, 8
485	ld	r26, -16(up)
486	ld	r27, -8(up)
487	mulld	r0, r26, v0
488	mulhdu	r31, r26, v0
489	ld	r26, 0(up)
490	ld	r28, -16(rp)
491	mulld	r24, r27, v0
492	mulhdu	r8, r27, v0
493	ld	r29, -8(rp)
494	ld	r30, 0(rp)
495	mulld	r9, r26, v0
496	mulhdu	r10, r26, v0
497	addc	r24, r24, r31
498	adde	r9, r9, r8
499	addze	r12, r10
500	addc	r0, r0, r28
501	std	r0, -16(rp)
502	adde	r24, r24, r29
503	std	r24, -8(rp)
504	adde	r9, r9, r30
505	std	r9, 0(rp)
506	ld	r26, 8(up)
507	ld	r27, 16(up)
508	bdz	L(end_1)
509
510	ALIGN(16)		C registers dying
511L(lo_1):
512	mulld	r0, r26, v0	C
513	mulhdu	r10, r26, v0	C 26
514	ld	r26, 24(up)	C
515	ld	r28, 8(rp)	C
516	mulld	r24, r27, v0	C
517	mulhdu	r8, r27, v0	C 27
518	ld	r27, 32(up)	C
519	ld	r29, 16(rp)	C
520	adde	r0, r0, r12	C 0 12
521	adde	r24, r24, r10	C 24 10
522	mulld	r9, r26, v0	C
523	mulhdu	r10, r26, v0	C 26
524	ld	r26, 40(up)	C
525	ld	r30, 24(rp)	C
526	mulld	r11, r27, v0	C
527	mulhdu	r12, r27, v0	C 27
528	ld	r27, 48(up)	C
529	ld	r31, 32(rp)	C
530	adde	r9, r9, r8	C 8 9
531	adde	r11, r11, r10	C 10 11
532	addze	r12, r12	C 12
533	addc	r0, r0, r28	C 0 28
534	std	r0, 8(rp)	C 0
535	adde	r24, r24, r29	C 7 29
536	std	r24, 16(rp)	C 7
537	adde	r9, r9, r30	C 9 30
538	std	r9, 24(rp)	C 9
539	adde	r11, r11, r31	C 11 31
540	std	r11, 32(rp)	C 11
541	addi	up, up, 32	C
542	addi	rp, rp, 32	C
543	bdnz	L(lo_1)	C
544
545	ALIGN(16)
546L(end_1):
547	mulld	r0, r26, v0
548	mulhdu	r10, r26, v0
549	ld	r28, 8(rp)
550	nop
551	mulld	r24, r27, v0
552	mulhdu	r8, r27, v0
553	ld	r29, 16(rp)
554	nop
555	adde	r0, r0, r12
556	adde	r24, r24, r10
557	addze	r8, r8
558	addic.	vn, vn, -1
559	addc	r0, r0, r28
560	std	r0, 8(rp)
561	adde	r24, r24, r29
562	std	r24, 16(rp)
563	addze	r8, r8
564	std	r8, 24(rp)
565	bne	L(outer_lo_1)
566	b	L(ret)
567
568
569	ALIGN(16)
570L(b2):	ld	r27, 8(up)
571	addi	up, up, -8
572	addi	rp, rp, -8
573	li	r12, 0
574	addic	r12, r12, 0
575
576	ALIGN(16)
577L(lo_m_2):
578	mulld	r0, r26, v0
579	mulhdu	r31, r26, v0
580	ld	r26, 24(up)
581	nop
582	mulld	r24, r27, v0
583	mulhdu	r8, r27, v0
584	ld	r27, 32(up)
585	nop
586	adde	r0, r0, r12
587	adde	r24, r24, r31
588	mulld	r9, r26, v0
589	mulhdu	r10, r26, v0
590	ld	r26, 40(up)
591	nop
592	mulld	r11, r27, v0
593	mulhdu	r12, r27, v0
594	ld	r27, 48(up)
595	std	r0, 8(rp)
596	adde	r9, r9, r8
597	std	r24, 16(rp)
598	adde	r11, r11, r10
599	std	r9, 24(rp)
600	addi	up, up, 32
601	std	r11, 32(rp)
602
603	addi	rp, rp, 32
604	bdnz	L(lo_m_2)
605
606	ALIGN(16)
607L(end_m_2):
608	mulld	r0, r26, v0
609	mulhdu	r31, r26, v0
610
611	mulld	r24, r27, v0
612	mulhdu	r8, r27, v0
613
614	adde	r0, r0, r12
615	adde	r24, r24, r31
616
617	std	r0, 8(rp)
618	addze	r8, r8
619	std	r24, 16(rp)
620	addic.	vn, vn, -1
621	std	r8, 24(rp)
622	nop
623	beq	L(ret)
624
625	ALIGN(16)
626L(outer_lo_2):
627	mtctr	un		C copy inner loop count into ctr
628	addi	rp, outer_rp, 0
629	addi	up, outer_up, -8
630	addi	outer_rp, outer_rp, 8
631	ld	v0, 0(vp)	C new v limb
632	addi	vp, vp, 8
633	ld	r26, 8(up)
634	ld	r27, 16(up)
635	li	r12, 0
636	addic	r12, r12, 0
637
638	ALIGN(16)		C registers dying
639L(lo_2):
640	mulld	r0, r26, v0	C
641	mulhdu	r10, r26, v0	C 26
642	ld	r26, 24(up)	C
643	ld	r28, 8(rp)	C
644	mulld	r24, r27, v0	C
645	mulhdu	r8, r27, v0	C 27
646	ld	r27, 32(up)	C
647	ld	r29, 16(rp)	C
648	adde	r0, r0, r12	C 0 12
649	adde	r24, r24, r10	C 24 10
650	mulld	r9, r26, v0	C
651	mulhdu	r10, r26, v0	C 26
652	ld	r26, 40(up)	C
653	ld	r30, 24(rp)	C
654	mulld	r11, r27, v0	C
655	mulhdu	r12, r27, v0	C 27
656	ld	r27, 48(up)	C
657	ld	r31, 32(rp)	C
658	adde	r9, r9, r8	C 8 9
659	adde	r11, r11, r10	C 10 11
660	addze	r12, r12	C 12
661	addc	r0, r0, r28	C 0 28
662	std	r0, 8(rp)	C 0
663	adde	r24, r24, r29	C 7 29
664	std	r24, 16(rp)	C 7
665	adde	r9, r9, r30	C 9 30
666	std	r9, 24(rp)	C 9
667	adde	r11, r11, r31	C 11 31
668	std	r11, 32(rp)	C 11
669	addi	up, up, 32	C
670	addi	rp, rp, 32	C
671	bdnz	L(lo_2)	C
672
673	ALIGN(16)
674L(end_2):
675	mulld	r0, r26, v0
676	mulhdu	r10, r26, v0
677	ld	r28, 8(rp)
678	nop
679	mulld	r24, r27, v0
680	mulhdu	r8, r27, v0
681	ld	r29, 16(rp)
682	nop
683	adde	r0, r0, r12
684	adde	r24, r24, r10
685	addze	r8, r8
686	addic.	vn, vn, -1
687	addc	r0, r0, r28
688	std	r0, 8(rp)
689	adde	r24, r24, r29
690	std	r24, 16(rp)
691	addze	r8, r8
692	std	r8, 24(rp)
693	bne	L(outer_lo_2)
694	b	L(ret)
695
696
697L(ret):	ld	r31, -8(r1)
698	ld	r30, -16(r1)
699	ld	r29, -24(r1)
700	ld	r28, -32(r1)
701	ld	r27, -40(r1)
702	ld	r26, -48(r1)
703	ld	r25, -56(r1)
704	ld	r24, -64(r1)
705	ld	r23, -72(r1)
706	ld	r22, -80(r1)
707	blr
708EPILOGUE()
709