xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/powerpc64/mode64/mul_1.asm (revision 3587d6f89c746bbb4f886219ddacd41ace480ecf)
1dnl  PowerPC-64 mpn_mul_1 -- Multiply a limb vector with a limb and store
2dnl  the result in a second limb vector.
3
4dnl  Copyright 1999-2001, 2003-2006, 2010 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of either:
10dnl
11dnl    * the GNU Lesser General Public License as published by the Free
12dnl      Software Foundation; either version 3 of the License, or (at your
13dnl      option) any later version.
14dnl
15dnl  or
16dnl
17dnl    * the GNU General Public License as published by the Free Software
18dnl      Foundation; either version 2 of the License, or (at your option) any
19dnl      later version.
20dnl
21dnl  or both in parallel, as here.
22dnl
23dnl  The GNU MP Library is distributed in the hope that it will be useful, but
24dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26dnl  for more details.
27dnl
28dnl  You should have received copies of the GNU General Public License and the
29dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
30dnl  see https://www.gnu.org/licenses/.
31
32include(`../config.m4')
33
34C               cycles/limb
35C POWER3/PPC630     6-18
36C POWER4/PPC970     7.25?  not updated for last file revision
37C POWER5            7.25
38C POWER6           14
39C POWER7            2.9
40
41C TODO
42C  * Try to reduce the number of needed live registers (at least r5 and r10
43C    could be combined)
44C  * Optimize feed-in code, for speed and size.
45C  * Clean up r12/r7 usage in feed-in code.
46
47C INPUT PARAMETERS
48define(`rp', `r3')
49define(`up', `r4')
50define(`n', `r5')
51define(`vl', `r6')
52
53ASM_START()
54PROLOGUE(mpn_mul_1c)
55	std	r27, -40(r1)
56	std	r26, -48(r1)
57	mr	r12, r7
58	b	L(ent)
59EPILOGUE()
60PROLOGUE(mpn_mul_1)
61	std	r27, -40(r1)
62	std	r26, -48(r1)
63	li	r12, 0		C cy_limb = 0
64L(ent):	ld	r26, 0(up)
65
66	rldicl.	r0, n, 0,62	C r0 = n & 3, set cr0
67	cmpdi	cr6, r0, 2
68	addic	n, n, 3		C compute count...
69	srdi	n, n, 2		C ...for ctr
70	mtctr	n		C copy count into ctr
71	beq	cr0, L(b00)
72	blt	cr6, L(b01)
73	beq	cr6, L(b10)
74
75L(b11):	mr	r7, r12
76	mulld	r0, r26, r6
77	mulhdu	r12, r26, r6
78	addi	up, up, 8
79	addc	r0, r0, r7
80	std	r0, 0(rp)
81	addi	rp, rp, 8
82	b	L(fic)
83
84L(b00):	ld	r27, 8(up)
85	addi	up, up, 16
86	mulld	r0, r26, r6
87	mulhdu	r5, r26, r6
88	mulld	r7, r27, r6
89	mulhdu	r8, r27, r6
90	addc	r0, r0, r12
91	adde	r7, r7, r5
92	addze	r12, r8
93	std	r0, 0(rp)
94	std	r7, 8(rp)
95	addi	rp, rp, 16
96	b	L(fic)
97
98	nop			C alignment
99L(b01):	bdnz	L(gt1)
100	mulld	r0, r26, r6
101	mulhdu	r8, r26, r6
102	addc	r0, r0, r12
103	std	r0, 0(rp)
104	b	L(ret)
105L(gt1):	ld	r27, 8(up)
106	nop
107	mulld	r0, r26, r6
108	mulhdu	r5, r26, r6
109	ld	r26, 16(up)
110	mulld	r7, r27, r6
111	mulhdu	r8, r27, r6
112	mulld	r9, r26, r6
113	mulhdu	r10, r26, r6
114	addc	r0, r0, r12
115	adde	r7, r7, r5
116	adde	r9, r9, r8
117	addze	r12, r10
118	std	r0, 0(rp)
119	std	r7, 8(rp)
120	std	r9, 16(rp)
121	addi	up, up, 24
122	addi	rp, rp, 24
123	b	L(fic)
124
125	nop
126L(fic):	ld	r26, 0(up)
127L(b10):	ld	r27, 8(up)
128	addi	up, up, 16
129	bdz	L(end)
130
131L(top):	mulld	r0, r26, r6
132	mulhdu	r5, r26, r6
133	mulld	r7, r27, r6
134	mulhdu	r8, r27, r6
135	ld	r26, 0(up)
136	ld	r27, 8(up)
137	adde	r0, r0, r12
138	adde	r7, r7, r5
139	mulld	r9, r26, r6
140	mulhdu	r10, r26, r6
141	mulld	r11, r27, r6
142	mulhdu	r12, r27, r6
143	ld	r26, 16(up)
144	ld	r27, 24(up)
145	std	r0, 0(rp)
146	adde	r9, r9, r8
147	std	r7, 8(rp)
148	adde	r11, r11, r10
149	std	r9, 16(rp)
150	addi	up, up, 32
151	std	r11, 24(rp)
152
153	addi	rp, rp, 32
154	bdnz	L(top)
155
156L(end):	mulld	r0, r26, r6
157	mulhdu	r5, r26, r6
158	mulld	r7, r27, r6
159	mulhdu	r8, r27, r6
160	adde	r0, r0, r12
161	adde	r7, r7, r5
162	std	r0, 0(rp)
163	std	r7, 8(rp)
164L(ret):	addze	r3, r8
165	ld	r27, -40(r1)
166	ld	r26, -48(r1)
167	blr
168EPILOGUE()
169