xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/powerpc64/mode64/p9/addmul_2.asm (revision 9fb66d812c00ebfb445c0b47dea128f32aa6fe96)
1dnl  Power9 mpn_addmul_2.
2
3dnl  Contributed to the GNU project by Torbjörn Granlund.
4
5dnl  Copyright 2018 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of the GNU Lesser General Public License as published
11dnl  by the Free Software Foundation; either version 3 of the License, or (at
12dnl  your option) any later version.
13
14dnl  The GNU MP Library is distributed in the hope that it will be useful, but
15dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17dnl  License for more details.
18
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24C         cycles/limb
25C power9:    1.62
26
27C STATUS
28C  * Not written with any power9 pipeline understanding.
29C  * The 4x unrolling was not motivated by any timing tests.
30C  * No local scheduling for performance tweaking has been done.
31C  * Decrease load scheduling!
32
33define(`rp', `r3')
34define(`up', `r4')
35define(`n',  `r5')		C Note: Reused as scratch
36define(`vp', `r6')		C Note: Reused for v1
37
38define(`v0', `r7')
39define(`v1', `r6')
40
41
42ASM_START()
43PROLOGUE(mpn_addmul_2)
44	std	r26, -48(r1)
45	std	r27, -40(r1)
46	std	r28, -32(r1)
47	std	r29, -24(r1)
48	std	r30, -16(r1)
49	std	r31, -8(r1)
50
51	subfic	r0, r1, 0	C clear CA
52	subfo	r0, r0, r0	C clear OV and r0
53
54	cmpdi	cr7, n, 4
55
56	ld	v0, 0(vp)
57	ld	v1, 8(vp)
58
59	srdi	r10, n, 2
60	mtctr	r10
61
62	rldicl.	r9, n, 0, 63
63	bne	cr0, L(bx1)
64
65L(bx0):	rldicl. r9, n, 63, 63
66
67	ld	r28, 0(rp)
68	ld	r8, 0(up)
69	ld	r11, 8(rp)
70	ld	r9, 8(up)
71	maddld(	r26, r8, v0, r28)
72	maddhdu(r31, r8, v0, r28)
73	blt	cr7, L(2)
74	ld	r28, 16(rp)
75	mulld	r5, r8, v1
76	mulhdu	r10, r8, v1
77	bne	cr0, L(b10)
78
79L(b00):	addi	up, up, -8
80	addi	rp, rp, -24
81	b	L(lo0)
82
83L(b10):	addi	up, up, 8
84	addi	rp, rp, -8
85	b	L(lo2)
86
87L(2):	addi	rp, rp, -8
88	mulld	r5, r8, v1
89	mulhdu	r10, r8, v1
90	b	L(cj2)
91
92L(bx1):	rldicl. r9, n, 63, 63
93
94	ld	r29, 0(rp)
95	ld	r9, 0(up)
96	ld	r10, 8(rp)
97	ld	r8, 8(up)
98	maddld(	r27, r9, v0, r29)
99	maddhdu(r30, r9, v0, r29)
100	ld	r29, 16(rp)
101	mulld	r12, r9, v1
102	mulhdu	r11, r9, v1
103	bne	cr0, L(b11)
104
105L(b01):	addi	rp, rp, -16
106	b	L(lo1)
107L(b11):	addi	up, up, 16
108	blt	cr7, L(end)
109
110L(top):	ld	r9, 0(up)
111	maddld(	r26, r8, v0, r10)	C 0  4   -> adde
112	maddhdu(r31, r8, v0, r10)	C 1  5
113	adde	r0, r27, r0		C    7 11
114	ld	r28, 24(rp)
115	std	r0, 0(rp)
116	maddld(	r5, r8, v1, r29)	C 1  5   -> addex
117	maddhdu(r10, r8, v1, r29)	C 2  6
118	addex(	r0, r12, r30, 0)	C    8 12
119L(lo2):	ld	r8, 8(up)
120	maddld(	r27, r9, v0, r11)	C 1  5   -> adde
121	maddhdu(r30, r9, v0, r11)	C 2  6
122	adde	r0, r26, r0		C    8 12
123	ld	r29, 32(rp)
124	std	r0, 8(rp)
125	maddld(	r12, r9, v1, r28)	C 2  6   -> addex
126	maddhdu(r11, r9, v1, r28)	C 3  7
127	addex(	r0, r5, r31, 0)		C 5  9 13
128L(lo1):	ld	r9, 16(up)
129	maddld(	r26, r8, v0, r10)	C 2  6   -> adde
130	maddhdu(r31, r8, v0, r10)	C 3  7
131	adde	r0, r27, r0		C    5  9 13
132	ld	r28, 40(rp)
133	std	r0, 16(rp)
134	maddld(	r5, r8, v1, r29)	C 3  7   -> addex
135	maddhdu(r10, r8, v1, r29)	C 4  8
136	addex(	r0, r12, r30, 0)	C    6 10
137L(lo0):	ld	r8, 24(up)
138	maddld(	r27, r9, v0, r11)	C 3  7   -> adde
139	maddhdu(r30, r9, v0, r11)	C 4  8
140	adde	r0, r26, r0		C    6 10
141	ld	r29, 48(rp)
142	std	r0, 24(rp)
143	maddld(	r12, r9, v1, r28)	C 4  8   -> addex
144	maddhdu(r11, r9, v1, r28)	C 5  9
145	addex(	r0, r5, r31, 0)		C    7 11
146	addi	up, up, 32
147	addi	rp, rp, 32
148	bdnz	L(top)
149
150L(end):	ld	r9, 0(up)
151	maddld(	r26, r8, v0, r10)	C 0  4
152	maddhdu(r31, r8, v0, r10)	C 1  5
153	adde	r0, r27, r0		C    7 11
154	std	r0, 0(rp)		C		-4
155	maddld(	r5, r8, v1, r29)	C 1  5
156	maddhdu(r10, r8, v1, r29)	C 2  6
157	addex(	r0, r12, r30, 0)	C    8 12
158L(cj2):	maddld(	r27, r9, v0, r11)	C 1  5		-2
159	maddhdu(r30, r9, v0, r11)	C 2  6		-1
160	adde	r0, r26, r0		C    8 12	-3
161	std	r0, 8(rp)		C		-3
162	mulld	r12, r9, v1		C 2  6		-1
163	mulhdu	r11, r9, v1		C 3  7		0 = return limb
164	addex(	r0, r5, r31, 0)		C 5  9 13
165	adde	r0, r27, r0		C    5  9 13	-2
166	std	r0, 16(rp)		C		-2
167	addex(	r0, r12, r30, 0)	C    6 10	-1
168	adde	r0, r0, r10		C		-1
169	std	r0, 24(rp)		C		-1
170	li	r4, 0
171	addze	r3, r11
172	addex(	r3, r3, r4, 0)
173
174L(ret):	ld	r26, -48(r1)
175	ld	r27, -40(r1)
176	ld	r28, -32(r1)
177	ld	r29, -24(r1)
178	ld	r30, -16(r1)
179	ld	r31, -8(r1)
180	blr
181EPILOGUE()
182ASM_END()
183