xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/powerpc64/mode64/p9/mul_basecase.asm (revision ae87de8892f277bece3527c15b186ebcfa188227)
1dnl  Power9 mpn_mul_basecase.
2
3dnl  Copyright 1999-2001, 2003-2006, 2008, 2017-2018 Free Software Foundation,
4dnl  Inc.
5
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of either:
10dnl
11dnl    * the GNU Lesser General Public License as published by the Free
12dnl      Software Foundation; either version 3 of the License, or (at your
13dnl      option) any later version.
14dnl
15dnl  or
16dnl
17dnl    * the GNU General Public License as published by the Free Software
18dnl      Foundation; either version 2 of the License, or (at your option) any
19dnl      later version.
20dnl
21dnl  or both in parallel, as here.
22dnl
23dnl  The GNU MP Library is distributed in the hope that it will be useful, but
24dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26dnl  for more details.
27dnl
28dnl  You should have received copies of the GNU General Public License and the
29dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
30dnl  see https://www.gnu.org/licenses/.
31
32include(`../config.m4')
33
34C                  cycles/limb
35C POWER3/PPC630          -
36C POWER4/PPC970          -
37C POWER5                 -
38C POWER6                 -
39C POWER7                 -
40C POWER8                 -
41C POWER9                 1.62
42
43C TODO
44C  * Check if (inner) loop alignment affects performance.
45C  * Could we schedule loads less in addmul_2/mul_2? That would save some regs
46C    and make the tail code more manageable.
47C  * Postpone some register saves to main loop.
48C  * Perhaps write more small operands (3x1, 3x2, 3x3) code.
49C  * Consider restoring rp,up after loop using arithmetic, eliminating rp2, up2.
50C    On the other hand, the current rp,up restore register are useful for OSP.
51C  * Do OSP. This should save a lot with the current deep addmul_2 pipeline.
52
53C INPUT PARAMETERS
54define(`rp', `r3')
55define(`up', `r4')
56define(`un', `r5')
57define(`vp', `r6')
58define(`vn', `r7')
59
60define(`v0', `r0')
61define(`v1', `r7')
62define(`rp2', `r24')
63define(`up2', `r25')
64
65ASM_START()
66PROLOGUE(mpn_mul_basecase)
67	cmpdi	cr0, un, 2
68	bgt	cr0, L(un_gt2)
69	cmpdi	cr6, vn, 1
70	ld	r7, 0(vp)
71	ld	r5, 0(up)
72	mulld	r8, r5, r7	C weight 0
73	mulhdu	r9, r5, r7	C weight 1
74	std	r8, 0(rp)
75	beq	cr0, L(2x)
76	std	r9, 8(rp)
77	blr
78	ALIGN(16)
79L(2x):	ld	r0, 8(up)
80	mulld	r8, r0, r7	C weight 1
81	mulhdu	r10, r0, r7	C weight 2
82	addc	r9, r9, r8
83	addze	r10, r10
84	bne	cr6, L(2x2)
85	std	r9, 8(rp)
86	std	r10, 16(rp)
87	blr
88	ALIGN(16)
89L(2x2):	ld	r6, 8(vp)
90	mulld	r8, r5, r6	C weight 1
91	mulhdu	r11, r5, r6	C weight 2
92	addc	r9, r9, r8
93	std	r9, 8(rp)
94	adde	r11, r11, r10
95	mulld	r12, r0, r6	C weight 2
96	mulhdu	r0, r0, r6	C weight 3
97	addze	r0, r0
98	addc	r11, r11, r12
99	addze	r0, r0
100	std	r11, 16(rp)
101	std	r0, 24(rp)
102	blr
103
104L(un_gt2):
105	std	r22, -80(r1)
106	std	r23, -72(r1)
107	std	r24, -64(r1)
108	std	r25, -56(r1)
109	std	r26, -48(r1)
110	std	r27, -40(r1)
111	std	r28, -32(r1)
112	std	r29, -24(r1)
113	std	r30, -16(r1)
114	std	r31, -8(r1)
115	mr	rp2, r3			C rp
116	mr	up2, r4			C up
117	srdi	r22, r5, 2		C un
118	subfic	r23, r7, 0		C -vn, clear CA
119	subfo	r0, r0, r0		C clear OV (and r0)
120
121	cmpdi	cr6, un, 3
122	rldicl	r0, un, 0, 63		C r0 = un & 1
123	cmpdi	cr7, r0, 0
124	rldicl	r0, un, 63, 63		C FIXME: unused for vn = 1
125	cmpdi	cr5, r0, 0		C FIXME: unused for vn = 1
126
127	ld	v0, 0(vp)
128	rldicl.	r9, vn, 0, 63
129	beq	cr0, L(vn_evn)
130
131L(vn_odd):
132	addi	r10, un, -2
133	ld	r5, 0(up)
134	srdi	r10, r10, 1
135	mtctr	r10
136	bne	cr7, L(m1_b1)
137
138L(m1_b0):
139	ld	r10, 8(up)
140	mulld	r9, r5, v0
141	mulhdu	r11, r5, v0
142	ld	r12, 16(up)
143	mulld	r8, r10, v0
144	mulhdu	r5, r10, v0
145	addi	rp, rp, -8
146	b	L(m1_mid)
147
148L(m1_b1):
149	ld	r12, 8(up)
150	mulld	r8, r5, v0
151	mulhdu	r5, r5, v0
152	ld	r10, 16(up)
153	mulld	r9, r12, v0
154	mulhdu	r11, r12, v0
155	addi	up, up, 8
156	beq	cr6, L(m1_end)		C jump taken means un = 3, vn = {1,3}
157
158	ALIGN(16)
159L(m1_top):
160	ld	r12, 16(up)
161	std	r8, 0(rp)
162	adde	r9, r5, r9
163	mulld	r8, r10, v0
164	mulhdu	r5, r10, v0
165L(m1_mid):
166	ld	r10, 24(up)
167	std	r9, 8(rp)
168	adde	r8, r11, r8
169	mulld	r9, r12, v0
170	mulhdu	r11, r12, v0
171	addi	rp, rp, 16
172	addi	up, up, 16
173	bdnz	L(m1_top)
174
175L(m1_end):
176	std	r8, 0(rp)
177	mulld	r8, r10, v0
178	adde	r9, r5, r9
179	mulhdu	r5, r10, v0
180	std	r9, 8(rp)
181	adde	r8, r11, r8
182	std	r8, 16(rp)
183	addze	r10, r5
184	std	r10, 24(rp)
185
186	addi	rp2, rp2, 8
187	addi	vp, vp, 8
188	addic.	r23, r23, 1
189	b	L(do_outer)
190
191L(vn_evn):
192	ld	v1, 8(vp)
193	addi	r23, r23, 2
194	mtctr	r22
195	bne	cr7, L(m2_bx1)
196
197L(m2_bx0):
198	ld	r8, 0(up)
199	ld	r9, 8(up)
200	li	r11, 0
201	mulld	r28, r8, v0
202	mulhdu	r31, r8, v0
203	mulld	r5, r8, v1
204	mulhdu	r10, r8, v1
205	li	r12, 0
206	bne	cr5, L(m2_b10)
207
208L(m2_b00):
209	addi	up, up, -8
210	addi	rp, rp, -24
211	b	L(m2_lo0)
212
213L(m2_b10):
214	addi	up, up, 8
215	addi	rp, rp, -8
216	b	L(m2_lo2)
217
218L(m2_bx1):
219	ld	r9, 0(up)
220	ld	r8, 8(up)
221	li	r10, 0
222	mulld	r29, r9, v0
223	mulhdu	r30, r9, v0
224	mulld	r12, r9, v1
225	mulhdu	r11, r9, v1
226	li	r5, 0
227	bne	cr5, L(m2_b11)
228
229L(m2_b01):
230	addi	rp, rp, -16
231	b	L(m2_lo1)
232L(m2_b11):
233	addi	up, up, 16
234	beq	cr6, L(m2_end)		C taken means un = 3, vn = 2. We're done.
235
236L(m2_top):
237	ld	r9, 0(up)
238	maddld(	r28, r8, v0, r10)
239	maddhdu(r31, r8, v0, r10)
240	adde	r5, r29, r5
241	std	r5, 0(rp)
242	mulld	r5, r8, v1
243	mulhdu	r10, r8, v1
244	addex(	r12, r12, r30, 0)
245L(m2_lo2):
246	ld	r8, 8(up)
247	maddld(	r29, r9, v0, r11)
248	maddhdu(r30, r9, v0, r11)
249	adde	r12, r28, r12
250	std	r12, 8(rp)
251	mulld	r12, r9, v1
252	mulhdu	r11, r9, v1
253	addex(	r5, r5, r31, 0)
254L(m2_lo1):
255	ld	r9, 16(up)
256	maddld(	r28, r8, v0, r10)
257	maddhdu(r31, r8, v0, r10)
258	adde	r5, r29, r5
259	std	r5, 16(rp)
260	mulld	r5, r8, v1
261	mulhdu	r10, r8, v1
262	addex(	r12, r12, r30, 0)
263L(m2_lo0):
264	ld	r8, 24(up)
265	maddld(	r29, r9, v0, r11)
266	maddhdu(r30, r9, v0, r11)
267	adde	r12, r28, r12
268	std	r12, 24(rp)
269	mulld	r12, r9, v1
270	mulhdu	r11, r9, v1
271	addex(	r5, r5, r31, 0)
272	addi	up, up, 32
273	addi	rp, rp, 32
274	bdnz	L(m2_top)
275
276L(m2_end):
277	ld	r9, 0(up)
278	maddld(	r28, r8, v0, r10)
279	maddhdu(r31, r8, v0, r10)
280	adde	r5, r29, r5
281	std	r5, 0(rp)
282	mulld	r5, r8, v1
283	mulhdu	r10, r8, v1
284	b	L(cj)
285
286L(outer):
287	ld	v0, 0(vp)
288	ld	v1, 8(vp)
289	addi	r23, r23, 2
290	mtctr	r22
291	bne	cr7, L(bx1)
292
293L(bx0):	ld	r26, 0(rp2)
294	ld	r8, 0(up2)
295	ld	r11, 8(rp2)
296	ld	r9, 8(up2)
297	maddld(	r28, r8, v0, r26)
298	maddhdu(r31, r8, v0, r26)
299	ld	r26, 16(rp2)
300	mulld	r5, r8, v1
301	mulhdu	r10, r8, v1
302	li	r12, 0
303	bne	cr5, L(b10)
304
305L(b00):	addi	up, up2, -8
306	addi	rp, rp2, -24
307	b	L(lo0)
308
309L(b10):	addi	up, up2, 8
310	addi	rp, rp2, -8
311	b	L(lo2)
312
313L(bx1):	ld	r27, 0(rp2)
314	ld	r9, 0(up2)
315	ld	r10, 8(rp2)
316	ld	r8, 8(up2)
317	maddld(	r29, r9, v0, r27)
318	maddhdu(r30, r9, v0, r27)
319	ld	r27, 16(rp2)
320	mulld	r12, r9, v1
321	mulhdu	r11, r9, v1
322	li	r5, 0
323	bne	cr5, L(b11)
324
325L(b01):	addi	up, up2, 0
326	addi	rp, rp2, -16
327	b	L(lo1)
328L(b11):	addi	up, up2, 16
329	addi	rp, rp2, 0
330	beq	cr6, L(end)		C taken means un = 3, vn = 3. We're done.
331
332L(top):	ld	r9, 0(up)
333	maddld(	r28, r8, v0, r10)
334	maddhdu(r31, r8, v0, r10)
335	adde	r5, r29, r5
336	ld	r26, 24(rp)
337	std	r5, 0(rp)
338	maddld(	r5, r8, v1, r27)
339	maddhdu(r10, r8, v1, r27)
340	addex(	r12, r12, r30, 0)
341L(lo2):	ld	r8, 8(up)
342	maddld(	r29, r9, v0, r11)
343	maddhdu(r30, r9, v0, r11)
344	adde	r12, r28, r12
345	ld	r27, 32(rp)
346	std	r12, 8(rp)
347	maddld(	r12, r9, v1, r26)
348	maddhdu(r11, r9, v1, r26)
349	addex(	r5, r5, r31, 0)
350L(lo1):	ld	r9, 16(up)
351	maddld(	r28, r8, v0, r10)
352	maddhdu(r31, r8, v0, r10)
353	adde	r5, r29, r5
354	ld	r26, 40(rp)
355	std	r5, 16(rp)
356	maddld(	r5, r8, v1, r27)
357	maddhdu(r10, r8, v1, r27)
358	addex(	r12, r12, r30, 0)
359L(lo0):	ld	r8, 24(up)
360	maddld(	r29, r9, v0, r11)
361	maddhdu(r30, r9, v0, r11)
362	adde	r12, r28, r12
363	ld	r27, 48(rp)
364	std	r12, 24(rp)
365	maddld(	r12, r9, v1, r26)
366	maddhdu(r11, r9, v1, r26)
367	addex(	r5, r5, r31, 0)
368	addi	up, up, 32
369	addi	rp, rp, 32
370	bdnz	L(top)
371
372L(end):	ld	r9, 0(up)
373	maddld(	r28, r8, v0, r10)
374	maddhdu(r31, r8, v0, r10)
375	adde	r5, r29, r5
376	std	r5, 0(rp)
377	maddld(	r5, r8, v1, r27)
378	maddhdu(r10, r8, v1, r27)
379L(cj):	addex(	r12, r12, r30, 0)
380	maddld(	r29, r9, v0, r11)
381	maddhdu(r30, r9, v0, r11)
382	adde	r12, r28, r12
383	std	r12, 8(rp)
384	mulld	r12, r9, v1
385	mulhdu	r11, r9, v1
386	addex(	r5, r5, r31, 0)
387	adde	r5, r29, r5
388	std	r5, 16(rp)
389	addex(	r12, r12, r30, 0)
390	adde	r12, r12, r10
391	std	r12, 24(rp)
392	li	r4, 0
393	addze	r5, r11
394	addex(	r5, r5, r4, 0)
395	std	r5, 32(rp)
396
397	cmpdi	cr0, r23, 0
398	addi	rp2, rp2, 16
399	addi	vp, vp, 16
400L(do_outer):
401	bne	cr0, L(outer)
402L(ret):
403	ld	r22, -80(r1)
404	ld	r23, -72(r1)
405	ld	r24, -64(r1)
406	ld	r25, -56(r1)
407	ld	r26, -48(r1)
408	ld	r27, -40(r1)
409	ld	r28, -32(r1)
410	ld	r29, -24(r1)
411	ld	r30, -16(r1)
412	ld	r31, -8(r1)
413	blr
414EPILOGUE()
415ASM_END()
416