xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/powerpc64/mode64/mul_1.asm (revision 96fc3e30a7c3f7bba53384bf41dad5f78306fac4)
1dnl  PowerPC-64 mpn_mul_1 -- Multiply a limb vector with a limb and store
2dnl  the result in a second limb vector.
3
4dnl  Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006 Free Software
5dnl  Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of the GNU Lesser General Public License as published
11dnl  by the Free Software Foundation; either version 3 of the License, or (at
12dnl  your option) any later version.
13
14dnl  The GNU MP Library is distributed in the hope that it will be useful, but
15dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17dnl  License for more details.
18
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24C		cycles/limb
25C POWER3/PPC630:     6-18
26C POWER4/PPC970:     7.25
27C POWER5:            7.75
28
29C TODO
30C  * Try to reduce the number of needed live registers (at least r5 and r10
31C    could be combined)
32C  * Optimize feed-in code, for speed and size.
33C  * Clean up r12/r7 usage in feed-in code.
34
35C INPUT PARAMETERS
36define(`rp', `r3')
37define(`up', `r4')
38define(`n', `r5')
39define(`vl', `r6')
40
41ASM_START()
42PROLOGUE(mpn_mul_1c)
43	std	r27, -40(r1)
44	std	r26, -48(r1)
45	mr	r12, r7
46	b	L(ent)
47EPILOGUE()
48PROLOGUE(mpn_mul_1)
49	std	r27, -40(r1)
50	std	r26, -48(r1)
51	li	r12, 0		C cy_limb = 0
52L(ent):	ld	r26, 0(up)
53
54	rldicl.	r0, n, 0,62	C r0 = n & 3, set cr0
55	cmpdi	cr6, r0, 2
56	addic	n, n, 3		C compute count...
57	srdi	n, n, 2		C ...for ctr
58	mtctr	n		C copy count into ctr
59	beq	cr0, L(b00)
60	blt	cr6, L(b01)
61	beq	cr6, L(b10)
62
63L(b11):	mr	r7, r12
64	mulld	r0, r26, r6
65	mulhdu	r12, r26, r6
66	addi	up, up, 8
67	addc	r0, r0, r7
68	std	r0, 0(rp)
69	addi	rp, rp, 8
70	b	L(fic)
71
72L(b00):	ld	r27, 8(up)
73	addi	up, up, 16
74	mulld	r0, r26, r6
75	mulhdu	r5, r26, r6
76	mulld	r7, r27, r6
77	mulhdu	r8, r27, r6
78	addc	r0, r0, r12
79	adde	r7, r7, r5
80	addze	r12, r8
81	std	r0, 0(rp)
82	std	r7, 8(rp)
83	addi	rp, rp, 16
84	b	L(fic)
85
86	nop			C alignment
87L(b01):	bdnz	L(gt1)
88	mulld	r0, r26, r6
89	mulhdu	r8, r26, r6
90	addc	r0, r0, r12
91	std	r0, 0(rp)
92	b	L(ret)
93L(gt1):	ld	r27, 8(up)
94	nop
95	mulld	r0, r26, r6
96	mulhdu	r5, r26, r6
97	ld	r26, 16(up)
98	mulld	r7, r27, r6
99	mulhdu	r8, r27, r6
100	mulld	r9, r26, r6
101	mulhdu	r10, r26, r6
102	addc	r0, r0, r12
103	adde	r7, r7, r5
104	adde	r9, r9, r8
105	addze	r12, r10
106	std	r0, 0(rp)
107	std	r7, 8(rp)
108	std	r9, 16(rp)
109	addi	up, up, 24
110	addi	rp, rp, 24
111	b	L(fic)
112
113	nop
114L(fic):	ld	r26, 0(up)
115L(b10):	ld	r27, 8(up)
116	addi	up, up, 16
117	bdz	L(end)
118
119L(top):	mulld	r0, r26, r6
120	mulhdu	r5, r26, r6
121	ld	r26, 0(up)
122	nop
123
124	mulld	r7, r27, r6
125	mulhdu	r8, r27, r6
126	ld	r27, 8(up)
127	nop
128
129	adde	r0, r0, r12
130	adde	r7, r7, r5
131
132	mulld	r9, r26, r6
133	mulhdu	r10, r26, r6
134	ld	r26, 16(up)
135	nop
136
137	mulld	r11, r27, r6
138	mulhdu	r12, r27, r6
139	ld	r27, 24(up)
140
141	std	r0, 0(rp)
142	adde	r9, r9, r8
143	std	r7, 8(rp)
144	adde	r11, r11, r10
145	std	r9, 16(rp)
146	addi	up, up, 32
147	std	r11, 24(rp)
148
149	addi	rp, rp, 32
150	bdnz	L(top)
151
152L(end):	mulld	r0, r26, r6
153	mulhdu	r5, r26, r6
154
155	mulld	r7, r27, r6
156	mulhdu	r8, r27, r6
157
158	adde	r0, r0, r12
159	adde	r7, r7, r5
160
161	std	r0, 0(rp)
162	std	r7, 8(rp)
163L(ret):	addze	r3, r8
164	ld	r27, -40(r1)
165	ld	r26, -48(r1)
166	blr
167EPILOGUE()
168