xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/alpha/ev6/nails/mul_1.asm (revision af56d1fe9956bd7c616e18c1b7f025f464618471)
1dnl  Alpha ev6 nails mpn_mul_1.
2
3dnl  Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22C      cycles/limb
23C EV4:    42
24C EV5:    18
25C EV6:     3.25
26
27C TODO
28C  * Reroll loop for 3.0 c/l with current 4-way unrolling.
29C  * The loop is overscheduled wrt loads and wrt multiplies, in particular
30C    umulh.
31C  * Use FP loop count and multiple exit points, that would simplify feed-in lp0
32C    and would work since the loop structure is really regular.
33
34C  INPUT PARAMETERS
35define(`rp',`r16')
36define(`up',`r17')
37define(`n', `r18')
38define(`vl0',`r19')
39
40define(`numb_mask',`r6')
41
42define(`m0a',`r0')
43define(`m0b',`r1')
44define(`m1a',`r2')
45define(`m1b',`r3')
46define(`m2a',`r20')
47define(`m2b',`r21')
48define(`m3a',`r22')
49define(`m3b',`r23')
50
51define(`acc0',`r25')
52define(`acc1',`r27')
53
54define(`ul0',`r4')
55define(`ul1',`r5')
56define(`ul2',`r4')
57define(`ul3',`r5')
58
59define(`rl0',`r24')
60define(`rl1',`r24')
61define(`rl2',`r24')
62define(`rl3',`r24')
63
64define(`t0',`r7')
65define(`t1',`r8')
66
67define(`NAIL_BITS',`GMP_NAIL_BITS')
68define(`NUMB_BITS',`GMP_NUMB_BITS')
69
70dnl  This declaration is munged by configure
71NAILS_SUPPORT(1-63)
72
73ASM_START()
74PROLOGUE(mpn_mul_1)
75	sll	vl0, NAIL_BITS, vl0
76	lda	numb_mask, -1(r31)
77	srl	numb_mask, NAIL_BITS, numb_mask
78
79	and	n,	3,	r25
80	cmpeq	r25,	1,	r21
81	bne	r21,	L(1m4)
82	cmpeq	r25,	2,	r21
83	bne	r21,	L(2m4)
84	beq	r25,	L(0m4)
85
86L(3m4):	ldq	ul3,	0(up)
87	lda	n,	-4(n)
88	ldq	ul0,	8(up)
89	mulq	vl0,	ul3,	m3a
90	umulh	vl0,	ul3,	m3b
91	ldq	ul1,	16(up)
92	lda	up,	24(up)
93	lda	rp,	-8(rp)
94	mulq	vl0,	ul0,	m0a
95	umulh	vl0,	ul0,	m0b
96	bge	n,	L(ge3)
97
98	mulq	vl0,	ul1,	m1a
99	umulh	vl0,	ul1,	m1b
100	srl	m3a,NAIL_BITS,	t0
101	addq	t0,	r31,	acc1
102	srl	m0a,NAIL_BITS,	t0
103	addq	t0,	m3b,	acc0
104	srl	acc1,NUMB_BITS,	t1
105	br	r31,	L(ta3)
106
107L(ge3):	ldq	ul2,	0(up)
108	mulq	vl0,	ul1,	m1a
109	umulh	vl0,	ul1,	m1b
110	srl	m3a,NAIL_BITS,	t0
111	ldq	ul3,	8(up)
112	lda	n,	-4(n)
113	mulq	vl0,	ul2,	m2a
114	addq	t0,	r31,	acc1
115	umulh	vl0,	ul2,	m2b
116	srl	m0a,NAIL_BITS,	t0
117	ldq	ul0,	16(up)
118	mulq	vl0,	ul3,	m3a
119	addq	t0,	m3b,	acc0
120	srl	acc1,NUMB_BITS,	t1
121	br	r31,	L(el3)
122
123L(0m4):	lda	n,	-8(n)
124	ldq	ul2,	0(up)
125	ldq	ul3,	8(up)
126	mulq	vl0,	ul2,	m2a
127	umulh	vl0,	ul2,	m2b
128	ldq	ul0,	16(up)
129	mulq	vl0,	ul3,	m3a
130	umulh	vl0,	ul3,	m3b
131	ldq	ul1,	24(up)
132	lda	up,	32(up)
133	mulq	vl0,	ul0,	m0a
134	umulh	vl0,	ul0,	m0b
135	bge	n,	L(ge4)
136
137	srl	m2a,NAIL_BITS,	t0
138	mulq	vl0,	ul1,	m1a
139	addq	t0,	r31,	acc0
140	umulh	vl0,	ul1,	m1b
141	srl	m3a,NAIL_BITS,	t0
142	addq	t0,	m2b,	acc1
143	srl	acc0,NUMB_BITS,	t1
144	br	r31,	L(ta4)
145
146L(ge4):	srl	m2a,NAIL_BITS,	t0
147	ldq	ul2,	0(up)
148	mulq	vl0,	ul1,	m1a
149	addq	t0,	r31,	acc0
150	umulh	vl0,	ul1,	m1b
151	srl	m3a,NAIL_BITS,	t0
152	ldq	ul3,	8(up)
153	lda	n,	-4(n)
154	mulq	vl0,	ul2,	m2a
155	addq	t0,	m2b,	acc1
156	srl	acc0,NUMB_BITS,	t1
157	br	r31,	L(el0)
158
159L(2m4):	lda	n,	-4(n)
160	ldq	ul0,	0(up)
161	ldq	ul1,	8(up)
162	lda	up,	16(up)
163	lda	rp,	-16(rp)
164	mulq	vl0,	ul0,	m0a
165	umulh	vl0,	ul0,	m0b
166	bge	n,	L(ge2)
167
168	mulq	vl0,	ul1,	m1a
169	umulh	vl0,	ul1,	m1b
170	srl	m0a,NAIL_BITS,	t0
171	addq	t0,	r31,	acc0
172	srl	m1a,NAIL_BITS,	t0
173	addq	t0,	m0b,	acc1
174	srl	acc0,NUMB_BITS,	t1
175	br	r31,	L(ta2)
176
177L(ge2):	ldq	ul2,	0(up)
178	mulq	vl0,	ul1,	m1a
179	umulh	vl0,	ul1,	m1b
180	ldq	ul3,	8(up)
181	lda	n,	-4(n)
182	mulq	vl0,	ul2,	m2a
183	umulh	vl0,	ul2,	m2b
184	srl	m0a,NAIL_BITS,	t0
185	ldq	ul0,	16(up)
186	mulq	vl0,	ul3,	m3a
187	addq	t0,	r31,	acc0
188	umulh	vl0,	ul3,	m3b
189	srl	m1a,NAIL_BITS,	t0
190	ldq	ul1,	24(up)
191	lda	up,	32(up)
192	lda	rp,	32(rp)
193	mulq	vl0,	ul0,	m0a
194	addq	t0,	m0b,	acc1
195	srl	acc0,NUMB_BITS,	t1
196	bge	n,	L(el2)
197
198	br	r31,	L(ta6)
199
200L(1m4):	lda	n,	-4(n)
201	ldq	ul1,	0(up)
202	lda	up,	8(up)
203	lda	rp,	-24(rp)
204	bge	n,	L(ge1)
205
206	mulq	vl0,	ul1,	m1a
207	umulh	vl0,	ul1,	m1b
208	srl	m1a,NAIL_BITS,	t0
209	addq	t0,	r31,	acc1
210	and	acc1,numb_mask,	r28
211	srl	acc1,NUMB_BITS,	t1
212	stq	r28,	24(rp)
213	addq	t1,	m1b,	r0
214	ret	r31,	(r26),	1
215
216L(ge1):	ldq	ul2,	0(up)
217	mulq	vl0,	ul1,	m1a
218	umulh	vl0,	ul1,	m1b
219	ldq	ul3,	8(up)
220	lda	n,	-4(n)
221	mulq	vl0,	ul2,	m2a
222	umulh	vl0,	ul2,	m2b
223	ldq	ul0,	16(up)
224	mulq	vl0,	ul3,	m3a
225	umulh	vl0,	ul3,	m3b
226	srl	m1a,NAIL_BITS,	t0
227	ldq	ul1,	24(up)
228	lda	up,	32(up)
229	lda	rp,	32(rp)
230	mulq	vl0,	ul0,	m0a
231	addq	t0,	r31,	acc1
232	umulh	vl0,	ul0,	m0b
233	srl	m2a,NAIL_BITS,	t0
234	mulq	vl0,	ul1,	m1a
235	addq	t0,	m1b,	acc0
236	srl	acc1,NUMB_BITS,	t1
237	blt	n,	L(ta5)
238
239L(ge5):	ldq	ul2,	0(up)
240	br	r31,	L(el1)
241
242	ALIGN(16)
243L(top):	mulq	vl0,	ul0,	m0a		C U1
244	addq	t0,	m0b,	acc1		C L0
245	srl	acc0,NUMB_BITS,	t1		C U0
246	stq	r28,	-24(rp)			C L1
247C
248L(el2):	umulh	vl0,	ul0,	m0b		C U1
249	and	acc0,numb_mask,	r28		C L0
250	unop					C U0
251	unop					C L1
252C
253	unop					C U1
254	addq	t1,	acc1,	acc1		C L0
255	srl	m2a,NAIL_BITS,	t0		C U0
256	ldq	ul2,	0(up)			C L1
257C
258	mulq	vl0,	ul1,	m1a		C U1
259	addq	t0,	m1b,	acc0		C L0
260	srl	acc1,NUMB_BITS,	t1		C U0
261	stq	r28,	-16(rp)			C L1
262C
263L(el1):	umulh	vl0,	ul1,	m1b		C U1
264	and	acc1,numb_mask,	r28		C L0
265	unop					C U0
266	lda	n,	-4(n)			C L1
267C
268	unop					C U1
269	addq	t1,	acc0,	acc0		C L0
270	srl	m3a,NAIL_BITS,	t0		C U0
271	ldq	ul3,	8(up)			C L1
272C
273	mulq	vl0,	ul2,	m2a		C U1
274	addq	t0,	m2b,	acc1		C L0
275	srl	acc0,NUMB_BITS,	t1		C U0
276	stq	r28,	-8(rp)			C L1
277C
278L(el0):	umulh	vl0,	ul2,	m2b		C U1
279	and	acc0,numb_mask,	r28		C L0
280	unop					C U0
281	unop					C L1
282C
283	unop					C U1
284	addq	t1,	acc1,	acc1		C L0
285	srl	m0a,NAIL_BITS,	t0		C U0
286	ldq	ul0,	16(up)			C L1
287C
288	mulq	vl0,	ul3,	m3a		C U1
289	addq	t0,	m3b,	acc0		C L0
290	srl	acc1,NUMB_BITS,	t1		C U0
291	stq	r28,	0(rp)			C L1
292C
293L(el3):	umulh	vl0,	ul3,	m3b		C U1
294	and	acc1,numb_mask,	r28		C L0
295	unop					C U0
296	unop					C L1
297C
298	unop					C U1
299	addq	t1,	acc0,	acc0		C L0
300	srl	m1a,NAIL_BITS,	t0		C U0
301	ldq	ul1,	24(up)			C L1
302C
303	lda	up,	32(up)			C L0
304	unop					C U1
305	lda	rp,	32(rp)			C L1
306	bge	n,	L(top)			C U0
307
308L(end):	mulq	vl0,	ul0,	m0a
309	addq	t0,	m0b,	acc1
310	srl	acc0,NUMB_BITS,	t1
311	stq	r28,	-24(rp)
312L(ta6):	umulh	vl0,	ul0,	m0b
313	and	acc0,numb_mask,	r28
314	addq	t1,	acc1,	acc1
315	srl	m2a,NAIL_BITS,	t0
316	mulq	vl0,	ul1,	m1a
317	addq	t0,	m1b,	acc0
318	srl	acc1,NUMB_BITS,	t1
319	stq	r28,	-16(rp)
320L(ta5):	umulh	vl0,	ul1,	m1b
321	and	acc1,numb_mask,	r28
322	addq	t1,	acc0,	acc0
323	srl	m3a,NAIL_BITS,	t0
324	addq	t0,	m2b,	acc1
325	srl	acc0,NUMB_BITS,	t1
326	stq	r28,	-8(rp)
327	ALIGN(16)
328L(ta4):	and	acc0,numb_mask,	r28
329	addq	t1,	acc1,	acc1
330	srl	m0a,NAIL_BITS,	t0
331	addq	t0,	m3b,	acc0
332	srl	acc1,NUMB_BITS,	t1
333	stq	r28,	0(rp)
334	unop
335	ALIGN(16)
336L(ta3):	and	acc1,numb_mask,	r28
337	addq	t1,	acc0,	acc0
338	srl	m1a,NAIL_BITS,	t0
339	addq	t0,	m0b,	acc1
340	srl	acc0,NUMB_BITS,	t1
341	stq	r28,	8(rp)
342	unop
343	ALIGN(16)
344L(ta2):	and	acc0,numb_mask,	r28
345	addq	t1,	acc1,	acc1
346	srl	acc1,NUMB_BITS,	t1
347	stq	r28,	16(rp)
348	and	acc1,numb_mask,	r28
349	addq	t1,	m1b,	r0
350	stq	r28,	24(rp)
351	ret	r31,	(r26),	1
352EPILOGUE()
353ASM_END()
354