xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/alpha/ev6/nails/submul_1.asm (revision d909946ca08dceb44d7d0f22ec9488679695d976)
1dnl  Alpha ev6 nails mpn_submul_1.
2
3dnl  Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22C      cycles/limb
23C EV4:    42
24C EV5:    18
25C EV6:     4
26
27C TODO
28C  * Reroll loop for 3.75 c/l with current 4-way unrolling.
29C  * The loop is overscheduled wrt loads and wrt multiplies, in particular
30C    umulh.
31C  * Use FP loop count and multiple exit points, that would simplify feed-in lp0
32C    and would work since the loop structure is really regular.
33
34C  INPUT PARAMETERS
35define(`rp',`r16')
36define(`up',`r17')
37define(`n', `r18')
38define(`vl0',`r19')
39
40define(`numb_mask',`r6')
41
42define(`m0a',`r0')
43define(`m0b',`r1')
44define(`m1a',`r2')
45define(`m1b',`r3')
46define(`m2a',`r20')
47define(`m2b',`r21')
48define(`m3a',`r22')
49define(`m3b',`r23')
50
51define(`acc0',`r25')
52define(`acc1',`r27')
53
54define(`ul0',`r4')
55define(`ul1',`r5')
56define(`ul2',`r4')
57define(`ul3',`r5')
58
59define(`rl0',`r24')
60define(`rl1',`r24')
61define(`rl2',`r24')
62define(`rl3',`r24')
63
64define(`t0',`r7')
65define(`t1',`r8')
66
67define(`NAIL_BITS',`GMP_NAIL_BITS')
68define(`NUMB_BITS',`GMP_NUMB_BITS')
69
70dnl  This declaration is munged by configure
71NAILS_SUPPORT(2-63)
72
73ASM_START()
74PROLOGUE(mpn_submul_1)
75	sll	vl0, NAIL_BITS, vl0
76	lda	numb_mask, -1(r31)
77	srl	numb_mask, NAIL_BITS, numb_mask
78
79	and	n,	3,	r25
80	cmpeq	r25,	1,	r21
81	bne	r21,	L(1m4)
82	cmpeq	r25,	2,	r21
83	bne	r21,	L(2m4)
84	beq	r25,	L(0m4)
85
86L(3m4):	ldq	ul3,	0(up)
87	lda	n,	-4(n)
88	ldq	ul0,	8(up)
89	mulq	vl0,	ul3,	m3a
90	umulh	vl0,	ul3,	m3b
91	ldq	ul1,	16(up)
92	lda	up,	24(up)
93	lda	rp,	-8(rp)
94	mulq	vl0,	ul0,	m0a
95	umulh	vl0,	ul0,	m0b
96	bge	n,	L(ge3)
97
98	mulq	vl0,	ul1,	m1a
99	umulh	vl0,	ul1,	m1b
100	ldq	rl3,	8(rp)
101	srl	m3a,NAIL_BITS,	t0
102	addq	t0,	r31,	acc1
103	subq	rl3,	acc1,	acc1
104	ldq	rl0,	16(rp)
105	srl	m0a,NAIL_BITS,	t0
106	addq	t0,	m3b,	acc0
107	sra	acc1,NUMB_BITS,	t1
108	br	r31,	L(ta3)
109
110L(ge3):	ldq	ul2,	0(up)
111	mulq	vl0,	ul1,	m1a
112	umulh	vl0,	ul1,	m1b
113	ldq	rl3,	8(rp)
114	srl	m3a,NAIL_BITS,	t0
115	ldq	ul3,	8(up)
116	lda	n,	-4(n)
117	mulq	vl0,	ul2,	m2a
118	addq	t0,	r31,	acc1
119	umulh	vl0,	ul2,	m2b
120	subq	rl3,	acc1,	acc1
121	ldq	rl0,	16(rp)
122	srl	m0a,NAIL_BITS,	t0
123	ldq	ul0,	16(up)
124	mulq	vl0,	ul3,	m3a
125	addq	t0,	m3b,	acc0
126	sra	acc1,NUMB_BITS,	t1
127	br	r31,	L(el3)
128
129L(0m4):	lda	n,	-8(n)
130	ldq	ul2,	0(up)
131	ldq	ul3,	8(up)
132	mulq	vl0,	ul2,	m2a
133	umulh	vl0,	ul2,	m2b
134	ldq	ul0,	16(up)
135	mulq	vl0,	ul3,	m3a
136	umulh	vl0,	ul3,	m3b
137	ldq	ul1,	24(up)
138	lda	up,	32(up)
139	mulq	vl0,	ul0,	m0a
140	umulh	vl0,	ul0,	m0b
141	bge	n,	L(ge4)
142
143	ldq	rl2,	0(rp)
144	srl	m2a,NAIL_BITS,	t0
145	mulq	vl0,	ul1,	m1a
146	addq	t0,	r31,	acc0
147	umulh	vl0,	ul1,	m1b
148	subq	rl2,	acc0,	acc0
149	ldq	rl3,	8(rp)
150	srl	m3a,NAIL_BITS,	t0
151	addq	t0,	m2b,	acc1
152	sra	acc0,NUMB_BITS,	t1
153	br	r31,	L(ta4)
154
155L(ge4):	ldq	rl2,	0(rp)
156	srl	m2a,NAIL_BITS,	t0
157	ldq	ul2,	0(up)
158	mulq	vl0,	ul1,	m1a
159	addq	t0,	r31,	acc0
160	umulh	vl0,	ul1,	m1b
161	subq	rl2,	acc0,	acc0
162	ldq	rl3,	8(rp)
163	srl	m3a,NAIL_BITS,	t0
164	ldq	ul3,	8(up)
165	lda	n,	-4(n)
166	mulq	vl0,	ul2,	m2a
167	addq	t0,	m2b,	acc1
168	sra	acc0,NUMB_BITS,	t1
169	br	r31,	L(el0)
170
171L(2m4):	lda	n,	-4(n)
172	ldq	ul0,	0(up)
173	ldq	ul1,	8(up)
174	lda	up,	16(up)
175	lda	rp,	-16(rp)
176	mulq	vl0,	ul0,	m0a
177	umulh	vl0,	ul0,	m0b
178	bge	n,	L(ge2)
179
180	mulq	vl0,	ul1,	m1a
181	umulh	vl0,	ul1,	m1b
182	ldq	rl0,	16(rp)
183	srl	m0a,NAIL_BITS,	t0
184	addq	t0,	r31,	acc0
185	subq	rl0,	acc0,	acc0
186	ldq	rl1,	24(rp)
187	srl	m1a,NAIL_BITS,	t0
188	addq	t0,	m0b,	acc1
189	sra	acc0,NUMB_BITS,	t1
190	br	r31,	L(ta2)
191
192L(ge2):	ldq	ul2,	0(up)
193	mulq	vl0,	ul1,	m1a
194	umulh	vl0,	ul1,	m1b
195	ldq	ul3,	8(up)
196	lda	n,	-4(n)
197	mulq	vl0,	ul2,	m2a
198	umulh	vl0,	ul2,	m2b
199	ldq	rl0,	16(rp)
200	srl	m0a,NAIL_BITS,	t0
201	ldq	ul0,	16(up)
202	mulq	vl0,	ul3,	m3a
203	addq	t0,	r31,	acc0
204	umulh	vl0,	ul3,	m3b
205	subq	rl0,	acc0,	acc0
206	ldq	rl1,	24(rp)
207	srl	m1a,NAIL_BITS,	t0
208	ldq	ul1,	24(up)
209	lda	up,	32(up)
210	lda	rp,	32(rp)
211	mulq	vl0,	ul0,	m0a
212	addq	t0,	m0b,	acc1
213	sra	acc0,NUMB_BITS,	t1
214	bge	n,	L(el2)
215
216	br	r31,	L(ta6)
217
218L(1m4):	lda	n,	-4(n)
219	ldq	ul1,	0(up)
220	lda	up,	8(up)
221	lda	rp,	-24(rp)
222	bge	n,	L(ge1)
223
224	mulq	vl0,	ul1,	m1a
225	umulh	vl0,	ul1,	m1b
226	ldq	rl1,	24(rp)
227	srl	m1a,NAIL_BITS,	t0
228	subq	rl1,	t0,	acc1
229	and	acc1,numb_mask,	r28
230	sra	acc1,NUMB_BITS,	t1
231	stq	r28,	24(rp)
232	subq	m1b,	t1,	r0
233	ret	r31,	(r26),	1
234
235L(ge1):	ldq	ul2,	0(up)
236	mulq	vl0,	ul1,	m1a
237	umulh	vl0,	ul1,	m1b
238	ldq	ul3,	8(up)
239	lda	n,	-4(n)
240	mulq	vl0,	ul2,	m2a
241	umulh	vl0,	ul2,	m2b
242	ldq	ul0,	16(up)
243	mulq	vl0,	ul3,	m3a
244	umulh	vl0,	ul3,	m3b
245	ldq	rl1,	24(rp)
246	srl	m1a,NAIL_BITS,	t0
247	ldq	ul1,	24(up)
248	lda	up,	32(up)
249	lda	rp,	32(rp)
250	mulq	vl0,	ul0,	m0a
251	addq	t0,	r31,	acc1
252	umulh	vl0,	ul0,	m0b
253	subq	rl1,	acc1,	acc1
254	ldq	rl2,	0(rp)
255	srl	m2a,NAIL_BITS,	t0
256	mulq	vl0,	ul1,	m1a
257	addq	t0,	m1b,	acc0
258	sra	acc1,NUMB_BITS,	t1
259	blt	n,	L(ta5)
260
261L(ge5):	ldq	ul2,	0(up)
262	br	r31,	L(el1)
263
264	ALIGN(16)
265L(top):	mulq	vl0,	ul0,	m0a		C U1
266	addq	t0,	m0b,	acc1		C L0
267	sra	acc0,NUMB_BITS,	t1		C U0
268	stq	r28,	-24(rp)			C L1
269C
270L(el2):	umulh	vl0,	ul0,	m0b		C U1
271	and	acc0,numb_mask,	r28		C L0
272	subq	rl1,	acc1,	acc1		C U0
273	ldq	rl2,	0(rp)			C L1
274C
275	unop					C U1
276	addq	t1,	acc1,	acc1		C L0
277	srl	m2a,NAIL_BITS,	t0		C U0
278	ldq	ul2,	0(up)			C L1
279C
280	mulq	vl0,	ul1,	m1a		C U1
281	addq	t0,	m1b,	acc0		C L0
282	sra	acc1,NUMB_BITS,	t1		C U0
283	stq	r28,	-16(rp)			C L1
284C
285L(el1):	umulh	vl0,	ul1,	m1b		C U1
286	and	acc1,numb_mask,	r28		C L0
287	subq	rl2,	acc0,	acc0		C U0
288	ldq	rl3,	8(rp)			C L1
289C
290	lda	n,	-4(n)			C L1
291	addq	t1,	acc0,	acc0		C L0
292	srl	m3a,NAIL_BITS,	t0		C U0
293	ldq	ul3,	8(up)			C L1
294C
295	mulq	vl0,	ul2,	m2a		C U1
296	addq	t0,	m2b,	acc1		C L0
297	sra	acc0,NUMB_BITS,	t1		C U0
298	stq	r28,	-8(rp)			C L1
299C
300L(el0):	umulh	vl0,	ul2,	m2b		C U1
301	and	acc0,numb_mask,	r28		C L0
302	subq	rl3,	acc1,	acc1		C U0
303	ldq	rl0,	16(rp)			C L1
304C
305	unop					C U1
306	addq	t1,	acc1,	acc1		C L0
307	srl	m0a,NAIL_BITS,	t0		C U0
308	ldq	ul0,	16(up)			C L1
309C
310	mulq	vl0,	ul3,	m3a		C U1
311	addq	t0,	m3b,	acc0		C L0
312	sra	acc1,NUMB_BITS,	t1		C U0
313	stq	r28,	0(rp)			C L1
314C
315L(el3):	umulh	vl0,	ul3,	m3b		C U1
316	and	acc1,numb_mask,	r28		C L0
317	subq	rl0,	acc0,	acc0		C U0
318	ldq	rl1,	24(rp)			C L1
319C
320	unop					C U1
321	addq	t1,	acc0,	acc0		C L0
322	srl	m1a,NAIL_BITS,	t0		C U0
323	ldq	ul1,	24(up)			C L1
324C
325	lda	up,	32(up)			C L0
326	unop					C U1
327	lda	rp,	32(rp)			C L1
328	bge	n,	L(top)			C U0
329
330L(end):	mulq	vl0,	ul0,	m0a
331	addq	t0,	m0b,	acc1
332	sra	acc0,NUMB_BITS,	t1
333	stq	r28,	-24(rp)
334L(ta6):	umulh	vl0,	ul0,	m0b
335	and	acc0,numb_mask,	r28
336	subq	rl1,	acc1,	acc1
337	ldq	rl2,	0(rp)
338	addq	t1,	acc1,	acc1
339	srl	m2a,NAIL_BITS,	t0
340	mulq	vl0,	ul1,	m1a
341	addq	t0,	m1b,	acc0
342	sra	acc1,NUMB_BITS,	t1
343	stq	r28,	-16(rp)
344L(ta5):	umulh	vl0,	ul1,	m1b
345	and	acc1,numb_mask,	r28
346	subq	rl2,	acc0,	acc0
347	ldq	rl3,	8(rp)
348	addq	t1,	acc0,	acc0
349	srl	m3a,NAIL_BITS,	t0
350	addq	t0,	m2b,	acc1
351	sra	acc0,NUMB_BITS,	t1
352	stq	r28,	-8(rp)
353	unop
354	ALIGN(16)
355L(ta4):	and	acc0,numb_mask,	r28
356	subq	rl3,	acc1,	acc1
357	ldq	rl0,	16(rp)
358	addq	t1,	acc1,	acc1
359	srl	m0a,NAIL_BITS,	t0
360	addq	t0,	m3b,	acc0
361	sra	acc1,NUMB_BITS,	t1
362	stq	r28,	0(rp)
363	unop
364	ALIGN(16)
365L(ta3):	and	acc1,numb_mask,	r28
366	subq	rl0,	acc0,	acc0
367	ldq	rl1,	24(rp)
368	addq	t1,	acc0,	acc0
369	srl	m1a,NAIL_BITS,	t0
370	addq	t0,	m0b,	acc1
371	sra	acc0,NUMB_BITS,	t1
372	stq	r28,	8(rp)
373	unop
374	ALIGN(16)
375L(ta2):	and	acc0,numb_mask,	r28
376	subq	rl1,	acc1,	acc1
377	addq	t1,	acc1,	acc1
378	sra	acc1,NUMB_BITS,	t1
379	stq	r28,	16(rp)
380	and	acc1,numb_mask,	r28
381	subq	m1b,	t1,	r0
382	stq	r28,	24(rp)
383	ret	r31,	(r26),	1
384EPILOGUE()
385ASM_END()
386