xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/alpha/ev6/nails/submul_1.asm (revision aef5eb5f59cdfe8314f1b5f78ac04eb144e44010)
1dnl  Alpha ev6 nails mpn_submul_1.
2
3dnl  Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C      cycles/limb
34C EV4:    42
35C EV5:    18
36C EV6:     4
37
38C TODO
39C  * Reroll loop for 3.75 c/l with current 4-way unrolling.
40C  * The loop is overscheduled wrt loads and wrt multiplies, in particular
41C    umulh.
42C  * Use FP loop count and multiple exit points, that would simplify feed-in lp0
43C    and would work since the loop structure is really regular.
44
45C  INPUT PARAMETERS
46define(`rp',`r16')
47define(`up',`r17')
48define(`n', `r18')
49define(`vl0',`r19')
50
51define(`numb_mask',`r6')
52
53define(`m0a',`r0')
54define(`m0b',`r1')
55define(`m1a',`r2')
56define(`m1b',`r3')
57define(`m2a',`r20')
58define(`m2b',`r21')
59define(`m3a',`r22')
60define(`m3b',`r23')
61
62define(`acc0',`r25')
63define(`acc1',`r27')
64
65define(`ul0',`r4')
66define(`ul1',`r5')
67define(`ul2',`r4')
68define(`ul3',`r5')
69
70define(`rl0',`r24')
71define(`rl1',`r24')
72define(`rl2',`r24')
73define(`rl3',`r24')
74
75define(`t0',`r7')
76define(`t1',`r8')
77
78define(`NAIL_BITS',`GMP_NAIL_BITS')
79define(`NUMB_BITS',`GMP_NUMB_BITS')
80
81dnl  This declaration is munged by configure
82NAILS_SUPPORT(2-63)
83
84ASM_START()
85PROLOGUE(mpn_submul_1)
86	sll	vl0, NAIL_BITS, vl0
87	lda	numb_mask, -1(r31)
88	srl	numb_mask, NAIL_BITS, numb_mask
89
90	and	n,	3,	r25
91	cmpeq	r25,	1,	r21
92	bne	r21,	L(1m4)
93	cmpeq	r25,	2,	r21
94	bne	r21,	L(2m4)
95	beq	r25,	L(0m4)
96
97L(3m4):	ldq	ul3,	0(up)
98	lda	n,	-4(n)
99	ldq	ul0,	8(up)
100	mulq	vl0,	ul3,	m3a
101	umulh	vl0,	ul3,	m3b
102	ldq	ul1,	16(up)
103	lda	up,	24(up)
104	lda	rp,	-8(rp)
105	mulq	vl0,	ul0,	m0a
106	umulh	vl0,	ul0,	m0b
107	bge	n,	L(ge3)
108
109	mulq	vl0,	ul1,	m1a
110	umulh	vl0,	ul1,	m1b
111	ldq	rl3,	8(rp)
112	srl	m3a,NAIL_BITS,	t0
113	addq	t0,	r31,	acc1
114	subq	rl3,	acc1,	acc1
115	ldq	rl0,	16(rp)
116	srl	m0a,NAIL_BITS,	t0
117	addq	t0,	m3b,	acc0
118	sra	acc1,NUMB_BITS,	t1
119	br	r31,	L(ta3)
120
121L(ge3):	ldq	ul2,	0(up)
122	mulq	vl0,	ul1,	m1a
123	umulh	vl0,	ul1,	m1b
124	ldq	rl3,	8(rp)
125	srl	m3a,NAIL_BITS,	t0
126	ldq	ul3,	8(up)
127	lda	n,	-4(n)
128	mulq	vl0,	ul2,	m2a
129	addq	t0,	r31,	acc1
130	umulh	vl0,	ul2,	m2b
131	subq	rl3,	acc1,	acc1
132	ldq	rl0,	16(rp)
133	srl	m0a,NAIL_BITS,	t0
134	ldq	ul0,	16(up)
135	mulq	vl0,	ul3,	m3a
136	addq	t0,	m3b,	acc0
137	sra	acc1,NUMB_BITS,	t1
138	br	r31,	L(el3)
139
140L(0m4):	lda	n,	-8(n)
141	ldq	ul2,	0(up)
142	ldq	ul3,	8(up)
143	mulq	vl0,	ul2,	m2a
144	umulh	vl0,	ul2,	m2b
145	ldq	ul0,	16(up)
146	mulq	vl0,	ul3,	m3a
147	umulh	vl0,	ul3,	m3b
148	ldq	ul1,	24(up)
149	lda	up,	32(up)
150	mulq	vl0,	ul0,	m0a
151	umulh	vl0,	ul0,	m0b
152	bge	n,	L(ge4)
153
154	ldq	rl2,	0(rp)
155	srl	m2a,NAIL_BITS,	t0
156	mulq	vl0,	ul1,	m1a
157	addq	t0,	r31,	acc0
158	umulh	vl0,	ul1,	m1b
159	subq	rl2,	acc0,	acc0
160	ldq	rl3,	8(rp)
161	srl	m3a,NAIL_BITS,	t0
162	addq	t0,	m2b,	acc1
163	sra	acc0,NUMB_BITS,	t1
164	br	r31,	L(ta4)
165
166L(ge4):	ldq	rl2,	0(rp)
167	srl	m2a,NAIL_BITS,	t0
168	ldq	ul2,	0(up)
169	mulq	vl0,	ul1,	m1a
170	addq	t0,	r31,	acc0
171	umulh	vl0,	ul1,	m1b
172	subq	rl2,	acc0,	acc0
173	ldq	rl3,	8(rp)
174	srl	m3a,NAIL_BITS,	t0
175	ldq	ul3,	8(up)
176	lda	n,	-4(n)
177	mulq	vl0,	ul2,	m2a
178	addq	t0,	m2b,	acc1
179	sra	acc0,NUMB_BITS,	t1
180	br	r31,	L(el0)
181
182L(2m4):	lda	n,	-4(n)
183	ldq	ul0,	0(up)
184	ldq	ul1,	8(up)
185	lda	up,	16(up)
186	lda	rp,	-16(rp)
187	mulq	vl0,	ul0,	m0a
188	umulh	vl0,	ul0,	m0b
189	bge	n,	L(ge2)
190
191	mulq	vl0,	ul1,	m1a
192	umulh	vl0,	ul1,	m1b
193	ldq	rl0,	16(rp)
194	srl	m0a,NAIL_BITS,	t0
195	addq	t0,	r31,	acc0
196	subq	rl0,	acc0,	acc0
197	ldq	rl1,	24(rp)
198	srl	m1a,NAIL_BITS,	t0
199	addq	t0,	m0b,	acc1
200	sra	acc0,NUMB_BITS,	t1
201	br	r31,	L(ta2)
202
203L(ge2):	ldq	ul2,	0(up)
204	mulq	vl0,	ul1,	m1a
205	umulh	vl0,	ul1,	m1b
206	ldq	ul3,	8(up)
207	lda	n,	-4(n)
208	mulq	vl0,	ul2,	m2a
209	umulh	vl0,	ul2,	m2b
210	ldq	rl0,	16(rp)
211	srl	m0a,NAIL_BITS,	t0
212	ldq	ul0,	16(up)
213	mulq	vl0,	ul3,	m3a
214	addq	t0,	r31,	acc0
215	umulh	vl0,	ul3,	m3b
216	subq	rl0,	acc0,	acc0
217	ldq	rl1,	24(rp)
218	srl	m1a,NAIL_BITS,	t0
219	ldq	ul1,	24(up)
220	lda	up,	32(up)
221	lda	rp,	32(rp)
222	mulq	vl0,	ul0,	m0a
223	addq	t0,	m0b,	acc1
224	sra	acc0,NUMB_BITS,	t1
225	bge	n,	L(el2)
226
227	br	r31,	L(ta6)
228
229L(1m4):	lda	n,	-4(n)
230	ldq	ul1,	0(up)
231	lda	up,	8(up)
232	lda	rp,	-24(rp)
233	bge	n,	L(ge1)
234
235	mulq	vl0,	ul1,	m1a
236	umulh	vl0,	ul1,	m1b
237	ldq	rl1,	24(rp)
238	srl	m1a,NAIL_BITS,	t0
239	subq	rl1,	t0,	acc1
240	and	acc1,numb_mask,	r28
241	sra	acc1,NUMB_BITS,	t1
242	stq	r28,	24(rp)
243	subq	m1b,	t1,	r0
244	ret	r31,	(r26),	1
245
246L(ge1):	ldq	ul2,	0(up)
247	mulq	vl0,	ul1,	m1a
248	umulh	vl0,	ul1,	m1b
249	ldq	ul3,	8(up)
250	lda	n,	-4(n)
251	mulq	vl0,	ul2,	m2a
252	umulh	vl0,	ul2,	m2b
253	ldq	ul0,	16(up)
254	mulq	vl0,	ul3,	m3a
255	umulh	vl0,	ul3,	m3b
256	ldq	rl1,	24(rp)
257	srl	m1a,NAIL_BITS,	t0
258	ldq	ul1,	24(up)
259	lda	up,	32(up)
260	lda	rp,	32(rp)
261	mulq	vl0,	ul0,	m0a
262	addq	t0,	r31,	acc1
263	umulh	vl0,	ul0,	m0b
264	subq	rl1,	acc1,	acc1
265	ldq	rl2,	0(rp)
266	srl	m2a,NAIL_BITS,	t0
267	mulq	vl0,	ul1,	m1a
268	addq	t0,	m1b,	acc0
269	sra	acc1,NUMB_BITS,	t1
270	blt	n,	L(ta5)
271
272L(ge5):	ldq	ul2,	0(up)
273	br	r31,	L(el1)
274
275	ALIGN(16)
276L(top):	mulq	vl0,	ul0,	m0a		C U1
277	addq	t0,	m0b,	acc1		C L0
278	sra	acc0,NUMB_BITS,	t1		C U0
279	stq	r28,	-24(rp)			C L1
280C
281L(el2):	umulh	vl0,	ul0,	m0b		C U1
282	and	acc0,numb_mask,	r28		C L0
283	subq	rl1,	acc1,	acc1		C U0
284	ldq	rl2,	0(rp)			C L1
285C
286	unop					C U1
287	addq	t1,	acc1,	acc1		C L0
288	srl	m2a,NAIL_BITS,	t0		C U0
289	ldq	ul2,	0(up)			C L1
290C
291	mulq	vl0,	ul1,	m1a		C U1
292	addq	t0,	m1b,	acc0		C L0
293	sra	acc1,NUMB_BITS,	t1		C U0
294	stq	r28,	-16(rp)			C L1
295C
296L(el1):	umulh	vl0,	ul1,	m1b		C U1
297	and	acc1,numb_mask,	r28		C L0
298	subq	rl2,	acc0,	acc0		C U0
299	ldq	rl3,	8(rp)			C L1
300C
301	lda	n,	-4(n)			C L1
302	addq	t1,	acc0,	acc0		C L0
303	srl	m3a,NAIL_BITS,	t0		C U0
304	ldq	ul3,	8(up)			C L1
305C
306	mulq	vl0,	ul2,	m2a		C U1
307	addq	t0,	m2b,	acc1		C L0
308	sra	acc0,NUMB_BITS,	t1		C U0
309	stq	r28,	-8(rp)			C L1
310C
311L(el0):	umulh	vl0,	ul2,	m2b		C U1
312	and	acc0,numb_mask,	r28		C L0
313	subq	rl3,	acc1,	acc1		C U0
314	ldq	rl0,	16(rp)			C L1
315C
316	unop					C U1
317	addq	t1,	acc1,	acc1		C L0
318	srl	m0a,NAIL_BITS,	t0		C U0
319	ldq	ul0,	16(up)			C L1
320C
321	mulq	vl0,	ul3,	m3a		C U1
322	addq	t0,	m3b,	acc0		C L0
323	sra	acc1,NUMB_BITS,	t1		C U0
324	stq	r28,	0(rp)			C L1
325C
326L(el3):	umulh	vl0,	ul3,	m3b		C U1
327	and	acc1,numb_mask,	r28		C L0
328	subq	rl0,	acc0,	acc0		C U0
329	ldq	rl1,	24(rp)			C L1
330C
331	unop					C U1
332	addq	t1,	acc0,	acc0		C L0
333	srl	m1a,NAIL_BITS,	t0		C U0
334	ldq	ul1,	24(up)			C L1
335C
336	lda	up,	32(up)			C L0
337	unop					C U1
338	lda	rp,	32(rp)			C L1
339	bge	n,	L(top)			C U0
340
341L(end):	mulq	vl0,	ul0,	m0a
342	addq	t0,	m0b,	acc1
343	sra	acc0,NUMB_BITS,	t1
344	stq	r28,	-24(rp)
345L(ta6):	umulh	vl0,	ul0,	m0b
346	and	acc0,numb_mask,	r28
347	subq	rl1,	acc1,	acc1
348	ldq	rl2,	0(rp)
349	addq	t1,	acc1,	acc1
350	srl	m2a,NAIL_BITS,	t0
351	mulq	vl0,	ul1,	m1a
352	addq	t0,	m1b,	acc0
353	sra	acc1,NUMB_BITS,	t1
354	stq	r28,	-16(rp)
355L(ta5):	umulh	vl0,	ul1,	m1b
356	and	acc1,numb_mask,	r28
357	subq	rl2,	acc0,	acc0
358	ldq	rl3,	8(rp)
359	addq	t1,	acc0,	acc0
360	srl	m3a,NAIL_BITS,	t0
361	addq	t0,	m2b,	acc1
362	sra	acc0,NUMB_BITS,	t1
363	stq	r28,	-8(rp)
364	unop
365	ALIGN(16)
366L(ta4):	and	acc0,numb_mask,	r28
367	subq	rl3,	acc1,	acc1
368	ldq	rl0,	16(rp)
369	addq	t1,	acc1,	acc1
370	srl	m0a,NAIL_BITS,	t0
371	addq	t0,	m3b,	acc0
372	sra	acc1,NUMB_BITS,	t1
373	stq	r28,	0(rp)
374	unop
375	ALIGN(16)
376L(ta3):	and	acc1,numb_mask,	r28
377	subq	rl0,	acc0,	acc0
378	ldq	rl1,	24(rp)
379	addq	t1,	acc0,	acc0
380	srl	m1a,NAIL_BITS,	t0
381	addq	t0,	m0b,	acc1
382	sra	acc0,NUMB_BITS,	t1
383	stq	r28,	8(rp)
384	unop
385	ALIGN(16)
386L(ta2):	and	acc0,numb_mask,	r28
387	subq	rl1,	acc1,	acc1
388	addq	t1,	acc1,	acc1
389	sra	acc1,NUMB_BITS,	t1
390	stq	r28,	16(rp)
391	and	acc1,numb_mask,	r28
392	subq	m1b,	t1,	r0
393	stq	r28,	24(rp)
394	ret	r31,	(r26),	1
395EPILOGUE()
396ASM_END()
397