xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/ia64/aorsorrlshC_n.asm (revision 3f351f34c6d827cf017cdcff3543f6ec0c88b420)
1dnl  IA-64 mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n.
2
3dnl  Contributed to the GNU project by Torbjörn Granlund.
4
5dnl  Copyright 2003-2005, 2010, 2013 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33C           cycles/limb
34C Itanium:      ?
35C Itanium 2:    1.5
36
37C TODO
38C  * Use shladd in feed-in code (for mpn_addlshC_n).
39C  * Rewrite loop to schedule loads closer to use, since we do prefetch.
40
41C INPUT PARAMETERS
42define(`rp', `r32')
43define(`up', `r33')
44define(`vp', `r34')
45define(`n',  `r35')
46
47ifdef(`DO_add', `
48  define(`ADDSUB',     `add	$1 = $2, $3')
49  define(`CMP',        `cmp.ltu	$1,p0 = $2, $3')
50  define(`INCR',       1)
51  define(`LIM',        -1)
52  define(`func',        mpn_addlsh`'LSH`'_n)')
53ifdef(`DO_sub', `
54  define(`ADDSUB',     `sub	$1 = $2, $3')
55  define(`CMP',        `cmp.gtu	$1,p0 = $2, $3')
56  define(`INCR',       -1)
57  define(`LIM',        0)
58  define(`func',        mpn_sublsh`'LSH`'_n)')
59ifdef(`DO_rsb', `
60  define(`ADDSUB',     `sub	$1 = $3, $2')
61  define(`CMP',        `cmp.gtu	$1,p0 = $2, $4')
62  define(`INCR',       -1)
63  define(`LIM',        0)
64  define(`func',        mpn_rsblsh`'LSH`'_n)')
65
66define(PFDIST, 500)
67
68define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17')
69define(`v0',`r18') define(`v1',`r19') define(`v2',`r20') define(`v3',`r21')
70define(`w0',`r22') define(`w1',`r23') define(`w2',`r24') define(`w3',`r25')
71define(`s0',`r26') define(`s1',`r27') define(`s2',`r28') define(`s3',`r29')
72define(`x0',`r30') define(`x1',`r31') define(`x2',`r3')  define(`x3',`r9')
73
74C r3 r8 r9 r10 r11
75
76ASM_START()
77PROLOGUE(func)
78	.prologue
79	.save	ar.lc, r2
80	.body
81ifdef(`HAVE_ABI_32',`
82	addp4	rp = 0, rp		C			M I
83	addp4	up = 0, up		C			M I
84	nop.i	0
85	addp4	vp = 0, vp		C			M I
86	nop.m	0
87	zxt4	n = n			C			I
88	;;
89')
90 {.mmi;	ld8	r11 = [vp], 8		C			M01
91	ld8	r10 = [up], 8		C			M01
92	mov.i	r2 = ar.lc		C			I0
93}{.mmi;	and	r14 = 3, n		C			M I
94	cmp.lt	p15, p0 = 4, n		C			M I
95	add	n = -5, n		C			M I
96	;;
97}{.mmi;	cmp.eq	p6, p0 = 1, r14		C			M I
98	cmp.eq	p7, p0 = 2, r14		C			M I
99	cmp.eq	p8, p0 = 3, r14		C			M I
100}{.bbb
101  (p6)	br.dptk	.Lb01			C			B
102  (p7)	br.dptk	.Lb10			C			B
103  (p8)	br.dptk	.Lb11			C			B
104}
105
106.Lb00:
107 {.mmi;	ld8	v0 = [vp], 8		C			M01
108	ld8	u0 = [up], 8		C			M01
109	shr.u	n = n, 2		C			I0
110	;;
111}{.mmi;	ld8	v1 = [vp], 8		C			M01
112	ld8	u1 = [up], 8		C			M01
113	shl	x3 = r11, LSH		C			I0
114	;;
115}{.mmi;	ld8	v2 = [vp], 8		C			M01
116	ld8	u2 = [up], 8		C			M01
117	shrp	x0 = v0, r11, 64-LSH	C			I0
118}{.mmb;	ADDSUB(	w3, r10, x3)		C			M I
119	nop	0
120  (p15)	br.dpnt	.grt4			C			B
121	;;
122}{.mii;	CMP(	p7, w3, r10, x3)	C			M II0
123	shrp	x1 = v1, v0, 64-LSH	C			I0
124	ADDSUB(	w0, u0, x0)		C			M I
125	;;
126}{.mii;	CMP(	p8, w0, u0, x0)		C			M I
127	shrp	x2 = v2, v1, 64-LSH	C			I0
128	ADDSUB(	w1, u1, x1)		C			M I
129}{.mmb;	nop	0
130	nop	0
131	br	.Lcj4			C			B
132}
133ALIGN(32)
134.grt4:
135 {.mii;	ld8	v3 = [vp], 8		C			M01
136	shrp	x0 = v0, r11, 64-LSH	C			I0
137	CMP(	p8, w3, r10, x3)	C			M I
138	;;
139}{.mmi;	ld8	u3 = [up], 8		C			M01
140	add	r11 = PFDIST, vp
141	shrp	x1 = v1, v0, 64-LSH	C			I0
142}{.mmi;	ld8	v0 = [vp], 8		C			M01
143	ADDSUB(	w0, u0, x0)		C			M I
144	nop	0
145	;;
146}{.mmi;	CMP(	p6, w0, u0, x0)		C			M I
147	add	r10 = PFDIST, up
148	mov.i	ar.lc = n		C			I0
149}{.mmb;	ADDSUB(	w1, u1, x1)		C			M I
150	ld8	u0 = [up], 8		C			M01
151	br	.LL00			C			B
152}
153
154	ALIGN(32)
155.Lb01:
156ifdef(`DO_add',
157`	shladd	w2 = r11, LSH, r10	C			M I
158	shr.u	r8 = r11, 64-LSH	C retval		I0
159  (p15)	br.dpnt	.grt1			C			B
160	;;
161',`
162	shl	x2 = r11, LSH		C			I0
163  (p15)	br.dpnt	.grt1			C			B
164	;;
165	ADDSUB(	w2, r10, x2)		C			M I
166	shr.u	r8 = r11, 64-LSH	C retval		I0
167	;;
168')
169	CMP(	p6, w2, r10, x2)	C			M I
170	br		.Lcj1
171
172.grt1:	ld8	v3 = [vp], 8		C			M01
173	ld8	u3 = [up], 8		C			M01
174	shr.u	n = n, 2		C			I0
175	;;
176	ld8	v0 = [vp], 8		C			M01
177	ld8	u0 = [up], 8		C			M01
178	mov.i	ar.lc = n		C FIXME swap with next	I0
179ifdef(`DO_add',
180`',`
181	ADDSUB(	w2, r10, x2)
182')
183	;;
184 {.mmi;	ld8	v1 = [vp], 8		C			M01
185	ld8	u1 = [up], 8		C			M01
186	shrp	x3 = v3, r11, 64-LSH	C			I0
187	;;
188}{.mmi;	ld8	v2 = [vp], 8		C			M01
189	ld8	u2 = [up], 8		C			M01
190	shrp	x0 = v0, v3, 64-LSH	C			I0
191}{.mmb;	CMP(	p6, w2, r10, x2)	C			M I
192	ADDSUB(	w3, u3, x3)		C			M I
193	br.cloop.dptk	.grt5		C			B
194	;;
195}{.mmi;	CMP(	p7, w3, u3, x3)		C			M I
196	ADDSUB(	w0, u0, x0)		C			M I
197	shrp	x1 = v1, v0, 64-LSH	C			I0
198}{.mmb;	nop	0
199	nop	0
200	br	.Lcj5			C			B
201}
202.grt5:
203 {.mmi;	add	r10 = PFDIST, up
204	add	r11 = PFDIST, vp
205	shrp	x0 = v0, v3, 64-LSH	C			I0
206}{.mmb;	ld8	v3 = [vp], 8		C			M01
207	CMP(	p8, w3, u3, x3)		C			M I
208	br	.LL01			C			B
209}
210	ALIGN(32)
211.Lb10:
212 {.mmi;	ld8	v2 = [vp], 8		C			M01
213	ld8	u2 = [up], 8		C			M01
214	shl	x1 = r11, LSH		C			I0
215}{.mmb;	nop	0
216	nop	0
217  (p15)	br.dpnt	.grt2			C			B
218	;;
219}{.mmi;	ADDSUB(	w1, r10, x1)		C			M I
220	nop	0
221	shrp	x2 = v2, r11, 64-LSH	C			I0
222	;;
223}{.mmi;	CMP(	p9, w1, r10, x1)	C			M I
224	ADDSUB(	w2, u2, x2)		C			M I
225	shr.u	r8 = v2, 64-LSH		C retval		I0
226	;;
227}{.mmb;	CMP(	p6, w2, u2, x2)		C			M I
228	nop	0
229	br	.Lcj2			C			B
230}
231.grt2:
232 {.mmi;	ld8	v3 = [vp], 8		C			M01
233	ld8	u3 = [up], 8		C			M01
234	shr.u	n = n, 2		C			I0
235	;;
236}{.mmi;	ld8	v0 = [vp], 8		C			M01
237	ld8	u0 = [up], 8		C			M01
238	mov.i	ar.lc = n		C			I0
239}{.mmi;	ADDSUB(	w1, r10, x1)		C			M I
240	nop	0
241	nop	0
242	;;
243}{.mii;	ld8	v1 = [vp], 8		C			M01
244	shrp	x2 = v2, r11, 64-LSH	C			I0
245	CMP(	p8, w1, r10, x1)	C			M I
246	;;
247}{.mmi;	add	r10 = PFDIST, up
248	ld8	u1 = [up], 8		C			M01
249	shrp	x3 = v3, v2, 64-LSH	C			I0
250}{.mmi;	add	r11 = PFDIST, vp
251	ld8	v2 = [vp], 8		C			M01
252	ADDSUB(	w2, u2, x2)		C			M I
253	;;
254}{.mmi;	CMP(	p6, w2, u2, x2)		C			M I
255	ld8	u2 = [up], 8		C			M01
256	shrp	x0 = v0, v3, 64-LSH	C			I0
257}{.mib;	ADDSUB(	w3, u3, x3)		C			M I
258	nop	0
259	br.cloop.dpnt	L(top)		C			B
260}
261	br	L(end)			C			B
262.Lb11:
263 {.mmi;	ld8	v1 = [vp], 8		C			M01
264	ld8	u1 = [up], 8		C			M01
265	shl	x0 = r11, LSH		C			I0
266	;;
267}{.mmi;	ld8	v2 = [vp], 8		C			M01
268	ld8	u2 = [up], 8		C			M01
269	shr.u	n = n, 2		C			I0
270}{.mmb;	nop	0
271	nop	0
272  (p15)	br.dpnt	.grt3			C			B
273	;;
274}{.mii;	nop	0
275	shrp	x1 = v1, r11, 64-LSH	C			I0
276	ADDSUB(	w0, r10, x0)		C			M I
277	;;
278}{.mii;	CMP(	p8, w0, r10, x0)	C			M I
279	shrp	x2 = v2, v1, 64-LSH	C			I0
280	ADDSUB(	w1, u1, x1)		C			M I
281	;;
282}{.mmb;	CMP(	p9, w1, u1, x1)		C			M I
283	ADDSUB(	w2, u2, x2)		C			M I
284	br	.Lcj3			C			B
285}
286.grt3:
287 {.mmi;	ld8	v3 = [vp], 8		C			M01
288	ld8	u3 = [up], 8		C			M01
289	shrp	x1 = v1, r11, 64-LSH	C			I0
290}{.mmi;	ADDSUB(	w0, r10, x0)		C			M I
291	nop	0
292	nop	0
293	;;
294}{.mmi;	ld8	v0 = [vp], 8		C			M01
295	CMP(	p6, w0, r10, x0)	C			M I
296	mov.i	ar.lc = n		C			I0
297}{.mmi;	ld8	u0 = [up], 8		C			M01
298	ADDSUB(	w1, u1, x1)		C			M I
299	nop	0
300	;;
301}{.mmi;	add	r10 = PFDIST, up
302	add	r11 = PFDIST, vp
303	shrp	x2 = v2, v1, 64-LSH	C			I0
304}{.mmb;	ld8	v1 = [vp], 8		C			M01
305	CMP(	p8, w1, u1, x1)		C			M I
306	br	.LL11			C			B
307}
308
309C *** MAIN LOOP START ***
310	ALIGN(32)
311L(top):	st8	[rp] = w1, 8		C			M23
312	lfetch	[r10], 32
313   (p8)	cmpeqor	p6, p0 = LIM, w2	C			M I
314   (p8)	add	w2 = INCR, w2		C			M I
315	ld8	v3 = [vp], 8		C			M01
316	CMP(	p8, w3, u3, x3)		C			M I
317	;;
318.LL01:	ld8	u3 = [up], 8		C			M01
319	shrp	x1 = v1, v0, 64-LSH	C			I0
320   (p6)	cmpeqor	p8, p0 = LIM, w3	C			M I
321   (p6)	add	w3 = INCR, w3		C			M I
322	ld8	v0 = [vp], 8		C			M01
323	ADDSUB(	w0, u0, x0)		C			M I
324	;;
325	st8	[rp] = w2, 8		C			M23
326	CMP(	p6, w0, u0, x0)		C			M I
327	nop.b	0
328	ld8	u0 = [up], 8		C			M01
329	lfetch	[r11], 32
330	ADDSUB(	w1, u1, x1)		C			M I
331	;;
332.LL00:	st8	[rp] = w3, 8		C			M23
333	shrp	x2 = v2, v1, 64-LSH	C			I0
334   (p8)	cmpeqor	p6, p0 = LIM, w0	C			M I
335   (p8)	add	w0 = INCR, w0		C			M I
336	ld8	v1 = [vp], 8		C			M01
337	CMP(	p8, w1, u1, x1)		C			M I
338	;;
339.LL11:	ld8	u1 = [up], 8		C			M01
340	shrp	x3 = v3, v2, 64-LSH	C			I0
341   (p6)	cmpeqor	p8, p0 = LIM, w1	C			M I
342   (p6)	add	w1 = INCR, w1		C			M I
343	ld8	v2 = [vp], 8		C			M01
344	ADDSUB(	w2, u2, x2)		C			M I
345	;;
346 {.mmi;	st8	[rp] = w0, 8		C			M23
347	CMP(	p6, w2, u2, x2)		C			M I
348	shrp	x0 = v0, v3, 64-LSH	C			I0
349}{.mib;
350	ld8	u2 = [up], 8		C			M01
351	ADDSUB(	w3, u3, x3)		C			M I
352	br.cloop.dptk	L(top)		C			B
353	;;
354}
355C *** MAIN LOOP END ***
356
357L(end):
358 {.mmi;	st8	[rp] = w1, 8		C			M23
359   (p8)	cmpeqor	p6, p0 = LIM, w2	C			M I
360	shrp	x1 = v1, v0, 64-LSH	C			I0
361}{.mmi;
362   (p8)	add	w2 = INCR, w2		C			M I
363	CMP(	p7, w3, u3, x3)		C			M I
364	ADDSUB(	w0, u0, x0)		C			M I
365	;;
366}
367.Lcj5:
368 {.mmi;	st8	[rp] = w2, 8		C			M23
369   (p6)	cmpeqor	p7, p0 = LIM, w3	C			M I
370	shrp	x2 = v2, v1, 64-LSH	C			I0
371}{.mmi;
372   (p6)	add	w3 = INCR, w3		C			M I
373	CMP(	p8, w0, u0, x0)		C			M I
374	ADDSUB(	w1, u1, x1)		C			M I
375	;;
376}
377.Lcj4:
378 {.mmi;	st8	[rp] = w3, 8		C			M23
379   (p7)	cmpeqor	p8, p0 = LIM, w0	C			M I
380	mov.i	ar.lc = r2		C			I0
381}{.mmi;
382   (p7)	add	w0 = INCR, w0		C			M I
383	CMP(	p9, w1, u1, x1)		C			M I
384	ADDSUB(	w2, u2, x2)		C			M I
385	;;
386}
387.Lcj3:
388 {.mmi;	st8	[rp] = w0, 8		C			M23
389   (p8)	cmpeqor	p9, p0 = LIM, w1	C			M I
390	shr.u	r8 = v2, 64-LSH		C			I0
391}{.mmi;
392   (p8)	add	w1 = INCR, w1		C			M I
393	CMP(	p6, w2, u2, x2)		C			M I
394	nop	0
395	;;
396}
397.Lcj2:
398 {.mmi;	st8	[rp] = w1, 8		C			M23
399   (p9)	cmpeqor	p6, p0 = LIM, w2	C			M I
400   (p9)	add	w2 = INCR, w2		C			M I
401	;;
402}
403.Lcj1:
404 {.mmb;	st8	[rp] = w2		C			M23
405ifdef(`DO_rsb',`
406   (p6)	add	r8 = -1, r8		C			M I
407',`
408   (p6)	add	r8 = 1, r8		C			M I
409')	br.ret.sptk.many b0		C			B
410}
411EPILOGUE()
412ASM_END()
413