xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/ia64/rsh1aors_n.asm (revision 901e7e84758515fbf39dfc064cb0b45ab146d8b0)
1dnl  IA-64 mpn_rsh1add_n/mpn_rsh1sub_n -- rp[] = (up[] +- vp[]) >> 1.
2
3dnl  Contributed to the GNU project by Torbjorn Granlund.
4
5dnl  Copyright 2003-2005 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C         cycles/limb
36C Itanium:    2.5
37C Itanium 2:  1.5
38
39C TODO
40C  * Rewrite function entry code using aorslsh1_n.asm style.
41C  * Micro-optimize feed-in and wind-down code.
42
43C INPUT PARAMETERS
44define(`rp',`r32')
45define(`up',`r33')
46define(`vp',`r34')
47define(`n',`r35')
48
49ifdef(`OPERATION_rsh1add_n',`
50  define(ADDSUB,       add)
51  define(PRED,	       ltu)
52  define(INCR,	       1)
53  define(LIM,	       -1)
54  define(func, mpn_rsh1add_n)
55')
56ifdef(`OPERATION_rsh1sub_n',`
57  define(ADDSUB,       sub)
58  define(PRED,	       gtu)
59  define(INCR,	       -1)
60  define(LIM,	       0)
61  define(func, mpn_rsh1sub_n)
62')
63
64C Some useful aliases for registers we use
65define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17')
66define(`v0',`r18') define(`v1',`r19') define(`v2',`r20') define(`v3',`r21')
67define(`w0',`r22') define(`w1',`r23') define(`w2',`r24') define(`w3',`r25')
68define(`x0',`r26') define(`x1',`r9') define(`x2',`r30') define(`x3',`r31')
69
70MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n)
71
72ASM_START()
73PROLOGUE(func)
74	.prologue
75	.save	ar.lc, r2
76	.body
77ifdef(`HAVE_ABI_32',`
78	addp4		rp = 0, rp		C			M I
79	addp4		up = 0, up		C			M I
80	addp4		vp = 0, vp		C			M I
81	nop.m		0
82	nop.m		0
83	zxt4		n = n			C			I
84	;;
85')
86 {.mmi;	ld8		r11 = [vp], 8		C			M01
87	ld8		r10 = [up], 8		C			M01
88	mov.i		r2 = ar.lc		C			I0
89}{.mmi;	and		r14 = 3, n		C			M I
90	cmp.lt		p15, p0 = 4, n		C			M I
91	add		n = -4, n		C			M I
92	;;
93}{.mmi;	cmp.eq		p6, p0 = 1, r14		C			M I
94	cmp.eq		p7, p0 = 2, r14		C			M I
95	cmp.eq		p8, p0 = 3, r14		C			M I
96}{.bbb
97  (p6)	br.dptk		.Lb01			C			B
98  (p7)	br.dptk		.Lb10			C			B
99  (p8)	br.dptk		.Lb11			C			B
100}
101
102.Lb00:	ld8		v0 = [vp], 8		C			M01
103	ld8		u0 = [up], 8		C			M01
104	shr.u		n = n, 2		C			I0
105	;;
106	ld8		v1 = [vp], 8		C			M01
107	ld8		u1 = [up], 8		C			M01
108	ADDSUB		w3 = r10, r11		C			M I
109	;;
110	ld8		v2 = [vp], 8		C			M01
111	ld8		u2 = [up], 8		C			M01
112  (p15)	br.dpnt		.grt4			C			B
113	;;
114
115	cmp.PRED	p7, p0 = w3, r10	C			M I
116	and		r8 = 1, w3		C			M I
117	ADDSUB		w0 = u0, v0		C			M I
118	;;
119	cmp.PRED	p8, p0 = w0, u0		C			M I
120	ADDSUB		w1 = u1, v1		C			M I
121	;;
122	cmp.PRED	p9, p0 = w1, u1		C			M I
123   (p7)	cmp.eq.or	p8, p0 = LIM, w0	C			M I
124   (p7)	add		w0 = INCR, w0		C			M I
125	;;
126	shrp		x3 = w0, w3, 1		C			I0
127	ADDSUB		w2 = u2, v2		C			M I
128   (p8)	cmp.eq.or	p9, p0 = LIM, w1	C			M I
129   (p8)	add		w1 = INCR, w1		C			M I
130	br		.Lcj4			C			B
131
132.grt4:	ld8		v3 = [vp], 8		C			M01
133	cmp.PRED	p7, p0 = w3, r10	C			M I
134	ld8		u3 = [up], 8		C			M01
135	and		r8 = 1, w3		C			M I
136	;;
137	ADDSUB		w0 = u0, v0		C			M I
138	ld8		v0 = [vp], 8		C			M01
139	add		n = -1, n
140	;;
141	cmp.PRED	p8, p0 = w0, u0		C			M I
142	ld8		u0 = [up], 8		C			M01
143	ADDSUB		w1 = u1, v1		C			M I
144	;;
145	ld8		v1 = [vp], 8		C			M01
146	mov.i		ar.lc = n		C			I0
147	cmp.PRED	p9, p0 = w1, u1		C			M I
148	ld8		u1 = [up], 8		C			M01
149   (p7)	cmp.eq.or	p8, p0 = LIM, w0	C			M I
150   (p7)	add		w0 = INCR, w0		C			M I
151	;;
152	ADDSUB		w2 = u2, v2		C			M I
153	ld8		v2 = [vp], 8		C			M01
154	shrp		x3 = w0, w3, 1		C			I0
155   (p8)	cmp.eq.or	p9, p0 = LIM, w1	C			M I
156   (p8)	add		w1 = INCR, w1		C			M I
157	br		.LL00			C			B
158
159
160.Lb01:	ADDSUB		w2 = r10, r11		C			M I
161	shr.u		n = n, 2		C			I0
162  (p15)	br.dpnt		.grt1			C			B
163	;;
164
165	cmp.PRED	p6, p7 = w2, r10	C			M I
166	shr.u		x2 = w2, 1		C			I0
167	and		r8 = 1, w2		C			M I
168	;;
169   (p6)	dep		x2 = -1, x2, 63, 1	C			I0
170	br		.Lcj1			C			B
171
172.grt1:	ld8		v3 = [vp], 8		C			M01
173	ld8		u3 = [up], 8		C			M01
174	;;
175	ld8		v0 = [vp], 8		C			M01
176	ld8		u0 = [up], 8		C			M01
177	mov.i		ar.lc = n		C FIXME swap with next	I0
178	;;
179	ld8		v1 = [vp], 8		C			M01
180	ld8		u1 = [up], 8		C			M01
181	;;
182	ld8		v2 = [vp], 8		C			M01
183	ld8		u2 = [up], 8		C			M01
184	cmp.PRED	p6, p0 = w2, r10	C			M I
185	and		r8 = 1, w2		C			M I
186	ADDSUB		w3 = u3, v3		C			M I
187	br.cloop.dptk	.grt5			C			B
188	;;
189
190	cmp.PRED	p7, p0 = w3, u3		C			M I
191	;;
192	ADDSUB		w0 = u0, v0		C			M I
193   (p6)	cmp.eq.or	p7, p0 = LIM, w3	C			M I
194   (p6)	add		w3 = INCR, w3		C			M I
195	;;
196	cmp.PRED	p8, p0 = w0, u0		C			M I
197	shrp		x2 = w3, w2, 1		C			I0
198	ADDSUB		w1 = u1, v1		C			M I
199	;;
200	cmp.PRED	p9, p0 = w1, u1		C			M I
201   (p7)	cmp.eq.or	p8, p0 = LIM, w0	C			M I
202   (p7)	add		w0 = INCR, w0		C			M I
203	br		.Lcj5			C			B
204
205.grt5:	ld8		v3 = [vp], 8		C			M01
206	cmp.PRED	p7, p0 = w3, u3		C			M I
207	ld8		u3 = [up], 8		C			M01
208	;;
209	ADDSUB		w0 = u0, v0		C			M I
210	ld8		v0 = [vp], 8		C			M01
211   (p6)	cmp.eq.or	p7, p0 = LIM, w3	C			M I
212   (p6)	add		w3 = INCR, w3		C			M I
213	;;
214	cmp.PRED	p8, p0 = w0, u0		C			M I
215	shrp		x2 = w3, w2, 1		C			I0
216	ld8		u0 = [up], 8		C			M01
217	ADDSUB		w1 = u1, v1		C			M I
218	;;
219	ld8		v1 = [vp], 8		C			M01
220	cmp.PRED	p9, p0 = w1, u1		C			M I
221	ld8		u1 = [up], 8		C			M01
222   (p7)	cmp.eq.or	p8, p0 = LIM, w0	C			M I
223   (p7)	add		w0 = INCR, w0		C			M I
224	br		.LL01			C			B
225
226
227.Lb10:	ld8		v2 = [vp], 8		C			M01
228	ld8		u2 = [up], 8		C			M01
229	shr.u		n = n, 2		C			I0
230	ADDSUB		w1 = r10, r11		C			M I
231  (p15)	br.dpnt		.grt2			C			B
232	;;
233
234	cmp.PRED	p9, p0 = w1, r10	C			M I
235	and		r8 = 1, w1		C			M I
236	ADDSUB		w2 = u2, v2		C			M I
237	;;
238	cmp.PRED	p6, p0 = w2, u2		C			M I
239	;;
240   (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
241   (p9)	add		w2 = INCR, w2		C			M I
242	;;
243	shrp		x1 = w2, w1, 1		C			I0
244	shr.u		x2 = w2, 1		C			I0
245	br		.Lcj2			C			B
246
247.grt2:	ld8		v3 = [vp], 8		C			M01
248	ld8		u3 = [up], 8		C			M01
249	;;
250	ld8		v0 = [vp], 8		C			M01
251	ld8		u0 = [up], 8		C			M01
252	mov.i		ar.lc = n		C			I0
253	;;
254	ld8		v1 = [vp], 8		C			M01
255	cmp.PRED	p9, p0 = w1, r10	C			M I
256	ld8		u1 = [up], 8		C			M01
257	and		r8 = 1, w1		C			M I
258	;;
259	ADDSUB		w2 = u2, v2		C			M I
260	ld8		v2 = [vp], 8		C			M01
261	;;
262	cmp.PRED	p6, p0 = w2, u2		C			M I
263	ld8		u2 = [up], 8		C			M01
264	ADDSUB		w3 = u3, v3		C			M I
265	br.cloop.dptk	.grt6			C			B
266	;;
267
268	cmp.PRED	p7, p0 = w3, u3		C			M I
269   (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
270   (p9)	add		w2 = INCR, w2		C			M I
271	;;
272	shrp		x1 = w2, w1, 1		C			I0
273	ADDSUB		w0 = u0, v0		C			M I
274   (p6)	cmp.eq.or	p7, p0 = LIM, w3	C			M I
275   (p6)	add		w3 = INCR, w3		C			M I
276	br		.Lcj6			C			B
277
278.grt6:	ld8		v3 = [vp], 8		C			M01
279	cmp.PRED	p7, p0 = w3, u3		C			M I
280	ld8		u3 = [up], 8		C			M01
281   (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
282   (p9)	add		w2 = INCR, w2		C			M I
283	;;
284	shrp		x1 = w2, w1, 1		C			I0
285	ADDSUB		w0 = u0, v0		C			M I
286	ld8		v0 = [vp], 8		C			M01
287   (p6)	cmp.eq.or	p7, p0 = LIM, w3	C			M I
288   (p6)	add		w3 = INCR, w3		C			M I
289	br		.LL10			C			B
290
291
292.Lb11:	ld8		v1 = [vp], 8		C			M01
293	ld8		u1 = [up], 8		C			M01
294	shr.u		n = n, 2		C			I0
295	;;
296	ld8		v2 = [vp], 8		C			M01
297	ld8		u2 = [up], 8		C			M01
298	ADDSUB		w0 = r10, r11		C			M I
299  (p15)	br.dpnt		.grt3			C			B
300	;;
301
302	cmp.PRED	p8, p0 = w0, r10	C			M I
303	ADDSUB		w1 = u1, v1		C			M I
304	and		r8 = 1, w0		C			M I
305	;;
306	cmp.PRED	p9, p0 = w1, u1		C			M I
307	;;
308	ADDSUB		w2 = u2, v2		C			M I
309   (p8)	cmp.eq.or	p9, p0 = LIM, w1	C			M I
310   (p8)	add		w1 = INCR, w1		C			M I
311	;;
312	cmp.PRED	p6, p0 = w2, u2		C			M I
313	shrp		x0 = w1, w0, 1		C			I0
314	;;
315   (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
316   (p9)	add		w2 = INCR, w2		C			M I
317	br		.Lcj3			C			B
318
319.grt3:	ld8		v3 = [vp], 8		C			M01
320	ld8		u3 = [up], 8		C			M01
321	;;
322	ld8		v0 = [vp], 8		C			M01
323	mov.i		ar.lc = n		C			I0
324	cmp.PRED	p8, p0 = w0, r10	C			M I
325	ld8		u0 = [up], 8		C			M01
326	ADDSUB		w1 = u1, v1		C			M I
327	and		r8 = 1, w0		C			M I
328	;;
329	ld8		v1 = [vp], 8		C			M01
330	cmp.PRED	p9, p0 = w1, u1		C			M I
331	ld8		u1 = [up], 8		C			M01
332	;;
333	ADDSUB		w2 = u2, v2		C			M I
334	ld8		v2 = [vp], 8		C			M01
335   (p8)	cmp.eq.or	p9, p0 = LIM, w1	C			M I
336   (p8)	add		w1 = INCR, w1		C			M I
337	;;
338	cmp.PRED	p6, p0 = w2, u2		C			M I
339	shrp		x0 = w1, w0, 1		C			I0
340	ld8		u2 = [up], 8		C			M01
341	ADDSUB		w3 = u3, v3		C			M I
342	br.cloop.dptk	.grt7			C			B
343	;;
344
345	cmp.PRED	p7, p0 = w3, u3		C			M I
346   (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
347   (p9)	add		w2 = INCR, w2		C			M I
348	br		.Lcj7			C			B
349
350.grt7:	ld8		v3 = [vp], 8		C			M01
351	cmp.PRED	p7, p0 = w3, u3		C			M I
352	ld8		u3 = [up], 8		C			M01
353   (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
354   (p9)	add		w2 = INCR, w2		C			M I
355	br		.LL11			C			B
356
357
358C *** MAIN LOOP START ***
359	ALIGN(32)
360.Loop:	st8		[rp] = x3, 8		C			M23
361	ld8		v3 = [vp], 8		C			M01
362	cmp.PRED	p7, p0 = w3, u3		C			M I
363	ld8		u3 = [up], 8		C			M01
364   (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
365   (p9)	add		w2 = INCR, w2		C			M I
366	;;
367.LL11:	st8		[rp] = x0, 8		C			M23
368	shrp		x1 = w2, w1, 1		C			I0
369	ADDSUB		w0 = u0, v0		C			M I
370	ld8		v0 = [vp], 8		C			M01
371   (p6)	cmp.eq.or	p7, p0 = LIM, w3	C			M I
372   (p6)	add		w3 = INCR, w3		C			M I
373	;;
374.LL10:	cmp.PRED	p8, p0 = w0, u0		C			M I
375	shrp		x2 = w3, w2, 1		C			I0
376	nop.b		0
377	ld8		u0 = [up], 8		C			M01
378	ADDSUB		w1 = u1, v1		C			M I
379	nop.b		0
380	;;
381	st8		[rp] = x1, 8		C			M23
382	ld8		v1 = [vp], 8		C			M01
383	cmp.PRED	p9, p0 = w1, u1		C			M I
384	ld8		u1 = [up], 8		C			M01
385   (p7)	cmp.eq.or	p8, p0 = LIM, w0	C			M I
386   (p7)	add		w0 = INCR, w0		C			M I
387	;;
388.LL01:	st8		[rp] = x2, 8		C			M23
389	shrp		x3 = w0, w3, 1		C			I0
390	ADDSUB		w2 = u2, v2		C			M I
391	ld8		v2 = [vp], 8		C			M01
392   (p8)	cmp.eq.or	p9, p0 = LIM, w1	C			M I
393   (p8)	add		w1 = INCR, w1		C			M I
394	;;
395.LL00:	cmp.PRED	p6, p0 = w2, u2		C			M I
396	shrp		x0 = w1, w0, 1		C			I0
397	nop.b		0
398	ld8		u2 = [up], 8		C			M01
399	ADDSUB		w3 = u3, v3		C			M I
400	br.cloop.dptk	.Loop			C			B
401	;;
402C *** MAIN LOOP END ***
403
404.Lskip:	st8		[rp] = x3, 8		C			M23
405	cmp.PRED	p7, p0 = w3, u3		C			M I
406   (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
407   (p9)	add		w2 = INCR, w2		C			M I
408	;;
409.Lcj7:	st8		[rp] = x0, 8		C			M23
410	shrp		x1 = w2, w1, 1		C			I0
411	ADDSUB		w0 = u0, v0		C			M I
412   (p6)	cmp.eq.or	p7, p0 = LIM, w3	C			M I
413   (p6)	add		w3 = INCR, w3		C			M I
414	;;
415.Lcj6:	cmp.PRED	p8, p0 = w0, u0		C			M I
416	shrp		x2 = w3, w2, 1		C			I0
417	ADDSUB		w1 = u1, v1		C			M I
418	;;
419	st8		[rp] = x1, 8		C			M23
420	cmp.PRED	p9, p0 = w1, u1		C			M I
421   (p7)	cmp.eq.or	p8, p0 = LIM, w0	C			M I
422   (p7)	add		w0 = INCR, w0		C			M I
423	;;
424.Lcj5:	st8		[rp] = x2, 8		C			M23
425	shrp		x3 = w0, w3, 1		C			I0
426	ADDSUB		w2 = u2, v2		C			M I
427   (p8)	cmp.eq.or	p9, p0 = LIM, w1	C			M I
428   (p8)	add		w1 = INCR, w1		C			M I
429	;;
430.Lcj4:	cmp.PRED	p6, p0 = w2, u2		C			M I
431	shrp		x0 = w1, w0, 1		C			I0
432	;;
433	st8		[rp] = x3, 8		C			M23
434   (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
435   (p9)	add		w2 = INCR, w2		C			M I
436	;;
437.Lcj3:	st8		[rp] = x0, 8		C			M23
438	shrp		x1 = w2, w1, 1		C			I0
439	shr.u		x2 = w2, 1		C			I0
440	;;
441.Lcj2:	st8		[rp] = x1, 8		C			M23
442   (p6)	dep		x2 = -1, x2, 63, 1	C			I0
443	;;
444.Lcj1:	st8		[rp] = x2		C			M23
445	mov.i		ar.lc = r2		C			I0
446	br.ret.sptk.many b0			C			B
447EPILOGUE()
448