xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/ia64/lorrshift.asm (revision c38e7cc395b1472a774ff828e46123de44c628e9)
1dnl  IA-64 mpn_lshift/mpn_rshift.
2
3dnl  Contributed to the GNU project by Torbjorn Granlund.
4
5dnl  Copyright 2000-2005 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C           cycles/limb
36C Itanium:      2
37C Itanium 2:    1
38
39C This code is scheduled deeply since the plain shift instructions shr and shl
40C have a latency of 4 (on Itanium) or 3 (on Itanium 2).  Poor scheduling of
41C these instructions cause a 10 cycle replay trap on Itanium.
42
43C The ld8 scheduling should probably be decreased to make the function smaller.
44C Good lfetch  will make sure we never stall anyway.
45
46C We should actually issue the first ld8 at cycle 0, and the first BSH/FSH pair
47C at cycle 2.  Judicious use of predicates could allow us to issue more ld8's
48C in the prologue.
49
50
51C INPUT PARAMETERS
52define(`rp', `r32')
53define(`up', `r33')
54define(`n',  `r34')
55define(`cnt',`r35')
56
57define(`tnc',`r9')
58
59ifdef(`OPERATION_lshift',`
60	define(`FSH',`shl')
61	define(`BSH',`shr.u')
62	define(`UPD',`-8')
63	define(`POFF',`-512')
64	define(`PUPD',`-32')
65	define(`func',`mpn_lshift')
66')
67ifdef(`OPERATION_rshift',`
68	define(`FSH',`shr.u')
69	define(`BSH',`shl')
70	define(`UPD',`8')
71	define(`POFF',`512')
72	define(`PUPD',`32')
73	define(`func',`mpn_rshift')
74')
75
76MULFUNC_PROLOGUE(mpn_lshift mpn_rshift)
77
78ASM_START()
79PROLOGUE(func)
80	.prologue
81	.save	ar.lc, r2
82	.body
83ifdef(`HAVE_ABI_32',
84`	addp4	rp = 0, rp		C			M I
85	addp4	up = 0, up		C		M I
86	sxt4	n = n			C		M I
87	nop.m		0
88	nop.m		0
89	zxt4	cnt = cnt		C		I
90	;;
91')
92
93 {.mmi;	cmp.lt	p14, p15 = 4, n		C		M I
94	and	r14 = 3, n		C		M I
95	mov.i	r2 = ar.lc		C		I0
96}{.mmi;	add	r15 = -1, n		C		M I
97	sub	tnc = 64, cnt		C		M I
98	add	r16 = -5, n
99	;;
100}{.mmi;	cmp.eq	p6, p0 = 1, r14		C		M I
101	cmp.eq	p7, p0 = 2, r14		C		M I
102	shr.u	n = r16, 2		C		I0
103}{.mmi;	cmp.eq	p8, p0 = 3, r14		C		M I
104ifdef(`OPERATION_lshift',
105`	shladd	up = r15, 3, up		C		M I
106	shladd	rp = r15, 3, rp')	C		M I
107	;;
108}{.mmi;	add	r11 = POFF, up		C		M I
109	ld8	r10 = [up], UPD		C		M01
110	mov.i	ar.lc = n		C		I0
111}{.bbb;
112   (p6)	br.dptk	.Lb01
113   (p7)	br.dptk	.Lb10
114   (p8)	br.dptk	.Lb11
115	;; }
116
117.Lb00:	ld8	r19 = [up], UPD
118	;;
119	ld8	r16 = [up], UPD
120	;;
121	ld8	r17 = [up], UPD
122	BSH	r8 = r10, tnc		C function return value
123	;;
124	FSH	r24 = r10, cnt
125	BSH	r25 = r19, tnc
126  (p14)	br.cond.dptk	.grt4
127	;;
128	FSH	r26 = r19, cnt
129	BSH	r27 = r16, tnc
130	;;
131	FSH	r20 = r16, cnt
132	BSH	r21 = r17, tnc
133	;;
134	or	r14 = r25, r24
135	FSH	r22 = r17, cnt
136	BSH	r23 = r10, tnc
137	br	.Lr4
138
139.grt4:	ld8	r18 = [up], UPD
140	FSH	r26 = r19, cnt
141	BSH	r27 = r16, tnc
142	;;
143	ld8	r19 = [up], UPD
144	FSH	r20 = r16, cnt
145	BSH	r21 = r17, tnc
146	;;
147	ld8	r16 = [up], UPD
148	FSH	r22 = r17, cnt
149	BSH	r23 = r18, tnc
150	;;
151	or	r14 = r25, r24
152	ld8	r17 = [up], UPD
153	br.cloop.dpnt	.Ltop
154	br	.Lbot
155
156.Lb01:
157  (p15)	BSH	r8 = r10, tnc		C function return value	I
158  (p15)	FSH	r22 = r10, cnt		C		I
159  (p15)	br.cond.dptk	.Lr1		C return	B
160
161.grt1:	ld8	r18 = [up], UPD
162	;;
163	ld8	r19 = [up], UPD
164	BSH	r8 = r10, tnc		C function return value
165	;;
166	ld8	r16 = [up], UPD
167	FSH	r22 = r10, cnt
168	BSH	r23 = r18, tnc
169	;;
170	ld8	r17 = [up], UPD
171	FSH	r24 = r18, cnt
172	BSH	r25 = r19, tnc
173	br.cloop.dpnt	.grt5
174	;;
175	or	r15 = r23, r22
176	FSH	r26 = r19, cnt
177	BSH	r27 = r16, tnc
178	;;
179	FSH	r20 = r16, cnt
180	BSH	r21 = r17, tnc
181	br	.Lr5
182
183.grt5:	ld8	r18 = [up], UPD
184	FSH	r26 = r19, cnt
185	BSH	r27 = r16, tnc
186	;;
187	ld8	r19 = [up], UPD
188	FSH	r20 = r16, cnt
189	BSH	r21 = r17, tnc
190	;;
191	or	r15 = r23, r22
192	ld8	r16 = [up], UPD
193	br	.LL01
194
195
196.Lb10:	ld8	r17 = [up], UPD
197  (p14)	br.cond.dptk	.grt2
198
199	BSH	r8 = r10, tnc		C function return value
200	;;
201	FSH	r20 = r10, cnt
202	BSH	r21 = r17, tnc
203	;;
204	or	r14 = r21, r20
205	FSH	r22 = r17, cnt
206	br	.Lr2			C return
207
208.grt2:	ld8	r18 = [up], UPD
209	BSH	r8 = r10, tnc		C function return value
210	;;
211	ld8	r19 = [up], UPD
212	FSH	r20 = r10, cnt
213	BSH	r21 = r17, tnc
214	;;
215	ld8	r16 = [up], UPD
216	FSH	r22 = r17, cnt
217	BSH	r23 = r18, tnc
218	;;
219 {.mmi;	ld8	r17 = [up], UPD
220	or	r14 = r21, r20
221	FSH	r24 = r18, cnt
222}{.mib;	nop	0
223	BSH	r25 = r19, tnc
224	br.cloop.dpnt	.grt6
225	;; }
226
227	FSH	r26 = r19, cnt
228	BSH	r27 = r16, tnc
229	br	.Lr6
230
231.grt6:	ld8	r18 = [up], UPD
232	FSH	r26 = r19, cnt
233	BSH	r27 = r16, tnc
234	;;
235	ld8	r19 = [up], UPD
236	br	.LL10
237
238
239.Lb11:	ld8	r16 = [up], UPD
240	;;
241	ld8	r17 = [up], UPD
242	BSH	r8 = r10, tnc		C function return value
243  (p14)	br.cond.dptk	.grt3
244	;;
245
246	FSH	r26 = r10, cnt
247	BSH	r27 = r16, tnc
248	;;
249	FSH	r20 = r16, cnt
250	BSH	r21 = r17, tnc
251	;;
252	or	r15 = r27, r26
253	FSH	r22 = r17, cnt
254	br	.Lr3			C return
255
256.grt3:	ld8	r18 = [up], UPD
257	FSH	r26 = r10, cnt
258	BSH	r27 = r16, tnc
259	;;
260	ld8	r19 = [up], UPD
261	FSH	r20 = r16, cnt
262	BSH	r21 = r17, tnc
263	;;
264	ld8	r16 = [up], UPD
265	FSH	r22 = r17, cnt
266	BSH	r23 = r18, tnc
267	;;
268	ld8	r17 = [up], UPD
269	br.cloop.dpnt	.grt7
270
271	or	r15 = r27, r26
272	FSH	r24 = r18, cnt
273	BSH	r25 = r19, tnc
274	br	.Lr7
275
276.grt7:	or	r15 = r27, r26
277	FSH	r24 = r18, cnt
278	BSH	r25 = r19, tnc
279	ld8	r18 = [up], UPD
280	br	.LL11
281
282C *** MAIN LOOP START ***
283	ALIGN(32)
284.Ltop:
285 {.mmi;	st8	[rp] = r14, UPD		C M2
286	or	r15 = r27, r26		C M3
287	FSH	r24 = r18, cnt		C I0
288}{.mmi;	ld8	r18 = [up], UPD		C M1
289	lfetch	[r11], PUPD
290	BSH	r25 = r19, tnc		C I1
291	;; }
292.LL11:
293 {.mmi;	st8	[rp] = r15, UPD
294	or	r14 = r21, r20
295	FSH	r26 = r19, cnt
296}{.mmi;	ld8	r19 = [up], UPD
297	nop.m	0
298	BSH	r27 = r16, tnc
299	;; }
300.LL10:
301 {.mmi;	st8	[rp] = r14, UPD
302	or	r15 = r23, r22
303	FSH	r20 = r16, cnt
304}{.mmi;	ld8	r16 = [up], UPD
305	nop.m	0
306	BSH	r21 = r17, tnc
307	;; }
308.LL01:
309 {.mmi;	st8	[rp] = r15, UPD
310	or	r14 = r25, r24
311	FSH	r22 = r17, cnt
312}{.mib;	ld8	r17 = [up], UPD
313	BSH	r23 = r18, tnc
314	br.cloop.dptk	.Ltop
315	;; }
316C *** MAIN LOOP END ***
317
318.Lbot:
319 {.mmi;	st8	[rp] = r14, UPD
320	or	r15 = r27, r26
321	FSH	r24 = r18, cnt
322}{.mib;	nop	0
323	BSH	r25 = r19, tnc
324	nop	0
325	;; }
326.Lr7:
327 {.mmi;	st8	[rp] = r15, UPD
328	or	r14 = r21, r20
329	FSH	r26 = r19, cnt
330}{.mib;	nop	0
331	BSH	r27 = r16, tnc
332	nop	0
333	;; }
334.Lr6:
335 {.mmi;	st8	[rp] = r14, UPD
336	or	r15 = r23, r22
337	FSH	r20 = r16, cnt
338}{.mib;	nop	0
339	BSH	r21 = r17, tnc
340	nop	0
341	;; }
342.Lr5:	st8	[rp] = r15, UPD
343	or	r14 = r25, r24
344	FSH	r22 = r17, cnt
345	;;
346.Lr4:	st8	[rp] = r14, UPD
347	or	r15 = r27, r26
348	;;
349.Lr3:	st8	[rp] = r15, UPD
350	or	r14 = r21, r20
351	;;
352.Lr2:	st8	[rp] = r14, UPD
353	;;
354.Lr1:	st8	[rp] = r22, UPD		C		M23
355	mov	ar.lc = r2		C		I0
356	br.ret.sptk.many b0		C		B
357EPILOGUE(func)
358ASM_END()
359