xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/ia64/lshiftc.asm (revision f89f6560d453f5e37386cc7938c072d2f528b9fa)
1dnl  IA-64 mpn_lshiftc.
2
3dnl  Contributed to the GNU project by Torbjorn Granlund.
4
5dnl  Copyright 2000, 2001, 2002, 2003, 2004, 2005, 2010 Free Software
6dnl  Foundation, Inc.
7
8dnl  This file is part of the GNU MP Library.
9
10dnl  The GNU MP Library is free software; you can redistribute it and/or modify
11dnl  it under the terms of the GNU Lesser General Public License as published
12dnl  by the Free Software Foundation; either version 3 of the License, or (at
13dnl  your option) any later version.
14
15dnl  The GNU MP Library is distributed in the hope that it will be useful, but
16dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
17dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
18dnl  License for more details.
19
20dnl  You should have received a copy of the GNU Lesser General Public License
21dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
22
23include(`../config.m4')
24
25C           cycles/limb
26C Itanium:      ?
27C Itanium 2:    1.25
28
29C This code is scheduled deeply since the plain shift instructions shr and shl
30C have a latency of 4 (on Itanium) or 3 (on Itanium 2).  Poor scheduling of
31C these instructions cause a 10 cycle replay trap on Itanium.
32
33C The ld8 scheduling should probably be decreased to make the function smaller.
34C Good lfetch  will make sure we never stall anyway.
35
36C We should actually issue the first ld8 at cycle 0, and the first BSH/FSH pair
37C at cycle 2.  Judicious use of predicates could allow us to issue more ld8's
38C in the prologue.
39
40
41C INPUT PARAMETERS
42define(`rp', `r32')
43define(`up', `r33')
44define(`n',  `r34')
45define(`cnt',`r35')
46
47define(`tnc',`r9')
48
49define(`FSH',`shl')
50define(`BSH',`shr.u')
51define(`UPD',`-8')
52define(`POFF',`-512')
53define(`PUPD',`-32')
54define(`func',`mpn_lshiftc')
55
56ASM_START()
57PROLOGUE(mpn_lshiftc)
58	.prologue
59	.save	ar.lc, r2
60	.body
61ifdef(`HAVE_ABI_32',
62`	addp4	rp = 0, rp		C				M I
63	addp4	up = 0, up		C				M I
64	sxt4	n = n			C				M I
65	zxt4	cnt = cnt		C				I
66	;;
67')
68
69 {.mmi;	nop	0			C				M I
70	and	r14 = 3, n		C				M I
71	mov.i	r2 = ar.lc		C				I0
72}{.mmi;	add	r15 = -1, n		C				M I
73	sub	tnc = 64, cnt		C				M I
74	nop	0
75	;;
76}{.mmi;	cmp.eq	p6, p0 = 1, r14		C				M I
77	cmp.eq	p7, p0 = 2, r14		C				M I
78	shr.u	n = r15, 2		C				I0
79}{.mmi;	cmp.eq	p8, p0 = 3, r14		C				M I
80	shladd	up = r15, 3, up		C				M I
81	shladd	rp = r15, 3, rp		C				M I
82	;;
83}{.mmi;	add	r11 = POFF, up		C				M I
84	ld8	r10 = [up], UPD		C				M01
85	mov.i	ar.lc = n		C				I0
86}{.bbb;
87   (p6)	br.dptk	.Lb01
88   (p7)	br.dptk	.Lb10
89   (p8)	br.dptk	.Lb11
90	;; }
91
92.Lb00:
93	ld8	r19 = [up], UPD
94	;;
95	ld8	r16 = [up], UPD
96	;;
97	ld8	r17 = [up], UPD
98	BSH	r8 = r10, tnc
99	br.cloop.dptk	L(gt4)
100	;;
101	FSH	r24 = r10, cnt
102	BSH	r25 = r19, tnc
103	;;
104	FSH	r26 = r19, cnt
105	BSH	r27 = r16, tnc
106	;;
107	FSH	r20 = r16, cnt
108	BSH	r21 = r17, tnc
109	;;
110	or	r14 = r25, r24
111	FSH	r22 = r17, cnt
112	;;
113	or	r15 = r27, r26
114	sub	r31 = -1, r14
115	br	.Lr4
116
117L(gt4):
118 {.mmi;	nop	0
119	nop	0
120	FSH	r24 = r10, cnt
121}{.mmi;	ld8	r18 = [up], UPD
122	nop	0
123	BSH	r25 = r19, tnc
124	;; }
125 {.mmi;	nop	0
126	nop	0
127	FSH	r26 = r19, cnt
128}{.mmi;	ld8	r19 = [up], UPD
129	nop	0
130	BSH	r27 = r16, tnc
131	;; }
132 {.mmi;	nop	0
133	nop	0
134	FSH	r20 = r16, cnt
135}{.mmi;	ld8	r16 = [up], UPD
136	nop	0
137	BSH	r21 = r17, tnc
138	;; }
139 {.mmi;	nop	0
140	or	r14 = r25, r24
141	FSH	r22 = r17, cnt
142}{.mib;	ld8	r17 = [up], UPD
143	BSH	r23 = r18, tnc
144	br.cloop.dptk	L(gt8)
145	;; }
146 {.mmi;	nop	0
147	or	r15 = r27, r26
148	FSH	r24 = r18, cnt
149}{.mib;	sub	r31 = -1, r14
150	BSH	r25 = r19, tnc
151	br	.Lr8 }
152
153L(gt8):
154	or	r15 = r27, r26
155	FSH	r24 = r18, cnt
156	ld8	r18 = [up], UPD
157	sub	r31 = -1, r14
158	BSH	r25 = r19, tnc
159	br	.LL00
160
161.Lb01:
162	br.cloop.dptk	L(gt1)
163	;;
164	BSH	r8 = r10, tnc
165	FSH	r22 = r10, cnt
166	;;
167	sub	r31 = -1, r22
168	br	.Lr1
169	;;
170L(gt1):
171	ld8	r18 = [up], UPD
172	BSH	r8 = r10, tnc
173	FSH	r22 = r10, cnt
174	;;
175	ld8	r19 = [up], UPD
176	;;
177	ld8	r16 = [up], UPD
178	;;
179	ld8	r17 = [up], UPD
180	BSH	r23 = r18, tnc
181	br.cloop.dptk	L(gt5)
182	;;
183	nop	0
184	FSH	r24 = r18, cnt
185	BSH	r25 = r19, tnc
186	;;
187	nop	0
188	FSH	r26 = r19, cnt
189	BSH	r27 = r16, tnc
190	;;
191	or	r15 = r23, r22
192	FSH	r20 = r16, cnt
193	BSH	r21 = r17, tnc
194	;;
195	or	r14 = r25, r24
196	FSH	r22 = r17, cnt
197	sub	r31 = -1, r15
198	br	.Lr5
199
200L(gt5):
201 {.mmi;	nop	0
202	nop	0
203	FSH	r24 = r18, cnt
204}{.mmi;	ld8	r18 = [up], UPD
205	nop	0
206	BSH	r25 = r19, tnc
207	;; }
208 {.mmi;	nop	0
209	nop	0
210	FSH	r26 = r19, cnt
211}{.mmi;	ld8	r19 = [up], UPD
212	nop	0
213	BSH	r27 = r16, tnc
214	;; }
215 {.mmi;	nop	0
216	or	r15 = r23, r22
217	FSH	r20 = r16, cnt
218}{.mmi;	ld8	r16 = [up], UPD
219	nop	0
220	BSH	r21 = r17, tnc
221	;; }
222 {.mmi;	or	r14 = r25, r24
223	sub	r31 = -1, r15
224	FSH	r22 = r17, cnt
225}{.mib;	ld8	r17 = [up], UPD
226	BSH	r23 = r18, tnc
227	br	L(end)
228	;; }
229
230.Lb10:
231	ld8	r17 = [up], UPD
232	br.cloop.dptk	L(gt2)
233	;;
234	BSH	r8 = r10, tnc
235	FSH	r20 = r10, cnt
236	;;
237	BSH	r21 = r17, tnc
238	FSH	r22 = r17, cnt
239	;;
240	or	r14 = r21, r20
241	;;
242	sub	r31 = -1, r14
243	br	.Lr2
244	;;
245L(gt2):
246	ld8	r18 = [up], UPD
247	BSH	r8 = r10, tnc
248	FSH	r20 = r10, cnt
249	;;
250	ld8	r19 = [up], UPD
251	;;
252	ld8	r16 = [up], UPD
253	BSH	r21 = r17, tnc
254	FSH	r22 = r17, cnt
255	;;
256	ld8	r17 = [up], UPD
257	BSH	r23 = r18, tnc
258	br.cloop.dptk	L(gt6)
259	;;
260	nop	0
261	FSH	r24 = r18, cnt
262	BSH	r25 = r19, tnc
263	;;
264	or	r14 = r21, r20
265	FSH	r26 = r19, cnt
266	BSH	r27 = r16, tnc
267	;;
268 {.mmi;	nop	0
269	or	r15 = r23, r22
270	FSH	r20 = r16, cnt
271}{.mib;	sub	r31 = -1, r14
272	BSH	r21 = r17, tnc
273	br	.Lr6
274	;; }
275L(gt6):
276 {.mmi;	nop	0
277	nop	0
278	FSH	r24 = r18, cnt
279}{.mmi;	ld8	r18 = [up], UPD
280	nop	0
281	BSH	r25 = r19, tnc
282	;; }
283 {.mmi; nop   0
284	or	r14 = r21, r20
285	FSH	r26 = r19, cnt
286}{.mmi;	ld8	r19 = [up], UPD
287	nop	0
288	BSH	r27 = r16, tnc
289	;; }
290 {.mmi;	or	r15 = r23, r22
291	sub	r31 = -1, r14
292	FSH	r20 = r16, cnt
293}{.mib;	ld8	r16 = [up], UPD
294	BSH	r21 = r17, tnc
295	br	.LL10
296}
297
298.Lb11:
299	ld8	r16 = [up], UPD
300	;;
301	ld8	r17 = [up], UPD
302	BSH	r8 = r10, tnc
303	FSH	r26 = r10, cnt
304	br.cloop.dptk	L(gt3)
305	;;
306	BSH	r27 = r16, tnc
307	;;
308	FSH	r20 = r16, cnt
309	BSH	r21 = r17, tnc
310	;;
311	FSH	r22 = r17, cnt
312	;;
313	or	r15 = r27, r26
314	;;
315	or	r14 = r21, r20
316	sub	r31 = -1, r15
317	br	.Lr3
318	;;
319L(gt3):
320	ld8	r18 = [up], UPD
321	;;
322	ld8	r19 = [up], UPD
323	BSH	r27 = r16, tnc
324	;;
325 {.mmi;	nop	0
326	nop	0
327	FSH	r20 = r16, cnt
328}{.mmi;	ld8	r16 = [up], UPD
329	nop	0
330	BSH	r21 = r17, tnc
331	;; }
332 {.mmi	nop	0
333	nop	0
334	FSH	r22 = r17, cnt
335}{.mib;	ld8	r17 = [up], UPD
336	BSH	r23 = r18, tnc
337	br.cloop.dptk	L(gt7)
338	;; }
339	or	r15 = r27, r26
340	FSH	r24 = r18, cnt
341	BSH	r25 = r19, tnc
342	;;
343 {.mmi;	nop	0
344	or	r14 = r21, r20
345	FSH	r26 = r19, cnt
346}{.mib;	sub	r31 = -1, r15
347	BSH	r27 = r16, tnc
348	br	.Lr7
349}
350L(gt7):
351 {.mmi;	nop	0
352	or	r15 = r27, r26
353	FSH	r24 = r18, cnt
354}{.mmi;	ld8	r18 = [up], UPD
355	nop	0
356	BSH	r25 = r19, tnc
357	;; }
358 {.mmi;	or	r14 = r21, r20
359	sub	r31 = -1, r15
360	FSH	r26 = r19, cnt
361}{.mib;	ld8	r19 = [up], UPD
362	BSH	r27 = r16, tnc
363	br	.LL11
364}
365
366C *** MAIN LOOP START ***
367	ALIGN(32)
368L(top):
369.LL01:
370 {.mmi;	st8	[rp] = r31, UPD		C M2
371	or	r15 = r27, r26		C M3
372	FSH	r24 = r18, cnt		C I0
373}{.mmi;	ld8	r18 = [up], UPD		C M0
374	sub	r31 = -1, r14		C M1
375	BSH	r25 = r19, tnc		C I1
376	;; }
377.LL00:
378 {.mmi;	st8	[rp] = r31, UPD
379	or	r14 = r21, r20
380	FSH	r26 = r19, cnt
381}{.mmi;	ld8	r19 = [up], UPD
382	sub	r31 = -1, r15
383	BSH	r27 = r16, tnc
384	;; }
385.LL11:
386 {.mmi;	st8	[rp] = r31, UPD
387	or	r15 = r23, r22
388	FSH	r20 = r16, cnt
389}{.mmi;	ld8	r16 = [up], UPD
390	sub	r31 = -1, r14
391	BSH	r21 = r17, tnc
392	;; }
393.LL10:
394 {.mmi;	st8	[rp] = r31, UPD
395	or	r14 = r25, r24
396	FSH	r22 = r17, cnt
397}{.mmi;	ld8	r17 = [up], UPD
398	sub	r31 = -1, r15
399	BSH	r23 = r18, tnc
400	;; }
401L(end):	lfetch		[r11], PUPD
402	br.cloop.dptk	L(top)
403C *** MAIN LOOP END ***
404
405 {.mmi;	st8	[rp] = r31, UPD
406	or	r15 = r27, r26
407	FSH	r24 = r18, cnt
408}{.mib;	sub	r31 = -1, r14
409	BSH	r25 = r19, tnc
410	nop	0
411	;; }
412.Lr8:
413 {.mmi;	st8	[rp] = r31, UPD
414	or	r14 = r21, r20
415	FSH	r26 = r19, cnt
416}{.mib;	sub	r31 = -1, r15
417	BSH	r27 = r16, tnc
418	nop	0
419	;; }
420.Lr7:
421 {.mmi;	st8	[rp] = r31, UPD
422	or	r15 = r23, r22
423	FSH	r20 = r16, cnt
424}{.mib;	sub	r31 = -1, r14
425	BSH	r21 = r17, tnc
426	nop	0
427	;; }
428.Lr6:	st8	[rp] = r31, UPD
429	or	r14 = r25, r24
430	FSH	r22 = r17, cnt
431	sub	r31 = -1, r15
432	;;
433.Lr5:	st8	[rp] = r31, UPD
434	or	r15 = r27, r26
435	sub	r31 = -1, r14
436	;;
437.Lr4:	st8	[rp] = r31, UPD
438	or	r14 = r21, r20
439	sub	r31 = -1, r15
440	;;
441.Lr3:	st8	[rp] = r31, UPD
442	sub	r31 = -1, r14
443	;;
444.Lr2:	st8	[rp] = r31, UPD
445	sub	r31 = -1, r22
446	;;
447.Lr1:	st8	[rp] = r31, UPD		C				M23
448	mov	ar.lc = r2		C				I0
449	br.ret.sptk.many b0		C				B
450EPILOGUE(func)
451ASM_END()
452