xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/ia64/addmul_2.asm (revision 7c192b2a5e1093666e67801684f930ef49b3b363)
1dnl  IA-64 mpn_addmul_2 -- Multiply a n-limb number with a 2-limb number and
2dnl  add the result to a (n+1)-limb number.
3
4dnl  Contributed to the GNU project by Torbjorn Granlund.
5
6dnl  Copyright 2004, 2005, 2011 Free Software Foundation, Inc.
7
8dnl  This file is part of the GNU MP Library.
9
10dnl  The GNU MP Library is free software; you can redistribute it and/or modify
11dnl  it under the terms of the GNU Lesser General Public License as published
12dnl  by the Free Software Foundation; either version 3 of the License, or (at
13dnl  your option) any later version.
14
15dnl  The GNU MP Library is distributed in the hope that it will be useful, but
16dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
17dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
18dnl  License for more details.
19
20dnl  You should have received a copy of the GNU Lesser General Public License
21dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
22
23include(`../config.m4')
24
25C         cycles/limb
26C Itanium:    3.65
27C Itanium 2:  1.625
28
29C TODO
30C  * Clean up variable names, and try to decrease the number of distinct
31C    registers used.
32C  * Clean up feed-in code to not require zeroing several registers.
33C  * Make sure we don't depend on uninitialised predicate registers.
34C  * Could perhaps save a few cycles by using 1 c/l carry propagation in
35C    wind-down code.
36C  * Ultimately rewrite.  The problem with this code is that it first uses a
37C    loaded u value in one xma pair, then leaves it live over several unrelated
38C    xma pairs, before it uses it again.  It should actually be quite possible
39C    to just swap some aligned xma pairs around.  But we should then schedule
40C    u loads further from the first use.
41
42C INPUT PARAMETERS
43define(`rp',`r32')
44define(`up',`r33')
45define(`n',`r34')
46define(`vp',`r35')
47
48define(`srp',`r3')
49
50define(`v0',`f6')
51define(`v1',`f7')
52
53define(`s0',`r14')
54define(`acc0',`r15')
55
56define(`pr0_0',`r16') define(`pr0_1',`r17')
57define(`pr0_2',`r18') define(`pr0_3',`r19')
58
59define(`pr1_0',`r20') define(`pr1_1',`r21')
60define(`pr1_2',`r22') define(`pr1_3',`r23')
61
62define(`acc1_0',`r24') define(`acc1_1',`r25')
63define(`acc1_2',`r26') define(`acc1_3',`r27')
64
65dnl define(`',`r28')
66dnl define(`',`r29')
67dnl define(`',`r30')
68dnl define(`',`r31')
69
70define(`fp0b_0',`f8') define(`fp0b_1',`f9')
71define(`fp0b_2',`f10') define(`fp0b_3',`f11')
72
73define(`fp1a_0',`f12') define(`fp1a_1',`f13')
74define(`fp1a_2',`f14') define(`fp1a_3',`f15')
75
76define(`fp1b_0',`f32') define(`fp1b_1',`f33')
77define(`fp1b_2',`f34') define(`fp1b_3',`f35')
78
79define(`fp2a_0',`f36') define(`fp2a_1',`f37')
80define(`fp2a_2',`f38') define(`fp2a_3',`f39')
81
82define(`r_0',`f40') define(`r_1',`f41')
83define(`r_2',`f42') define(`r_3',`f43')
84
85define(`u_0',`f44') define(`u_1',`f45')
86define(`u_2',`f46') define(`u_3',`f47')
87
88define(`rx',`f48')
89define(`ux',`f49')
90define(`ry',`f50')
91define(`uy',`f51')
92
93ASM_START()
94PROLOGUE(mpn_addmul_2s)
95	.prologue
96	.save	ar.lc, r2
97	.body
98
99ifdef(`HAVE_ABI_32',`
100.mmi;		addp4	rp = 0, rp		C			M I
101		addp4	up = 0, up		C			M I
102		addp4	vp = 0, vp		C			M I
103.mmi;		nop	1
104		nop	1
105		zxt4	n = n			C			I
106	;;')
107
108.mmi;		ldf8	ux = [up], 8		C			M
109		ldf8	v0 = [vp], 8		C			M
110		mov	r2 = ar.lc		C			I0
111.mmi;		ldf8	rx = [rp], 8		C			M
112		and	r14 = 3, n		C			M I
113		add	n = -2, n		C			M I
114	;;
115.mmi;		ldf8	uy = [up], 8		C			M
116		ldf8	v1 = [vp]		C			M
117		shr.u	n = n, 2		C			I0
118.mmi;		ldf8	ry = [rp], -8		C			M
119		cmp.eq	p14, p0 = 1, r14	C			M I
120		cmp.eq	p11, p0 = 2, r14	C			M I
121	;;
122.mmi;		add	srp = 16, rp		C			M I
123		cmp.eq	p15, p0 = 3, r14	C			M I
124		mov	ar.lc = n		C			I0
125.bbb;	(p14)	br.dptk	L(x01)			C			B
126	(p11)	br.dptk	L(x10)			C			B
127	(p15)	br.dptk	L(x11)			C			B
128	;;
129
130L(x00):		cmp.ne	p6, p0 = r0, r0		C suppress initial xma pair
131		mov	fp2a_3 = f0
132		br	L(b00)
133L(x01):		cmp.ne	p14, p0 = r0, r0	C suppress initial xma pair
134		mov	fp2a_2 = f0
135		br	L(b01)
136L(x10):		cmp.ne	p11, p0 = r0, r0	C suppress initial xma pair
137		mov	fp2a_1 = f0
138		br	L(b10)
139L(x11):		cmp.ne	p15, p0 = r0, r0	C suppress initial xma pair
140		mov	fp2a_0 = f0
141		br	L(b11)
142
143EPILOGUE()
144
145PROLOGUE(mpn_addmul_2)
146	.prologue
147	.save	ar.lc, r2
148	.body
149
150ifdef(`HAVE_ABI_32',`
151.mmi;		addp4	rp = 0, rp		C			M I
152		addp4	up = 0, up		C			M I
153		addp4	vp = 0, vp		C			M I
154.mmi;		nop	1
155		nop	1
156		zxt4	n = n			C			I
157	;;')
158
159.mmi;		ldf8	ux = [up], 8		C			M
160		ldf8	v0 = [vp], 8		C			M
161		mov	r2 = ar.lc		C			I0
162.mmi;		ldf8	rx = [rp], 8		C			M
163		and	r14 = 3, n		C			M I
164		add	n = -2, n		C			M I
165	;;
166.mmi;		ldf8	uy = [up], 8		C			M
167		ldf8	v1 = [vp]		C			M
168		shr.u	n = n, 2		C			I0
169.mmi;		ldf8	ry = [rp], -8		C			M
170		cmp.eq	p14, p0 = 1, r14	C			M I
171		cmp.eq	p11, p0 = 2, r14	C			M I
172	;;
173.mmi;		add	srp = 16, rp		C			M I
174		cmp.eq	p15, p6 = 3, r14	C			M I
175		mov	ar.lc = n		C			I0
176.bbb;	(p14)	br.dptk	L(b01)			C			B
177	(p11)	br.dptk	L(b10)			C			B
178	(p15)	br.dptk	L(b11)			C			B
179	;;
180
181	ALIGN(32)
182L(b00):
183.mmi;		ldf8	r_1 = [srp], 8
184		ldf8	u_1 = [up], 8
185		mov	acc1_2 = 0
186.mmi;		mov	pr1_2 = 0
187		mov	pr0_3 = 0
188		cmp.ne	p8, p9 = r0, r0
189	;;
190.mfi;		ldf8	r_2 = [srp], 8
191		xma.l	fp0b_3 = ux, v0, rx
192		cmp.ne	p12, p13 = r0, r0
193.mfb;		ldf8	u_2 = [up], 8
194		xma.hu	fp1b_3 = ux, v0, rx
195		br.cloop.dptk	L(gt4)
196
197		xma.l	fp0b_0 = uy, v0, ry
198		xma.hu	fp1a_0 = uy, v0, ry
199	;;
200		getfsig	acc0 = fp0b_3
201	(p6)	xma.hu	fp2a_3 = ux, v1, fp1b_3		C suppressed for addmul_2s
202	(p6)	xma.l	fp1b_3 = ux, v1, fp1b_3		C suppressed for addmul_2s
203	;;
204		xma.l	fp0b_1 = u_1, v0, r_1
205		xma.hu	fp1a_1 = u_1, v0, r_1
206	;;
207		getfsig	pr0_0 = fp0b_0
208		xma.l	fp1b_0 = uy, v1, fp1a_0
209		xma.hu	fp2a_0 = uy, v1, fp1a_0
210	;;
211		getfsig	pr1_3 = fp1b_3
212		getfsig	acc1_3 = fp2a_3
213		xma.l	fp0b_2 = u_2, v0, r_2
214		xma.hu	fp1a_2 = u_2, v0, r_2
215		br	L(cj4)
216
217L(gt4):		xma.l	fp0b_0 = uy, v0, ry
218		xma.hu	fp1a_0 = uy, v0, ry
219	;;
220		ldf8	r_3 = [srp], 8
221		getfsig	acc0 = fp0b_3
222	(p6)	xma.hu	fp2a_3 = ux, v1, fp1b_3		C suppressed for addmul_2s
223		ldf8	u_3 = [up], 8
224	(p6)	xma.l	fp1b_3 = ux, v1, fp1b_3		C suppressed for addmul_2s
225	;;
226		xma.l	fp0b_1 = u_1, v0, r_1
227		xma.hu	fp1a_1 = u_1, v0, r_1
228	;;
229		ldf8	r_0 = [srp], 8
230		getfsig	pr0_0 = fp0b_0
231		xma.l	fp1b_0 = uy, v1, fp1a_0
232		xma.hu	fp2a_0 = uy, v1, fp1a_0
233	;;
234		ldf8	u_0 = [up], 8
235		getfsig	pr1_3 = fp1b_3
236		xma.l	fp0b_2 = u_2, v0, r_2
237	;;
238		getfsig	acc1_3 = fp2a_3
239		xma.hu	fp1a_2 = u_2, v0, r_2
240		br	L(00)
241
242
243	ALIGN(32)
244L(b01):
245.mmi;		ldf8	r_0 = [srp], 8		C M
246		ldf8	u_0 = [up], 8		C M
247		mov	acc1_1 = 0		C M I
248.mmi;		mov	pr1_1 = 0		C M I
249		mov	pr0_2 = 0		C M I
250		cmp.ne	p6, p7 = r0, r0		C M I
251	;;
252.mfi;		ldf8	r_1 = [srp], 8		C M
253		xma.l	fp0b_2 = ux, v0, rx	C F
254		cmp.ne	p10, p11 = r0, r0	C M I
255.mfi;		ldf8	u_1 = [up], 8		C M
256		xma.hu	fp1b_2 = ux, v0, rx	C F
257		nop	1
258	;;
259		xma.l	fp0b_3 = uy, v0, ry	C F
260		xma.hu	fp1a_3 = uy, v0, ry	C F
261	;;
262.mmf;		getfsig	acc0 = fp0b_2		C M
263		ldf8	r_2 = [srp], 8		C M
264	(p14)	xma.hu	fp2a_2 = ux, v1,fp1b_2	C F	suppressed for addmul_2s
265.mfb;		ldf8	u_2 = [up], 8		C M
266	(p14)	xma.l	fp1b_2 = ux, v1,fp1b_2	C F	suppressed for addmul_2s
267		br.cloop.dptk	L(gt5)
268
269		xma.l	fp0b_0 = u_0, v0, r_0	C F
270		xma.hu	fp1a_0 = u_0, v0, r_0	C F
271	;;
272		getfsig	pr0_3 = fp0b_3		C M
273		xma.l	fp1b_3 = uy, v1,fp1a_3	C F
274		xma.hu	fp2a_3 = uy, v1,fp1a_3	C F
275	;;
276		getfsig	pr1_2 = fp1b_2		C M
277		getfsig	acc1_2 = fp2a_2		C M
278		xma.l	fp0b_1 = u_1, v0, r_1	C F
279		xma.hu	fp1a_1 = u_1, v0, r_1	C F
280		br	L(cj5)
281
282L(gt5):		xma.l	fp0b_0 = u_0, v0, r_0
283		xma.hu	fp1a_0 = u_0, v0, r_0
284	;;
285		getfsig	pr0_3 = fp0b_3
286		ldf8	r_3 = [srp], 8
287		xma.l	fp1b_3 = uy, v1, fp1a_3
288		xma.hu	fp2a_3 = uy, v1, fp1a_3
289	;;
290		ldf8	u_3 = [up], 8
291		getfsig	pr1_2 = fp1b_2
292		xma.l	fp0b_1 = u_1, v0, r_1
293	;;
294		getfsig	acc1_2 = fp2a_2
295		xma.hu	fp1a_1 = u_1, v0, r_1
296		br	L(01)
297
298
299	ALIGN(32)
300L(b10):		br.cloop.dptk	L(gt2)
301		xma.l	fp0b_1 = ux, v0, rx
302		xma.hu	fp1b_1 = ux, v0, rx
303	;;
304		xma.l	fp0b_2 = uy, v0, ry
305		xma.hu	fp1a_2 = uy, v0, ry
306	;;
307		stf8	[rp] = fp0b_1, 8
308	(p11)	xma.hu	fp2a_1 = ux, v1, fp1b_1		C suppressed for addmul_2s
309	(p11)	xma.l	fp1b_1 = ux, v1, fp1b_1		C suppressed for addmul_2s
310	;;
311		getfsig	acc0 = fp0b_2
312		xma.l	fp1b_2 = uy, v1, fp1a_2
313		xma.hu	fp2a_2 = uy, v1, fp1a_2
314	;;
315		getfsig	pr1_1 = fp1b_1
316		getfsig	acc1_1 = fp2a_1
317		mov	ar.lc = r2
318		getfsig	pr1_2 = fp1b_2
319		getfsig	r8 = fp2a_2
320	;;
321		add	s0 = pr1_1, acc0
322	;;
323		st8	[rp] = s0, 8
324		cmp.ltu	p8, p9 = s0, pr1_1
325		sub	r31 = -1, acc1_1
326	;;
327		.pred.rel "mutex", p8, p9
328	(p8)	add	acc0 = pr1_2, acc1_1, 1
329	(p9)	add	acc0 = pr1_2, acc1_1
330	(p8)	cmp.leu	p10, p0 = r31, pr1_2
331	(p9)	cmp.ltu	p10, p0 = r31, pr1_2
332	;;
333		st8	[rp] = acc0, 8
334	(p10)	add	r8 = 1, r8
335		br.ret.sptk.many b0
336
337
338L(gt2):
339.mmi;		ldf8	r_3 = [srp], 8
340		ldf8	u_3 = [up], 8
341		mov	acc1_0 = 0
342	;;
343.mfi;		ldf8	r_0 = [srp], 8
344		xma.l	fp0b_1 = ux, v0, rx
345		mov	pr1_0 = 0
346.mfi;		ldf8	u_0 = [up], 8
347		xma.hu	fp1b_1 = ux, v0, rx
348		mov	pr0_1 = 0
349	;;
350		xma.l	fp0b_2 = uy, v0, ry
351		xma.hu	fp1a_2 = uy, v0, ry
352	;;
353		getfsig	acc0 = fp0b_1
354		ldf8	r_1 = [srp], 8
355	(p11)	xma.hu	fp2a_1 = ux, v1, fp1b_1		C suppressed for addmul_2s
356	(p11)	xma.l	fp1b_1 = ux, v1, fp1b_1		C suppressed for addmul_2s
357	;;
358		ldf8	u_1 = [up], 8
359		xma.l	fp0b_3 = u_3, v0, r_3
360		xma.hu	fp1a_3 = u_3, v0, r_3
361	;;
362		getfsig	pr0_2 = fp0b_2
363		ldf8	r_2 = [srp], 8
364		xma.l	fp1b_2 = uy, v1, fp1a_2
365		xma.hu	fp2a_2 = uy, v1, fp1a_2
366	;;
367		ldf8	u_2 = [up], 8
368		getfsig	pr1_1 = fp1b_1
369	;;
370.mfi;		getfsig	acc1_1 = fp2a_1
371		xma.l	fp0b_0 = u_0, v0, r_0
372		cmp.ne	p8, p9 = r0, r0
373.mfb;		cmp.ne	p12, p13 = r0, r0
374		xma.hu	fp1a_0 = u_0, v0, r_0
375		br.cloop.sptk.clr	L(top)
376		br.many	L(end)
377
378
379	ALIGN(32)
380L(b11):		ldf8	r_2 = [srp], 8
381		mov	pr1_3 = 0
382		mov	pr0_0 = 0
383	;;
384		ldf8	u_2 = [up], 8
385		mov	acc1_3 = 0
386		br.cloop.dptk	L(gt3)
387	;;
388		cmp.ne	p6, p7 = r0, r0
389		xma.l	fp0b_0 = ux, v0, rx
390		xma.hu	fp1b_0 = ux, v0, rx
391	;;
392		cmp.ne	p10, p11 = r0, r0
393		xma.l	fp0b_1 = uy, v0, ry
394		xma.hu	fp1a_1 = uy, v0, ry
395	;;
396		getfsig	acc0 = fp0b_0
397	(p15)	xma.hu	fp2a_0 = ux, v1, fp1b_0		C suppressed for addmul_2s
398	(p15)	xma.l	fp1b_0 = ux, v1, fp1b_0		C suppressed for addmul_2s
399	;;
400		xma.l	fp0b_2 = uy, v1, r_2
401		xma.hu	fp1a_2 = uy, v1, r_2
402	;;
403		getfsig	pr0_1 = fp0b_1
404		xma.l	fp1b_1 = u_2, v0, fp1a_1
405		xma.hu	fp2a_1 = u_2, v0, fp1a_1
406	;;
407		getfsig	pr1_0 = fp1b_0
408		getfsig	acc1_0 = fp2a_0
409		br	L(cj3)
410
411L(gt3):		ldf8	r_3 = [srp], 8
412		xma.l	fp0b_0 = ux, v0, rx
413		cmp.ne	p10, p11 = r0, r0
414		ldf8	u_3 = [up], 8
415		xma.hu	fp1b_0 = ux, v0, rx
416		cmp.ne	p6, p7 = r0, r0
417	;;
418		xma.l	fp0b_1 = uy, v0, ry
419		xma.hu	fp1a_1 = uy, v0, ry
420	;;
421		getfsig	acc0 = fp0b_0
422		ldf8	r_0 = [srp], 8
423	(p15)	xma.hu	fp2a_0 = ux, v1, fp1b_0		C suppressed for addmul_2s
424		ldf8	u_0 = [up], 8
425	(p15)	xma.l	fp1b_0 = ux, v1, fp1b_0		C suppressed for addmul_2s
426	;;
427		xma.l	fp0b_2 = u_2, v0, r_2
428		xma.hu	fp1a_2 = u_2, v0, r_2
429	;;
430		getfsig	pr0_1 = fp0b_1
431		ldf8	r_1 = [srp], 8
432		xma.l	fp1b_1 = uy, v1, fp1a_1
433		xma.hu	fp2a_1 = uy, v1, fp1a_1
434	;;
435		ldf8	u_1 = [up], 8
436		getfsig	pr1_0 = fp1b_0
437	;;
438		getfsig	acc1_0 = fp2a_0
439		xma.l	fp0b_3 = u_3, v0, r_3
440		xma.hu	fp1a_3 = u_3, v0, r_3
441		br	L(11)
442
443
444C *** MAIN LOOP START ***
445	ALIGN(32)
446L(top):						C 00
447		.pred.rel "mutex", p12, p13
448		getfsig	pr0_3 = fp0b_3
449		ldf8	r_3 = [srp], 8
450		xma.l	fp1b_3 = u_3, v1, fp1a_3
451	(p12)	add	s0 = pr1_0, acc0, 1
452	(p13)	add	s0 = pr1_0, acc0
453		xma.hu	fp2a_3 = u_3, v1, fp1a_3
454	;;					C 01
455		.pred.rel "mutex", p8, p9
456		.pred.rel "mutex", p12, p13
457		ldf8	u_3 = [up], 8
458		getfsig	pr1_2 = fp1b_2
459	(p8)	cmp.leu	p6, p7 = acc0, pr0_1
460	(p9)	cmp.ltu	p6, p7 = acc0, pr0_1
461	(p12)	cmp.leu	p10, p11 = s0, pr1_0
462	(p13)	cmp.ltu	p10, p11 = s0, pr1_0
463	;;					C 02
464		.pred.rel "mutex", p6, p7
465		getfsig	acc1_2 = fp2a_2
466		st8	[rp] = s0, 8
467		xma.l	fp0b_1 = u_1, v0, r_1
468	(p6)	add	acc0 = pr0_2, acc1_0, 1
469	(p7)	add	acc0 = pr0_2, acc1_0
470		xma.hu	fp1a_1 = u_1, v0, r_1
471	;;					C 03
472L(01):
473		.pred.rel "mutex", p10, p11
474		getfsig	pr0_0 = fp0b_0
475		ldf8	r_0 = [srp], 8
476		xma.l	fp1b_0 = u_0, v1, fp1a_0
477	(p10)	add	s0 = pr1_1, acc0, 1
478	(p11)	add	s0 = pr1_1, acc0
479		xma.hu	fp2a_0 = u_0, v1, fp1a_0
480	;;					C 04
481		.pred.rel "mutex", p6, p7
482		.pred.rel "mutex", p10, p11
483		ldf8	u_0 = [up], 8
484		getfsig	pr1_3 = fp1b_3
485	(p6)	cmp.leu	p8, p9 = acc0, pr0_2
486	(p7)	cmp.ltu	p8, p9 = acc0, pr0_2
487	(p10)	cmp.leu	p12, p13 = s0, pr1_1
488	(p11)	cmp.ltu	p12, p13 = s0, pr1_1
489	;;					C 05
490		.pred.rel "mutex", p8, p9
491		getfsig	acc1_3 = fp2a_3
492		st8	[rp] = s0, 8
493		xma.l	fp0b_2 = u_2, v0, r_2
494	(p8)	add	acc0 = pr0_3, acc1_1, 1
495	(p9)	add	acc0 = pr0_3, acc1_1
496		xma.hu	fp1a_2 = u_2, v0, r_2
497	;;					C 06
498L(00):
499		.pred.rel "mutex", p12, p13
500		getfsig	pr0_1 = fp0b_1
501		ldf8	r_1 = [srp], 8
502		xma.l	fp1b_1 = u_1, v1, fp1a_1
503	(p12)	add	s0 = pr1_2, acc0, 1
504	(p13)	add	s0 = pr1_2, acc0
505		xma.hu	fp2a_1 = u_1, v1, fp1a_1
506	;;					C 07
507		.pred.rel "mutex", p8, p9
508		.pred.rel "mutex", p12, p13
509		ldf8	u_1 = [up], 8
510		getfsig	pr1_0 = fp1b_0
511	(p8)	cmp.leu	p6, p7 = acc0, pr0_3
512	(p9)	cmp.ltu	p6, p7 = acc0, pr0_3
513	(p12)	cmp.leu	p10, p11 = s0, pr1_2
514	(p13)	cmp.ltu	p10, p11 = s0, pr1_2
515	;;					C 08
516		.pred.rel "mutex", p6, p7
517		getfsig	acc1_0 = fp2a_0
518		st8	[rp] = s0, 8
519		xma.l	fp0b_3 = u_3, v0, r_3
520	(p6)	add	acc0 = pr0_0, acc1_2, 1
521	(p7)	add	acc0 = pr0_0, acc1_2
522		xma.hu	fp1a_3 = u_3, v0, r_3
523	;;					C 09
524L(11):
525		.pred.rel "mutex", p10, p11
526		getfsig	pr0_2 = fp0b_2
527		ldf8	r_2 = [srp], 8
528		xma.l	fp1b_2 = u_2, v1, fp1a_2
529	(p10)	add	s0 = pr1_3, acc0, 1
530	(p11)	add	s0 = pr1_3, acc0
531		xma.hu	fp2a_2 = u_2, v1, fp1a_2
532	;;					C 10
533		.pred.rel "mutex", p6, p7
534		.pred.rel "mutex", p10, p11
535		ldf8	u_2 = [up], 8
536		getfsig	pr1_1 = fp1b_1
537	(p6)	cmp.leu	p8, p9 = acc0, pr0_0
538	(p7)	cmp.ltu	p8, p9 = acc0, pr0_0
539	(p10)	cmp.leu	p12, p13 = s0, pr1_3
540	(p11)	cmp.ltu	p12, p13 = s0, pr1_3
541	;;					C 11
542		.pred.rel "mutex", p8, p9
543		getfsig	acc1_1 = fp2a_1
544		st8	[rp] = s0, 8
545		xma.l	fp0b_0 = u_0, v0, r_0
546	(p8)	add	acc0 = pr0_1, acc1_3, 1
547	(p9)	add	acc0 = pr0_1, acc1_3
548		xma.hu	fp1a_0 = u_0, v0, r_0
549L(10):		br.cloop.sptk.clr	L(top)			C 12
550	;;
551C *** MAIN LOOP END ***
552L(end):
553		.pred.rel "mutex", p12, p13
554.mfi;		getfsig	pr0_3 = fp0b_3
555		xma.l	fp1b_3 = u_3, v1, fp1a_3
556	(p12)	add	s0 = pr1_0, acc0, 1
557.mfi;	(p13)	add	s0 = pr1_0, acc0
558		xma.hu	fp2a_3 = u_3, v1, fp1a_3
559		nop	1
560	;;
561		.pred.rel "mutex", p8, p9
562		.pred.rel "mutex", p12, p13
563.mmi;		getfsig	pr1_2 = fp1b_2
564		st8	[rp] = s0, 8
565	(p8)	cmp.leu	p6, p7 = acc0, pr0_1
566.mmi;	(p9)	cmp.ltu	p6, p7 = acc0, pr0_1
567	(p12)	cmp.leu	p10, p11 = s0, pr1_0
568	(p13)	cmp.ltu	p10, p11 = s0, pr1_0
569	;;
570		.pred.rel "mutex", p6, p7
571.mfi;		getfsig	acc1_2 = fp2a_2
572		xma.l	fp0b_1 = u_1, v0, r_1
573		nop	1
574.mmf;	(p6)	add	acc0 = pr0_2, acc1_0, 1
575	(p7)	add	acc0 = pr0_2, acc1_0
576		xma.hu	fp1a_1 = u_1, v0, r_1
577	;;
578L(cj5):
579		.pred.rel "mutex", p10, p11
580.mfi;		getfsig	pr0_0 = fp0b_0
581		xma.l	fp1b_0 = u_0, v1, fp1a_0
582	(p10)	add	s0 = pr1_1, acc0, 1
583.mfi;	(p11)	add	s0 = pr1_1, acc0
584		xma.hu	fp2a_0 = u_0, v1, fp1a_0
585		nop	1
586	;;
587		.pred.rel "mutex", p6, p7
588		.pred.rel "mutex", p10, p11
589.mmi;		getfsig	pr1_3 = fp1b_3
590		st8	[rp] = s0, 8
591	(p6)	cmp.leu	p8, p9 = acc0, pr0_2
592.mmi;	(p7)	cmp.ltu	p8, p9 = acc0, pr0_2
593	(p10)	cmp.leu	p12, p13 = s0, pr1_1
594	(p11)	cmp.ltu	p12, p13 = s0, pr1_1
595	;;
596		.pred.rel "mutex", p8, p9
597.mfi;		getfsig	acc1_3 = fp2a_3
598		xma.l	fp0b_2 = u_2, v0, r_2
599		nop	1
600.mmf;	(p8)	add	acc0 = pr0_3, acc1_1, 1
601	(p9)	add	acc0 = pr0_3, acc1_1
602		xma.hu	fp1a_2 = u_2, v0, r_2
603	;;
604L(cj4):
605		.pred.rel "mutex", p12, p13
606.mfi;		getfsig	pr0_1 = fp0b_1
607		xma.l	fp1b_1 = u_1, v1, fp1a_1
608	(p12)	add	s0 = pr1_2, acc0, 1
609.mfi;	(p13)	add	s0 = pr1_2, acc0
610		xma.hu	fp2a_1 = u_1, v1, fp1a_1
611		nop	1
612	;;
613		.pred.rel "mutex", p8, p9
614		.pred.rel "mutex", p12, p13
615.mmi;		getfsig	pr1_0 = fp1b_0
616		st8	[rp] = s0, 8
617	(p8)	cmp.leu	p6, p7 = acc0, pr0_3
618.mmi;	(p9)	cmp.ltu	p6, p7 = acc0, pr0_3
619	(p12)	cmp.leu	p10, p11 = s0, pr1_2
620	(p13)	cmp.ltu	p10, p11 = s0, pr1_2
621	;;
622		.pred.rel "mutex", p6, p7
623.mmi;		getfsig	acc1_0 = fp2a_0
624	(p6)	add	acc0 = pr0_0, acc1_2, 1
625	(p7)	add	acc0 = pr0_0, acc1_2
626	;;
627L(cj3):
628		.pred.rel "mutex", p10, p11
629.mfi;		getfsig	pr0_2 = fp0b_2
630		xma.l	fp1b_2 = u_2, v1, fp1a_2
631	(p10)	add	s0 = pr1_3, acc0, 1
632.mfi;	(p11)	add	s0 = pr1_3, acc0
633		xma.hu	fp2a_2 = u_2, v1, fp1a_2
634		nop	1
635	;;
636		.pred.rel "mutex", p6, p7
637		.pred.rel "mutex", p10, p11
638.mmi;		getfsig	pr1_1 = fp1b_1
639		st8	[rp] = s0, 8
640	(p6)	cmp.leu	p8, p9 = acc0, pr0_0
641.mmi;	(p7)	cmp.ltu	p8, p9 = acc0, pr0_0
642	(p10)	cmp.leu	p12, p13 = s0, pr1_3
643	(p11)	cmp.ltu	p12, p13 = s0, pr1_3
644	;;
645		.pred.rel "mutex", p8, p9
646.mmi;		getfsig	acc1_1 = fp2a_1
647	(p8)	add	acc0 = pr0_1, acc1_3, 1
648	(p9)	add	acc0 = pr0_1, acc1_3
649	;;
650		.pred.rel "mutex", p12, p13
651.mmi;	(p12)	add	s0 = pr1_0, acc0, 1
652	(p13)	add	s0 = pr1_0, acc0
653		nop	1
654	;;
655		.pred.rel "mutex", p8, p9
656		.pred.rel "mutex", p12, p13
657.mmi;		getfsig	pr1_2 = fp1b_2
658		st8	[rp] = s0, 8
659	(p8)	cmp.leu	p6, p7 = acc0, pr0_1
660.mmi;	(p9)	cmp.ltu	p6, p7 = acc0, pr0_1
661	(p12)	cmp.leu	p10, p11 = s0, pr1_0
662	(p13)	cmp.ltu	p10, p11 = s0, pr1_0
663	;;
664		.pred.rel "mutex", p6, p7
665.mmi;		getfsig	r8 = fp2a_2
666	(p6)	add	acc0 = pr0_2, acc1_0, 1
667	(p7)	add	acc0 = pr0_2, acc1_0
668	;;
669		.pred.rel "mutex", p10, p11
670.mmi;	(p10)	add	s0 = pr1_1, acc0, 1
671	(p11)	add	s0 = pr1_1, acc0
672	(p6)	cmp.leu	p8, p9 = acc0, pr0_2
673	;;
674		.pred.rel "mutex", p10, p11
675.mmi;	(p7)	cmp.ltu	p8, p9 = acc0, pr0_2
676	(p10)	cmp.leu	p12, p13 = s0, pr1_1
677	(p11)	cmp.ltu	p12, p13 = s0, pr1_1
678	;;
679		.pred.rel "mutex", p8, p9
680.mmi;		st8	[rp] = s0, 8
681	(p8)	add	acc0 = pr1_2, acc1_1, 1
682	(p9)	add	acc0 = pr1_2, acc1_1
683	;;
684		.pred.rel "mutex", p8, p9
685.mmi;	(p8)	cmp.leu	p10, p11 = acc0, pr1_2
686	(p9)	cmp.ltu	p10, p11 = acc0, pr1_2
687	(p12)	add	acc0 = 1, acc0
688	;;
689.mmi;		st8	[rp] = acc0, 8
690	(p12)	cmpeqor	p10, p0 = 0, acc0
691		nop	1
692	;;
693.mib;	(p10)	add	r8 = 1, r8
694		mov	ar.lc = r2
695		br.ret.sptk.many b0
696EPILOGUE()
697ASM_END()
698