xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/ia64/addmul_2.asm (revision ae87de8892f277bece3527c15b186ebcfa188227)
1dnl  IA-64 mpn_addmul_2 -- Multiply a n-limb number with a 2-limb number and
2dnl  add the result to a (n+1)-limb number.
3
4dnl  Contributed to the GNU project by Torbjorn Granlund.
5
6dnl  Copyright 2004, 2005, 2011 Free Software Foundation, Inc.
7
8dnl  This file is part of the GNU MP Library.
9dnl
10dnl  The GNU MP Library is free software; you can redistribute it and/or modify
11dnl  it under the terms of either:
12dnl
13dnl    * the GNU Lesser General Public License as published by the Free
14dnl      Software Foundation; either version 3 of the License, or (at your
15dnl      option) any later version.
16dnl
17dnl  or
18dnl
19dnl    * the GNU General Public License as published by the Free Software
20dnl      Foundation; either version 2 of the License, or (at your option) any
21dnl      later version.
22dnl
23dnl  or both in parallel, as here.
24dnl
25dnl  The GNU MP Library is distributed in the hope that it will be useful, but
26dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
27dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
28dnl  for more details.
29dnl
30dnl  You should have received copies of the GNU General Public License and the
31dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
32dnl  see https://www.gnu.org/licenses/.
33
34include(`../config.m4')
35
36C         cycles/limb
37C Itanium:    3.65
38C Itanium 2:  1.625
39
40C TODO
41C  * Clean up variable names, and try to decrease the number of distinct
42C    registers used.
43C  * Clean up feed-in code to not require zeroing several registers.
44C  * Make sure we don't depend on uninitialised predicate registers.
45C  * Could perhaps save a few cycles by using 1 c/l carry propagation in
46C    wind-down code.
47C  * Ultimately rewrite.  The problem with this code is that it first uses a
48C    loaded u value in one xma pair, then leaves it live over several unrelated
49C    xma pairs, before it uses it again.  It should actually be quite possible
50C    to just swap some aligned xma pairs around.  But we should then schedule
51C    u loads further from the first use.
52
53C INPUT PARAMETERS
54define(`rp',`r32')
55define(`up',`r33')
56define(`n',`r34')
57define(`vp',`r35')
58
59define(`srp',`r3')
60
61define(`v0',`f6')
62define(`v1',`f7')
63
64define(`s0',`r14')
65define(`acc0',`r15')
66
67define(`pr0_0',`r16') define(`pr0_1',`r17')
68define(`pr0_2',`r18') define(`pr0_3',`r19')
69
70define(`pr1_0',`r20') define(`pr1_1',`r21')
71define(`pr1_2',`r22') define(`pr1_3',`r23')
72
73define(`acc1_0',`r24') define(`acc1_1',`r25')
74define(`acc1_2',`r26') define(`acc1_3',`r27')
75
76dnl define(`',`r28')
77dnl define(`',`r29')
78dnl define(`',`r30')
79dnl define(`',`r31')
80
81define(`fp0b_0',`f8') define(`fp0b_1',`f9')
82define(`fp0b_2',`f10') define(`fp0b_3',`f11')
83
84define(`fp1a_0',`f12') define(`fp1a_1',`f13')
85define(`fp1a_2',`f14') define(`fp1a_3',`f15')
86
87define(`fp1b_0',`f32') define(`fp1b_1',`f33')
88define(`fp1b_2',`f34') define(`fp1b_3',`f35')
89
90define(`fp2a_0',`f36') define(`fp2a_1',`f37')
91define(`fp2a_2',`f38') define(`fp2a_3',`f39')
92
93define(`r_0',`f40') define(`r_1',`f41')
94define(`r_2',`f42') define(`r_3',`f43')
95
96define(`u_0',`f44') define(`u_1',`f45')
97define(`u_2',`f46') define(`u_3',`f47')
98
99define(`rx',`f48')
100define(`ux',`f49')
101define(`ry',`f50')
102define(`uy',`f51')
103
104ASM_START()
105PROLOGUE(mpn_addmul_2s)
106	.prologue
107	.save	ar.lc, r2
108	.body
109
110ifdef(`HAVE_ABI_32',`
111 {.mmi;		addp4	rp = 0, rp		C			M I
112		addp4	up = 0, up		C			M I
113		addp4	vp = 0, vp		C			M I
114}{.mmi;		nop	1
115		nop	1
116		zxt4	n = n			C			I
117	;;
118}')
119
120 {.mmi;		ldf8	ux = [up], 8		C			M
121		ldf8	v0 = [vp], 8		C			M
122		mov	r2 = ar.lc		C			I0
123}{.mmi;		ldf8	rx = [rp], 8		C			M
124		and	r14 = 3, n		C			M I
125		add	n = -2, n		C			M I
126	;;
127}{.mmi;		ldf8	uy = [up], 8		C			M
128		ldf8	v1 = [vp]		C			M
129		shr.u	n = n, 2		C			I0
130}{.mmi;		ldf8	ry = [rp], -8		C			M
131		cmp.eq	p14, p0 = 1, r14	C			M I
132		cmp.eq	p11, p0 = 2, r14	C			M I
133	;;
134}{.mmi;		add	srp = 16, rp		C			M I
135		cmp.eq	p15, p0 = 3, r14	C			M I
136		mov	ar.lc = n		C			I0
137}{.bbb;	(p14)	br.dptk	L(x01)			C			B
138	(p11)	br.dptk	L(x10)			C			B
139	(p15)	br.dptk	L(x11)			C			B
140	;;
141}
142L(x00):		cmp.ne	p6, p0 = r0, r0		C suppress initial xma pair
143		mov	fp2a_3 = f0
144		br	L(b00)
145L(x01):		cmp.ne	p14, p0 = r0, r0	C suppress initial xma pair
146		mov	fp2a_2 = f0
147		br	L(b01)
148L(x10):		cmp.ne	p11, p0 = r0, r0	C suppress initial xma pair
149		mov	fp2a_1 = f0
150		br	L(b10)
151L(x11):		cmp.ne	p15, p0 = r0, r0	C suppress initial xma pair
152		mov	fp2a_0 = f0
153		br	L(b11)
154
155EPILOGUE()
156
157PROLOGUE(mpn_addmul_2)
158	.prologue
159	.save	ar.lc, r2
160	.body
161
162ifdef(`HAVE_ABI_32',`
163 {.mmi;		addp4	rp = 0, rp		C			M I
164		addp4	up = 0, up		C			M I
165		addp4	vp = 0, vp		C			M I
166}{.mmi;		nop	1
167		nop	1
168		zxt4	n = n			C			I
169	;;
170}')
171
172 {.mmi;		ldf8	ux = [up], 8		C			M
173		ldf8	v0 = [vp], 8		C			M
174		mov	r2 = ar.lc		C			I0
175}{.mmi;		ldf8	rx = [rp], 8		C			M
176		and	r14 = 3, n		C			M I
177		add	n = -2, n		C			M I
178	;;
179}{.mmi;		ldf8	uy = [up], 8		C			M
180		ldf8	v1 = [vp]		C			M
181		shr.u	n = n, 2		C			I0
182}{.mmi;		ldf8	ry = [rp], -8		C			M
183		cmp.eq	p14, p0 = 1, r14	C			M I
184		cmp.eq	p11, p0 = 2, r14	C			M I
185	;;
186}{.mmi;		add	srp = 16, rp		C			M I
187		cmp.eq	p15, p6 = 3, r14	C			M I
188		mov	ar.lc = n		C			I0
189}{.bbb;	(p14)	br.dptk	L(b01)			C			B
190	(p11)	br.dptk	L(b10)			C			B
191	(p15)	br.dptk	L(b11)			C			B
192	;;
193}
194	ALIGN(32)
195L(b00):
196 {.mmi;		ldf8	r_1 = [srp], 8
197		ldf8	u_1 = [up], 8
198		mov	acc1_2 = 0
199}{.mmi;		mov	pr1_2 = 0
200		mov	pr0_3 = 0
201		cmp.ne	p8, p9 = r0, r0
202	;;
203}{.mfi;		ldf8	r_2 = [srp], 8
204		xma.l	fp0b_3 = ux, v0, rx
205		cmp.ne	p12, p13 = r0, r0
206}{.mfb;		ldf8	u_2 = [up], 8
207		xma.hu	fp1b_3 = ux, v0, rx
208		br.cloop.dptk	L(gt4)
209}
210		xma.l	fp0b_0 = uy, v0, ry
211		xma.hu	fp1a_0 = uy, v0, ry
212	;;
213		getfsig	acc0 = fp0b_3
214	(p6)	xma.hu	fp2a_3 = ux, v1, fp1b_3		C suppressed for addmul_2s
215	(p6)	xma.l	fp1b_3 = ux, v1, fp1b_3		C suppressed for addmul_2s
216	;;
217		xma.l	fp0b_1 = u_1, v0, r_1
218		xma.hu	fp1a_1 = u_1, v0, r_1
219	;;
220		getfsig	pr0_0 = fp0b_0
221		xma.l	fp1b_0 = uy, v1, fp1a_0
222		xma.hu	fp2a_0 = uy, v1, fp1a_0
223	;;
224		getfsig	pr1_3 = fp1b_3
225		getfsig	acc1_3 = fp2a_3
226		xma.l	fp0b_2 = u_2, v0, r_2
227		xma.hu	fp1a_2 = u_2, v0, r_2
228		br	L(cj4)
229
230L(gt4):		xma.l	fp0b_0 = uy, v0, ry
231		xma.hu	fp1a_0 = uy, v0, ry
232	;;
233		ldf8	r_3 = [srp], 8
234		getfsig	acc0 = fp0b_3
235	(p6)	xma.hu	fp2a_3 = ux, v1, fp1b_3		C suppressed for addmul_2s
236		ldf8	u_3 = [up], 8
237	(p6)	xma.l	fp1b_3 = ux, v1, fp1b_3		C suppressed for addmul_2s
238	;;
239		xma.l	fp0b_1 = u_1, v0, r_1
240		xma.hu	fp1a_1 = u_1, v0, r_1
241	;;
242		ldf8	r_0 = [srp], 8
243		getfsig	pr0_0 = fp0b_0
244		xma.l	fp1b_0 = uy, v1, fp1a_0
245		xma.hu	fp2a_0 = uy, v1, fp1a_0
246	;;
247		ldf8	u_0 = [up], 8
248		getfsig	pr1_3 = fp1b_3
249		xma.l	fp0b_2 = u_2, v0, r_2
250	;;
251		getfsig	acc1_3 = fp2a_3
252		xma.hu	fp1a_2 = u_2, v0, r_2
253		br	L(00)
254
255
256	ALIGN(32)
257L(b01):
258 {.mmi;		ldf8	r_0 = [srp], 8		C M
259		ldf8	u_0 = [up], 8		C M
260		mov	acc1_1 = 0		C M I
261}{.mmi;		mov	pr1_1 = 0		C M I
262		mov	pr0_2 = 0		C M I
263		cmp.ne	p6, p7 = r0, r0		C M I
264	;;
265}{.mfi;		ldf8	r_1 = [srp], 8		C M
266		xma.l	fp0b_2 = ux, v0, rx	C F
267		cmp.ne	p10, p11 = r0, r0	C M I
268}{.mfi;		ldf8	u_1 = [up], 8		C M
269		xma.hu	fp1b_2 = ux, v0, rx	C F
270		nop	1
271	;;
272}		xma.l	fp0b_3 = uy, v0, ry	C F
273		xma.hu	fp1a_3 = uy, v0, ry	C F
274	;;
275 {.mmf;		getfsig	acc0 = fp0b_2		C M
276		ldf8	r_2 = [srp], 8		C M
277	(p14)	xma.hu	fp2a_2 = ux, v1,fp1b_2	C F	suppressed for addmul_2s
278}{.mfb;		ldf8	u_2 = [up], 8		C M
279	(p14)	xma.l	fp1b_2 = ux, v1,fp1b_2	C F	suppressed for addmul_2s
280		br.cloop.dptk	L(gt5)
281}
282		xma.l	fp0b_0 = u_0, v0, r_0	C F
283		xma.hu	fp1a_0 = u_0, v0, r_0	C F
284	;;
285		getfsig	pr0_3 = fp0b_3		C M
286		xma.l	fp1b_3 = uy, v1,fp1a_3	C F
287		xma.hu	fp2a_3 = uy, v1,fp1a_3	C F
288	;;
289		getfsig	pr1_2 = fp1b_2		C M
290		getfsig	acc1_2 = fp2a_2		C M
291		xma.l	fp0b_1 = u_1, v0, r_1	C F
292		xma.hu	fp1a_1 = u_1, v0, r_1	C F
293		br	L(cj5)
294
295L(gt5):		xma.l	fp0b_0 = u_0, v0, r_0
296		xma.hu	fp1a_0 = u_0, v0, r_0
297	;;
298		getfsig	pr0_3 = fp0b_3
299		ldf8	r_3 = [srp], 8
300		xma.l	fp1b_3 = uy, v1, fp1a_3
301		xma.hu	fp2a_3 = uy, v1, fp1a_3
302	;;
303		ldf8	u_3 = [up], 8
304		getfsig	pr1_2 = fp1b_2
305		xma.l	fp0b_1 = u_1, v0, r_1
306	;;
307		getfsig	acc1_2 = fp2a_2
308		xma.hu	fp1a_1 = u_1, v0, r_1
309		br	L(01)
310
311
312	ALIGN(32)
313L(b10):		br.cloop.dptk	L(gt2)
314		xma.l	fp0b_1 = ux, v0, rx
315		xma.hu	fp1b_1 = ux, v0, rx
316	;;
317		xma.l	fp0b_2 = uy, v0, ry
318		xma.hu	fp1a_2 = uy, v0, ry
319	;;
320		stf8	[rp] = fp0b_1, 8
321	(p11)	xma.hu	fp2a_1 = ux, v1, fp1b_1		C suppressed for addmul_2s
322	(p11)	xma.l	fp1b_1 = ux, v1, fp1b_1		C suppressed for addmul_2s
323	;;
324		getfsig	acc0 = fp0b_2
325		xma.l	fp1b_2 = uy, v1, fp1a_2
326		xma.hu	fp2a_2 = uy, v1, fp1a_2
327	;;
328		getfsig	pr1_1 = fp1b_1
329		getfsig	acc1_1 = fp2a_1
330		mov	ar.lc = r2
331		getfsig	pr1_2 = fp1b_2
332		getfsig	r8 = fp2a_2
333	;;
334		add	s0 = pr1_1, acc0
335	;;
336		st8	[rp] = s0, 8
337		cmp.ltu	p8, p9 = s0, pr1_1
338		sub	r31 = -1, acc1_1
339	;;
340	.pred.rel "mutex", p8, p9
341	(p8)	add	acc0 = pr1_2, acc1_1, 1
342	(p9)	add	acc0 = pr1_2, acc1_1
343	(p8)	cmp.leu	p10, p0 = r31, pr1_2
344	(p9)	cmp.ltu	p10, p0 = r31, pr1_2
345	;;
346		st8	[rp] = acc0, 8
347	(p10)	add	r8 = 1, r8
348		br.ret.sptk.many b0
349
350
351L(gt2):
352 {.mmi;		ldf8	r_3 = [srp], 8
353		ldf8	u_3 = [up], 8
354		mov	acc1_0 = 0
355	;;
356}{.mfi;		ldf8	r_0 = [srp], 8
357		xma.l	fp0b_1 = ux, v0, rx
358		mov	pr1_0 = 0
359}{.mfi;		ldf8	u_0 = [up], 8
360		xma.hu	fp1b_1 = ux, v0, rx
361		mov	pr0_1 = 0
362	;;
363}		xma.l	fp0b_2 = uy, v0, ry
364		xma.hu	fp1a_2 = uy, v0, ry
365	;;
366		getfsig	acc0 = fp0b_1
367		ldf8	r_1 = [srp], 8
368	(p11)	xma.hu	fp2a_1 = ux, v1, fp1b_1		C suppressed for addmul_2s
369	(p11)	xma.l	fp1b_1 = ux, v1, fp1b_1		C suppressed for addmul_2s
370	;;
371		ldf8	u_1 = [up], 8
372		xma.l	fp0b_3 = u_3, v0, r_3
373		xma.hu	fp1a_3 = u_3, v0, r_3
374	;;
375		getfsig	pr0_2 = fp0b_2
376		ldf8	r_2 = [srp], 8
377		xma.l	fp1b_2 = uy, v1, fp1a_2
378		xma.hu	fp2a_2 = uy, v1, fp1a_2
379	;;
380		ldf8	u_2 = [up], 8
381		getfsig	pr1_1 = fp1b_1
382	;;
383 {.mfi;		getfsig	acc1_1 = fp2a_1
384		xma.l	fp0b_0 = u_0, v0, r_0
385		cmp.ne	p8, p9 = r0, r0
386}{.mfb;		cmp.ne	p12, p13 = r0, r0
387		xma.hu	fp1a_0 = u_0, v0, r_0
388		br.cloop.sptk.clr	L(top)
389}
390		br.many	L(end)
391
392
393	ALIGN(32)
394L(b11):		ldf8	r_2 = [srp], 8
395		mov	pr1_3 = 0
396		mov	pr0_0 = 0
397	;;
398		ldf8	u_2 = [up], 8
399		mov	acc1_3 = 0
400		br.cloop.dptk	L(gt3)
401	;;
402		cmp.ne	p6, p7 = r0, r0
403		xma.l	fp0b_0 = ux, v0, rx
404		xma.hu	fp1b_0 = ux, v0, rx
405	;;
406		cmp.ne	p10, p11 = r0, r0
407		xma.l	fp0b_1 = uy, v0, ry
408		xma.hu	fp1a_1 = uy, v0, ry
409	;;
410		getfsig	acc0 = fp0b_0
411	(p15)	xma.hu	fp2a_0 = ux, v1, fp1b_0		C suppressed for addmul_2s
412	(p15)	xma.l	fp1b_0 = ux, v1, fp1b_0		C suppressed for addmul_2s
413	;;
414		xma.l	fp0b_2 = uy, v1, r_2
415		xma.hu	fp1a_2 = uy, v1, r_2
416	;;
417		getfsig	pr0_1 = fp0b_1
418		xma.l	fp1b_1 = u_2, v0, fp1a_1
419		xma.hu	fp2a_1 = u_2, v0, fp1a_1
420	;;
421		getfsig	pr1_0 = fp1b_0
422		getfsig	acc1_0 = fp2a_0
423		br	L(cj3)
424
425L(gt3):		ldf8	r_3 = [srp], 8
426		xma.l	fp0b_0 = ux, v0, rx
427		cmp.ne	p10, p11 = r0, r0
428		ldf8	u_3 = [up], 8
429		xma.hu	fp1b_0 = ux, v0, rx
430		cmp.ne	p6, p7 = r0, r0
431	;;
432		xma.l	fp0b_1 = uy, v0, ry
433		xma.hu	fp1a_1 = uy, v0, ry
434	;;
435		getfsig	acc0 = fp0b_0
436		ldf8	r_0 = [srp], 8
437	(p15)	xma.hu	fp2a_0 = ux, v1, fp1b_0		C suppressed for addmul_2s
438		ldf8	u_0 = [up], 8
439	(p15)	xma.l	fp1b_0 = ux, v1, fp1b_0		C suppressed for addmul_2s
440	;;
441		xma.l	fp0b_2 = u_2, v0, r_2
442		xma.hu	fp1a_2 = u_2, v0, r_2
443	;;
444		getfsig	pr0_1 = fp0b_1
445		ldf8	r_1 = [srp], 8
446		xma.l	fp1b_1 = uy, v1, fp1a_1
447		xma.hu	fp2a_1 = uy, v1, fp1a_1
448	;;
449		ldf8	u_1 = [up], 8
450		getfsig	pr1_0 = fp1b_0
451	;;
452		getfsig	acc1_0 = fp2a_0
453		xma.l	fp0b_3 = u_3, v0, r_3
454		xma.hu	fp1a_3 = u_3, v0, r_3
455		br	L(11)
456
457
458C *** MAIN LOOP START ***
459	ALIGN(32)
460L(top):						C 00
461	.pred.rel "mutex", p12, p13
462		getfsig	pr0_3 = fp0b_3
463		ldf8	r_3 = [srp], 8
464		xma.l	fp1b_3 = u_3, v1, fp1a_3
465	(p12)	add	s0 = pr1_0, acc0, 1
466	(p13)	add	s0 = pr1_0, acc0
467		xma.hu	fp2a_3 = u_3, v1, fp1a_3
468	;;					C 01
469	.pred.rel "mutex", p8, p9
470	.pred.rel "mutex", p12, p13
471		ldf8	u_3 = [up], 8
472		getfsig	pr1_2 = fp1b_2
473	(p8)	cmp.leu	p6, p7 = acc0, pr0_1
474	(p9)	cmp.ltu	p6, p7 = acc0, pr0_1
475	(p12)	cmp.leu	p10, p11 = s0, pr1_0
476	(p13)	cmp.ltu	p10, p11 = s0, pr1_0
477	;;					C 02
478	.pred.rel "mutex", p6, p7
479		getfsig	acc1_2 = fp2a_2
480		st8	[rp] = s0, 8
481		xma.l	fp0b_1 = u_1, v0, r_1
482	(p6)	add	acc0 = pr0_2, acc1_0, 1
483	(p7)	add	acc0 = pr0_2, acc1_0
484		xma.hu	fp1a_1 = u_1, v0, r_1
485	;;					C 03
486L(01):
487	.pred.rel "mutex", p10, p11
488		getfsig	pr0_0 = fp0b_0
489		ldf8	r_0 = [srp], 8
490		xma.l	fp1b_0 = u_0, v1, fp1a_0
491	(p10)	add	s0 = pr1_1, acc0, 1
492	(p11)	add	s0 = pr1_1, acc0
493		xma.hu	fp2a_0 = u_0, v1, fp1a_0
494	;;					C 04
495	.pred.rel "mutex", p6, p7
496	.pred.rel "mutex", p10, p11
497		ldf8	u_0 = [up], 8
498		getfsig	pr1_3 = fp1b_3
499	(p6)	cmp.leu	p8, p9 = acc0, pr0_2
500	(p7)	cmp.ltu	p8, p9 = acc0, pr0_2
501	(p10)	cmp.leu	p12, p13 = s0, pr1_1
502	(p11)	cmp.ltu	p12, p13 = s0, pr1_1
503	;;					C 05
504	.pred.rel "mutex", p8, p9
505		getfsig	acc1_3 = fp2a_3
506		st8	[rp] = s0, 8
507		xma.l	fp0b_2 = u_2, v0, r_2
508	(p8)	add	acc0 = pr0_3, acc1_1, 1
509	(p9)	add	acc0 = pr0_3, acc1_1
510		xma.hu	fp1a_2 = u_2, v0, r_2
511	;;					C 06
512L(00):
513	.pred.rel "mutex", p12, p13
514		getfsig	pr0_1 = fp0b_1
515		ldf8	r_1 = [srp], 8
516		xma.l	fp1b_1 = u_1, v1, fp1a_1
517	(p12)	add	s0 = pr1_2, acc0, 1
518	(p13)	add	s0 = pr1_2, acc0
519		xma.hu	fp2a_1 = u_1, v1, fp1a_1
520	;;					C 07
521	.pred.rel "mutex", p8, p9
522	.pred.rel "mutex", p12, p13
523		ldf8	u_1 = [up], 8
524		getfsig	pr1_0 = fp1b_0
525	(p8)	cmp.leu	p6, p7 = acc0, pr0_3
526	(p9)	cmp.ltu	p6, p7 = acc0, pr0_3
527	(p12)	cmp.leu	p10, p11 = s0, pr1_2
528	(p13)	cmp.ltu	p10, p11 = s0, pr1_2
529	;;					C 08
530	.pred.rel "mutex", p6, p7
531		getfsig	acc1_0 = fp2a_0
532		st8	[rp] = s0, 8
533		xma.l	fp0b_3 = u_3, v0, r_3
534	(p6)	add	acc0 = pr0_0, acc1_2, 1
535	(p7)	add	acc0 = pr0_0, acc1_2
536		xma.hu	fp1a_3 = u_3, v0, r_3
537	;;					C 09
538L(11):
539	.pred.rel "mutex", p10, p11
540		getfsig	pr0_2 = fp0b_2
541		ldf8	r_2 = [srp], 8
542		xma.l	fp1b_2 = u_2, v1, fp1a_2
543	(p10)	add	s0 = pr1_3, acc0, 1
544	(p11)	add	s0 = pr1_3, acc0
545		xma.hu	fp2a_2 = u_2, v1, fp1a_2
546	;;					C 10
547	.pred.rel "mutex", p6, p7
548	.pred.rel "mutex", p10, p11
549		ldf8	u_2 = [up], 8
550		getfsig	pr1_1 = fp1b_1
551	(p6)	cmp.leu	p8, p9 = acc0, pr0_0
552	(p7)	cmp.ltu	p8, p9 = acc0, pr0_0
553	(p10)	cmp.leu	p12, p13 = s0, pr1_3
554	(p11)	cmp.ltu	p12, p13 = s0, pr1_3
555	;;					C 11
556	.pred.rel "mutex", p8, p9
557		getfsig	acc1_1 = fp2a_1
558		st8	[rp] = s0, 8
559		xma.l	fp0b_0 = u_0, v0, r_0
560	(p8)	add	acc0 = pr0_1, acc1_3, 1
561	(p9)	add	acc0 = pr0_1, acc1_3
562		xma.hu	fp1a_0 = u_0, v0, r_0
563L(10):		br.cloop.sptk.clr	L(top)	C 12
564	;;
565C *** MAIN LOOP END ***
566L(end):
567	.pred.rel "mutex", p12, p13
568 {.mfi;		getfsig	pr0_3 = fp0b_3
569		xma.l	fp1b_3 = u_3, v1, fp1a_3
570	(p12)	add	s0 = pr1_0, acc0, 1
571}{.mfi;	(p13)	add	s0 = pr1_0, acc0
572		xma.hu	fp2a_3 = u_3, v1, fp1a_3
573		nop	1
574	;;
575}	.pred.rel "mutex", p8, p9
576	.pred.rel "mutex", p12, p13
577 {.mmi;		getfsig	pr1_2 = fp1b_2
578		st8	[rp] = s0, 8
579	(p8)	cmp.leu	p6, p7 = acc0, pr0_1
580}{.mmi;	(p9)	cmp.ltu	p6, p7 = acc0, pr0_1
581	(p12)	cmp.leu	p10, p11 = s0, pr1_0
582	(p13)	cmp.ltu	p10, p11 = s0, pr1_0
583	;;
584}	.pred.rel "mutex", p6, p7
585 {.mfi;		getfsig	acc1_2 = fp2a_2
586		xma.l	fp0b_1 = u_1, v0, r_1
587		nop	1
588}{.mmf;	(p6)	add	acc0 = pr0_2, acc1_0, 1
589	(p7)	add	acc0 = pr0_2, acc1_0
590		xma.hu	fp1a_1 = u_1, v0, r_1
591	;;
592}
593L(cj5):
594	.pred.rel "mutex", p10, p11
595 {.mfi;		getfsig	pr0_0 = fp0b_0
596		xma.l	fp1b_0 = u_0, v1, fp1a_0
597	(p10)	add	s0 = pr1_1, acc0, 1
598}{.mfi;	(p11)	add	s0 = pr1_1, acc0
599		xma.hu	fp2a_0 = u_0, v1, fp1a_0
600		nop	1
601	;;
602}	.pred.rel "mutex", p6, p7
603	.pred.rel "mutex", p10, p11
604 {.mmi;		getfsig	pr1_3 = fp1b_3
605	st8	[rp] = s0, 8
606	(p6)	cmp.leu	p8, p9 = acc0, pr0_2
607}{.mmi;	(p7)	cmp.ltu	p8, p9 = acc0, pr0_2
608	(p10)	cmp.leu	p12, p13 = s0, pr1_1
609	(p11)	cmp.ltu	p12, p13 = s0, pr1_1
610	;;
611}	.pred.rel "mutex", p8, p9
612 {.mfi;		getfsig	acc1_3 = fp2a_3
613		xma.l	fp0b_2 = u_2, v0, r_2
614		nop	1
615}{.mmf;	(p8)	add	acc0 = pr0_3, acc1_1, 1
616	(p9)	add	acc0 = pr0_3, acc1_1
617		xma.hu	fp1a_2 = u_2, v0, r_2
618	;;
619}
620L(cj4):
621	.pred.rel "mutex", p12, p13
622 {.mfi;		getfsig	pr0_1 = fp0b_1
623		xma.l	fp1b_1 = u_1, v1, fp1a_1
624	(p12)	add	s0 = pr1_2, acc0, 1
625}{.mfi;	(p13)	add	s0 = pr1_2, acc0
626		xma.hu	fp2a_1 = u_1, v1, fp1a_1
627		nop	1
628	;;
629}	.pred.rel "mutex", p8, p9
630	.pred.rel "mutex", p12, p13
631 {.mmi;		getfsig	pr1_0 = fp1b_0
632		st8	[rp] = s0, 8
633	(p8)	cmp.leu	p6, p7 = acc0, pr0_3
634}{.mmi;	(p9)	cmp.ltu	p6, p7 = acc0, pr0_3
635	(p12)	cmp.leu	p10, p11 = s0, pr1_2
636	(p13)	cmp.ltu	p10, p11 = s0, pr1_2
637	;;
638}	.pred.rel "mutex", p6, p7
639 {.mmi;		getfsig	acc1_0 = fp2a_0
640	(p6)	add	acc0 = pr0_0, acc1_2, 1
641	(p7)	add	acc0 = pr0_0, acc1_2
642	;;
643}
644L(cj3):
645	.pred.rel "mutex", p10, p11
646 {.mfi;		getfsig	pr0_2 = fp0b_2
647		xma.l	fp1b_2 = u_2, v1, fp1a_2
648	(p10)	add	s0 = pr1_3, acc0, 1
649}{.mfi;	(p11)	add	s0 = pr1_3, acc0
650		xma.hu	fp2a_2 = u_2, v1, fp1a_2
651		nop	1
652	;;
653}	.pred.rel "mutex", p6, p7
654	.pred.rel "mutex", p10, p11
655 {.mmi;		getfsig	pr1_1 = fp1b_1
656		st8	[rp] = s0, 8
657	(p6)	cmp.leu	p8, p9 = acc0, pr0_0
658}{.mmi;	(p7)	cmp.ltu	p8, p9 = acc0, pr0_0
659	(p10)	cmp.leu	p12, p13 = s0, pr1_3
660	(p11)	cmp.ltu	p12, p13 = s0, pr1_3
661	;;
662}	.pred.rel "mutex", p8, p9
663 {.mmi;		getfsig	acc1_1 = fp2a_1
664	(p8)	add	acc0 = pr0_1, acc1_3, 1
665	(p9)	add	acc0 = pr0_1, acc1_3
666	;;
667}	.pred.rel "mutex", p12, p13
668 {.mmi;	(p12)	add	s0 = pr1_0, acc0, 1
669	(p13)	add	s0 = pr1_0, acc0
670		nop	1
671	;;
672}	.pred.rel "mutex", p8, p9
673	.pred.rel "mutex", p12, p13
674 {.mmi;		getfsig	pr1_2 = fp1b_2
675		st8	[rp] = s0, 8
676	(p8)	cmp.leu	p6, p7 = acc0, pr0_1
677}{.mmi;	(p9)	cmp.ltu	p6, p7 = acc0, pr0_1
678	(p12)	cmp.leu	p10, p11 = s0, pr1_0
679	(p13)	cmp.ltu	p10, p11 = s0, pr1_0
680	;;
681}	.pred.rel "mutex", p6, p7
682 {.mmi;		getfsig	r8 = fp2a_2
683	(p6)	add	acc0 = pr0_2, acc1_0, 1
684	(p7)	add	acc0 = pr0_2, acc1_0
685	;;
686}	.pred.rel "mutex", p10, p11
687 {.mmi;	(p10)	add	s0 = pr1_1, acc0, 1
688	(p11)	add	s0 = pr1_1, acc0
689	(p6)	cmp.leu	p8, p9 = acc0, pr0_2
690	;;
691}	.pred.rel "mutex", p10, p11
692 {.mmi;	(p7)	cmp.ltu	p8, p9 = acc0, pr0_2
693	(p10)	cmp.leu	p12, p13 = s0, pr1_1
694	(p11)	cmp.ltu	p12, p13 = s0, pr1_1
695	;;
696}	.pred.rel "mutex", p8, p9
697 {.mmi;		st8	[rp] = s0, 8
698	(p8)	add	acc0 = pr1_2, acc1_1, 1
699	(p9)	add	acc0 = pr1_2, acc1_1
700	;;
701}	.pred.rel "mutex", p8, p9
702 {.mmi;	(p8)	cmp.leu	p10, p11 = acc0, pr1_2
703	(p9)	cmp.ltu	p10, p11 = acc0, pr1_2
704	(p12)	add	acc0 = 1, acc0
705	;;
706}{.mmi;		st8	[rp] = acc0, 8
707	(p12)	cmpeqor	p10, p0 = 0, acc0
708		nop	1
709	;;
710}{.mib;	(p10)	add	r8 = 1, r8
711		mov	ar.lc = r2
712		br.ret.sptk.many b0
713}
714EPILOGUE()
715ASM_END()
716