xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/ia64/mul_2.asm (revision 62f324d0121177eaf2e0384f92fd9ca2a751c795)
1dnl  IA-64 mpn_mul_2 -- Multiply a n-limb number with a 2-limb number and store
2dnl  store the result to a (n+1)-limb number.
3
4dnl  Copyright 2004 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of the GNU Lesser General Public License as published
10dnl  by the Free Software Foundation; either version 3 of the License, or (at
11dnl  your option) any later version.
12
13dnl  The GNU MP Library is distributed in the hope that it will be useful, but
14dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16dnl  License for more details.
17
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23C         cycles/limb
24C Itanium:    3.15
25C Itanium 2:  1.625
26
27C Note that this is very similar to addmul_2.asm.  If you change this file,
28C please change that file too.
29
30C TODO
31C  * Clean up variable names, and try to decrease the number of distinct
32C    registers used.
33C  * Cleanup feed-in code to not require zeroing several registers.
34C  * Make sure we don't depend on uninitialized predicate registers.
35C  * We currently cross-jump very aggressively, at the expense of a few cycles
36C    per operation.  Consider changing that.
37C  * Could perhaps save a few cycles by using 1 c/l carry propagation in
38C    wind-down code.
39C  * Ultimately rewrite.  The problem with this code is that it first uses a
40C    loaded u value in one xma pair, then leaves it live over several unrelated
41C    xma pairs, before it uses it again.  It should actually be quite possible
42C    to just swap some aligned xma pairs around.  But we should then schedule
43C    u loads further from the first use.
44
45C INPUT PARAMETERS
46define(`rp',`r32')
47define(`up',`r33')
48define(`n',`r34')
49define(`vp',`r35')
50
51define(`srp',`r3')
52
53define(`v0',`f6')
54define(`v1',`f7')
55
56define(`s0',`r14')
57define(`acc0',`r15')
58
59define(`pr0_0',`r16') define(`pr0_1',`r17')
60define(`pr0_2',`r18') define(`pr0_3',`r19')
61
62define(`pr1_0',`r20') define(`pr1_1',`r21')
63define(`pr1_2',`r22') define(`pr1_3',`r23')
64
65define(`acc1_0',`r24') define(`acc1_1',`r25')
66define(`acc1_2',`r26') define(`acc1_3',`r27')
67
68dnl define(`',`r28')
69dnl define(`',`r29')
70dnl define(`',`r30')
71dnl define(`',`r31')
72
73define(`fp0b_0',`f8') define(`fp0b_1',`f9')
74define(`fp0b_2',`f10') define(`fp0b_3',`f11')
75
76define(`fp1a_0',`f12') define(`fp1a_1',`f13')
77define(`fp1a_2',`f14') define(`fp1a_3',`f15')
78
79define(`fp1b_0',`f32') define(`fp1b_1',`f33')
80define(`fp1b_2',`f34') define(`fp1b_3',`f35')
81
82define(`fp2a_0',`f36') define(`fp2a_1',`f37')
83define(`fp2a_2',`f38') define(`fp2a_3',`f39')
84
85define(`u_0',`f44') define(`u_1',`f45')
86define(`u_2',`f46') define(`u_3',`f47')
87
88define(`ux',`f49')
89define(`uy',`f51')
90
91ASM_START()
92PROLOGUE(mpn_mul_2)
93	.prologue
94	.save	ar.lc, r2
95	.body
96
97ifdef(`HAVE_ABI_32',
98`	addp4		rp = 0, rp		C			M I
99	addp4		up = 0, up		C			M I
100	addp4		vp = 0, vp		C			M I
101	zxt4		n = n			C			I
102	;;')
103
104{.mmi		C 00
105	ldf8		ux = [up], 8		C			M
106	ldf8		v0 = [vp], 8		C			M
107	mov.i		r2 = ar.lc		C			I0
108}{.mmi
109	nop		0			C			M
110	and		r14 = 3, n		C			M I
111	add		n = -2, n		C			M I
112	;;
113}{.mmi		C 01
114	ldf8		uy = [up], 8		C			M
115	ldf8		v1 = [vp]		C			M
116	shr.u		n = n, 2		C			I
117}{.mmi
118	nop		0			C			M
119	cmp.eq		p10, p0 = 1, r14	C			M I
120	cmp.eq		p11, p0 = 2, r14	C			M I
121	;;
122}{.mmi		C 02
123	nop		0			C			M
124	cmp.eq		p12, p0 = 3, r14	C			M I
125	mov.i		ar.lc = n		C			I0
126}{.bbb
127  (p10) br.dptk		.Lb01			C			B
128  (p11) br.dptk		.Lb10			C			B
129  (p12) br.dptk		.Lb11			C			B
130	;;
131}
132
133	ALIGN(32)
134.Lb00:	ldf8		u_1 = [up], 8
135	mov		acc1_2 = 0
136	mov		pr1_2 = 0
137	mov		pr0_3 = 0
138	cmp.ne		p8, p9 = r0, r0
139	;;
140	xma.l		fp0b_3 = ux, v0, f0
141	cmp.ne		p12, p13 = r0, r0
142	ldf8		u_2 = [up], 8
143	xma.hu		fp1a_3 = ux, v0, f0
144	br.cloop.dptk	.grt4
145
146	xma.l		fp0b_0 = uy, v0, f0
147	xma.hu		fp1a_0 = uy, v0, f0
148	;;
149	getf.sig	acc0 = fp0b_3
150	xma.l		fp1b_3 = ux, v1, fp1a_3
151	xma.hu		fp2a_3 = ux, v1, fp1a_3
152	;;
153	xma.l		fp0b_1 = u_1, v0, f0
154	xma.hu		fp1a_1 = u_1, v0, f0
155	;;
156	getf.sig	pr0_0 = fp0b_0
157	xma.l		fp1b_0 = uy, v1, fp1a_0
158	xma.hu		fp2a_0 = uy, v1, fp1a_0
159	;;
160	getf.sig	pr1_3 = fp1b_3
161	getf.sig	acc1_3 = fp2a_3
162	xma.l		fp0b_2 = u_2, v0, f0
163	xma.hu		fp1a_2 = u_2, v0, f0
164	br		.Lcj4
165
166.grt4:	xma.l		fp0b_0 = uy, v0, f0
167	xma.hu		fp1a_0 = uy, v0, f0
168	;;
169	getf.sig	acc0 = fp0b_3
170	xma.l		fp1b_3 = ux, v1, fp1a_3
171	ldf8		u_3 = [up], 8
172	xma.hu		fp2a_3 = ux, v1, fp1a_3
173	;;
174	xma.l		fp0b_1 = u_1, v0, f0
175	xma.hu		fp1a_1 = u_1, v0, f0
176	;;
177	getf.sig	pr0_0 = fp0b_0
178	xma.l		fp1b_0 = uy, v1, fp1a_0
179	xma.hu		fp2a_0 = uy, v1, fp1a_0
180	;;
181	ldf8		u_0 = [up], 8
182	getf.sig	pr1_3 = fp1b_3
183	;;
184	getf.sig	acc1_3 = fp2a_3
185	xma.l		fp0b_2 = u_2, v0, f0
186	xma.hu		fp1a_2 = u_2, v0, f0
187	br		.LL00
188
189
190	ALIGN(32)
191.Lb01:	ldf8		u_0 = [up], 8		C M
192	mov		acc1_1 = 0		C M I
193	mov		pr1_1 = 0		C M I
194	mov		pr0_2 = 0		C M I
195	cmp.ne		p6, p7 = r0, r0		C M I
196	;;
197	xma.l		fp0b_2 = ux, v0, f0	C F
198	cmp.ne		p10, p11 = r0, r0	C M I
199	ldf8		u_1 = [up], 8		C M
200	xma.hu		fp1a_2 = ux, v0, f0	C F
201	;;
202	xma.l		fp0b_3 = uy, v0, f0	C F
203	xma.hu		fp1a_3 = uy, v0, f0	C F
204	;;
205	getf.sig	acc0 = fp0b_2		C M
206	xma.l		fp1b_2 = ux, v1,fp1a_2	C F
207	xma.hu		fp2a_2 = ux, v1,fp1a_2	C F
208	ldf8		u_2 = [up], 8		C M
209	br.cloop.dptk	.grt5
210
211	xma.l		fp0b_0 = u_0, v0, f0	C F
212	xma.hu		fp1a_0 = u_0, v0, f0	C F
213	;;
214	getf.sig	pr0_3 = fp0b_3		C M
215	xma.l		fp1b_3 = uy, v1,fp1a_3	C F
216	xma.hu		fp2a_3 = uy, v1,fp1a_3	C F
217	;;
218	getf.sig	pr1_2 = fp1b_2		C M
219	getf.sig	acc1_2 = fp2a_2		C M
220	xma.l		fp0b_1 = u_1, v0, f0	C F
221	xma.hu		fp1a_1 = u_1, v0, f0	C F
222	br		.Lcj5
223
224.grt5:	xma.l		fp0b_0 = u_0, v0, f0
225	xma.hu		fp1a_0 = u_0, v0, f0
226	;;
227	getf.sig	pr0_3 = fp0b_3
228	xma.l		fp1b_3 = uy, v1, fp1a_3
229	xma.hu		fp2a_3 = uy, v1, fp1a_3
230	;;
231	ldf8		u_3 = [up], 8
232	getf.sig	pr1_2 = fp1b_2
233	;;
234	getf.sig	acc1_2 = fp2a_2
235	xma.l		fp0b_1 = u_1, v0, f0
236	xma.hu		fp1a_1 = u_1, v0, f0
237	br		.LL01
238
239
240C We have two variants for n = 2.  They turn out to run at exactly the same
241C speed.  But the first, odd variant might allow one cycle to be trimmed.
242	ALIGN(32)
243ifdef(`',`
244.Lb10:		C 03
245	br.cloop.dptk	.grt2
246		C 04
247		C 05
248		C 06
249	xma.l		fp0b_1 = ux, v0, f0	C 0
250	xma.hu		fp1a_1 = ux, v0, f0	C 1
251	;;	C 07
252	xma.l		fp0b_2 = uy, v0, f0	C 1
253	xma.l		fp1b_1 = ux, v1, f0	C 1
254	;;	C 08
255	xma.hu		fp1a_2 = uy, v0, f0	C 2
256	xma.hu		fp2a_1 = ux, v1, f0	C 2
257	;;	C 09
258	xma.l		fp1b_2 = uy, v1, f0	C 2
259	xma.hu		fp2a_2 = uy, v1, f0	C 3
260	;;	C 10
261	getf.sig	r16 = fp1a_1
262	stf8		[rp] = fp0b_1, 8
263	;;	C 11
264	getf.sig	r17 = fp0b_2
265		C 12
266	getf.sig	r18 = fp1b_1
267		C 13
268	getf.sig	r19 = fp1a_2
269		C 14
270	getf.sig	r20 = fp2a_1
271		C 15
272	getf.sig	r21 = fp1b_2
273	;;	C 16
274	getf.sig	r8 = fp2a_2
275	add		r24 = r16, r17
276	;;	C 17
277	cmp.ltu		p6, p7 = r24, r16
278	add		r26 = r24, r18
279	;;	C 18
280	cmp.ltu		p8, p9 = r26, r24
281	;;	C 19
282	st8		[rp] = r26, 8
283  (p6)	add		r25 = r19, r20, 1
284  (p7)	add		r25 = r19, r20
285	;;	C 20
286  (p8)	add		r27 = r25, r21, 1
287  (p9)	add		r27 = r25, r21
288  (p6)	cmp.leu		p10, p0 = r25, r19
289  (p7)	cmp.ltu		p10, p0 = r25, r19
290	;;	C 21
291  (p10)	add		r8 = 1, r8
292  (p8)	cmp.leu		p12, p0 = r27, r25
293  (p9)	cmp.ltu		p12, p0 = r27, r25
294	;;	C 22
295	st8		[rp] = r27, 8
296	mov.i		ar.lc = r2
297  (p12)	add		r8 = 1, r8
298	br.ret.sptk.many b0
299')
300
301.Lb10:		C 03
302	br.cloop.dptk	.grt2
303		C 04
304		C 05
305		C 06
306	xma.l		fp0b_1 = ux, v0, f0
307	xma.hu		fp1a_1 = ux, v0, f0
308	;;	C 07
309	xma.l		fp0b_2 = uy, v0, f0
310	xma.hu		fp1a_2 = uy, v0, f0
311	;;	C 08
312		C 09
313		C 10
314	stf8		[rp] = fp0b_1, 8
315	xma.l		fp1b_1 = ux, v1, fp1a_1
316	xma.hu		fp2a_1 = ux, v1, fp1a_1
317	;;	C 11
318	getf.sig	acc0 = fp0b_2
319	xma.l		fp1b_2 = uy, v1, fp1a_2
320	xma.hu		fp2a_2 = uy, v1, fp1a_2
321	;;	C 12
322		C 13
323		C 14
324	getf.sig	pr1_1 = fp1b_1
325		C 15
326	getf.sig	acc1_1 = fp2a_1
327		C 16
328	getf.sig	pr1_2 = fp1b_2
329		C 17
330	getf.sig	r8 = fp2a_2
331	;;	C 18
332		C 19
333	add		s0 = pr1_1, acc0
334	;;	C 20
335	st8		[rp] = s0, 8
336	cmp.ltu		p8, p9 = s0, pr1_1
337	sub		r31 = -1, acc1_1
338	;;	C 21
339	.pred.rel "mutex", p8, p9
340  (p8)	add		acc0 = pr1_2, acc1_1, 1
341  (p9)	add		acc0 = pr1_2, acc1_1
342  (p8)	cmp.leu		p10, p0 = r31, pr1_2
343  (p9)	cmp.ltu		p10, p0 = r31, pr1_2
344	;;	C 22
345	st8		[rp] = acc0, 8
346	mov.i		ar.lc = r2
347  (p10)	add		r8 = 1, r8
348	br.ret.sptk.many b0
349
350
351.grt2:	ldf8		u_3 = [up], 8
352	mov		acc1_0 = 0
353	mov		pr1_0 = 0
354	;;
355	mov		pr0_1 = 0
356	xma.l		fp0b_1 = ux, v0, f0
357	ldf8		u_0 = [up], 8
358	xma.hu		fp1a_1 = ux, v0, f0
359	;;
360	xma.l		fp0b_2 = uy, v0, f0
361	xma.hu		fp1a_2 = uy, v0, f0
362	;;
363	getf.sig	acc0 = fp0b_1
364	xma.l		fp1b_1 = ux, v1, fp1a_1
365	xma.hu		fp2a_1 = ux, v1, fp1a_1
366	;;
367	ldf8		u_1 = [up], 8
368	xma.l		fp0b_3 = u_3, v0, f0
369	xma.hu		fp1a_3 = u_3, v0, f0
370	;;
371	getf.sig	pr0_2 = fp0b_2
372	xma.l		fp1b_2 = uy, v1, fp1a_2
373	xma.hu		fp2a_2 = uy, v1, fp1a_2
374	;;
375	ldf8		u_2 = [up], 8
376	getf.sig	pr1_1 = fp1b_1
377	;;
378	getf.sig	acc1_1 = fp2a_1
379	xma.l		fp0b_0 = u_0, v0, f0
380	cmp.ne		p8, p9 = r0, r0
381	cmp.ne		p12, p13 = r0, r0
382	xma.hu		fp1a_0 = u_0, v0, f0
383	br		.LL10
384
385
386	ALIGN(32)
387.Lb11:	mov		acc1_3 = 0
388	mov		pr1_3 = 0
389	mov		pr0_0 = 0
390	cmp.ne		p6, p7 = r0, r0
391	;;
392	ldf8		u_2 = [up], 8
393	br.cloop.dptk	.grt3
394	;;
395	xma.l		fp0b_0 = ux, v0, f0
396	xma.hu		fp1a_0 = ux, v0, f0
397	;;
398	cmp.ne		p10, p11 = r0, r0
399	xma.l		fp0b_1 = uy, v0, f0
400	xma.hu		fp1a_1 = uy, v0, f0
401	;;
402	getf.sig	acc0 = fp0b_0
403	xma.l		fp1b_0 = ux, v1, fp1a_0
404	xma.hu		fp2a_0 = ux, v1, fp1a_0
405	;;
406	xma.l		fp0b_2 = u_2, v0, f0
407	xma.hu		fp1a_2 = u_2, v0, f0
408	;;
409	getf.sig	pr0_1 = fp0b_1
410	xma.l		fp1b_1 = uy, v1, fp1a_1
411	xma.hu		fp2a_1 = uy, v1, fp1a_1
412	;;
413	getf.sig	pr1_0 = fp1b_0
414	getf.sig	acc1_0 = fp2a_0
415	br		.Lcj3
416
417.grt3:	xma.l		fp0b_0 = ux, v0, f0
418	cmp.ne		p10, p11 = r0, r0
419	ldf8		u_3 = [up], 8
420	xma.hu		fp1a_0 = ux, v0, f0
421	;;
422	xma.l		fp0b_1 = uy, v0, f0
423	xma.hu		fp1a_1 = uy, v0, f0
424	;;
425	getf.sig	acc0 = fp0b_0
426	xma.l		fp1b_0 = ux, v1, fp1a_0
427	ldf8		u_0 = [up], 8
428	xma.hu		fp2a_0 = ux, v1, fp1a_0
429	;;
430	xma.l		fp0b_2 = u_2, v0, f0
431	xma.hu		fp1a_2 = u_2, v0, f0
432	;;
433	getf.sig	pr0_1 = fp0b_1
434	xma.l		fp1b_1 = uy, v1, fp1a_1
435	xma.hu		fp2a_1 = uy, v1, fp1a_1
436	;;
437	ldf8		u_1 = [up], 8
438	getf.sig	pr1_0 = fp1b_0
439	;;
440	getf.sig	acc1_0 = fp2a_0
441	xma.l		fp0b_3 = u_3, v0, f0
442	xma.hu		fp1a_3 = u_3, v0, f0
443	br		.LL11
444
445
446C *** MAIN LOOP START ***
447	ALIGN(32)
448.Loop:						C 00
449	.pred.rel "mutex", p12, p13
450	getf.sig	pr0_3 = fp0b_3
451	xma.l		fp1b_3 = u_3, v1, fp1a_3
452  (p12)	add		s0 = pr1_0, acc0, 1
453  (p13)	add		s0 = pr1_0, acc0
454	xma.hu		fp2a_3 = u_3, v1, fp1a_3
455	;;					C 01
456	.pred.rel "mutex", p8, p9
457	.pred.rel "mutex", p12, p13
458	ldf8		u_3 = [up], 8
459	getf.sig	pr1_2 = fp1b_2
460  (p8)	cmp.leu		p6, p7 = acc0, pr0_1
461  (p9)	cmp.ltu		p6, p7 = acc0, pr0_1
462  (p12)	cmp.leu		p10, p11 = s0, pr1_0
463  (p13)	cmp.ltu		p10, p11 = s0, pr1_0
464	;;					C 02
465	.pred.rel "mutex", p6, p7
466	getf.sig	acc1_2 = fp2a_2
467	st8		[rp] = s0, 8
468	xma.l		fp0b_1 = u_1, v0, f0
469  (p6)	add		acc0 = pr0_2, acc1_0, 1
470  (p7)	add		acc0 = pr0_2, acc1_0
471	xma.hu		fp1a_1 = u_1, v0, f0
472	;;					C 03
473.LL01:
474	.pred.rel "mutex", p10, p11
475	getf.sig	pr0_0 = fp0b_0
476	xma.l		fp1b_0 = u_0, v1, fp1a_0
477  (p10)	add		s0 = pr1_1, acc0, 1
478  (p11)	add		s0 = pr1_1, acc0
479	xma.hu		fp2a_0 = u_0, v1, fp1a_0
480	;;					C 04
481	.pred.rel "mutex", p6, p7
482	.pred.rel "mutex", p10, p11
483	ldf8		u_0 = [up], 8
484	getf.sig	pr1_3 = fp1b_3
485  (p6)	cmp.leu		p8, p9 = acc0, pr0_2
486  (p7)	cmp.ltu		p8, p9 = acc0, pr0_2
487  (p10)	cmp.leu		p12, p13 = s0, pr1_1
488  (p11)	cmp.ltu		p12, p13 = s0, pr1_1
489	;;					C 05
490	.pred.rel "mutex", p8, p9
491	getf.sig	acc1_3 = fp2a_3
492	st8		[rp] = s0, 8
493	xma.l		fp0b_2 = u_2, v0, f0
494  (p8)	add		acc0 = pr0_3, acc1_1, 1
495  (p9)	add		acc0 = pr0_3, acc1_1
496	xma.hu		fp1a_2 = u_2, v0, f0
497	;;					C 06
498.LL00:
499	.pred.rel "mutex", p12, p13
500	getf.sig	pr0_1 = fp0b_1
501	xma.l		fp1b_1 = u_1, v1, fp1a_1
502  (p12)	add		s0 = pr1_2, acc0, 1
503  (p13)	add		s0 = pr1_2, acc0
504	xma.hu		fp2a_1 = u_1, v1, fp1a_1
505	;;					C 07
506	.pred.rel "mutex", p8, p9
507	.pred.rel "mutex", p12, p13
508	ldf8		u_1 = [up], 8
509	getf.sig	pr1_0 = fp1b_0
510  (p8)	cmp.leu		p6, p7 = acc0, pr0_3
511  (p9)	cmp.ltu		p6, p7 = acc0, pr0_3
512  (p12)	cmp.leu		p10, p11 = s0, pr1_2
513  (p13)	cmp.ltu		p10, p11 = s0, pr1_2
514	;;					C 08
515	.pred.rel "mutex", p6, p7
516	getf.sig	acc1_0 = fp2a_0
517	st8		[rp] = s0, 8
518	xma.l		fp0b_3 = u_3, v0, f0
519  (p6)	add		acc0 = pr0_0, acc1_2, 1
520  (p7)	add		acc0 = pr0_0, acc1_2
521	xma.hu		fp1a_3 = u_3, v0, f0
522	;;					C 09
523.LL11:
524	.pred.rel "mutex", p10, p11
525	getf.sig	pr0_2 = fp0b_2
526	xma.l		fp1b_2 = u_2, v1, fp1a_2
527  (p10)	add		s0 = pr1_3, acc0, 1
528  (p11)	add		s0 = pr1_3, acc0
529	xma.hu		fp2a_2 = u_2, v1, fp1a_2
530	;;					C 10
531	.pred.rel "mutex", p6, p7
532	.pred.rel "mutex", p10, p11
533	ldf8		u_2 = [up], 8
534	getf.sig	pr1_1 = fp1b_1
535  (p6)	cmp.leu		p8, p9 = acc0, pr0_0
536  (p7)	cmp.ltu		p8, p9 = acc0, pr0_0
537  (p10)	cmp.leu		p12, p13 = s0, pr1_3
538  (p11)	cmp.ltu		p12, p13 = s0, pr1_3
539	;;					C 11
540	.pred.rel "mutex", p8, p9
541	getf.sig	acc1_1 = fp2a_1
542	st8		[rp] = s0, 8
543	xma.l		fp0b_0 = u_0, v0, f0
544  (p8)	add		acc0 = pr0_1, acc1_3, 1
545  (p9)	add		acc0 = pr0_1, acc1_3
546	xma.hu		fp1a_0 = u_0, v0, f0
547.LL10:	br.cloop.dptk	.Loop			C 12
548	;;
549C *** MAIN LOOP END ***
550
551.Lcj6:
552	.pred.rel "mutex", p12, p13
553	getf.sig	pr0_3 = fp0b_3
554	xma.l		fp1b_3 = u_3, v1, fp1a_3
555  (p12)	add		s0 = pr1_0, acc0, 1
556  (p13)	add		s0 = pr1_0, acc0
557	xma.hu		fp2a_3 = u_3, v1, fp1a_3
558	;;
559	.pred.rel "mutex", p8, p9
560	.pred.rel "mutex", p12, p13
561	getf.sig	pr1_2 = fp1b_2
562  (p8)	cmp.leu		p6, p7 = acc0, pr0_1
563  (p9)	cmp.ltu		p6, p7 = acc0, pr0_1
564  (p12)	cmp.leu		p10, p11 = s0, pr1_0
565  (p13)	cmp.ltu		p10, p11 = s0, pr1_0
566	;;
567	.pred.rel "mutex", p6, p7
568	getf.sig	acc1_2 = fp2a_2
569	st8		[rp] = s0, 8
570	xma.l		fp0b_1 = u_1, v0, f0
571  (p6)	add		acc0 = pr0_2, acc1_0, 1
572  (p7)	add		acc0 = pr0_2, acc1_0
573	xma.hu		fp1a_1 = u_1, v0, f0
574	;;
575.Lcj5:
576	.pred.rel "mutex", p10, p11
577	getf.sig	pr0_0 = fp0b_0
578	xma.l		fp1b_0 = u_0, v1, fp1a_0
579  (p10)	add		s0 = pr1_1, acc0, 1
580  (p11)	add		s0 = pr1_1, acc0
581	xma.hu		fp2a_0 = u_0, v1, fp1a_0
582	;;
583	.pred.rel "mutex", p6, p7
584	.pred.rel "mutex", p10, p11
585	getf.sig	pr1_3 = fp1b_3
586  (p6)	cmp.leu		p8, p9 = acc0, pr0_2
587  (p7)	cmp.ltu		p8, p9 = acc0, pr0_2
588  (p10)	cmp.leu		p12, p13 = s0, pr1_1
589  (p11)	cmp.ltu		p12, p13 = s0, pr1_1
590	;;
591	.pred.rel "mutex", p8, p9
592	getf.sig	acc1_3 = fp2a_3
593	st8		[rp] = s0, 8
594	xma.l		fp0b_2 = u_2, v0, f0
595  (p8)	add		acc0 = pr0_3, acc1_1, 1
596  (p9)	add		acc0 = pr0_3, acc1_1
597	xma.hu		fp1a_2 = u_2, v0, f0
598	;;
599.Lcj4:
600	.pred.rel "mutex", p12, p13
601	getf.sig	pr0_1 = fp0b_1
602	xma.l		fp1b_1 = u_1, v1, fp1a_1
603  (p12)	add		s0 = pr1_2, acc0, 1
604  (p13)	add		s0 = pr1_2, acc0
605	xma.hu		fp2a_1 = u_1, v1, fp1a_1
606	;;
607	.pred.rel "mutex", p8, p9
608	.pred.rel "mutex", p12, p13
609	getf.sig	pr1_0 = fp1b_0
610  (p8)	cmp.leu		p6, p7 = acc0, pr0_3
611  (p9)	cmp.ltu		p6, p7 = acc0, pr0_3
612  (p12)	cmp.leu		p10, p11 = s0, pr1_2
613  (p13)	cmp.ltu		p10, p11 = s0, pr1_2
614	;;
615	.pred.rel "mutex", p6, p7
616	getf.sig	acc1_0 = fp2a_0
617	st8		[rp] = s0, 8
618  (p6)	add		acc0 = pr0_0, acc1_2, 1
619  (p7)	add		acc0 = pr0_0, acc1_2
620	;;
621.Lcj3:
622	.pred.rel "mutex", p10, p11
623	getf.sig	pr0_2 = fp0b_2
624	xma.l		fp1b_2 = u_2, v1, fp1a_2
625  (p10)	add		s0 = pr1_3, acc0, 1
626  (p11)	add		s0 = pr1_3, acc0
627	xma.hu		fp2a_2 = u_2, v1, fp1a_2
628	;;
629	.pred.rel "mutex", p6, p7
630	.pred.rel "mutex", p10, p11
631	getf.sig	pr1_1 = fp1b_1
632  (p6)	cmp.leu		p8, p9 = acc0, pr0_0
633  (p7)	cmp.ltu		p8, p9 = acc0, pr0_0
634  (p10)	cmp.leu		p12, p13 = s0, pr1_3
635  (p11)	cmp.ltu		p12, p13 = s0, pr1_3
636	;;
637	.pred.rel "mutex", p8, p9
638	getf.sig	acc1_1 = fp2a_1
639	st8		[rp] = s0, 8
640  (p8)	add		acc0 = pr0_1, acc1_3, 1
641  (p9)	add		acc0 = pr0_1, acc1_3
642	;;
643	.pred.rel "mutex", p12, p13
644  (p12)	add		s0 = pr1_0, acc0, 1
645  (p13)	add		s0 = pr1_0, acc0
646	;;
647	.pred.rel "mutex", p8, p9
648	.pred.rel "mutex", p12, p13
649	getf.sig	pr1_2 = fp1b_2
650  (p8)	cmp.leu		p6, p7 = acc0, pr0_1
651  (p9)	cmp.ltu		p6, p7 = acc0, pr0_1
652  (p12)	cmp.leu		p10, p11 = s0, pr1_0
653  (p13)	cmp.ltu		p10, p11 = s0, pr1_0
654	;;
655	.pred.rel "mutex", p6, p7
656	getf.sig	acc1_2 = fp2a_2
657	st8		[rp] = s0, 8
658  (p6)	add		acc0 = pr0_2, acc1_0, 1
659  (p7)	add		acc0 = pr0_2, acc1_0
660	;;
661	.pred.rel "mutex", p10, p11
662  (p10)	add		s0 = pr1_1, acc0, 1
663  (p11)	add		s0 = pr1_1, acc0
664	;;
665	.pred.rel "mutex", p6, p7
666	.pred.rel "mutex", p10, p11
667  (p6)	cmp.leu		p8, p9 = acc0, pr0_2
668  (p7)	cmp.ltu		p8, p9 = acc0, pr0_2
669  (p10)	cmp.leu		p12, p13 = s0, pr1_1
670  (p11)	cmp.ltu		p12, p13 = s0, pr1_1
671	;;
672	.pred.rel "mutex", p8, p9
673	st8		[rp] = s0, 8
674  (p8)	add		acc0 = pr1_2, acc1_1, 1
675  (p9)	add		acc0 = pr1_2, acc1_1
676	;;
677	.pred.rel "mutex", p8, p9
678  (p8)	cmp.leu		p10, p11 = acc0, pr1_2
679  (p9)	cmp.ltu		p10, p11 = acc0, pr1_2
680  (p12)	add		acc0 = 1, acc0
681	;;
682	st8		[rp] = acc0, 8
683  (p12)	cmp.eq.or	p10, p0 = 0, acc0
684	mov		r8 = acc1_2
685	;;
686	.pred.rel "mutex", p10, p11
687  (p10)	add		r8 = 1, r8
688	mov.i		ar.lc = r2
689	br.ret.sptk.many b0
690EPILOGUE()
691ASM_END()
692