xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/ia64/submul_1.asm (revision 8450a7c42673d65e3b1f6560d3b6ecd317a6cbe8)
1dnl  IA-64 mpn_submul_1 -- Multiply a limb vector with a limb and subtract the
2dnl  result from a second limb vector.
3
4dnl  Contributed to the GNU project by Torbjorn Granlund.
5
6dnl  Copyright 2000, 2001, 2002, 2003, 2004 Free Software Foundation, Inc.
7
8dnl  This file is part of the GNU MP Library.
9
10dnl  The GNU MP Library is free software; you can redistribute it and/or modify
11dnl  it under the terms of the GNU Lesser General Public License as published
12dnl  by the Free Software Foundation; either version 3 of the License, or (at
13dnl  your option) any later version.
14
15dnl  The GNU MP Library is distributed in the hope that it will be useful, but
16dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
17dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
18dnl  License for more details.
19
20dnl  You should have received a copy of the GNU Lesser General Public License
21dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
22
23include(`../config.m4')
24
25C         cycles/limb
26C Itanium:    4.0
27C Itanium 2:  2.25 (alignment dependent, sometimes it seems to need 3 c/l)
28
29C TODO
30C  * Optimize feed-in and wind-down code, both for speed and code size.
31C  * Handle low limb input and results specially, using a common stf8 in the
32C    epilogue.
33C  * Delay r8, r10 initialization, put cmp-p6 in 1st bundle and br .Ldone in
34C    2nd bundle.  This will allow the bbb bundle to be one cycle earlier and
35C    save a cycle.
36
37C INPUT PARAMETERS
38define(`rp', `r32')
39define(`up', `r33')
40define(`n',  `r34')
41define(`vl', `r35')
42
43ASM_START()
44PROLOGUE(mpn_submul_1)
45	.prologue
46	.save	ar.lc, r2
47	.body
48
49ifdef(`HAVE_ABI_32',
50`	addp4		rp = 0, rp		C M I
51	addp4		up = 0, up		C M I
52	zxt4		n = n			C I
53	;;
54')
55{.mmi
56	mov		r10 = rp		C M I
57	mov		r9 = up			C M I
58	sub		vl = r0, vl		C M I	negate vl
59}
60{.mmi
61	ldf8		f8 = [rp], 8		C M
62	ldf8		f7 = [up], 8		C M
63	add		r19 = -1, n		C M I	n - 1
64	;;
65}
66{.mmi
67	cmp.eq		p6, p0 = 0, vl		C M I
68	mov		r8 = 0			C M I	zero cylimb
69	mov		r2 = ar.lc		C I0
70}
71{.mmi
72	setf.sig	f6 = vl			C M2 M3
73	and		r14 = 3, n		C M I
74	shr.u		r19 = r19, 2		C I0
75	;;
76}
77{.mmb
78	nop		0
79	cmp.eq		p10, p0 = 0, r14	C M I
80   (p6)	br.spnt		.Ldone			C B	vl == 0
81}
82{.mmi
83	cmp.eq		p11, p0 = 2, r14	C M I
84	cmp.eq		p12, p0 = 3, r14	C M I
85	mov		ar.lc = r19		C I0
86}
87{.bbb
88  (p10)	br.dptk		.Lb00			C B
89  (p11)	br.dptk		.Lb10			C B
90  (p12)	br.dptk		.Lb11			C B
91	;;
92}
93
94.Lb01:	br.cloop.dptk	.grt1
95
96	xma.l		f39 = f7, f6, f8
97	xma.hu		f43 = f7, f6, f8
98	;;
99	getf.sig	r27 = f39			C lo
100	getf.sig	r31 = f43			C hi
101	ld8		r20 = [r9], 8
102	br		.Lcj1
103
104.grt1:	ldf8		f44 = [rp], 8
105	ldf8		f32 = [up], 8
106	;;
107	ldf8		f45 = [rp], 8
108	ldf8		f33 = [up], 8
109	;;
110	ldf8		f46 = [rp], 8
111	xma.l		f39 = f7, f6, f8
112	ldf8		f34 = [up], 8
113	xma.hu		f43 = f7, f6, f8
114	;;
115	ldf8		f47 = [rp], 8
116	xma.l		f36 = f32, f6, f44
117	ldf8		f35 = [up], 8
118	xma.hu		f40 = f32, f6, f44
119	br.cloop.dptk	.grt5
120	;;
121
122	getf.sig	r27 = f39			C lo
123	xma.l		f37 = f33, f6, f45
124	ld8		r20 = [r9], 8
125	xma.hu		f41 = f33, f6, f45
126	;;
127	getf.sig	r31 = f43			C hi
128	getf.sig	r24 = f36			C lo
129	xma.l		f38 = f34, f6, f46
130	ld8		r21 = [r9], 8
131	xma.hu		f42 = f34, f6, f46
132	;;
133	getf.sig	r28 = f40			C hi
134	getf.sig	r25 = f37			C lo
135	xma.l		f39 = f35, f6, f47
136	ld8		r22 = [r9], 8
137	xma.hu		f43 = f35, f6, f47
138	;;
139	getf.sig	r29 = f41			C hi
140	getf.sig	r26 = f38			C lo
141	ld8		r23 = [r9], 8
142	br		.Lcj5
143
144.grt5:	ldf8		f44 = [rp], 8
145	ldf8		f32 = [up], 8
146	;;
147	getf.sig	r27 = f39			C lo
148	xma.l		f37 = f33, f6, f45
149	ld8		r20 = [r9], 8
150	xma.hu		f41 = f33, f6, f45
151	;;
152	ldf8		f45 = [rp], 8
153	getf.sig	r31 = f43			C hi
154	ldf8		f33 = [up], 8
155	;;
156	getf.sig	r24 = f36			C lo
157	xma.l		f38 = f34, f6, f46
158	ld8		r21 = [r9], 8
159	xma.hu		f42 = f34, f6, f46
160	;;
161	ldf8		f46 = [rp], 8
162	getf.sig	r28 = f40			C hi
163	ldf8		f34 = [up], 8
164	;;
165	getf.sig	r25 = f37			C lo
166	xma.l		f39 = f35, f6, f47
167	ld8		r22 = [r9], 8
168	xma.hu		f43 = f35, f6, f47
169	;;
170	ldf8		f47 = [rp], 8
171	getf.sig	r29 = f41			C hi
172	ldf8		f35 = [up], 8
173	;;
174	getf.sig	r26 = f38			C lo
175	xma.l		f36 = f32, f6, f44
176	ld8		r23 = [r9], 8
177	xma.hu		f40 = f32, f6, f44
178	br.cloop.dptk	.Loop
179	br		.Lend
180
181
182.Lb10:	ldf8		f47 = [rp], 8
183	ldf8		f35 = [up], 8
184	br.cloop.dptk	.grt2
185
186	xma.l		f38 = f7, f6, f8
187	xma.hu		f42 = f7, f6, f8
188	;;
189	xma.l		f39 = f35, f6, f47
190	xma.hu		f43 = f35, f6, f47
191	;;
192	getf.sig	r26 = f38			C lo
193	getf.sig	r30 = f42			C hi
194	ld8		r23 = [r9], 8
195	;;
196	getf.sig	r27 = f39			C lo
197	getf.sig	r31 = f43			C hi
198	ld8		r20 = [r9], 8
199	br		.Lcj2
200
201.grt2:	ldf8		f44 = [rp], 8
202	ldf8		f32 = [up], 8
203	;;
204	ldf8		f45 = [rp], 8
205	ldf8		f33 = [up], 8
206	xma.l		f38 = f7, f6, f8
207	xma.hu		f42 = f7, f6, f8
208	;;
209	ldf8		f46 = [rp], 8
210	ldf8		f34 = [up], 8
211	xma.l		f39 = f35, f6, f47
212	xma.hu		f43 = f35, f6, f47
213	;;
214	ldf8		f47 = [rp], 8
215	ldf8		f35 = [up], 8
216	;;
217	getf.sig	r26 = f38			C lo
218	xma.l		f36 = f32, f6, f44
219	ld8		r23 = [r9], 8
220	xma.hu		f40 = f32, f6, f44
221	br.cloop.dptk	.grt6
222
223	getf.sig	r30 = f42			C hi
224	;;
225	getf.sig	r27 = f39			C lo
226	xma.l		f37 = f33, f6, f45
227	ld8		r20 = [r9], 8
228	xma.hu		f41 = f33, f6, f45
229	;;
230	getf.sig	r31 = f43			C hi
231	getf.sig	r24 = f36			C lo
232	xma.l		f38 = f34, f6, f46
233	ld8		r21 = [r9], 8
234	xma.hu		f42 = f34, f6, f46
235	;;
236	getf.sig	r28 = f40			C hi
237	getf.sig	r25 = f37			C lo
238	xma.l		f39 = f35, f6, f47
239	ld8		r22 = [r9], 8
240	xma.hu		f43 = f35, f6, f47
241	br		.Lcj6
242
243.grt6:	ldf8		f44 = [rp], 8
244	getf.sig	r30 = f42			C hi
245	ldf8		f32 = [up], 8
246	;;
247	getf.sig	r27 = f39			C lo
248	xma.l		f37 = f33, f6, f45
249	ld8		r20 = [r9], 8
250	xma.hu		f41 = f33, f6, f45
251	;;
252	ldf8		f45 = [rp], 8
253	getf.sig	r31 = f43			C hi
254	ldf8		f33 = [up], 8
255	;;
256	getf.sig	r24 = f36			C lo
257	xma.l		f38 = f34, f6, f46
258	ld8		r21 = [r9], 8
259	xma.hu		f42 = f34, f6, f46
260	;;
261	ldf8		f46 = [rp], 8
262	getf.sig	r28 = f40			C hi
263	ldf8		f34 = [up], 8
264	;;
265	getf.sig	r25 = f37			C lo
266	xma.l		f39 = f35, f6, f47
267	ld8		r22 = [r9], 8
268	xma.hu		f43 = f35, f6, f47
269	br		.LL10
270
271
272.Lb11:	ldf8		f46 = [rp], 8
273	ldf8		f34 = [up], 8
274	;;
275	ldf8		f47 = [rp], 8
276	ldf8		f35 = [up], 8
277	br.cloop.dptk	.grt3
278
279	xma.l		f37 = f7, f6, f8
280	xma.hu		f41 = f7, f6, f8
281	;;
282	xma.l		f38 = f34, f6, f46
283	xma.hu		f42 = f34, f6, f46
284	;;
285	getf.sig	r25 = f37			C lo
286	xma.l		f39 = f35, f6, f47
287	xma.hu		f43 = f35, f6, f47
288	;;
289	getf.sig	r29 = f41			C hi
290	ld8		r22 = [r9], 8
291	;;
292	getf.sig	r26 = f38			C lo
293	getf.sig	r30 = f42			C hi
294	ld8		r23 = [r9], 8
295	;;
296	getf.sig	r27 = f39			C lo
297	getf.sig	r31 = f43			C hi
298	ld8		r20 = [r9], 8
299	br		.Lcj3
300
301.grt3:	ldf8		f44 = [rp], 8
302	xma.l		f37 = f7, f6, f8
303	ldf8		f32 = [up], 8
304	xma.hu		f41 = f7, f6, f8
305	;;
306	ldf8		f45 = [rp], 8
307	xma.l		f38 = f34, f6, f46
308	ldf8		f33 = [up], 8
309	xma.hu		f42 = f34, f6, f46
310	;;
311	ldf8		f46 = [rp], 8
312	ldf8		f34 = [up], 8
313	;;
314	getf.sig	r25 = f37			C lo
315	xma.l		f39 = f35, f6, f47
316	ld8		r22 = [r9], 8
317	xma.hu		f43 = f35, f6, f47
318	;;
319	ldf8		f47 = [rp], 8
320	getf.sig	r29 = f41			C hi
321	ldf8		f35 = [up], 8
322	;;
323	getf.sig	r26 = f38			C lo
324	xma.l		f36 = f32, f6, f44
325	ld8		r23 = [r9], 8
326	xma.hu		f40 = f32, f6, f44
327	br.cloop.dptk	.grt7
328	;;
329
330	getf.sig	r30 = f42			C hi
331	getf.sig	r27 = f39			C lo
332	xma.l		f37 = f33, f6, f45
333	ld8		r20 = [r9], 8
334	xma.hu		f41 = f33, f6, f45
335	;;
336	getf.sig	r31 = f43			C hi
337	getf.sig	r24 = f36			C lo
338	xma.l		f38 = f34, f6, f46
339	ld8		r21 = [r9], 8
340	xma.hu		f42 = f34, f6, f46
341	br		.Lcj7
342
343.grt7:	ldf8		f44 = [rp], 8
344	getf.sig	r30 = f42			C hi
345	ldf8		f32 = [up], 8
346	;;
347	getf.sig	r27 = f39			C lo
348	xma.l		f37 = f33, f6, f45
349	ld8		r20 = [r9], 8
350	xma.hu		f41 = f33, f6, f45
351	;;
352	ldf8		f45 = [rp], 8
353	getf.sig	r31 = f43			C hi
354	ldf8		f33 = [up], 8
355	;;
356	getf.sig	r24 = f36			C lo
357	xma.l		f38 = f34, f6, f46
358	ld8		r21 = [r9], 8
359	xma.hu		f42 = f34, f6, f46
360	br		.LL11
361
362
363.Lb00:	ldf8		f45 = [rp], 8
364	ldf8		f33 = [up], 8
365	;;
366	ldf8		f46 = [rp], 8
367	ldf8		f34 = [up], 8
368	;;
369	ldf8		f47 = [rp], 8
370	xma.l		f36 = f7, f6, f8
371	ldf8		f35 = [up], 8
372	xma.hu		f40 = f7, f6, f8
373	br.cloop.dptk	.grt4
374
375	xma.l		f37 = f33, f6, f45
376	xma.hu		f41 = f33, f6, f45
377	;;
378	getf.sig	r24 = f36			C lo
379	xma.l		f38 = f34, f6, f46
380	ld8		r21 = [r9], 8
381	xma.hu		f42 = f34, f6, f46
382	;;
383	getf.sig	r28 = f40			C hi
384	xma.l		f39 = f35, f6, f47
385	getf.sig	r25 = f37			C lo
386	ld8		r22 = [r9], 8
387	xma.hu		f43 = f35, f6, f47
388	;;
389	getf.sig	r29 = f41			C hi
390	getf.sig	r26 = f38			C lo
391	ld8		r23 = [r9], 8
392	;;
393	getf.sig	r30 = f42			C hi
394	getf.sig	r27 = f39			C lo
395	ld8		r20 = [r9], 8
396	br		.Lcj4
397
398.grt4:	ldf8		f44 = [rp], 8
399	xma.l		f37 = f33, f6, f45
400	ldf8		f32 = [up], 8
401	xma.hu		f41 = f33, f6, f45
402	;;
403	ldf8		f45 = [rp], 8
404	ldf8		f33 = [up], 8
405	xma.l		f38 = f34, f6, f46
406	getf.sig	r24 = f36			C lo
407	ld8		r21 = [r9], 8
408	xma.hu		f42 = f34, f6, f46
409	;;
410	ldf8		f46 = [rp], 8
411	getf.sig	r28 = f40			C hi
412	ldf8		f34 = [up], 8
413	xma.l		f39 = f35, f6, f47
414	getf.sig	r25 = f37			C lo
415	ld8		r22 = [r9], 8
416	xma.hu		f43 = f35, f6, f47
417	;;
418	ldf8		f47 = [rp], 8
419	getf.sig	r29 = f41			C hi
420	ldf8		f35 = [up], 8
421	;;
422	getf.sig	r26 = f38			C lo
423	xma.l		f36 = f32, f6, f44
424	ld8		r23 = [r9], 8
425	xma.hu		f40 = f32, f6, f44
426	br.cloop.dptk	.grt8
427	;;
428
429	getf.sig	r30 = f42			C hi
430	getf.sig	r27 = f39			C lo
431	xma.l		f37 = f33, f6, f45
432	ld8		r20 = [r9], 8
433	xma.hu		f41 = f33, f6, f45
434	br		.Lcj8
435
436.grt8:	ldf8		f44 = [rp], 8
437	getf.sig	r30 = f42			C hi
438	ldf8		f32 = [up], 8
439	;;
440	getf.sig	r27 = f39			C lo
441	xma.l		f37 = f33, f6, f45
442	ld8		r20 = [r9], 8
443	xma.hu		f41 = f33, f6, f45
444	br		.LL00
445
446	ALIGN(32)
447.Loop:
448{.mmi
449	ldf8		f44 = [rp], 8
450	cmp.ltu		p6, p0 = r27, r8	C lo cmp
451	sub		r14 = r27, r8		C lo sub
452}
453{.mmi
454	getf.sig	r30 = f42			C hi
455	ldf8		f32 = [up], 8
456	sub		r8 = r20, r31		C hi sub
457	;;				C 01
458}
459{.mmf
460	getf.sig	r27 = f39			C lo
461	st8		[r10] = r14, 8
462	xma.l		f37 = f33, f6, f45
463}
464{.mfi
465	ld8		r20 = [r9], 8
466	xma.hu		f41 = f33, f6, f45
467   (p6)	add		r8 = 1, r8
468	;;				C 02
469}
470{.mmi
471.LL00:	ldf8		f45 = [rp], 8
472	cmp.ltu		p6, p0 = r24, r8
473	sub		r14 = r24, r8
474}
475{.mmi
476	getf.sig	r31 = f43			C hi
477	ldf8		f33 = [up], 8
478	sub		r8 = r21, r28
479	;;				C 03
480}
481{.mmf
482	getf.sig	r24 = f36			C lo
483	st8		[r10] = r14, 8
484	xma.l		f38 = f34, f6, f46
485}
486{.mfi
487	ld8		r21 = [r9], 8
488	xma.hu		f42 = f34, f6, f46
489   (p6)	add		r8 = 1, r8
490	;;				C 04
491}
492{.mmi
493.LL11:	ldf8		f46 = [rp], 8
494	cmp.ltu		p6, p0 = r25, r8
495	sub		r14 = r25, r8
496}
497{.mmi
498	getf.sig	r28 = f40			C hi
499	ldf8		f34 = [up], 8
500	sub		r8 = r22, r29
501	;;				C 05
502}
503{.mmf
504	getf.sig	r25 = f37			C lo
505	st8		[r10] = r14, 8
506	xma.l		f39 = f35, f6, f47
507}
508{.mfi
509	ld8		r22 = [r9], 8
510	xma.hu		f43 = f35, f6, f47
511   (p6)	add		r8 = 1, r8
512	;;				C 06
513}
514{.mmi
515.LL10:	ldf8		f47 = [rp], 8
516	cmp.ltu		p6, p0 = r26, r8
517	sub		r14 = r26, r8
518}
519{.mmi
520	getf.sig	r29 = f41			C hi
521	ldf8		f35 = [up], 8
522	sub		r8 = r23, r30
523	;;				C 07
524}
525{.mmf
526	getf.sig	r26 = f38			C lo
527	st8		[r10] = r14, 8
528	xma.l		f36 = f32, f6, f44
529}
530{.mfi
531	ld8		r23 = [r9], 8
532	xma.hu		f40 = f32, f6, f44
533   (p6)	add		r8 = 1, r8
534}
535	br.cloop.dptk	.Loop
536	;;
537
538.Lend:
539	cmp.ltu		p6, p0 = r27, r8
540	sub		r14 = r27, r8
541	getf.sig	r30 = f42
542	sub		r8 = r20, r31
543	;;
544	getf.sig	r27 = f39
545	st8		[r10] = r14, 8
546	xma.l		f37 = f33, f6, f45
547	ld8		r20 = [r9], 8
548	xma.hu		f41 = f33, f6, f45
549   (p6)	add		r8 = 1, r8
550	;;
551.Lcj8:
552	cmp.ltu		p6, p0 = r24, r8
553	sub		r14 = r24, r8
554	getf.sig	r31 = f43
555	sub		r8 = r21, r28
556	;;
557	getf.sig	r24 = f36
558	st8		[r10] = r14, 8
559	xma.l		f38 = f34, f6, f46
560	ld8		r21 = [r9], 8
561	xma.hu		f42 = f34, f6, f46
562   (p6)	add		r8 = 1, r8
563	;;
564.Lcj7:
565	cmp.ltu		p6, p0 = r25, r8
566	sub		r14 = r25, r8
567	getf.sig	r28 = f40
568	sub		r8 = r22, r29
569	;;
570	getf.sig	r25 = f37
571	st8		[r10] = r14, 8
572	xma.l		f39 = f35, f6, f47
573	ld8		r22 = [r9], 8
574	xma.hu		f43 = f35, f6, f47
575   (p6)	add		r8 = 1, r8
576	;;
577.Lcj6:
578	cmp.ltu		p6, p0 = r26, r8
579	sub		r14 = r26, r8
580	getf.sig	r29 = f41
581	sub		r8 = r23, r30
582	;;
583	getf.sig	r26 = f38
584	st8		[r10] = r14, 8
585	ld8		r23 = [r9], 8
586   (p6)	add		r8 = 1, r8
587	;;
588.Lcj5:
589	cmp.ltu		p6, p0 = r27, r8
590	sub		r14 = r27, r8
591	getf.sig	r30 = f42
592	sub		r8 = r20, r31
593	;;
594	getf.sig	r27 = f39
595	st8		[r10] = r14, 8
596	ld8		r20 = [r9], 8
597   (p6)	add		r8 = 1, r8
598	;;
599.Lcj4:
600	cmp.ltu		p6, p0 = r24, r8
601	sub		r14 = r24, r8
602	getf.sig	r31 = f43
603	sub		r8 = r21, r28
604	;;
605	st8		[r10] = r14, 8
606   (p6)	add		r8 = 1, r8
607	;;
608.Lcj3:
609	cmp.ltu		p6, p0 = r25, r8
610	sub		r14 = r25, r8
611	sub		r8 = r22, r29
612	;;
613	st8		[r10] = r14, 8
614   (p6)	add		r8 = 1, r8
615	;;
616.Lcj2:
617	cmp.ltu		p6, p0 = r26, r8
618	sub		r14 = r26, r8
619	sub		r8 = r23, r30
620	;;
621	st8		[r10] = r14, 8
622   (p6)	add		r8 = 1, r8
623	;;
624.Lcj1:
625	cmp.ltu		p6, p0 = r27, r8
626	sub		r14 = r27, r8
627	sub		r8 = r20, r31
628	;;
629	st8		[r10] = r14, 8
630	mov		ar.lc = r2
631   (p6)	add		r8 = 1, r8
632	br.ret.sptk.many b0
633.Ldone:	mov		ar.lc = r2
634	br.ret.sptk.many b0
635EPILOGUE()
636ASM_END()
637