xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/ia64/submul_1.asm (revision d90047b5d07facf36e6c01dcc0bded8997ce9cc2)
1dnl  IA-64 mpn_submul_1 -- Multiply a limb vector with a limb and subtract the
2dnl  result from a second limb vector.
3
4dnl  Contributed to the GNU project by Torbjorn Granlund.
5
6dnl  Copyright 2000-2004 Free Software Foundation, Inc.
7
8dnl  This file is part of the GNU MP Library.
9dnl
10dnl  The GNU MP Library is free software; you can redistribute it and/or modify
11dnl  it under the terms of either:
12dnl
13dnl    * the GNU Lesser General Public License as published by the Free
14dnl      Software Foundation; either version 3 of the License, or (at your
15dnl      option) any later version.
16dnl
17dnl  or
18dnl
19dnl    * the GNU General Public License as published by the Free Software
20dnl      Foundation; either version 2 of the License, or (at your option) any
21dnl      later version.
22dnl
23dnl  or both in parallel, as here.
24dnl
25dnl  The GNU MP Library is distributed in the hope that it will be useful, but
26dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
27dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
28dnl  for more details.
29dnl
30dnl  You should have received copies of the GNU General Public License and the
31dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
32dnl  see https://www.gnu.org/licenses/.
33
34include(`../config.m4')
35
36C         cycles/limb
37C Itanium:    4.0
38C Itanium 2:  2.25 (alignment dependent, sometimes it seems to need 3 c/l)
39
40C TODO
41C  * Optimize feed-in and wind-down code, both for speed and code size.
42C  * Handle low limb input and results specially, using a common stf8 in the
43C    epilogue.
44C  * Delay r8, r10 initialization, put cmp-p6 in 1st bundle and br .Ldone in
45C    2nd bundle.  This will allow the bbb bundle to be one cycle earlier and
46C    save a cycle.
47
48C INPUT PARAMETERS
49define(`rp', `r32')
50define(`up', `r33')
51define(`n',  `r34')
52define(`vl', `r35')
53
54ASM_START()
55PROLOGUE(mpn_submul_1)
56	.prologue
57	.save	ar.lc, r2
58	.body
59
60ifdef(`HAVE_ABI_32',
61`	addp4		rp = 0, rp		C M I
62	addp4		up = 0, up		C M I
63	zxt4		n = n			C I
64	;;
65')
66{.mmi
67	mov		r10 = rp		C M I
68	mov		r9 = up			C M I
69	sub		vl = r0, vl		C M I	negate vl
70}
71{.mmi
72	ldf8		f8 = [rp], 8		C M
73	ldf8		f7 = [up], 8		C M
74	add		r19 = -1, n		C M I	n - 1
75	;;
76}
77{.mmi
78	cmp.eq		p6, p0 = 0, vl		C M I
79	mov		r8 = 0			C M I	zero cylimb
80	mov		r2 = ar.lc		C I0
81}
82{.mmi
83	setf.sig	f6 = vl			C M2 M3
84	and		r14 = 3, n		C M I
85	shr.u		r19 = r19, 2		C I0
86	;;
87}
88{.mmb
89	nop		0
90	cmp.eq		p10, p0 = 0, r14	C M I
91   (p6)	br.spnt		.Ldone			C B	vl == 0
92}
93{.mmi
94	cmp.eq		p11, p0 = 2, r14	C M I
95	cmp.eq		p12, p0 = 3, r14	C M I
96	mov		ar.lc = r19		C I0
97}
98{.bbb
99  (p10)	br.dptk		.Lb00			C B
100  (p11)	br.dptk		.Lb10			C B
101  (p12)	br.dptk		.Lb11			C B
102	;;
103}
104
105.Lb01:	br.cloop.dptk	.grt1
106
107	xma.l		f39 = f7, f6, f8
108	xma.hu		f43 = f7, f6, f8
109	;;
110	getf.sig	r27 = f39			C lo
111	getf.sig	r31 = f43			C hi
112	ld8		r20 = [r9], 8
113	br		.Lcj1
114
115.grt1:	ldf8		f44 = [rp], 8
116	ldf8		f32 = [up], 8
117	;;
118	ldf8		f45 = [rp], 8
119	ldf8		f33 = [up], 8
120	;;
121	ldf8		f46 = [rp], 8
122	xma.l		f39 = f7, f6, f8
123	ldf8		f34 = [up], 8
124	xma.hu		f43 = f7, f6, f8
125	;;
126	ldf8		f47 = [rp], 8
127	xma.l		f36 = f32, f6, f44
128	ldf8		f35 = [up], 8
129	xma.hu		f40 = f32, f6, f44
130	br.cloop.dptk	.grt5
131	;;
132
133	getf.sig	r27 = f39			C lo
134	xma.l		f37 = f33, f6, f45
135	ld8		r20 = [r9], 8
136	xma.hu		f41 = f33, f6, f45
137	;;
138	getf.sig	r31 = f43			C hi
139	getf.sig	r24 = f36			C lo
140	xma.l		f38 = f34, f6, f46
141	ld8		r21 = [r9], 8
142	xma.hu		f42 = f34, f6, f46
143	;;
144	getf.sig	r28 = f40			C hi
145	getf.sig	r25 = f37			C lo
146	xma.l		f39 = f35, f6, f47
147	ld8		r22 = [r9], 8
148	xma.hu		f43 = f35, f6, f47
149	;;
150	getf.sig	r29 = f41			C hi
151	getf.sig	r26 = f38			C lo
152	ld8		r23 = [r9], 8
153	br		.Lcj5
154
155.grt5:	ldf8		f44 = [rp], 8
156	ldf8		f32 = [up], 8
157	;;
158	getf.sig	r27 = f39			C lo
159	xma.l		f37 = f33, f6, f45
160	ld8		r20 = [r9], 8
161	xma.hu		f41 = f33, f6, f45
162	;;
163	ldf8		f45 = [rp], 8
164	getf.sig	r31 = f43			C hi
165	ldf8		f33 = [up], 8
166	;;
167	getf.sig	r24 = f36			C lo
168	xma.l		f38 = f34, f6, f46
169	ld8		r21 = [r9], 8
170	xma.hu		f42 = f34, f6, f46
171	;;
172	ldf8		f46 = [rp], 8
173	getf.sig	r28 = f40			C hi
174	ldf8		f34 = [up], 8
175	;;
176	getf.sig	r25 = f37			C lo
177	xma.l		f39 = f35, f6, f47
178	ld8		r22 = [r9], 8
179	xma.hu		f43 = f35, f6, f47
180	;;
181	ldf8		f47 = [rp], 8
182	getf.sig	r29 = f41			C hi
183	ldf8		f35 = [up], 8
184	;;
185	getf.sig	r26 = f38			C lo
186	xma.l		f36 = f32, f6, f44
187	ld8		r23 = [r9], 8
188	xma.hu		f40 = f32, f6, f44
189	br.cloop.dptk	.Loop
190	br		.Lend
191
192
193.Lb10:	ldf8		f47 = [rp], 8
194	ldf8		f35 = [up], 8
195	br.cloop.dptk	.grt2
196
197	xma.l		f38 = f7, f6, f8
198	xma.hu		f42 = f7, f6, f8
199	;;
200	xma.l		f39 = f35, f6, f47
201	xma.hu		f43 = f35, f6, f47
202	;;
203	getf.sig	r26 = f38			C lo
204	getf.sig	r30 = f42			C hi
205	ld8		r23 = [r9], 8
206	;;
207	getf.sig	r27 = f39			C lo
208	getf.sig	r31 = f43			C hi
209	ld8		r20 = [r9], 8
210	br		.Lcj2
211
212.grt2:	ldf8		f44 = [rp], 8
213	ldf8		f32 = [up], 8
214	;;
215	ldf8		f45 = [rp], 8
216	ldf8		f33 = [up], 8
217	xma.l		f38 = f7, f6, f8
218	xma.hu		f42 = f7, f6, f8
219	;;
220	ldf8		f46 = [rp], 8
221	ldf8		f34 = [up], 8
222	xma.l		f39 = f35, f6, f47
223	xma.hu		f43 = f35, f6, f47
224	;;
225	ldf8		f47 = [rp], 8
226	ldf8		f35 = [up], 8
227	;;
228	getf.sig	r26 = f38			C lo
229	xma.l		f36 = f32, f6, f44
230	ld8		r23 = [r9], 8
231	xma.hu		f40 = f32, f6, f44
232	br.cloop.dptk	.grt6
233
234	getf.sig	r30 = f42			C hi
235	;;
236	getf.sig	r27 = f39			C lo
237	xma.l		f37 = f33, f6, f45
238	ld8		r20 = [r9], 8
239	xma.hu		f41 = f33, f6, f45
240	;;
241	getf.sig	r31 = f43			C hi
242	getf.sig	r24 = f36			C lo
243	xma.l		f38 = f34, f6, f46
244	ld8		r21 = [r9], 8
245	xma.hu		f42 = f34, f6, f46
246	;;
247	getf.sig	r28 = f40			C hi
248	getf.sig	r25 = f37			C lo
249	xma.l		f39 = f35, f6, f47
250	ld8		r22 = [r9], 8
251	xma.hu		f43 = f35, f6, f47
252	br		.Lcj6
253
254.grt6:	ldf8		f44 = [rp], 8
255	getf.sig	r30 = f42			C hi
256	ldf8		f32 = [up], 8
257	;;
258	getf.sig	r27 = f39			C lo
259	xma.l		f37 = f33, f6, f45
260	ld8		r20 = [r9], 8
261	xma.hu		f41 = f33, f6, f45
262	;;
263	ldf8		f45 = [rp], 8
264	getf.sig	r31 = f43			C hi
265	ldf8		f33 = [up], 8
266	;;
267	getf.sig	r24 = f36			C lo
268	xma.l		f38 = f34, f6, f46
269	ld8		r21 = [r9], 8
270	xma.hu		f42 = f34, f6, f46
271	;;
272	ldf8		f46 = [rp], 8
273	getf.sig	r28 = f40			C hi
274	ldf8		f34 = [up], 8
275	;;
276	getf.sig	r25 = f37			C lo
277	xma.l		f39 = f35, f6, f47
278	ld8		r22 = [r9], 8
279	xma.hu		f43 = f35, f6, f47
280	br		.LL10
281
282
283.Lb11:	ldf8		f46 = [rp], 8
284	ldf8		f34 = [up], 8
285	;;
286	ldf8		f47 = [rp], 8
287	ldf8		f35 = [up], 8
288	br.cloop.dptk	.grt3
289
290	xma.l		f37 = f7, f6, f8
291	xma.hu		f41 = f7, f6, f8
292	;;
293	xma.l		f38 = f34, f6, f46
294	xma.hu		f42 = f34, f6, f46
295	;;
296	getf.sig	r25 = f37			C lo
297	xma.l		f39 = f35, f6, f47
298	xma.hu		f43 = f35, f6, f47
299	;;
300	getf.sig	r29 = f41			C hi
301	ld8		r22 = [r9], 8
302	;;
303	getf.sig	r26 = f38			C lo
304	getf.sig	r30 = f42			C hi
305	ld8		r23 = [r9], 8
306	;;
307	getf.sig	r27 = f39			C lo
308	getf.sig	r31 = f43			C hi
309	ld8		r20 = [r9], 8
310	br		.Lcj3
311
312.grt3:	ldf8		f44 = [rp], 8
313	xma.l		f37 = f7, f6, f8
314	ldf8		f32 = [up], 8
315	xma.hu		f41 = f7, f6, f8
316	;;
317	ldf8		f45 = [rp], 8
318	xma.l		f38 = f34, f6, f46
319	ldf8		f33 = [up], 8
320	xma.hu		f42 = f34, f6, f46
321	;;
322	ldf8		f46 = [rp], 8
323	ldf8		f34 = [up], 8
324	;;
325	getf.sig	r25 = f37			C lo
326	xma.l		f39 = f35, f6, f47
327	ld8		r22 = [r9], 8
328	xma.hu		f43 = f35, f6, f47
329	;;
330	ldf8		f47 = [rp], 8
331	getf.sig	r29 = f41			C hi
332	ldf8		f35 = [up], 8
333	;;
334	getf.sig	r26 = f38			C lo
335	xma.l		f36 = f32, f6, f44
336	ld8		r23 = [r9], 8
337	xma.hu		f40 = f32, f6, f44
338	br.cloop.dptk	.grt7
339	;;
340
341	getf.sig	r30 = f42			C hi
342	getf.sig	r27 = f39			C lo
343	xma.l		f37 = f33, f6, f45
344	ld8		r20 = [r9], 8
345	xma.hu		f41 = f33, f6, f45
346	;;
347	getf.sig	r31 = f43			C hi
348	getf.sig	r24 = f36			C lo
349	xma.l		f38 = f34, f6, f46
350	ld8		r21 = [r9], 8
351	xma.hu		f42 = f34, f6, f46
352	br		.Lcj7
353
354.grt7:	ldf8		f44 = [rp], 8
355	getf.sig	r30 = f42			C hi
356	ldf8		f32 = [up], 8
357	;;
358	getf.sig	r27 = f39			C lo
359	xma.l		f37 = f33, f6, f45
360	ld8		r20 = [r9], 8
361	xma.hu		f41 = f33, f6, f45
362	;;
363	ldf8		f45 = [rp], 8
364	getf.sig	r31 = f43			C hi
365	ldf8		f33 = [up], 8
366	;;
367	getf.sig	r24 = f36			C lo
368	xma.l		f38 = f34, f6, f46
369	ld8		r21 = [r9], 8
370	xma.hu		f42 = f34, f6, f46
371	br		.LL11
372
373
374.Lb00:	ldf8		f45 = [rp], 8
375	ldf8		f33 = [up], 8
376	;;
377	ldf8		f46 = [rp], 8
378	ldf8		f34 = [up], 8
379	;;
380	ldf8		f47 = [rp], 8
381	xma.l		f36 = f7, f6, f8
382	ldf8		f35 = [up], 8
383	xma.hu		f40 = f7, f6, f8
384	br.cloop.dptk	.grt4
385
386	xma.l		f37 = f33, f6, f45
387	xma.hu		f41 = f33, f6, f45
388	;;
389	getf.sig	r24 = f36			C lo
390	xma.l		f38 = f34, f6, f46
391	ld8		r21 = [r9], 8
392	xma.hu		f42 = f34, f6, f46
393	;;
394	getf.sig	r28 = f40			C hi
395	xma.l		f39 = f35, f6, f47
396	getf.sig	r25 = f37			C lo
397	ld8		r22 = [r9], 8
398	xma.hu		f43 = f35, f6, f47
399	;;
400	getf.sig	r29 = f41			C hi
401	getf.sig	r26 = f38			C lo
402	ld8		r23 = [r9], 8
403	;;
404	getf.sig	r30 = f42			C hi
405	getf.sig	r27 = f39			C lo
406	ld8		r20 = [r9], 8
407	br		.Lcj4
408
409.grt4:	ldf8		f44 = [rp], 8
410	xma.l		f37 = f33, f6, f45
411	ldf8		f32 = [up], 8
412	xma.hu		f41 = f33, f6, f45
413	;;
414	ldf8		f45 = [rp], 8
415	ldf8		f33 = [up], 8
416	xma.l		f38 = f34, f6, f46
417	getf.sig	r24 = f36			C lo
418	ld8		r21 = [r9], 8
419	xma.hu		f42 = f34, f6, f46
420	;;
421	ldf8		f46 = [rp], 8
422	getf.sig	r28 = f40			C hi
423	ldf8		f34 = [up], 8
424	xma.l		f39 = f35, f6, f47
425	getf.sig	r25 = f37			C lo
426	ld8		r22 = [r9], 8
427	xma.hu		f43 = f35, f6, f47
428	;;
429	ldf8		f47 = [rp], 8
430	getf.sig	r29 = f41			C hi
431	ldf8		f35 = [up], 8
432	;;
433	getf.sig	r26 = f38			C lo
434	xma.l		f36 = f32, f6, f44
435	ld8		r23 = [r9], 8
436	xma.hu		f40 = f32, f6, f44
437	br.cloop.dptk	.grt8
438	;;
439
440	getf.sig	r30 = f42			C hi
441	getf.sig	r27 = f39			C lo
442	xma.l		f37 = f33, f6, f45
443	ld8		r20 = [r9], 8
444	xma.hu		f41 = f33, f6, f45
445	br		.Lcj8
446
447.grt8:	ldf8		f44 = [rp], 8
448	getf.sig	r30 = f42			C hi
449	ldf8		f32 = [up], 8
450	;;
451	getf.sig	r27 = f39			C lo
452	xma.l		f37 = f33, f6, f45
453	ld8		r20 = [r9], 8
454	xma.hu		f41 = f33, f6, f45
455	br		.LL00
456
457	ALIGN(32)
458.Loop:
459{.mmi
460	ldf8		f44 = [rp], 8
461	cmp.ltu		p6, p0 = r27, r8	C lo cmp
462	sub		r14 = r27, r8		C lo sub
463}
464{.mmi
465	getf.sig	r30 = f42			C hi
466	ldf8		f32 = [up], 8
467	sub		r8 = r20, r31		C hi sub
468	;;				C 01
469}
470{.mmf
471	getf.sig	r27 = f39			C lo
472	st8		[r10] = r14, 8
473	xma.l		f37 = f33, f6, f45
474}
475{.mfi
476	ld8		r20 = [r9], 8
477	xma.hu		f41 = f33, f6, f45
478   (p6)	add		r8 = 1, r8
479	;;				C 02
480}
481{.mmi
482.LL00:	ldf8		f45 = [rp], 8
483	cmp.ltu		p6, p0 = r24, r8
484	sub		r14 = r24, r8
485}
486{.mmi
487	getf.sig	r31 = f43			C hi
488	ldf8		f33 = [up], 8
489	sub		r8 = r21, r28
490	;;				C 03
491}
492{.mmf
493	getf.sig	r24 = f36			C lo
494	st8		[r10] = r14, 8
495	xma.l		f38 = f34, f6, f46
496}
497{.mfi
498	ld8		r21 = [r9], 8
499	xma.hu		f42 = f34, f6, f46
500   (p6)	add		r8 = 1, r8
501	;;				C 04
502}
503{.mmi
504.LL11:	ldf8		f46 = [rp], 8
505	cmp.ltu		p6, p0 = r25, r8
506	sub		r14 = r25, r8
507}
508{.mmi
509	getf.sig	r28 = f40			C hi
510	ldf8		f34 = [up], 8
511	sub		r8 = r22, r29
512	;;				C 05
513}
514{.mmf
515	getf.sig	r25 = f37			C lo
516	st8		[r10] = r14, 8
517	xma.l		f39 = f35, f6, f47
518}
519{.mfi
520	ld8		r22 = [r9], 8
521	xma.hu		f43 = f35, f6, f47
522   (p6)	add		r8 = 1, r8
523	;;				C 06
524}
525{.mmi
526.LL10:	ldf8		f47 = [rp], 8
527	cmp.ltu		p6, p0 = r26, r8
528	sub		r14 = r26, r8
529}
530{.mmi
531	getf.sig	r29 = f41			C hi
532	ldf8		f35 = [up], 8
533	sub		r8 = r23, r30
534	;;				C 07
535}
536{.mmf
537	getf.sig	r26 = f38			C lo
538	st8		[r10] = r14, 8
539	xma.l		f36 = f32, f6, f44
540}
541{.mfi
542	ld8		r23 = [r9], 8
543	xma.hu		f40 = f32, f6, f44
544   (p6)	add		r8 = 1, r8
545}
546	br.cloop.dptk	.Loop
547	;;
548
549.Lend:
550	cmp.ltu		p6, p0 = r27, r8
551	sub		r14 = r27, r8
552	getf.sig	r30 = f42
553	sub		r8 = r20, r31
554	;;
555	getf.sig	r27 = f39
556	st8		[r10] = r14, 8
557	xma.l		f37 = f33, f6, f45
558	ld8		r20 = [r9], 8
559	xma.hu		f41 = f33, f6, f45
560   (p6)	add		r8 = 1, r8
561	;;
562.Lcj8:
563	cmp.ltu		p6, p0 = r24, r8
564	sub		r14 = r24, r8
565	getf.sig	r31 = f43
566	sub		r8 = r21, r28
567	;;
568	getf.sig	r24 = f36
569	st8		[r10] = r14, 8
570	xma.l		f38 = f34, f6, f46
571	ld8		r21 = [r9], 8
572	xma.hu		f42 = f34, f6, f46
573   (p6)	add		r8 = 1, r8
574	;;
575.Lcj7:
576	cmp.ltu		p6, p0 = r25, r8
577	sub		r14 = r25, r8
578	getf.sig	r28 = f40
579	sub		r8 = r22, r29
580	;;
581	getf.sig	r25 = f37
582	st8		[r10] = r14, 8
583	xma.l		f39 = f35, f6, f47
584	ld8		r22 = [r9], 8
585	xma.hu		f43 = f35, f6, f47
586   (p6)	add		r8 = 1, r8
587	;;
588.Lcj6:
589	cmp.ltu		p6, p0 = r26, r8
590	sub		r14 = r26, r8
591	getf.sig	r29 = f41
592	sub		r8 = r23, r30
593	;;
594	getf.sig	r26 = f38
595	st8		[r10] = r14, 8
596	ld8		r23 = [r9], 8
597   (p6)	add		r8 = 1, r8
598	;;
599.Lcj5:
600	cmp.ltu		p6, p0 = r27, r8
601	sub		r14 = r27, r8
602	getf.sig	r30 = f42
603	sub		r8 = r20, r31
604	;;
605	getf.sig	r27 = f39
606	st8		[r10] = r14, 8
607	ld8		r20 = [r9], 8
608   (p6)	add		r8 = 1, r8
609	;;
610.Lcj4:
611	cmp.ltu		p6, p0 = r24, r8
612	sub		r14 = r24, r8
613	getf.sig	r31 = f43
614	sub		r8 = r21, r28
615	;;
616	st8		[r10] = r14, 8
617   (p6)	add		r8 = 1, r8
618	;;
619.Lcj3:
620	cmp.ltu		p6, p0 = r25, r8
621	sub		r14 = r25, r8
622	sub		r8 = r22, r29
623	;;
624	st8		[r10] = r14, 8
625   (p6)	add		r8 = 1, r8
626	;;
627.Lcj2:
628	cmp.ltu		p6, p0 = r26, r8
629	sub		r14 = r26, r8
630	sub		r8 = r23, r30
631	;;
632	st8		[r10] = r14, 8
633   (p6)	add		r8 = 1, r8
634	;;
635.Lcj1:
636	cmp.ltu		p6, p0 = r27, r8
637	sub		r14 = r27, r8
638	sub		r8 = r20, r31
639	;;
640	st8		[r10] = r14, 8
641	mov		ar.lc = r2
642   (p6)	add		r8 = 1, r8
643	br.ret.sptk.many b0
644.Ldone:	mov		ar.lc = r2
645	br.ret.sptk.many b0
646EPILOGUE()
647ASM_END()
648