xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/ia64/addmul_1.asm (revision 92e958de60c71aa0f2452bd7074cbb006fe6546b)
1dnl  IA-64 mpn_addmul_1 -- Multiply a limb vector with a limb and add the
2dnl  result to a second limb vector.
3
4dnl  Contributed to the GNU project by Torbjorn Granlund.
5
6dnl  Copyright 2000, 2001, 2002, 2003, 2004, 2005, 2007 Free Software
7dnl  Foundation, Inc.
8
9dnl  This file is part of the GNU MP Library.
10
11dnl  The GNU MP Library is free software; you can redistribute it and/or modify
12dnl  it under the terms of the GNU Lesser General Public License as published
13dnl  by the Free Software Foundation; either version 3 of the License, or (at
14dnl  your option) any later version.
15
16dnl  The GNU MP Library is distributed in the hope that it will be useful, but
17dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
18dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
19dnl  License for more details.
20
21dnl  You should have received a copy of the GNU Lesser General Public License
22dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
23
24include(`../config.m4')
25
26C         cycles/limb
27C Itanium:    3.0
28C Itanium 2:  2.0
29
30C TODO
31C  * Further optimize feed-in and wind-down code, both for speed and code size.
32C  * Handle low limb input and results specially, using a common stf8 in the
33C    epilogue.
34C  * Use 1 c/l carry propagation scheme in wind-down code.
35C  * Use extra pointer registers for `up' and rp to speed up feed-in loads.
36C  * Work out final differences with mul_1.asm.  That function is 300 bytes
37C    smaller than this due to better loop scheduling and thus simpler feed-in
38C    code.
39
40C INPUT PARAMETERS
41define(`rp', `r32')
42define(`up', `r33')
43define(`n', `r34')
44define(`vl', `r35')
45
46ASM_START()
47PROLOGUE(mpn_addmul_1)
48	.prologue
49	.save	ar.lc, r2
50	.body
51
52ifdef(`HAVE_ABI_32',
53`	addp4		rp = 0, rp		C M I
54	addp4		up = 0, up		C M I
55	zxt4		n = n			C I
56	;;
57')
58{.mmi
59	adds		r15 = -1, n		C M I
60	mov		r20 = rp		C M I
61	mov.i		r2 = ar.lc		C I0
62}
63{.mmi
64	ldf8		f7 = [up], 8		C M
65	ldf8		f8 = [rp], 8		C M
66	and		r14 = 3, n		C M I
67	;;
68}
69{.mmi
70	setf.sig	f6 = vl			C M2 M3
71	cmp.eq		p10, p0 = 0, r14	C M I
72	shr.u		r31 = r15, 2		C I0
73}
74{.mmi
75	cmp.eq		p11, p0 = 2, r14	C M I
76	cmp.eq		p12, p0 = 3, r14	C M I
77	nop.i		0			C I
78	;;
79}
80{.mii
81	cmp.ne		p6, p7 = r0, r0		C M I
82	mov.i		ar.lc = r31		C I0
83	cmp.ne		p8, p9 = r0, r0		C M I
84}
85{.bbb
86  (p10)	br.dptk		.Lb00			C B
87  (p11)	br.dptk		.Lb10			C B
88  (p12)	br.dptk		.Lb11			C B
89	;;
90}
91
92.Lb01:	br.cloop.dptk	.grt1			C B
93
94	xma.l		f39 = f7, f6, f8	C F
95	xma.hu		f43 = f7, f6, f8	C F
96	;;
97	getf.sig	r8 = f43		C M2
98	stf8		[r20] = f39		C M2 M3
99	mov.i		ar.lc = r2		C I0
100	br.ret.sptk.many b0			C B
101
102.grt1:
103	ldf8		f32 = [up], 8
104	ldf8		f44 = [rp], 8
105	;;
106	ldf8		f33 = [up], 8
107	ldf8		f45 = [rp], 8
108	;;
109	ldf8		f34 = [up], 8
110	xma.l		f39 = f7, f6, f8
111	ldf8		f46 = [rp], 8
112	xma.hu		f43 = f7, f6, f8
113	;;
114	ldf8		f35 = [up], 8
115	ldf8		f47 = [rp], 8
116	br.cloop.dptk	.grt5
117
118	xma.l		f36 = f32, f6, f44
119	xma.hu		f40 = f32, f6, f44
120	;;
121	stf8		[r20] = f39, 8
122	xma.l		f37 = f33, f6, f45
123	xma.hu		f41 = f33, f6, f45
124	;;
125	getf.sig	r31 = f43
126	getf.sig	r24 = f36
127	xma.l		f38 = f34, f6, f46
128	xma.hu		f42 = f34, f6, f46
129	;;
130	getf.sig	r28 = f40
131	getf.sig	r25 = f37
132	xma.l		f39 = f35, f6, f47
133	xma.hu		f43 = f35, f6, f47
134	;;
135	getf.sig	r29 = f41
136	getf.sig	r26 = f38
137	br		.Lcj5
138
139.grt5:
140	mov		r30 = 0
141	xma.l		f36 = f32, f6, f44
142	xma.hu		f40 = f32, f6, f44
143	;;
144	ldf8		f32 = [up], 8
145	xma.l		f37 = f33, f6, f45
146	ldf8		f44 = [rp], 8
147	xma.hu		f41 = f33, f6, f45
148	;;
149	ldf8		f33 = [up], 8
150	getf.sig	r27 = f39
151	;;
152	getf.sig	r31 = f43
153	xma.l		f38 = f34, f6, f46
154	ldf8		f45 = [rp], 8
155	xma.hu		f42 = f34, f6, f46
156	;;
157	ldf8		f34 = [up], 8
158	getf.sig	r24 = f36
159	;;
160	getf.sig	r28 = f40
161	xma.l		f39 = f35, f6, f47
162	ldf8		f46 = [rp], 8
163	xma.hu		f43 = f35, f6, f47
164	;;
165	ldf8		f35 = [up], 8
166	getf.sig	r25 = f37
167	br.cloop.dptk	.Loop
168	br		.Le0
169
170
171.Lb10:	ldf8		f35 = [up], 8
172	ldf8		f47 = [rp], 8
173	br.cloop.dptk	.grt2
174
175	xma.l		f38 = f7, f6, f8
176	xma.hu		f42 = f7, f6, f8
177	;;
178	xma.l		f39 = f35, f6, f47
179	xma.hu		f43 = f35, f6, f47
180	;;
181	getf.sig	r30 = f42
182	stf8		[r20] = f38, 8
183	getf.sig	r27 = f39
184	getf.sig	r8 = f43
185	br		.Lcj2
186
187.grt2:
188	ldf8		f32 = [up], 8
189	ldf8		f44 = [rp], 8
190	;;
191	ldf8		f33 = [up], 8
192	xma.l		f38 = f7, f6, f8
193	ldf8		f45 = [rp], 8
194	xma.hu		f42 = f7, f6, f8
195	;;
196	ldf8		f34 = [up], 8
197	xma.l		f39 = f35, f6, f47
198	ldf8		f46 = [rp], 8
199	xma.hu		f43 = f35, f6, f47
200	;;
201	ldf8		f35 = [up], 8
202	ldf8		f47 = [rp], 8
203	br.cloop.dptk	.grt6
204
205	stf8		[r20] = f38, 8
206	xma.l		f36 = f32, f6, f44
207	xma.hu		f40 = f32, f6, f44
208	;;
209	getf.sig	r30 = f42
210	getf.sig	r27 = f39
211	xma.l		f37 = f33, f6, f45
212	xma.hu		f41 = f33, f6, f45
213	;;
214	getf.sig	r31 = f43
215	getf.sig	r24 = f36
216	xma.l		f38 = f34, f6, f46
217	xma.hu		f42 = f34, f6, f46
218	;;
219	getf.sig	r28 = f40
220	getf.sig	r25 = f37
221	xma.l		f39 = f35, f6, f47
222	xma.hu		f43 = f35, f6, f47
223	br		.Lcj6
224
225.grt6:
226	mov		r29 = 0
227	xma.l		f36 = f32, f6, f44
228	xma.hu		f40 = f32, f6, f44
229	;;
230	ldf8		f32 = [up], 8
231	getf.sig	r26 = f38
232	;;
233	getf.sig	r30 = f42
234	xma.l		f37 = f33, f6, f45
235	ldf8		f44 = [rp], 8
236	xma.hu		f41 = f33, f6, f45
237	;;
238	ldf8		f33 = [up], 8
239	getf.sig	r27 = f39
240	;;
241	getf.sig	r31 = f43
242	xma.l		f38 = f34, f6, f46
243	ldf8		f45 = [rp], 8
244	xma.hu		f42 = f34, f6, f46
245	;;
246	ldf8		f34 = [up], 8
247	getf.sig	r24 = f36
248	br		.LL10
249
250
251.Lb11:	ldf8		f34 = [up], 8
252	ldf8		f46 = [rp], 8
253	;;
254	ldf8		f35 = [up], 8
255	ldf8		f47 = [rp], 8
256	br.cloop.dptk	.grt3
257	;;
258
259	xma.l		f37 = f7, f6, f8
260	xma.hu		f41 = f7, f6, f8
261	xma.l		f38 = f34, f6, f46
262	xma.hu		f42 = f34, f6, f46
263	xma.l		f39 = f35, f6, f47
264	xma.hu		f43 = f35, f6, f47
265	;;
266	getf.sig	r29 = f41
267	stf8		[r20] = f37, 8
268	getf.sig	r26 = f38
269	getf.sig	r30 = f42
270	getf.sig	r27 = f39
271	getf.sig	r8 = f43
272	br		.Lcj3
273
274.grt3:
275	ldf8		f32 = [up], 8
276	xma.l		f37 = f7, f6, f8
277	ldf8		f44 = [rp], 8
278	xma.hu		f41 = f7, f6, f8
279	;;
280	ldf8		f33 = [up], 8
281	xma.l		f38 = f34, f6, f46
282	ldf8		f45 = [rp], 8
283	xma.hu		f42 = f34, f6, f46
284	;;
285	ldf8		f34 = [up], 8
286	xma.l		f39 = f35, f6, f47
287	ldf8		f46 = [rp], 8
288	xma.hu		f43 = f35, f6, f47
289	;;
290	ldf8		f35 = [up], 8
291	getf.sig	r25 = f37		C FIXME
292	ldf8		f47 = [rp], 8
293	br.cloop.dptk	.grt7
294
295	getf.sig	r29 = f41
296	stf8		[r20] = f37, 8		C FIXME
297	xma.l		f36 = f32, f6, f44
298	getf.sig	r26 = f38
299	xma.hu		f40 = f32, f6, f44
300	;;
301	getf.sig	r30 = f42
302	xma.l		f37 = f33, f6, f45
303	getf.sig	r27 = f39
304	xma.hu		f41 = f33, f6, f45
305	;;
306	getf.sig	r31 = f43
307	xma.l		f38 = f34, f6, f46
308	getf.sig	r24 = f36
309	xma.hu		f42 = f34, f6, f46
310	br		.Lcj7
311
312.grt7:
313	getf.sig	r29 = f41
314	xma.l		f36 = f32, f6, f44
315	mov		r28 = 0
316	xma.hu		f40 = f32, f6, f44
317	;;
318	ldf8		f32 = [up], 8
319	getf.sig	r26 = f38
320	;;
321	getf.sig	r30 = f42
322	xma.l		f37 = f33, f6, f45
323	ldf8		f44 = [rp], 8
324	xma.hu		f41 = f33, f6, f45
325	;;
326	ldf8		f33 = [up], 8
327	getf.sig	r27 = f39
328	br		.LL11
329
330
331.Lb00:	ldf8		f33 = [up], 8
332	ldf8		f45 = [rp], 8
333	;;
334	ldf8		f34 = [up], 8
335	ldf8		f46 = [rp], 8
336	;;
337	ldf8		f35 = [up], 8
338	xma.l		f36 = f7, f6, f8
339	ldf8		f47 = [rp], 8
340	xma.hu		f40 = f7, f6, f8
341	br.cloop.dptk	.grt4
342
343	xma.l		f37 = f33, f6, f45
344	xma.hu		f41 = f33, f6, f45
345	xma.l		f38 = f34, f6, f46
346	xma.hu		f42 = f34, f6, f46
347	;;
348	getf.sig	r28 = f40
349	stf8		[r20] = f36, 8
350	xma.l		f39 = f35, f6, f47
351	getf.sig	r25 = f37
352	xma.hu		f43 = f35, f6, f47
353	;;
354	getf.sig	r29 = f41
355	getf.sig	r26 = f38
356	getf.sig	r30 = f42
357	getf.sig	r27 = f39
358	br		.Lcj4
359
360.grt4:
361	ldf8		f32 = [up], 8
362	xma.l		f37 = f33, f6, f45
363	ldf8		f44 = [rp], 8
364	xma.hu		f41 = f33, f6, f45
365	;;
366	ldf8		f33 = [up], 8
367	xma.l		f38 = f34, f6, f46
368	ldf8		f45 = [rp], 8
369	xma.hu		f42 = f34, f6, f46
370	;;
371	ldf8		f34 = [up], 8
372	getf.sig	r24 = f36		C FIXME
373	xma.l		f39 = f35, f6, f47
374	ldf8		f46 = [rp], 8
375	getf.sig	r28 = f40
376	xma.hu		f43 = f35, f6, f47
377	;;
378	ldf8		f35 = [up], 8
379	getf.sig	r25 = f37
380	ldf8		f47 = [rp], 8
381	br.cloop.dptk	.grt8
382
383	getf.sig	r29 = f41
384	stf8		[r20] = f36, 8		C FIXME
385	xma.l		f36 = f32, f6, f44
386	getf.sig	r26 = f38
387	getf.sig	r30 = f42
388	xma.hu		f40 = f32, f6, f44
389	;;
390	xma.l		f37 = f33, f6, f45
391	getf.sig	r27 = f39
392	xma.hu		f41 = f33, f6, f45
393	br		.Lcj8
394
395.grt8:
396	getf.sig	r29 = f41
397	xma.l		f36 = f32, f6, f44
398	mov		r31 = 0
399	xma.hu		f40 = f32, f6, f44
400	;;
401	ldf8		f32 = [up], 8
402	getf.sig	r26 = f38
403	br		.LL00
404
405
406C *** MAIN LOOP START ***
407	ALIGN(32)				C insn	fed	cycle #
408.Loop:
409	.pred.rel "mutex", p6, p7		C num	by	i1 i2
410	getf.sig	r29 = f41		C 00	16	0   0
411	xma.l		f36 = f32, f6, f44	C 01	06,15	0   0
412   (p6)	add		r14 = r30, r27, 1	C 02		0   0
413	ldf8		f47 = [rp], 8		C 03		0   0
414	xma.hu		f40 = f32, f6, f44	C 04	06,15	0   0
415   (p7)	add		r14 = r30, r27		C 05		0   0
416	;;
417	.pred.rel "mutex", p6, p7
418	ldf8		f32 = [up], 8		C 06		1   1
419   (p6)	cmp.leu		p8, p9 = r14, r27	C 07		1   1
420   (p7)	cmp.ltu		p8, p9 = r14, r27	C 08		1   1
421	getf.sig	r26 = f38		C 09	25	2   1
422	st8		[r20] = r14, 8		C 10		2   1
423	nop.b		0			C 11		2   1
424	;;
425.LL00:
426	.pred.rel "mutex", p8, p9
427	getf.sig	r30 = f42		C 12	28	3   2
428	xma.l		f37 = f33, f6, f45	C 13	18,27	3   2
429   (p8)	add		r16 = r31, r24, 1	C 14		3   2
430	ldf8		f44 = [rp], 8		C 15		3   2
431	xma.hu		f41 = f33, f6, f45	C 16	18,27	3   2
432   (p9)	add		r16 = r31, r24		C 17		3   2
433	;;
434	.pred.rel "mutex", p8, p9
435	ldf8		f33 = [up], 8		C 18		4   3
436   (p8)	cmp.leu		p6, p7 = r16, r24	C 19		4   3
437   (p9)	cmp.ltu		p6, p7 = r16, r24	C 20		4   3
438	getf.sig	r27 = f39		C 21	37	5   3
439	st8		[r20] = r16, 8		C 22		5   3
440	nop.b		0			C 23		5   3
441	;;
442.LL11:
443	.pred.rel "mutex", p6, p7
444	getf.sig	r31 = f43		C 24	40	6   4
445	xma.l		f38 = f34, f6, f46	C 25	30,39	6   4
446   (p6)	add		r14 = r28, r25, 1	C 26		6   4
447	ldf8		f45 = [rp], 8		C 27		6   4
448	xma.hu		f42 = f34, f6, f46	C 28	30,39	6   4
449   (p7)	add		r14 = r28, r25		C 29		6   4
450	;;
451	.pred.rel "mutex", p6, p7
452	ldf8		f34 = [up], 8		C 30		7   5
453   (p6)	cmp.leu		p8, p9 = r14, r25	C 31		7   5
454   (p7)	cmp.ltu		p8, p9 = r14, r25	C 32		7   5
455	getf.sig	r24 = f36		C 33	01	8   5
456	st8		[r20] = r14, 8		C 34		8   5
457	nop.b		0			C 35		8   5
458	;;
459.LL10:
460	.pred.rel "mutex", p8, p9
461	getf.sig	r28 = f40		C 36	04	9   6
462	xma.l		f39 = f35, f6, f47	C 37	42,03	9   6
463   (p8)	add		r16 = r29, r26, 1	C 38		9   6
464	ldf8		f46 = [rp], 8		C 39		9   6
465	xma.hu		f43 = f35, f6, f47	C 40	42,03	9   6
466   (p9)	add		r16 = r29, r26		C 41		9   6
467	;;
468	.pred.rel "mutex", p8, p9
469	ldf8		f35 = [up], 8		C 42	       10   7
470   (p8)	cmp.leu		p6, p7 = r16, r26	C 43	       10   7
471   (p9)	cmp.ltu		p6, p7 = r16, r26	C 44	       10   7
472	getf.sig	r25 = f37		C 45	13     11   7
473	st8		[r20] = r16, 8		C 46	       11   7
474	br.cloop.dptk	.Loop			C 47	       11   7
475C *** MAIN LOOP END ***
476	;;
477.Le0:
478	.pred.rel "mutex", p6, p7
479	getf.sig	r29 = f41		C
480	xma.l		f36 = f32, f6, f44	C
481   (p6)	add		r14 = r30, r27, 1	C
482	ldf8		f47 = [rp], 8		C
483	xma.hu		f40 = f32, f6, f44	C
484   (p7)	add		r14 = r30, r27		C
485	;;
486	.pred.rel "mutex", p6, p7
487   (p6)	cmp.leu		p8, p9 = r14, r27	C
488   (p7)	cmp.ltu		p8, p9 = r14, r27	C
489	getf.sig	r26 = f38		C
490	st8		[r20] = r14, 8		C
491	;;
492	.pred.rel "mutex", p8, p9
493	getf.sig	r30 = f42		C
494	xma.l		f37 = f33, f6, f45	C
495   (p8)	add		r16 = r31, r24, 1	C
496	xma.hu		f41 = f33, f6, f45	C
497   (p9)	add		r16 = r31, r24		C
498	;;
499	.pred.rel "mutex", p8, p9
500   (p8)	cmp.leu		p6, p7 = r16, r24	C
501   (p9)	cmp.ltu		p6, p7 = r16, r24	C
502	getf.sig	r27 = f39		C
503	st8		[r20] = r16, 8		C
504	;;
505.Lcj8:
506	.pred.rel "mutex", p6, p7
507	getf.sig	r31 = f43		C
508	xma.l		f38 = f34, f6, f46	C
509   (p6)	add		r14 = r28, r25, 1	C
510	xma.hu		f42 = f34, f6, f46	C
511   (p7)	add		r14 = r28, r25		C
512	;;
513	.pred.rel "mutex", p6, p7
514   (p6)	cmp.leu		p8, p9 = r14, r25	C
515   (p7)	cmp.ltu		p8, p9 = r14, r25	C
516	getf.sig	r24 = f36		C
517	st8		[r20] = r14, 8		C
518	;;
519.Lcj7:
520	.pred.rel "mutex", p8, p9
521	getf.sig	r28 = f40		C
522	xma.l		f39 = f35, f6, f47	C
523   (p8)	add		r16 = r29, r26, 1	C
524	xma.hu		f43 = f35, f6, f47	C
525   (p9)	add		r16 = r29, r26		C
526	;;
527	.pred.rel "mutex", p8, p9
528   (p8)	cmp.leu		p6, p7 = r16, r26	C
529   (p9)	cmp.ltu		p6, p7 = r16, r26	C
530	getf.sig	r25 = f37		C
531	st8		[r20] = r16, 8		C
532	;;
533.Lcj6:
534	.pred.rel "mutex", p6, p7
535	getf.sig	r29 = f41		C
536   (p6)	add		r14 = r30, r27, 1	C
537   (p7)	add		r14 = r30, r27		C
538	;;
539	.pred.rel "mutex", p6, p7
540   (p6)	cmp.leu		p8, p9 = r14, r27	C
541   (p7)	cmp.ltu		p8, p9 = r14, r27	C
542	getf.sig	r26 = f38		C
543	st8		[r20] = r14, 8		C
544	;;
545.Lcj5:
546	.pred.rel "mutex", p8, p9
547	getf.sig	r30 = f42		C
548   (p8)	add		r16 = r31, r24, 1	C
549   (p9)	add		r16 = r31, r24		C
550	;;
551	.pred.rel "mutex", p8, p9
552   (p8)	cmp.leu		p6, p7 = r16, r24	C
553   (p9)	cmp.ltu		p6, p7 = r16, r24	C
554	getf.sig	r27 = f39		C
555	st8		[r20] = r16, 8		C
556	;;
557.Lcj4:
558	.pred.rel "mutex", p6, p7
559	getf.sig	r8 = f43		C
560   (p6)	add		r14 = r28, r25, 1	C
561   (p7)	add		r14 = r28, r25		C
562	;;
563	.pred.rel "mutex", p6, p7
564	st8		[r20] = r14, 8		C
565   (p6)	cmp.leu		p8, p9 = r14, r25	C
566   (p7)	cmp.ltu		p8, p9 = r14, r25	C
567	;;
568.Lcj3:
569	.pred.rel "mutex", p8, p9
570   (p8)	add		r16 = r29, r26, 1	C
571   (p9)	add		r16 = r29, r26		C
572	;;
573	.pred.rel "mutex", p8, p9
574	st8		[r20] = r16, 8		C
575   (p8)	cmp.leu		p6, p7 = r16, r26	C
576   (p9)	cmp.ltu		p6, p7 = r16, r26	C
577	;;
578.Lcj2:
579	.pred.rel "mutex", p6, p7
580   (p6)	add		r14 = r30, r27, 1	C
581   (p7)	add		r14 = r30, r27		C
582	;;
583	.pred.rel "mutex", p6, p7
584	st8		[r20] = r14		C
585   (p6)	cmp.leu		p8, p9 = r14, r27	C
586   (p7)	cmp.ltu		p8, p9 = r14, r27	C
587	;;
588   (p8)	add		r8 = 1, r8		C M I
589	mov.i		ar.lc = r2		C I0
590	br.ret.sptk.many b0			C B
591EPILOGUE()
592ASM_END()
593