xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/ia64/addmul_1.asm (revision ca453df649ce9db45b64d73678ba06cbccf9aa11)
1dnl  IA-64 mpn_addmul_1 -- Multiply a limb vector with a limb and add the
2dnl  result to a second limb vector.
3
4dnl  Copyright 2000, 2001, 2002, 2003, 2004, 2005, 2007 Free Software
5dnl  Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of the GNU Lesser General Public License as published
11dnl  by the Free Software Foundation; either version 3 of the License, or (at
12dnl  your option) any later version.
13
14dnl  The GNU MP Library is distributed in the hope that it will be useful, but
15dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17dnl  License for more details.
18
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24C         cycles/limb
25C Itanium:    3.0
26C Itanium 2:  2.0
27
28C TODO
29C  * Further optimize feed-in and wind-down code, both for speed and code size.
30C  * Handle low limb input and results specially, using a common stf8 in the
31C    epilogue.
32C  * Use 1 c/l carry propagation scheme in wind-down code.
33C  * Use extra pointer registers for `up' and rp to speed up feed-in loads.
34C  * Work out final differences with mul_1.asm.  That function is 300 bytes
35C    smaller than this due to better loop scheduling and thus simpler feed-in
36C    code.
37
38C INPUT PARAMETERS
39define(`rp', `r32')
40define(`up', `r33')
41define(`n', `r34')
42define(`vl', `r35')
43
44ASM_START()
45PROLOGUE(mpn_addmul_1)
46	.prologue
47	.save	ar.lc, r2
48	.body
49
50ifdef(`HAVE_ABI_32',
51`	addp4		rp = 0, rp		C M I
52	addp4		up = 0, up		C M I
53	zxt4		n = n			C I
54	;;
55')
56{.mmi
57	adds		r15 = -1, n		C M I
58	mov		r20 = rp		C M I
59	mov.i		r2 = ar.lc		C I0
60}
61{.mmi
62	ldf8		f7 = [up], 8		C M
63	ldf8		f8 = [rp], 8		C M
64	and		r14 = 3, n		C M I
65	;;
66}
67{.mmi
68	setf.sig	f6 = vl			C M2 M3
69	cmp.eq		p10, p0 = 0, r14	C M I
70	shr.u		r31 = r15, 2		C I0
71}
72{.mmi
73	cmp.eq		p11, p0 = 2, r14	C M I
74	cmp.eq		p12, p0 = 3, r14	C M I
75	nop.i		0			C I
76	;;
77}
78{.mii
79	cmp.ne		p6, p7 = r0, r0		C M I
80	mov.i		ar.lc = r31		C I0
81	cmp.ne		p8, p9 = r0, r0		C M I
82}
83{.bbb
84  (p10)	br.dptk		.Lb00			C B
85  (p11)	br.dptk		.Lb10			C B
86  (p12)	br.dptk		.Lb11			C B
87	;;
88}
89
90.Lb01:	br.cloop.dptk	.grt1			C B
91
92	xma.l		f39 = f7, f6, f8	C F
93	xma.hu		f43 = f7, f6, f8	C F
94	;;
95	getf.sig	r8 = f43		C M2
96	stf8		[r20] = f39		C M2 M3
97	mov.i		ar.lc = r2		C I0
98	br.ret.sptk.many b0			C B
99
100.grt1:
101	ldf8		f32 = [up], 8
102	ldf8		f44 = [rp], 8
103	;;
104	ldf8		f33 = [up], 8
105	ldf8		f45 = [rp], 8
106	;;
107	ldf8		f34 = [up], 8
108	xma.l		f39 = f7, f6, f8
109	ldf8		f46 = [rp], 8
110	xma.hu		f43 = f7, f6, f8
111	;;
112	ldf8		f35 = [up], 8
113	ldf8		f47 = [rp], 8
114	br.cloop.dptk	.grt5
115
116	xma.l		f36 = f32, f6, f44
117	xma.hu		f40 = f32, f6, f44
118	;;
119	stf8		[r20] = f39, 8
120	xma.l		f37 = f33, f6, f45
121	xma.hu		f41 = f33, f6, f45
122	;;
123	getf.sig	r31 = f43
124	getf.sig	r24 = f36
125	xma.l		f38 = f34, f6, f46
126	xma.hu		f42 = f34, f6, f46
127	;;
128	getf.sig	r28 = f40
129	getf.sig	r25 = f37
130	xma.l		f39 = f35, f6, f47
131	xma.hu		f43 = f35, f6, f47
132	;;
133	getf.sig	r29 = f41
134	getf.sig	r26 = f38
135	br		.Lcj5
136
137.grt5:
138	mov		r30 = 0
139	xma.l		f36 = f32, f6, f44
140	xma.hu		f40 = f32, f6, f44
141	;;
142	ldf8		f32 = [up], 8
143	xma.l		f37 = f33, f6, f45
144	ldf8		f44 = [rp], 8
145	xma.hu		f41 = f33, f6, f45
146	;;
147	ldf8		f33 = [up], 8
148	getf.sig	r27 = f39
149	;;
150	getf.sig	r31 = f43
151	xma.l		f38 = f34, f6, f46
152	ldf8		f45 = [rp], 8
153	xma.hu		f42 = f34, f6, f46
154	;;
155	ldf8		f34 = [up], 8
156	getf.sig	r24 = f36
157	;;
158	getf.sig	r28 = f40
159	xma.l		f39 = f35, f6, f47
160	ldf8		f46 = [rp], 8
161	xma.hu		f43 = f35, f6, f47
162	;;
163	ldf8		f35 = [up], 8
164	getf.sig	r25 = f37
165	br.cloop.dptk	.Loop
166	br		.Le0
167
168
169.Lb10:	ldf8		f35 = [up], 8
170	ldf8		f47 = [rp], 8
171	br.cloop.dptk	.grt2
172
173	xma.l		f38 = f7, f6, f8
174	xma.hu		f42 = f7, f6, f8
175	;;
176	xma.l		f39 = f35, f6, f47
177	xma.hu		f43 = f35, f6, f47
178	;;
179	getf.sig	r30 = f42
180	stf8		[r20] = f38, 8
181	getf.sig	r27 = f39
182	getf.sig	r8 = f43
183	br		.Lcj2
184
185.grt2:
186	ldf8		f32 = [up], 8
187	ldf8		f44 = [rp], 8
188	;;
189	ldf8		f33 = [up], 8
190	xma.l		f38 = f7, f6, f8
191	ldf8		f45 = [rp], 8
192	xma.hu		f42 = f7, f6, f8
193	;;
194	ldf8		f34 = [up], 8
195	xma.l		f39 = f35, f6, f47
196	ldf8		f46 = [rp], 8
197	xma.hu		f43 = f35, f6, f47
198	;;
199	ldf8		f35 = [up], 8
200	ldf8		f47 = [rp], 8
201	br.cloop.dptk	.grt6
202
203	stf8		[r20] = f38, 8
204	xma.l		f36 = f32, f6, f44
205	xma.hu		f40 = f32, f6, f44
206	;;
207	getf.sig	r30 = f42
208	getf.sig	r27 = f39
209	xma.l		f37 = f33, f6, f45
210	xma.hu		f41 = f33, f6, f45
211	;;
212	getf.sig	r31 = f43
213	getf.sig	r24 = f36
214	xma.l		f38 = f34, f6, f46
215	xma.hu		f42 = f34, f6, f46
216	;;
217	getf.sig	r28 = f40
218	getf.sig	r25 = f37
219	xma.l		f39 = f35, f6, f47
220	xma.hu		f43 = f35, f6, f47
221	br		.Lcj6
222
223.grt6:
224	mov		r29 = 0
225	xma.l		f36 = f32, f6, f44
226	xma.hu		f40 = f32, f6, f44
227	;;
228	ldf8		f32 = [up], 8
229	getf.sig	r26 = f38
230	;;
231	getf.sig	r30 = f42
232	xma.l		f37 = f33, f6, f45
233	ldf8		f44 = [rp], 8
234	xma.hu		f41 = f33, f6, f45
235	;;
236	ldf8		f33 = [up], 8
237	getf.sig	r27 = f39
238	;;
239	getf.sig	r31 = f43
240	xma.l		f38 = f34, f6, f46
241	ldf8		f45 = [rp], 8
242	xma.hu		f42 = f34, f6, f46
243	;;
244	ldf8		f34 = [up], 8
245	getf.sig	r24 = f36
246	br		.LL10
247
248
249.Lb11:	ldf8		f34 = [up], 8
250	ldf8		f46 = [rp], 8
251	;;
252	ldf8		f35 = [up], 8
253	ldf8		f47 = [rp], 8
254	br.cloop.dptk	.grt3
255	;;
256
257	xma.l		f37 = f7, f6, f8
258	xma.hu		f41 = f7, f6, f8
259	xma.l		f38 = f34, f6, f46
260	xma.hu		f42 = f34, f6, f46
261	xma.l		f39 = f35, f6, f47
262	xma.hu		f43 = f35, f6, f47
263	;;
264	getf.sig	r29 = f41
265	stf8		[r20] = f37, 8
266	getf.sig	r26 = f38
267	getf.sig	r30 = f42
268	getf.sig	r27 = f39
269	getf.sig	r8 = f43
270	br		.Lcj3
271
272.grt3:
273	ldf8		f32 = [up], 8
274	xma.l		f37 = f7, f6, f8
275	ldf8		f44 = [rp], 8
276	xma.hu		f41 = f7, f6, f8
277	;;
278	ldf8		f33 = [up], 8
279	xma.l		f38 = f34, f6, f46
280	ldf8		f45 = [rp], 8
281	xma.hu		f42 = f34, f6, f46
282	;;
283	ldf8		f34 = [up], 8
284	xma.l		f39 = f35, f6, f47
285	ldf8		f46 = [rp], 8
286	xma.hu		f43 = f35, f6, f47
287	;;
288	ldf8		f35 = [up], 8
289	getf.sig	r25 = f37		C FIXME
290	ldf8		f47 = [rp], 8
291	br.cloop.dptk	.grt7
292
293	getf.sig	r29 = f41
294	stf8		[r20] = f37, 8		C FIXME
295	xma.l		f36 = f32, f6, f44
296	getf.sig	r26 = f38
297	xma.hu		f40 = f32, f6, f44
298	;;
299	getf.sig	r30 = f42
300	xma.l		f37 = f33, f6, f45
301	getf.sig	r27 = f39
302	xma.hu		f41 = f33, f6, f45
303	;;
304	getf.sig	r31 = f43
305	xma.l		f38 = f34, f6, f46
306	getf.sig	r24 = f36
307	xma.hu		f42 = f34, f6, f46
308	br		.Lcj7
309
310.grt7:
311	getf.sig	r29 = f41
312	xma.l		f36 = f32, f6, f44
313	mov		r28 = 0
314	xma.hu		f40 = f32, f6, f44
315	;;
316	ldf8		f32 = [up], 8
317	getf.sig	r26 = f38
318	;;
319	getf.sig	r30 = f42
320	xma.l		f37 = f33, f6, f45
321	ldf8		f44 = [rp], 8
322	xma.hu		f41 = f33, f6, f45
323	;;
324	ldf8		f33 = [up], 8
325	getf.sig	r27 = f39
326	br		.LL11
327
328
329.Lb00:	ldf8		f33 = [up], 8
330	ldf8		f45 = [rp], 8
331	;;
332	ldf8		f34 = [up], 8
333	ldf8		f46 = [rp], 8
334	;;
335	ldf8		f35 = [up], 8
336	xma.l		f36 = f7, f6, f8
337	ldf8		f47 = [rp], 8
338	xma.hu		f40 = f7, f6, f8
339	br.cloop.dptk	.grt4
340
341	xma.l		f37 = f33, f6, f45
342	xma.hu		f41 = f33, f6, f45
343	xma.l		f38 = f34, f6, f46
344	xma.hu		f42 = f34, f6, f46
345	;;
346	getf.sig	r28 = f40
347	stf8		[r20] = f36, 8
348	xma.l		f39 = f35, f6, f47
349	getf.sig	r25 = f37
350	xma.hu		f43 = f35, f6, f47
351	;;
352	getf.sig	r29 = f41
353	getf.sig	r26 = f38
354	getf.sig	r30 = f42
355	getf.sig	r27 = f39
356	br		.Lcj4
357
358.grt4:
359	ldf8		f32 = [up], 8
360	xma.l		f37 = f33, f6, f45
361	ldf8		f44 = [rp], 8
362	xma.hu		f41 = f33, f6, f45
363	;;
364	ldf8		f33 = [up], 8
365	xma.l		f38 = f34, f6, f46
366	ldf8		f45 = [rp], 8
367	xma.hu		f42 = f34, f6, f46
368	;;
369	ldf8		f34 = [up], 8
370	getf.sig	r24 = f36		C FIXME
371	xma.l		f39 = f35, f6, f47
372	ldf8		f46 = [rp], 8
373	getf.sig	r28 = f40
374	xma.hu		f43 = f35, f6, f47
375	;;
376	ldf8		f35 = [up], 8
377	getf.sig	r25 = f37
378	ldf8		f47 = [rp], 8
379	br.cloop.dptk	.grt8
380
381	getf.sig	r29 = f41
382	stf8		[r20] = f36, 8		C FIXME
383	xma.l		f36 = f32, f6, f44
384	getf.sig	r26 = f38
385	getf.sig	r30 = f42
386	xma.hu		f40 = f32, f6, f44
387	;;
388	xma.l		f37 = f33, f6, f45
389	getf.sig	r27 = f39
390	xma.hu		f41 = f33, f6, f45
391	br		.Lcj8
392
393.grt8:
394	getf.sig	r29 = f41
395	xma.l		f36 = f32, f6, f44
396	mov		r31 = 0
397	xma.hu		f40 = f32, f6, f44
398	;;
399	ldf8		f32 = [up], 8
400	getf.sig	r26 = f38
401	br		.LL00
402
403
404C *** MAIN LOOP START ***
405	ALIGN(32)				C insn	fed	cycle #
406.Loop:
407	.pred.rel "mutex", p6, p7		C num	by	i1 i2
408	getf.sig	r29 = f41		C 00	16	0   0
409	xma.l		f36 = f32, f6, f44	C 01	06,15	0   0
410   (p6)	add		r14 = r30, r27, 1	C 02		0   0
411	ldf8		f47 = [rp], 8		C 03		0   0
412	xma.hu		f40 = f32, f6, f44	C 04	06,15	0   0
413   (p7)	add		r14 = r30, r27		C 05		0   0
414	;;
415	.pred.rel "mutex", p6, p7
416	ldf8		f32 = [up], 8		C 06		1   1
417   (p6)	cmp.leu		p8, p9 = r14, r27	C 07		1   1
418   (p7)	cmp.ltu		p8, p9 = r14, r27	C 08		1   1
419	getf.sig	r26 = f38		C 09	25	2   1
420	st8		[r20] = r14, 8		C 10		2   1
421	nop.b		0			C 11		2   1
422	;;
423.LL00:
424	.pred.rel "mutex", p8, p9
425	getf.sig	r30 = f42		C 12	28	3   2
426	xma.l		f37 = f33, f6, f45	C 13	18,27	3   2
427   (p8)	add		r16 = r31, r24, 1	C 14		3   2
428	ldf8		f44 = [rp], 8		C 15		3   2
429	xma.hu		f41 = f33, f6, f45	C 16	18,27	3   2
430   (p9)	add		r16 = r31, r24		C 17		3   2
431	;;
432	.pred.rel "mutex", p8, p9
433	ldf8		f33 = [up], 8		C 18		4   3
434   (p8)	cmp.leu		p6, p7 = r16, r24	C 19		4   3
435   (p9)	cmp.ltu		p6, p7 = r16, r24	C 20		4   3
436	getf.sig	r27 = f39		C 21	37	5   3
437	st8		[r20] = r16, 8		C 22		5   3
438	nop.b		0			C 23		5   3
439	;;
440.LL11:
441	.pred.rel "mutex", p6, p7
442	getf.sig	r31 = f43		C 24	40	6   4
443	xma.l		f38 = f34, f6, f46	C 25	30,39	6   4
444   (p6)	add		r14 = r28, r25, 1	C 26		6   4
445	ldf8		f45 = [rp], 8		C 27		6   4
446	xma.hu		f42 = f34, f6, f46	C 28	30,39	6   4
447   (p7)	add		r14 = r28, r25		C 29		6   4
448	;;
449	.pred.rel "mutex", p6, p7
450	ldf8		f34 = [up], 8		C 30		7   5
451   (p6)	cmp.leu		p8, p9 = r14, r25	C 31		7   5
452   (p7)	cmp.ltu		p8, p9 = r14, r25	C 32		7   5
453	getf.sig	r24 = f36		C 33	01	8   5
454	st8		[r20] = r14, 8		C 34		8   5
455	nop.b		0			C 35		8   5
456	;;
457.LL10:
458	.pred.rel "mutex", p8, p9
459	getf.sig	r28 = f40		C 36	04	9   6
460	xma.l		f39 = f35, f6, f47	C 37	42,03	9   6
461   (p8)	add		r16 = r29, r26, 1	C 38		9   6
462	ldf8		f46 = [rp], 8		C 39		9   6
463	xma.hu		f43 = f35, f6, f47	C 40	42,03	9   6
464   (p9)	add		r16 = r29, r26		C 41		9   6
465	;;
466	.pred.rel "mutex", p8, p9
467	ldf8		f35 = [up], 8		C 42	       10   7
468   (p8)	cmp.leu		p6, p7 = r16, r26	C 43	       10   7
469   (p9)	cmp.ltu		p6, p7 = r16, r26	C 44	       10   7
470	getf.sig	r25 = f37		C 45	13     11   7
471	st8		[r20] = r16, 8		C 46	       11   7
472	br.cloop.dptk	.Loop			C 47	       11   7
473C *** MAIN LOOP END ***
474	;;
475.Le0:
476	.pred.rel "mutex", p6, p7
477	getf.sig	r29 = f41		C
478	xma.l		f36 = f32, f6, f44	C
479   (p6)	add		r14 = r30, r27, 1	C
480	ldf8		f47 = [rp], 8		C
481	xma.hu		f40 = f32, f6, f44	C
482   (p7)	add		r14 = r30, r27		C
483	;;
484	.pred.rel "mutex", p6, p7
485   (p6)	cmp.leu		p8, p9 = r14, r27	C
486   (p7)	cmp.ltu		p8, p9 = r14, r27	C
487	getf.sig	r26 = f38		C
488	st8		[r20] = r14, 8		C
489	;;
490	.pred.rel "mutex", p8, p9
491	getf.sig	r30 = f42		C
492	xma.l		f37 = f33, f6, f45	C
493   (p8)	add		r16 = r31, r24, 1	C
494	xma.hu		f41 = f33, f6, f45	C
495   (p9)	add		r16 = r31, r24		C
496	;;
497	.pred.rel "mutex", p8, p9
498   (p8)	cmp.leu		p6, p7 = r16, r24	C
499   (p9)	cmp.ltu		p6, p7 = r16, r24	C
500	getf.sig	r27 = f39		C
501	st8		[r20] = r16, 8		C
502	;;
503.Lcj8:
504	.pred.rel "mutex", p6, p7
505	getf.sig	r31 = f43		C
506	xma.l		f38 = f34, f6, f46	C
507   (p6)	add		r14 = r28, r25, 1	C
508	xma.hu		f42 = f34, f6, f46	C
509   (p7)	add		r14 = r28, r25		C
510	;;
511	.pred.rel "mutex", p6, p7
512   (p6)	cmp.leu		p8, p9 = r14, r25	C
513   (p7)	cmp.ltu		p8, p9 = r14, r25	C
514	getf.sig	r24 = f36		C
515	st8		[r20] = r14, 8		C
516	;;
517.Lcj7:
518	.pred.rel "mutex", p8, p9
519	getf.sig	r28 = f40		C
520	xma.l		f39 = f35, f6, f47	C
521   (p8)	add		r16 = r29, r26, 1	C
522	xma.hu		f43 = f35, f6, f47	C
523   (p9)	add		r16 = r29, r26		C
524	;;
525	.pred.rel "mutex", p8, p9
526   (p8)	cmp.leu		p6, p7 = r16, r26	C
527   (p9)	cmp.ltu		p6, p7 = r16, r26	C
528	getf.sig	r25 = f37		C
529	st8		[r20] = r16, 8		C
530	;;
531.Lcj6:
532	.pred.rel "mutex", p6, p7
533	getf.sig	r29 = f41		C
534   (p6)	add		r14 = r30, r27, 1	C
535   (p7)	add		r14 = r30, r27		C
536	;;
537	.pred.rel "mutex", p6, p7
538   (p6)	cmp.leu		p8, p9 = r14, r27	C
539   (p7)	cmp.ltu		p8, p9 = r14, r27	C
540	getf.sig	r26 = f38		C
541	st8		[r20] = r14, 8		C
542	;;
543.Lcj5:
544	.pred.rel "mutex", p8, p9
545	getf.sig	r30 = f42		C
546   (p8)	add		r16 = r31, r24, 1	C
547   (p9)	add		r16 = r31, r24		C
548	;;
549	.pred.rel "mutex", p8, p9
550   (p8)	cmp.leu		p6, p7 = r16, r24	C
551   (p9)	cmp.ltu		p6, p7 = r16, r24	C
552	getf.sig	r27 = f39		C
553	st8		[r20] = r16, 8		C
554	;;
555.Lcj4:
556	.pred.rel "mutex", p6, p7
557	getf.sig	r8 = f43		C
558   (p6)	add		r14 = r28, r25, 1	C
559   (p7)	add		r14 = r28, r25		C
560	;;
561	.pred.rel "mutex", p6, p7
562	st8		[r20] = r14, 8		C
563   (p6)	cmp.leu		p8, p9 = r14, r25	C
564   (p7)	cmp.ltu		p8, p9 = r14, r25	C
565	;;
566.Lcj3:
567	.pred.rel "mutex", p8, p9
568   (p8)	add		r16 = r29, r26, 1	C
569   (p9)	add		r16 = r29, r26		C
570	;;
571	.pred.rel "mutex", p8, p9
572	st8		[r20] = r16, 8		C
573   (p8)	cmp.leu		p6, p7 = r16, r26	C
574   (p9)	cmp.ltu		p6, p7 = r16, r26	C
575	;;
576.Lcj2:
577	.pred.rel "mutex", p6, p7
578   (p6)	add		r14 = r30, r27, 1	C
579   (p7)	add		r14 = r30, r27		C
580	;;
581	.pred.rel "mutex", p6, p7
582	st8		[r20] = r14		C
583   (p6)	cmp.leu		p8, p9 = r14, r27	C
584   (p7)	cmp.ltu		p8, p9 = r14, r27	C
585	;;
586   (p8)	add		r8 = 1, r8		C M I
587	mov.i		ar.lc = r2		C I0
588	br.ret.sptk.many b0			C B
589EPILOGUE()
590ASM_END()
591