xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/ia64/mul_1.asm (revision b757af438b42b93f8c6571f026d8b8ef3eaf5fc9)
1dnl  IA-64 mpn_mul_1, mpn_mul_1c -- Multiply a limb vector with a limb and
2dnl  store the result in a second limb vector.
3
4dnl  Copyright 2000, 2001, 2002, 2003, 2004, 2006, 2007 Free Software
5dnl  Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of the GNU Lesser General Public License as published
11dnl  by the Free Software Foundation; either version 3 of the License, or (at
12dnl  your option) any later version.
13
14dnl  The GNU MP Library is distributed in the hope that it will be useful, but
15dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17dnl  License for more details.
18
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24C         cycles/limb
25C Itanium:    4.0
26C Itanium 2:  2.0
27
28C TODO
29C  * Further optimize feed-in and wind-down code, both for speed and code size.
30C  * Handle low limb input and results specially, using a common stf8 in the
31C    epilogue.
32C  * Use 1 c/l carry propagation scheme in wind-down code.
33C  * Use extra pointer register for `up' to speed up feed-in loads.
34C  * Work out final differences with addmul_1.asm.
35
36C INPUT PARAMETERS
37define(`rp', `r32')
38define(`up', `r33')
39define(`n', `r34')
40define(`vl', `r35')
41define(`cy', `r36')	C for mpn_mul_1c
42
43ASM_START()
44PROLOGUE(mpn_mul_1)
45	.prologue
46	.save	ar.lc, r2
47	.body
48
49ifdef(`HAVE_ABI_32',
50`	addp4		rp = 0, rp		C M I
51	addp4		up = 0, up		C M I
52	zxt4		n = n			C I
53	;;
54')
55{.mfi
56	adds		r15 = -1, n		C M I
57	mov		f9 = f0			C F
58	mov.i		r2 = ar.lc		C I0
59}
60{.mmi
61	ldf8		f7 = [up], 8		C M
62	nop.m		0			C M
63	and		r14 = 3, n		C M I
64	;;
65}
66.Lcommon:
67{.mii
68	setf.sig	f6 = vl			C M2 M3
69	shr.u		r31 = r15, 2		C I0
70	cmp.eq		p10, p0 = 0, r14	C M I
71}
72{.mii
73	cmp.eq		p11, p0 = 2, r14	C M I
74	cmp.eq		p12, p0 = 3, r14	C M I
75	nop.i		0			C I
76	;;
77}
78{.mii
79	cmp.ne		p6, p7 = r0, r0		C M I
80	mov.i		ar.lc = r31		C I0
81	cmp.ne		p8, p9 = r0, r0		C M I
82}
83{.bbb
84  (p10)	br.dptk		.Lb00			C B
85  (p11)	br.dptk		.Lb10			C B
86  (p12)	br.dptk		.Lb11			C B
87	;;
88}
89
90.Lb01:	mov		r20 = 0
91	br.cloop.dptk	.grt1			C B
92
93	xma.l		f39 = f7, f6, f9	C F
94	xma.hu		f43 = f7, f6, f9	C F
95	;;
96	getf.sig	r8 = f43		C M2
97	stf8		[rp] = f39		C M2 M3
98	mov.i		ar.lc = r2		C I0
99	br.ret.sptk.many b0			C B
100
101.grt1:
102	ldf8		f32 = [up], 8
103	;;
104	ldf8		f33 = [up], 8
105	;;
106	ldf8		f34 = [up], 8
107	xma.l		f39 = f7, f6, f9
108	xma.hu		f43 = f7, f6, f9
109	;;
110	ldf8		f35 = [up], 8
111	br.cloop.dptk	.grt5
112
113	xma.l		f36 = f32, f6, f0
114	xma.hu		f40 = f32, f6, f0
115	;;
116	stf8		[rp] = f39, 8
117	xma.l		f37 = f33, f6, f0
118	xma.hu		f41 = f33, f6, f0
119	;;
120	getf.sig	r21 = f43
121	getf.sig	r18 = f36
122	xma.l		f38 = f34, f6, f0
123	xma.hu		f42 = f34, f6, f0
124	;;
125	getf.sig	r22 = f40
126	getf.sig	r19 = f37
127	xma.l		f39 = f35, f6, f0
128	xma.hu		f43 = f35, f6, f0
129	;;
130	getf.sig	r23 = f41
131	getf.sig	r16 = f38
132	br		.Lcj5
133
134.grt5:
135	xma.l		f36 = f32, f6, f0
136	xma.hu		f40 = f32, f6, f0
137	;;
138	getf.sig	r17 = f39
139	ldf8		f32 = [up], 8
140	xma.l		f37 = f33, f6, f0
141	xma.hu		f41 = f33, f6, f0
142	;;
143	getf.sig	r21 = f43
144	ldf8		f33 = [up], 8
145	xma.l		f38 = f34, f6, f0
146	;;
147	getf.sig	r18 = f36
148	xma.hu		f42 = f34, f6, f0
149	;;
150	getf.sig	r22 = f40
151	ldf8		f34 = [up], 8
152	xma.l		f39 = f35, f6, f0
153	;;
154	getf.sig	r19 = f37
155	xma.hu		f43 = f35, f6, f0
156	br		.LL01
157
158
159.Lb10:	ldf8		f35 = [up], 8
160	mov		r23 = 0
161	br.cloop.dptk	.grt2
162
163	xma.l		f38 = f7, f6, f9
164	xma.hu		f42 = f7, f6, f9
165	;;
166	stf8		[rp] = f38, 8
167	xma.l		f39 = f35, f6, f42
168	xma.hu		f43 = f35, f6, f42
169	;;
170	getf.sig	r8 = f43
171	stf8		[rp] = f39
172	mov.i		ar.lc = r2
173	br.ret.sptk.many b0
174
175
176.grt2:
177	ldf8		f32 = [up], 8
178	;;
179	ldf8		f33 = [up], 8
180	xma.l		f38 = f7, f6, f9
181	xma.hu		f42 = f7, f6, f9
182	;;
183	ldf8		f34 = [up], 8
184	xma.l		f39 = f35, f6, f0
185	xma.hu		f43 = f35, f6, f0
186	;;
187	ldf8		f35 = [up], 8
188	br.cloop.dptk	.grt6
189
190	stf8		[rp] = f38, 8
191	xma.l		f36 = f32, f6, f0
192	xma.hu		f40 = f32, f6, f0
193	;;
194	getf.sig	r20 = f42
195	getf.sig	r17 = f39
196	xma.l		f37 = f33, f6, f0
197	xma.hu		f41 = f33, f6, f0
198	;;
199	getf.sig	r21 = f43
200	getf.sig	r18 = f36
201	xma.l		f38 = f34, f6, f0
202	xma.hu		f42 = f34, f6, f0
203	;;
204	getf.sig	r22 = f40
205	getf.sig	r19 = f37
206	xma.l		f39 = f35, f6, f0
207	xma.hu		f43 = f35, f6, f0
208	br		.Lcj6
209
210.grt6:
211	getf.sig	r16 = f38
212	xma.l		f36 = f32, f6, f0
213	xma.hu		f40 = f32, f6, f0
214	;;
215	getf.sig	r20 = f42
216	ldf8		f32 = [up], 8
217	xma.l		f37 = f33, f6, f0
218	;;
219	getf.sig	r17 = f39
220	xma.hu		f41 = f33, f6, f0
221	;;
222	getf.sig	r21 = f43
223	ldf8		f33 = [up], 8
224	xma.l		f38 = f34, f6, f0
225	;;
226	getf.sig	r18 = f36
227	xma.hu		f42 = f34, f6, f0
228	br		.LL10
229
230
231.Lb11:	ldf8		f34 = [up], 8
232	mov		r22 = 0
233	;;
234	ldf8		f35 = [up], 8
235	br.cloop.dptk	.grt3
236	;;
237
238	xma.l		f37 = f7, f6, f9
239	xma.hu		f41 = f7, f6, f9
240	xma.l		f38 = f34, f6, f0
241	xma.hu		f42 = f34, f6, f0
242	xma.l		f39 = f35, f6, f0
243	xma.hu		f43 = f35, f6, f0
244	;;
245	getf.sig	r23 = f41
246	stf8		[rp] = f37, 8
247	getf.sig	r16 = f38
248	getf.sig	r20 = f42
249	getf.sig	r17 = f39
250	getf.sig	r8 = f43
251	br		.Lcj3
252
253.grt3:
254	ldf8		f32 = [up], 8
255	xma.l		f37 = f7, f6, f9
256	xma.hu		f41 = f7, f6, f9
257	;;
258	ldf8		f33 = [up], 8
259	xma.l		f38 = f34, f6, f0
260	xma.hu		f42 = f34, f6, f0
261	;;
262	getf.sig	r19 = f37
263	ldf8		f34 = [up], 8
264	xma.l		f39 = f35, f6, f0
265	xma.hu		f43 = f35, f6, f0
266	;;
267	getf.sig	r23 = f41
268	ldf8		f35 = [up], 8
269	br.cloop.dptk	.grt7
270
271	getf.sig	r16 = f38
272	xma.l		f36 = f32, f6, f0
273	getf.sig	r20 = f42
274	xma.hu		f40 = f32, f6, f0
275	;;
276	getf.sig	r17 = f39
277	xma.l		f37 = f33, f6, f0
278	getf.sig	r21 = f43
279	xma.hu		f41 = f33, f6, f0
280	;;
281	getf.sig	r18 = f36
282	st8		[rp] = r19, 8
283	xma.l		f38 = f34, f6, f0
284	xma.hu		f42 = f34, f6, f0
285	br		.Lcj7
286
287.grt7:
288	getf.sig	r16 = f38
289	xma.l		f36 = f32, f6, f0
290	xma.hu		f40 = f32, f6, f0
291	;;
292	getf.sig	r20 = f42
293	ldf8		f32 = [up], 8
294	xma.l		f37 = f33, f6, f0
295	;;
296	getf.sig	r17 = f39
297	xma.hu		f41 = f33, f6, f0
298	br		.LL11
299
300
301.Lb00:	ldf8		f33 = [up], 8
302	mov		r21 = 0
303	;;
304	ldf8		f34 = [up], 8
305	;;
306	ldf8		f35 = [up], 8
307	xma.l		f36 = f7, f6, f9
308	xma.hu		f40 = f7, f6, f9
309	br.cloop.dptk	.grt4
310
311	xma.l		f37 = f33, f6, f0
312	xma.hu		f41 = f33, f6, f0
313	xma.l		f38 = f34, f6, f0
314	xma.hu		f42 = f34, f6, f0
315	;;
316	getf.sig	r22 = f40
317	stf8		[rp] = f36, 8
318	xma.l		f39 = f35, f6, f0
319	getf.sig	r19 = f37
320	xma.hu		f43 = f35, f6, f0
321	;;
322	getf.sig	r23 = f41
323	getf.sig	r16 = f38
324	getf.sig	r20 = f42
325	getf.sig	r17 = f39
326	br		.Lcj4
327
328.grt4:
329	ldf8		f32 = [up], 8
330	xma.l		f37 = f33, f6, f0
331	xma.hu		f41 = f33, f6, f0
332	;;
333	getf.sig	r18 = f36
334	ldf8		f33 = [up], 8
335	xma.l		f38 = f34, f6, f0
336	xma.hu		f42 = f34, f6, f0
337	;;
338	getf.sig	r22 = f40
339	ldf8		f34 = [up], 8
340	xma.l		f39 = f35, f6, f0
341	;;
342	getf.sig	r19 = f37
343	getf.sig	r23 = f41
344	xma.hu		f43 = f35, f6, f0
345	ldf8		f35 = [up], 8
346	br.cloop.dptk	.grt8
347
348	getf.sig	r16 = f38
349	xma.l		f36 = f32, f6, f0
350	getf.sig	r20 = f42
351	xma.hu		f40 = f32, f6, f0
352	;;
353	getf.sig	r17 = f39
354	st8		[rp] = r18, 8
355	xma.l		f37 = f33, f6, f0
356	xma.hu		f41 = f33, f6, f0
357	br		.Lcj8
358
359.grt8:
360	getf.sig	r16 = f38
361	xma.l		f36 = f32, f6, f0
362	xma.hu		f40 = f32, f6, f0
363	br		.LL00
364
365
366C *** MAIN LOOP START ***
367	ALIGN(32)
368.Loop:
369	.pred.rel "mutex",p6,p7
370	getf.sig	r16 = f38
371	xma.l		f36 = f32, f6, f0
372   (p6)	cmp.leu		p8, p9 = r24, r17
373	st8		[rp] = r24, 8
374	xma.hu		f40 = f32, f6, f0
375   (p7)	cmp.ltu		p8, p9 = r24, r17
376	;;
377.LL00:
378	.pred.rel "mutex",p8,p9
379	getf.sig	r20 = f42
380   (p8)	add		r24 = r18, r21, 1
381	nop.b		0
382	ldf8		f32 = [up], 8
383   (p9)	add		r24 = r18, r21
384	nop.b		0
385	;;
386	.pred.rel "mutex",p8,p9
387	getf.sig	r17 = f39
388	xma.l		f37 = f33, f6, f0
389   (p8)	cmp.leu		p6, p7 = r24, r18
390	st8		[rp] = r24, 8
391	xma.hu		f41 = f33, f6, f0
392   (p9)	cmp.ltu		p6, p7 = r24, r18
393	;;
394.LL11:
395	.pred.rel "mutex",p6,p7
396	getf.sig	r21 = f43
397   (p6)	add		r24 = r19, r22, 1
398	nop.b		0
399	ldf8		f33 = [up], 8
400   (p7)	add		r24 = r19, r22
401	nop.b		0
402	;;
403	.pred.rel "mutex",p6,p7
404	getf.sig	r18 = f36
405	xma.l		f38 = f34, f6, f0
406   (p6)	cmp.leu		p8, p9 = r24, r19
407	st8		[rp] = r24, 8
408	xma.hu		f42 = f34, f6, f0
409   (p7)	cmp.ltu		p8, p9 = r24, r19
410	;;
411.LL10:
412	.pred.rel "mutex",p8,p9
413	getf.sig	r22 = f40
414   (p8)	add		r24 = r16, r23, 1
415	nop.b		0
416	ldf8		f34 = [up], 8
417   (p9)	add		r24 = r16, r23
418	nop.b		0
419	;;
420	.pred.rel "mutex",p8,p9
421	getf.sig	r19 = f37
422	xma.l		f39 = f35, f6, f0
423   (p8)	cmp.leu		p6, p7 = r24, r16
424	st8		[rp] = r24, 8
425	xma.hu		f43 = f35, f6, f0
426   (p9)	cmp.ltu		p6, p7 = r24, r16
427	;;
428.LL01:
429	.pred.rel "mutex",p6,p7
430	getf.sig	r23 = f41
431   (p6)	add		r24 = r17, r20, 1
432	nop.b		0
433	ldf8		f35 = [up], 8
434   (p7)	add		r24 = r17, r20
435	br.cloop.dptk	.Loop
436C *** MAIN LOOP END ***
437	;;
438
439.Lcj9:
440	.pred.rel "mutex",p6,p7
441	getf.sig	r16 = f38
442	xma.l		f36 = f32, f6, f0
443   (p6)	cmp.leu		p8, p9 = r24, r17
444	st8		[rp] = r24, 8
445	xma.hu		f40 = f32, f6, f0
446   (p7)	cmp.ltu		p8, p9 = r24, r17
447	;;
448	.pred.rel "mutex",p8,p9
449	getf.sig	r20 = f42
450   (p8)	add		r24 = r18, r21, 1
451   (p9)	add		r24 = r18, r21
452	;;
453	.pred.rel "mutex",p8,p9
454	getf.sig	r17 = f39
455	xma.l		f37 = f33, f6, f0
456   (p8)	cmp.leu		p6, p7 = r24, r18
457	st8		[rp] = r24, 8
458	xma.hu		f41 = f33, f6, f0
459   (p9)	cmp.ltu		p6, p7 = r24, r18
460	;;
461.Lcj8:
462	.pred.rel "mutex",p6,p7
463	getf.sig	r21 = f43
464   (p6)	add		r24 = r19, r22, 1
465   (p7)	add		r24 = r19, r22
466	;;
467	.pred.rel "mutex",p6,p7
468	getf.sig	r18 = f36
469	xma.l		f38 = f34, f6, f0
470   (p6)	cmp.leu		p8, p9 = r24, r19
471	st8		[rp] = r24, 8
472	xma.hu		f42 = f34, f6, f0
473   (p7)	cmp.ltu		p8, p9 = r24, r19
474	;;
475.Lcj7:
476	.pred.rel "mutex",p8,p9
477	getf.sig	r22 = f40
478   (p8)	add		r24 = r16, r23, 1
479   (p9)	add		r24 = r16, r23
480	;;
481	.pred.rel "mutex",p8,p9
482	getf.sig	r19 = f37
483	xma.l		f39 = f35, f6, f0
484   (p8)	cmp.leu		p6, p7 = r24, r16
485	st8		[rp] = r24, 8
486	xma.hu		f43 = f35, f6, f0
487   (p9)	cmp.ltu		p6, p7 = r24, r16
488	;;
489.Lcj6:
490	.pred.rel "mutex",p6,p7
491	getf.sig	r23 = f41
492   (p6)	add		r24 = r17, r20, 1
493   (p7)	add		r24 = r17, r20
494	;;
495	.pred.rel "mutex",p6,p7
496   (p6)	cmp.leu		p8, p9 = r24, r17
497   (p7)	cmp.ltu		p8, p9 = r24, r17
498	getf.sig	r16 = f38
499	st8		[rp] = r24, 8
500	;;
501.Lcj5:
502	.pred.rel "mutex",p8,p9
503	getf.sig	r20 = f42
504   (p8)	add		r24 = r18, r21, 1
505   (p9)	add		r24 = r18, r21
506	;;
507	.pred.rel "mutex",p8,p9
508   (p8)	cmp.leu		p6, p7 = r24, r18
509   (p9)	cmp.ltu		p6, p7 = r24, r18
510	getf.sig	r17 = f39
511	st8		[rp] = r24, 8
512	;;
513.Lcj4:
514	.pred.rel "mutex",p6,p7
515	getf.sig	r8 = f43
516   (p6)	add		r24 = r19, r22, 1
517   (p7)	add		r24 = r19, r22
518	;;
519	.pred.rel "mutex",p6,p7
520	st8		[rp] = r24, 8
521   (p6)	cmp.leu		p8, p9 = r24, r19
522   (p7)	cmp.ltu		p8, p9 = r24, r19
523	;;
524.Lcj3:
525	.pred.rel "mutex",p8,p9
526   (p8)	add		r24 = r16, r23, 1
527   (p9)	add		r24 = r16, r23
528	;;
529	.pred.rel "mutex",p8,p9
530	st8		[rp] = r24, 8
531   (p8)	cmp.leu		p6, p7 = r24, r16
532   (p9)	cmp.ltu		p6, p7 = r24, r16
533	;;
534.Lcj2:
535	.pred.rel "mutex",p6,p7
536   (p6)	add		r24 = r17, r20, 1
537   (p7)	add		r24 = r17, r20
538	;;
539	.pred.rel "mutex",p6,p7
540	st8		[rp] = r24, 8
541   (p6)	cmp.leu		p8, p9 = r24, r17
542   (p7)	cmp.ltu		p8, p9 = r24, r17
543	;;
544	.pred.rel "mutex",p8,p9
545   (p8)	add		r8 = 1, r8
546	mov.i		ar.lc = r2
547	br.ret.sptk.many b0
548EPILOGUE()
549
550PROLOGUE(mpn_mul_1c)
551	.prologue
552	.save	ar.lc, r2
553	.body
554
555ifdef(`HAVE_ABI_32',
556`	addp4		rp = 0, rp		C M I
557	addp4		up = 0, up		C M I
558	zxt4		n = n			C I
559	;;
560')
561{.mmi
562	adds		r15 = -1, n		C M I
563	setf.sig	f9 = cy			C M2 M3
564	mov.i		r2 = ar.lc		C I0
565}
566{.mmb
567	ldf8		f7 = [up], 8		C M
568	and		r14 = 3, n		C M I
569	br.sptk		.Lcommon
570	;;
571}
572EPILOGUE()
573ASM_END()
574