xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/ia64/mul_1.asm (revision ce54336801cf28877c3414aa2fcb251dddd543a2)
1dnl  IA-64 mpn_mul_1, mpn_mul_1c -- Multiply a limb vector with a limb and
2dnl  store the result in a second limb vector.
3
4dnl  Contributed to the GNU project by Torbjorn Granlund.
5
6dnl  Copyright 2000-2004, 2006, 2007 Free Software Foundation, Inc.
7
8dnl  This file is part of the GNU MP Library.
9dnl
10dnl  The GNU MP Library is free software; you can redistribute it and/or modify
11dnl  it under the terms of either:
12dnl
13dnl    * the GNU Lesser General Public License as published by the Free
14dnl      Software Foundation; either version 3 of the License, or (at your
15dnl      option) any later version.
16dnl
17dnl  or
18dnl
19dnl    * the GNU General Public License as published by the Free Software
20dnl      Foundation; either version 2 of the License, or (at your option) any
21dnl      later version.
22dnl
23dnl  or both in parallel, as here.
24dnl
25dnl  The GNU MP Library is distributed in the hope that it will be useful, but
26dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
27dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
28dnl  for more details.
29dnl
30dnl  You should have received copies of the GNU General Public License and the
31dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
32dnl  see https://www.gnu.org/licenses/.
33
34include(`../config.m4')
35
36C         cycles/limb
37C Itanium:    4.0
38C Itanium 2:  2.0
39
40C TODO
41C  * Further optimize feed-in and wind-down code, both for speed and code size.
42C  * Handle low limb input and results specially, using a common stf8 in the
43C    epilogue.
44C  * Use 1 c/l carry propagation scheme in wind-down code.
45C  * Use extra pointer register for `up' to speed up feed-in loads.
46C  * Work out final differences with addmul_1.asm.
47
48C INPUT PARAMETERS
49define(`rp', `r32')
50define(`up', `r33')
51define(`n', `r34')
52define(`vl', `r35')
53define(`cy', `r36')	C for mpn_mul_1c
54
55ASM_START()
56PROLOGUE(mpn_mul_1)
57	.prologue
58	.save	ar.lc, r2
59	.body
60
61ifdef(`HAVE_ABI_32',
62`	addp4		rp = 0, rp		C M I
63	addp4		up = 0, up		C M I
64	zxt4		n = n			C I
65	;;
66')
67{.mfi
68	adds		r15 = -1, n		C M I
69	mov		f9 = f0			C F
70	mov.i		r2 = ar.lc		C I0
71}
72{.mmi
73	ldf8		f7 = [up], 8		C M
74	nop.m		0			C M
75	and		r14 = 3, n		C M I
76	;;
77}
78.Lcommon:
79{.mii
80	setf.sig	f6 = vl			C M2 M3
81	shr.u		r31 = r15, 2		C I0
82	cmp.eq		p10, p0 = 0, r14	C M I
83}
84{.mii
85	cmp.eq		p11, p0 = 2, r14	C M I
86	cmp.eq		p12, p0 = 3, r14	C M I
87	nop.i		0			C I
88	;;
89}
90{.mii
91	cmp.ne		p6, p7 = r0, r0		C M I
92	mov.i		ar.lc = r31		C I0
93	cmp.ne		p8, p9 = r0, r0		C M I
94}
95{.bbb
96  (p10)	br.dptk		.Lb00			C B
97  (p11)	br.dptk		.Lb10			C B
98  (p12)	br.dptk		.Lb11			C B
99	;;
100}
101
102.Lb01:	mov		r20 = 0
103	br.cloop.dptk	.grt1			C B
104
105	xma.l		f39 = f7, f6, f9	C F
106	xma.hu		f43 = f7, f6, f9	C F
107	;;
108	getf.sig	r8 = f43		C M2
109	stf8		[rp] = f39		C M2 M3
110	mov.i		ar.lc = r2		C I0
111	br.ret.sptk.many b0			C B
112
113.grt1:
114	ldf8		f32 = [up], 8
115	;;
116	ldf8		f33 = [up], 8
117	;;
118	ldf8		f34 = [up], 8
119	xma.l		f39 = f7, f6, f9
120	xma.hu		f43 = f7, f6, f9
121	;;
122	ldf8		f35 = [up], 8
123	br.cloop.dptk	.grt5
124
125	xma.l		f36 = f32, f6, f0
126	xma.hu		f40 = f32, f6, f0
127	;;
128	stf8		[rp] = f39, 8
129	xma.l		f37 = f33, f6, f0
130	xma.hu		f41 = f33, f6, f0
131	;;
132	getf.sig	r21 = f43
133	getf.sig	r18 = f36
134	xma.l		f38 = f34, f6, f0
135	xma.hu		f42 = f34, f6, f0
136	;;
137	getf.sig	r22 = f40
138	getf.sig	r19 = f37
139	xma.l		f39 = f35, f6, f0
140	xma.hu		f43 = f35, f6, f0
141	;;
142	getf.sig	r23 = f41
143	getf.sig	r16 = f38
144	br		.Lcj5
145
146.grt5:
147	xma.l		f36 = f32, f6, f0
148	xma.hu		f40 = f32, f6, f0
149	;;
150	getf.sig	r17 = f39
151	ldf8		f32 = [up], 8
152	xma.l		f37 = f33, f6, f0
153	xma.hu		f41 = f33, f6, f0
154	;;
155	getf.sig	r21 = f43
156	ldf8		f33 = [up], 8
157	xma.l		f38 = f34, f6, f0
158	;;
159	getf.sig	r18 = f36
160	xma.hu		f42 = f34, f6, f0
161	;;
162	getf.sig	r22 = f40
163	ldf8		f34 = [up], 8
164	xma.l		f39 = f35, f6, f0
165	;;
166	getf.sig	r19 = f37
167	xma.hu		f43 = f35, f6, f0
168	br		.LL01
169
170
171.Lb10:	ldf8		f35 = [up], 8
172	mov		r23 = 0
173	br.cloop.dptk	.grt2
174
175	xma.l		f38 = f7, f6, f9
176	xma.hu		f42 = f7, f6, f9
177	;;
178	stf8		[rp] = f38, 8
179	xma.l		f39 = f35, f6, f42
180	xma.hu		f43 = f35, f6, f42
181	;;
182	getf.sig	r8 = f43
183	stf8		[rp] = f39
184	mov.i		ar.lc = r2
185	br.ret.sptk.many b0
186
187
188.grt2:
189	ldf8		f32 = [up], 8
190	;;
191	ldf8		f33 = [up], 8
192	xma.l		f38 = f7, f6, f9
193	xma.hu		f42 = f7, f6, f9
194	;;
195	ldf8		f34 = [up], 8
196	xma.l		f39 = f35, f6, f0
197	xma.hu		f43 = f35, f6, f0
198	;;
199	ldf8		f35 = [up], 8
200	br.cloop.dptk	.grt6
201
202	stf8		[rp] = f38, 8
203	xma.l		f36 = f32, f6, f0
204	xma.hu		f40 = f32, f6, f0
205	;;
206	getf.sig	r20 = f42
207	getf.sig	r17 = f39
208	xma.l		f37 = f33, f6, f0
209	xma.hu		f41 = f33, f6, f0
210	;;
211	getf.sig	r21 = f43
212	getf.sig	r18 = f36
213	xma.l		f38 = f34, f6, f0
214	xma.hu		f42 = f34, f6, f0
215	;;
216	getf.sig	r22 = f40
217	getf.sig	r19 = f37
218	xma.l		f39 = f35, f6, f0
219	xma.hu		f43 = f35, f6, f0
220	br		.Lcj6
221
222.grt6:
223	getf.sig	r16 = f38
224	xma.l		f36 = f32, f6, f0
225	xma.hu		f40 = f32, f6, f0
226	;;
227	getf.sig	r20 = f42
228	ldf8		f32 = [up], 8
229	xma.l		f37 = f33, f6, f0
230	;;
231	getf.sig	r17 = f39
232	xma.hu		f41 = f33, f6, f0
233	;;
234	getf.sig	r21 = f43
235	ldf8		f33 = [up], 8
236	xma.l		f38 = f34, f6, f0
237	;;
238	getf.sig	r18 = f36
239	xma.hu		f42 = f34, f6, f0
240	br		.LL10
241
242
243.Lb11:	ldf8		f34 = [up], 8
244	mov		r22 = 0
245	;;
246	ldf8		f35 = [up], 8
247	br.cloop.dptk	.grt3
248	;;
249
250	xma.l		f37 = f7, f6, f9
251	xma.hu		f41 = f7, f6, f9
252	xma.l		f38 = f34, f6, f0
253	xma.hu		f42 = f34, f6, f0
254	xma.l		f39 = f35, f6, f0
255	xma.hu		f43 = f35, f6, f0
256	;;
257	getf.sig	r23 = f41
258	stf8		[rp] = f37, 8
259	getf.sig	r16 = f38
260	getf.sig	r20 = f42
261	getf.sig	r17 = f39
262	getf.sig	r8 = f43
263	br		.Lcj3
264
265.grt3:
266	ldf8		f32 = [up], 8
267	xma.l		f37 = f7, f6, f9
268	xma.hu		f41 = f7, f6, f9
269	;;
270	ldf8		f33 = [up], 8
271	xma.l		f38 = f34, f6, f0
272	xma.hu		f42 = f34, f6, f0
273	;;
274	getf.sig	r19 = f37
275	ldf8		f34 = [up], 8
276	xma.l		f39 = f35, f6, f0
277	xma.hu		f43 = f35, f6, f0
278	;;
279	getf.sig	r23 = f41
280	ldf8		f35 = [up], 8
281	br.cloop.dptk	.grt7
282
283	getf.sig	r16 = f38
284	xma.l		f36 = f32, f6, f0
285	getf.sig	r20 = f42
286	xma.hu		f40 = f32, f6, f0
287	;;
288	getf.sig	r17 = f39
289	xma.l		f37 = f33, f6, f0
290	getf.sig	r21 = f43
291	xma.hu		f41 = f33, f6, f0
292	;;
293	getf.sig	r18 = f36
294	st8		[rp] = r19, 8
295	xma.l		f38 = f34, f6, f0
296	xma.hu		f42 = f34, f6, f0
297	br		.Lcj7
298
299.grt7:
300	getf.sig	r16 = f38
301	xma.l		f36 = f32, f6, f0
302	xma.hu		f40 = f32, f6, f0
303	;;
304	getf.sig	r20 = f42
305	ldf8		f32 = [up], 8
306	xma.l		f37 = f33, f6, f0
307	;;
308	getf.sig	r17 = f39
309	xma.hu		f41 = f33, f6, f0
310	br		.LL11
311
312
313.Lb00:	ldf8		f33 = [up], 8
314	mov		r21 = 0
315	;;
316	ldf8		f34 = [up], 8
317	;;
318	ldf8		f35 = [up], 8
319	xma.l		f36 = f7, f6, f9
320	xma.hu		f40 = f7, f6, f9
321	br.cloop.dptk	.grt4
322
323	xma.l		f37 = f33, f6, f0
324	xma.hu		f41 = f33, f6, f0
325	xma.l		f38 = f34, f6, f0
326	xma.hu		f42 = f34, f6, f0
327	;;
328	getf.sig	r22 = f40
329	stf8		[rp] = f36, 8
330	xma.l		f39 = f35, f6, f0
331	getf.sig	r19 = f37
332	xma.hu		f43 = f35, f6, f0
333	;;
334	getf.sig	r23 = f41
335	getf.sig	r16 = f38
336	getf.sig	r20 = f42
337	getf.sig	r17 = f39
338	br		.Lcj4
339
340.grt4:
341	ldf8		f32 = [up], 8
342	xma.l		f37 = f33, f6, f0
343	xma.hu		f41 = f33, f6, f0
344	;;
345	getf.sig	r18 = f36
346	ldf8		f33 = [up], 8
347	xma.l		f38 = f34, f6, f0
348	xma.hu		f42 = f34, f6, f0
349	;;
350	getf.sig	r22 = f40
351	ldf8		f34 = [up], 8
352	xma.l		f39 = f35, f6, f0
353	;;
354	getf.sig	r19 = f37
355	getf.sig	r23 = f41
356	xma.hu		f43 = f35, f6, f0
357	ldf8		f35 = [up], 8
358	br.cloop.dptk	.grt8
359
360	getf.sig	r16 = f38
361	xma.l		f36 = f32, f6, f0
362	getf.sig	r20 = f42
363	xma.hu		f40 = f32, f6, f0
364	;;
365	getf.sig	r17 = f39
366	st8		[rp] = r18, 8
367	xma.l		f37 = f33, f6, f0
368	xma.hu		f41 = f33, f6, f0
369	br		.Lcj8
370
371.grt8:
372	getf.sig	r16 = f38
373	xma.l		f36 = f32, f6, f0
374	xma.hu		f40 = f32, f6, f0
375	br		.LL00
376
377
378C *** MAIN LOOP START ***
379	ALIGN(32)
380.Loop:
381	.pred.rel "mutex",p6,p7
382	getf.sig	r16 = f38
383	xma.l		f36 = f32, f6, f0
384   (p6)	cmp.leu		p8, p9 = r24, r17
385	st8		[rp] = r24, 8
386	xma.hu		f40 = f32, f6, f0
387   (p7)	cmp.ltu		p8, p9 = r24, r17
388	;;
389.LL00:
390	.pred.rel "mutex",p8,p9
391	getf.sig	r20 = f42
392   (p8)	add		r24 = r18, r21, 1
393	nop.b		0
394	ldf8		f32 = [up], 8
395   (p9)	add		r24 = r18, r21
396	nop.b		0
397	;;
398	.pred.rel "mutex",p8,p9
399	getf.sig	r17 = f39
400	xma.l		f37 = f33, f6, f0
401   (p8)	cmp.leu		p6, p7 = r24, r18
402	st8		[rp] = r24, 8
403	xma.hu		f41 = f33, f6, f0
404   (p9)	cmp.ltu		p6, p7 = r24, r18
405	;;
406.LL11:
407	.pred.rel "mutex",p6,p7
408	getf.sig	r21 = f43
409   (p6)	add		r24 = r19, r22, 1
410	nop.b		0
411	ldf8		f33 = [up], 8
412   (p7)	add		r24 = r19, r22
413	nop.b		0
414	;;
415	.pred.rel "mutex",p6,p7
416	getf.sig	r18 = f36
417	xma.l		f38 = f34, f6, f0
418   (p6)	cmp.leu		p8, p9 = r24, r19
419	st8		[rp] = r24, 8
420	xma.hu		f42 = f34, f6, f0
421   (p7)	cmp.ltu		p8, p9 = r24, r19
422	;;
423.LL10:
424	.pred.rel "mutex",p8,p9
425	getf.sig	r22 = f40
426   (p8)	add		r24 = r16, r23, 1
427	nop.b		0
428	ldf8		f34 = [up], 8
429   (p9)	add		r24 = r16, r23
430	nop.b		0
431	;;
432	.pred.rel "mutex",p8,p9
433	getf.sig	r19 = f37
434	xma.l		f39 = f35, f6, f0
435   (p8)	cmp.leu		p6, p7 = r24, r16
436	st8		[rp] = r24, 8
437	xma.hu		f43 = f35, f6, f0
438   (p9)	cmp.ltu		p6, p7 = r24, r16
439	;;
440.LL01:
441	.pred.rel "mutex",p6,p7
442	getf.sig	r23 = f41
443   (p6)	add		r24 = r17, r20, 1
444	nop.b		0
445	ldf8		f35 = [up], 8
446   (p7)	add		r24 = r17, r20
447	br.cloop.dptk	.Loop
448C *** MAIN LOOP END ***
449	;;
450
451.Lcj9:
452	.pred.rel "mutex",p6,p7
453	getf.sig	r16 = f38
454	xma.l		f36 = f32, f6, f0
455   (p6)	cmp.leu		p8, p9 = r24, r17
456	st8		[rp] = r24, 8
457	xma.hu		f40 = f32, f6, f0
458   (p7)	cmp.ltu		p8, p9 = r24, r17
459	;;
460	.pred.rel "mutex",p8,p9
461	getf.sig	r20 = f42
462   (p8)	add		r24 = r18, r21, 1
463   (p9)	add		r24 = r18, r21
464	;;
465	.pred.rel "mutex",p8,p9
466	getf.sig	r17 = f39
467	xma.l		f37 = f33, f6, f0
468   (p8)	cmp.leu		p6, p7 = r24, r18
469	st8		[rp] = r24, 8
470	xma.hu		f41 = f33, f6, f0
471   (p9)	cmp.ltu		p6, p7 = r24, r18
472	;;
473.Lcj8:
474	.pred.rel "mutex",p6,p7
475	getf.sig	r21 = f43
476   (p6)	add		r24 = r19, r22, 1
477   (p7)	add		r24 = r19, r22
478	;;
479	.pred.rel "mutex",p6,p7
480	getf.sig	r18 = f36
481	xma.l		f38 = f34, f6, f0
482   (p6)	cmp.leu		p8, p9 = r24, r19
483	st8		[rp] = r24, 8
484	xma.hu		f42 = f34, f6, f0
485   (p7)	cmp.ltu		p8, p9 = r24, r19
486	;;
487.Lcj7:
488	.pred.rel "mutex",p8,p9
489	getf.sig	r22 = f40
490   (p8)	add		r24 = r16, r23, 1
491   (p9)	add		r24 = r16, r23
492	;;
493	.pred.rel "mutex",p8,p9
494	getf.sig	r19 = f37
495	xma.l		f39 = f35, f6, f0
496   (p8)	cmp.leu		p6, p7 = r24, r16
497	st8		[rp] = r24, 8
498	xma.hu		f43 = f35, f6, f0
499   (p9)	cmp.ltu		p6, p7 = r24, r16
500	;;
501.Lcj6:
502	.pred.rel "mutex",p6,p7
503	getf.sig	r23 = f41
504   (p6)	add		r24 = r17, r20, 1
505   (p7)	add		r24 = r17, r20
506	;;
507	.pred.rel "mutex",p6,p7
508   (p6)	cmp.leu		p8, p9 = r24, r17
509   (p7)	cmp.ltu		p8, p9 = r24, r17
510	getf.sig	r16 = f38
511	st8		[rp] = r24, 8
512	;;
513.Lcj5:
514	.pred.rel "mutex",p8,p9
515	getf.sig	r20 = f42
516   (p8)	add		r24 = r18, r21, 1
517   (p9)	add		r24 = r18, r21
518	;;
519	.pred.rel "mutex",p8,p9
520   (p8)	cmp.leu		p6, p7 = r24, r18
521   (p9)	cmp.ltu		p6, p7 = r24, r18
522	getf.sig	r17 = f39
523	st8		[rp] = r24, 8
524	;;
525.Lcj4:
526	.pred.rel "mutex",p6,p7
527	getf.sig	r8 = f43
528   (p6)	add		r24 = r19, r22, 1
529   (p7)	add		r24 = r19, r22
530	;;
531	.pred.rel "mutex",p6,p7
532	st8		[rp] = r24, 8
533   (p6)	cmp.leu		p8, p9 = r24, r19
534   (p7)	cmp.ltu		p8, p9 = r24, r19
535	;;
536.Lcj3:
537	.pred.rel "mutex",p8,p9
538   (p8)	add		r24 = r16, r23, 1
539   (p9)	add		r24 = r16, r23
540	;;
541	.pred.rel "mutex",p8,p9
542	st8		[rp] = r24, 8
543   (p8)	cmp.leu		p6, p7 = r24, r16
544   (p9)	cmp.ltu		p6, p7 = r24, r16
545	;;
546.Lcj2:
547	.pred.rel "mutex",p6,p7
548   (p6)	add		r24 = r17, r20, 1
549   (p7)	add		r24 = r17, r20
550	;;
551	.pred.rel "mutex",p6,p7
552	st8		[rp] = r24, 8
553   (p6)	cmp.leu		p8, p9 = r24, r17
554   (p7)	cmp.ltu		p8, p9 = r24, r17
555	;;
556   (p8)	add		r8 = 1, r8
557	mov.i		ar.lc = r2
558	br.ret.sptk.many b0
559EPILOGUE()
560
561PROLOGUE(mpn_mul_1c)
562	.prologue
563	.save	ar.lc, r2
564	.body
565
566ifdef(`HAVE_ABI_32',
567`	addp4		rp = 0, rp		C M I
568	addp4		up = 0, up		C M I
569	zxt4		n = n			C I
570	;;
571')
572{.mmi
573	adds		r15 = -1, n		C M I
574	setf.sig	f9 = cy			C M2 M3
575	mov.i		r2 = ar.lc		C I0
576}
577{.mmb
578	ldf8		f7 = [up], 8		C M
579	and		r14 = 3, n		C M I
580	br.sptk		.Lcommon
581	;;
582}
583EPILOGUE()
584ASM_END()
585