xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/ia64/mul_1.asm (revision 75f6d617e282811cb173c2ccfbf5df0dd71f7045)
1dnl  IA-64 mpn_mul_1, mpn_mul_1c -- Multiply a limb vector with a limb and
2dnl  store the result in a second limb vector.
3
4dnl  Contributed to the GNU project by Torbjorn Granlund.
5
6dnl  Copyright 2000, 2001, 2002, 2003, 2004, 2006, 2007 Free Software
7dnl  Foundation, Inc.
8
9dnl  This file is part of the GNU MP Library.
10
11dnl  The GNU MP Library is free software; you can redistribute it and/or modify
12dnl  it under the terms of the GNU Lesser General Public License as published
13dnl  by the Free Software Foundation; either version 3 of the License, or (at
14dnl  your option) any later version.
15
16dnl  The GNU MP Library is distributed in the hope that it will be useful, but
17dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
18dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
19dnl  License for more details.
20
21dnl  You should have received a copy of the GNU Lesser General Public License
22dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
23
24include(`../config.m4')
25
26C         cycles/limb
27C Itanium:    4.0
28C Itanium 2:  2.0
29
30C TODO
31C  * Further optimize feed-in and wind-down code, both for speed and code size.
32C  * Handle low limb input and results specially, using a common stf8 in the
33C    epilogue.
34C  * Use 1 c/l carry propagation scheme in wind-down code.
35C  * Use extra pointer register for `up' to speed up feed-in loads.
36C  * Work out final differences with addmul_1.asm.
37
38C INPUT PARAMETERS
39define(`rp', `r32')
40define(`up', `r33')
41define(`n', `r34')
42define(`vl', `r35')
43define(`cy', `r36')	C for mpn_mul_1c
44
45ASM_START()
46PROLOGUE(mpn_mul_1)
47	.prologue
48	.save	ar.lc, r2
49	.body
50
51ifdef(`HAVE_ABI_32',
52`	addp4		rp = 0, rp		C M I
53	addp4		up = 0, up		C M I
54	zxt4		n = n			C I
55	;;
56')
57{.mfi
58	adds		r15 = -1, n		C M I
59	mov		f9 = f0			C F
60	mov.i		r2 = ar.lc		C I0
61}
62{.mmi
63	ldf8		f7 = [up], 8		C M
64	nop.m		0			C M
65	and		r14 = 3, n		C M I
66	;;
67}
68.Lcommon:
69{.mii
70	setf.sig	f6 = vl			C M2 M3
71	shr.u		r31 = r15, 2		C I0
72	cmp.eq		p10, p0 = 0, r14	C M I
73}
74{.mii
75	cmp.eq		p11, p0 = 2, r14	C M I
76	cmp.eq		p12, p0 = 3, r14	C M I
77	nop.i		0			C I
78	;;
79}
80{.mii
81	cmp.ne		p6, p7 = r0, r0		C M I
82	mov.i		ar.lc = r31		C I0
83	cmp.ne		p8, p9 = r0, r0		C M I
84}
85{.bbb
86  (p10)	br.dptk		.Lb00			C B
87  (p11)	br.dptk		.Lb10			C B
88  (p12)	br.dptk		.Lb11			C B
89	;;
90}
91
92.Lb01:	mov		r20 = 0
93	br.cloop.dptk	.grt1			C B
94
95	xma.l		f39 = f7, f6, f9	C F
96	xma.hu		f43 = f7, f6, f9	C F
97	;;
98	getf.sig	r8 = f43		C M2
99	stf8		[rp] = f39		C M2 M3
100	mov.i		ar.lc = r2		C I0
101	br.ret.sptk.many b0			C B
102
103.grt1:
104	ldf8		f32 = [up], 8
105	;;
106	ldf8		f33 = [up], 8
107	;;
108	ldf8		f34 = [up], 8
109	xma.l		f39 = f7, f6, f9
110	xma.hu		f43 = f7, f6, f9
111	;;
112	ldf8		f35 = [up], 8
113	br.cloop.dptk	.grt5
114
115	xma.l		f36 = f32, f6, f0
116	xma.hu		f40 = f32, f6, f0
117	;;
118	stf8		[rp] = f39, 8
119	xma.l		f37 = f33, f6, f0
120	xma.hu		f41 = f33, f6, f0
121	;;
122	getf.sig	r21 = f43
123	getf.sig	r18 = f36
124	xma.l		f38 = f34, f6, f0
125	xma.hu		f42 = f34, f6, f0
126	;;
127	getf.sig	r22 = f40
128	getf.sig	r19 = f37
129	xma.l		f39 = f35, f6, f0
130	xma.hu		f43 = f35, f6, f0
131	;;
132	getf.sig	r23 = f41
133	getf.sig	r16 = f38
134	br		.Lcj5
135
136.grt5:
137	xma.l		f36 = f32, f6, f0
138	xma.hu		f40 = f32, f6, f0
139	;;
140	getf.sig	r17 = f39
141	ldf8		f32 = [up], 8
142	xma.l		f37 = f33, f6, f0
143	xma.hu		f41 = f33, f6, f0
144	;;
145	getf.sig	r21 = f43
146	ldf8		f33 = [up], 8
147	xma.l		f38 = f34, f6, f0
148	;;
149	getf.sig	r18 = f36
150	xma.hu		f42 = f34, f6, f0
151	;;
152	getf.sig	r22 = f40
153	ldf8		f34 = [up], 8
154	xma.l		f39 = f35, f6, f0
155	;;
156	getf.sig	r19 = f37
157	xma.hu		f43 = f35, f6, f0
158	br		.LL01
159
160
161.Lb10:	ldf8		f35 = [up], 8
162	mov		r23 = 0
163	br.cloop.dptk	.grt2
164
165	xma.l		f38 = f7, f6, f9
166	xma.hu		f42 = f7, f6, f9
167	;;
168	stf8		[rp] = f38, 8
169	xma.l		f39 = f35, f6, f42
170	xma.hu		f43 = f35, f6, f42
171	;;
172	getf.sig	r8 = f43
173	stf8		[rp] = f39
174	mov.i		ar.lc = r2
175	br.ret.sptk.many b0
176
177
178.grt2:
179	ldf8		f32 = [up], 8
180	;;
181	ldf8		f33 = [up], 8
182	xma.l		f38 = f7, f6, f9
183	xma.hu		f42 = f7, f6, f9
184	;;
185	ldf8		f34 = [up], 8
186	xma.l		f39 = f35, f6, f0
187	xma.hu		f43 = f35, f6, f0
188	;;
189	ldf8		f35 = [up], 8
190	br.cloop.dptk	.grt6
191
192	stf8		[rp] = f38, 8
193	xma.l		f36 = f32, f6, f0
194	xma.hu		f40 = f32, f6, f0
195	;;
196	getf.sig	r20 = f42
197	getf.sig	r17 = f39
198	xma.l		f37 = f33, f6, f0
199	xma.hu		f41 = f33, f6, f0
200	;;
201	getf.sig	r21 = f43
202	getf.sig	r18 = f36
203	xma.l		f38 = f34, f6, f0
204	xma.hu		f42 = f34, f6, f0
205	;;
206	getf.sig	r22 = f40
207	getf.sig	r19 = f37
208	xma.l		f39 = f35, f6, f0
209	xma.hu		f43 = f35, f6, f0
210	br		.Lcj6
211
212.grt6:
213	getf.sig	r16 = f38
214	xma.l		f36 = f32, f6, f0
215	xma.hu		f40 = f32, f6, f0
216	;;
217	getf.sig	r20 = f42
218	ldf8		f32 = [up], 8
219	xma.l		f37 = f33, f6, f0
220	;;
221	getf.sig	r17 = f39
222	xma.hu		f41 = f33, f6, f0
223	;;
224	getf.sig	r21 = f43
225	ldf8		f33 = [up], 8
226	xma.l		f38 = f34, f6, f0
227	;;
228	getf.sig	r18 = f36
229	xma.hu		f42 = f34, f6, f0
230	br		.LL10
231
232
233.Lb11:	ldf8		f34 = [up], 8
234	mov		r22 = 0
235	;;
236	ldf8		f35 = [up], 8
237	br.cloop.dptk	.grt3
238	;;
239
240	xma.l		f37 = f7, f6, f9
241	xma.hu		f41 = f7, f6, f9
242	xma.l		f38 = f34, f6, f0
243	xma.hu		f42 = f34, f6, f0
244	xma.l		f39 = f35, f6, f0
245	xma.hu		f43 = f35, f6, f0
246	;;
247	getf.sig	r23 = f41
248	stf8		[rp] = f37, 8
249	getf.sig	r16 = f38
250	getf.sig	r20 = f42
251	getf.sig	r17 = f39
252	getf.sig	r8 = f43
253	br		.Lcj3
254
255.grt3:
256	ldf8		f32 = [up], 8
257	xma.l		f37 = f7, f6, f9
258	xma.hu		f41 = f7, f6, f9
259	;;
260	ldf8		f33 = [up], 8
261	xma.l		f38 = f34, f6, f0
262	xma.hu		f42 = f34, f6, f0
263	;;
264	getf.sig	r19 = f37
265	ldf8		f34 = [up], 8
266	xma.l		f39 = f35, f6, f0
267	xma.hu		f43 = f35, f6, f0
268	;;
269	getf.sig	r23 = f41
270	ldf8		f35 = [up], 8
271	br.cloop.dptk	.grt7
272
273	getf.sig	r16 = f38
274	xma.l		f36 = f32, f6, f0
275	getf.sig	r20 = f42
276	xma.hu		f40 = f32, f6, f0
277	;;
278	getf.sig	r17 = f39
279	xma.l		f37 = f33, f6, f0
280	getf.sig	r21 = f43
281	xma.hu		f41 = f33, f6, f0
282	;;
283	getf.sig	r18 = f36
284	st8		[rp] = r19, 8
285	xma.l		f38 = f34, f6, f0
286	xma.hu		f42 = f34, f6, f0
287	br		.Lcj7
288
289.grt7:
290	getf.sig	r16 = f38
291	xma.l		f36 = f32, f6, f0
292	xma.hu		f40 = f32, f6, f0
293	;;
294	getf.sig	r20 = f42
295	ldf8		f32 = [up], 8
296	xma.l		f37 = f33, f6, f0
297	;;
298	getf.sig	r17 = f39
299	xma.hu		f41 = f33, f6, f0
300	br		.LL11
301
302
303.Lb00:	ldf8		f33 = [up], 8
304	mov		r21 = 0
305	;;
306	ldf8		f34 = [up], 8
307	;;
308	ldf8		f35 = [up], 8
309	xma.l		f36 = f7, f6, f9
310	xma.hu		f40 = f7, f6, f9
311	br.cloop.dptk	.grt4
312
313	xma.l		f37 = f33, f6, f0
314	xma.hu		f41 = f33, f6, f0
315	xma.l		f38 = f34, f6, f0
316	xma.hu		f42 = f34, f6, f0
317	;;
318	getf.sig	r22 = f40
319	stf8		[rp] = f36, 8
320	xma.l		f39 = f35, f6, f0
321	getf.sig	r19 = f37
322	xma.hu		f43 = f35, f6, f0
323	;;
324	getf.sig	r23 = f41
325	getf.sig	r16 = f38
326	getf.sig	r20 = f42
327	getf.sig	r17 = f39
328	br		.Lcj4
329
330.grt4:
331	ldf8		f32 = [up], 8
332	xma.l		f37 = f33, f6, f0
333	xma.hu		f41 = f33, f6, f0
334	;;
335	getf.sig	r18 = f36
336	ldf8		f33 = [up], 8
337	xma.l		f38 = f34, f6, f0
338	xma.hu		f42 = f34, f6, f0
339	;;
340	getf.sig	r22 = f40
341	ldf8		f34 = [up], 8
342	xma.l		f39 = f35, f6, f0
343	;;
344	getf.sig	r19 = f37
345	getf.sig	r23 = f41
346	xma.hu		f43 = f35, f6, f0
347	ldf8		f35 = [up], 8
348	br.cloop.dptk	.grt8
349
350	getf.sig	r16 = f38
351	xma.l		f36 = f32, f6, f0
352	getf.sig	r20 = f42
353	xma.hu		f40 = f32, f6, f0
354	;;
355	getf.sig	r17 = f39
356	st8		[rp] = r18, 8
357	xma.l		f37 = f33, f6, f0
358	xma.hu		f41 = f33, f6, f0
359	br		.Lcj8
360
361.grt8:
362	getf.sig	r16 = f38
363	xma.l		f36 = f32, f6, f0
364	xma.hu		f40 = f32, f6, f0
365	br		.LL00
366
367
368C *** MAIN LOOP START ***
369	ALIGN(32)
370.Loop:
371	.pred.rel "mutex",p6,p7
372	getf.sig	r16 = f38
373	xma.l		f36 = f32, f6, f0
374   (p6)	cmp.leu		p8, p9 = r24, r17
375	st8		[rp] = r24, 8
376	xma.hu		f40 = f32, f6, f0
377   (p7)	cmp.ltu		p8, p9 = r24, r17
378	;;
379.LL00:
380	.pred.rel "mutex",p8,p9
381	getf.sig	r20 = f42
382   (p8)	add		r24 = r18, r21, 1
383	nop.b		0
384	ldf8		f32 = [up], 8
385   (p9)	add		r24 = r18, r21
386	nop.b		0
387	;;
388	.pred.rel "mutex",p8,p9
389	getf.sig	r17 = f39
390	xma.l		f37 = f33, f6, f0
391   (p8)	cmp.leu		p6, p7 = r24, r18
392	st8		[rp] = r24, 8
393	xma.hu		f41 = f33, f6, f0
394   (p9)	cmp.ltu		p6, p7 = r24, r18
395	;;
396.LL11:
397	.pred.rel "mutex",p6,p7
398	getf.sig	r21 = f43
399   (p6)	add		r24 = r19, r22, 1
400	nop.b		0
401	ldf8		f33 = [up], 8
402   (p7)	add		r24 = r19, r22
403	nop.b		0
404	;;
405	.pred.rel "mutex",p6,p7
406	getf.sig	r18 = f36
407	xma.l		f38 = f34, f6, f0
408   (p6)	cmp.leu		p8, p9 = r24, r19
409	st8		[rp] = r24, 8
410	xma.hu		f42 = f34, f6, f0
411   (p7)	cmp.ltu		p8, p9 = r24, r19
412	;;
413.LL10:
414	.pred.rel "mutex",p8,p9
415	getf.sig	r22 = f40
416   (p8)	add		r24 = r16, r23, 1
417	nop.b		0
418	ldf8		f34 = [up], 8
419   (p9)	add		r24 = r16, r23
420	nop.b		0
421	;;
422	.pred.rel "mutex",p8,p9
423	getf.sig	r19 = f37
424	xma.l		f39 = f35, f6, f0
425   (p8)	cmp.leu		p6, p7 = r24, r16
426	st8		[rp] = r24, 8
427	xma.hu		f43 = f35, f6, f0
428   (p9)	cmp.ltu		p6, p7 = r24, r16
429	;;
430.LL01:
431	.pred.rel "mutex",p6,p7
432	getf.sig	r23 = f41
433   (p6)	add		r24 = r17, r20, 1
434	nop.b		0
435	ldf8		f35 = [up], 8
436   (p7)	add		r24 = r17, r20
437	br.cloop.dptk	.Loop
438C *** MAIN LOOP END ***
439	;;
440
441.Lcj9:
442	.pred.rel "mutex",p6,p7
443	getf.sig	r16 = f38
444	xma.l		f36 = f32, f6, f0
445   (p6)	cmp.leu		p8, p9 = r24, r17
446	st8		[rp] = r24, 8
447	xma.hu		f40 = f32, f6, f0
448   (p7)	cmp.ltu		p8, p9 = r24, r17
449	;;
450	.pred.rel "mutex",p8,p9
451	getf.sig	r20 = f42
452   (p8)	add		r24 = r18, r21, 1
453   (p9)	add		r24 = r18, r21
454	;;
455	.pred.rel "mutex",p8,p9
456	getf.sig	r17 = f39
457	xma.l		f37 = f33, f6, f0
458   (p8)	cmp.leu		p6, p7 = r24, r18
459	st8		[rp] = r24, 8
460	xma.hu		f41 = f33, f6, f0
461   (p9)	cmp.ltu		p6, p7 = r24, r18
462	;;
463.Lcj8:
464	.pred.rel "mutex",p6,p7
465	getf.sig	r21 = f43
466   (p6)	add		r24 = r19, r22, 1
467   (p7)	add		r24 = r19, r22
468	;;
469	.pred.rel "mutex",p6,p7
470	getf.sig	r18 = f36
471	xma.l		f38 = f34, f6, f0
472   (p6)	cmp.leu		p8, p9 = r24, r19
473	st8		[rp] = r24, 8
474	xma.hu		f42 = f34, f6, f0
475   (p7)	cmp.ltu		p8, p9 = r24, r19
476	;;
477.Lcj7:
478	.pred.rel "mutex",p8,p9
479	getf.sig	r22 = f40
480   (p8)	add		r24 = r16, r23, 1
481   (p9)	add		r24 = r16, r23
482	;;
483	.pred.rel "mutex",p8,p9
484	getf.sig	r19 = f37
485	xma.l		f39 = f35, f6, f0
486   (p8)	cmp.leu		p6, p7 = r24, r16
487	st8		[rp] = r24, 8
488	xma.hu		f43 = f35, f6, f0
489   (p9)	cmp.ltu		p6, p7 = r24, r16
490	;;
491.Lcj6:
492	.pred.rel "mutex",p6,p7
493	getf.sig	r23 = f41
494   (p6)	add		r24 = r17, r20, 1
495   (p7)	add		r24 = r17, r20
496	;;
497	.pred.rel "mutex",p6,p7
498   (p6)	cmp.leu		p8, p9 = r24, r17
499   (p7)	cmp.ltu		p8, p9 = r24, r17
500	getf.sig	r16 = f38
501	st8		[rp] = r24, 8
502	;;
503.Lcj5:
504	.pred.rel "mutex",p8,p9
505	getf.sig	r20 = f42
506   (p8)	add		r24 = r18, r21, 1
507   (p9)	add		r24 = r18, r21
508	;;
509	.pred.rel "mutex",p8,p9
510   (p8)	cmp.leu		p6, p7 = r24, r18
511   (p9)	cmp.ltu		p6, p7 = r24, r18
512	getf.sig	r17 = f39
513	st8		[rp] = r24, 8
514	;;
515.Lcj4:
516	.pred.rel "mutex",p6,p7
517	getf.sig	r8 = f43
518   (p6)	add		r24 = r19, r22, 1
519   (p7)	add		r24 = r19, r22
520	;;
521	.pred.rel "mutex",p6,p7
522	st8		[rp] = r24, 8
523   (p6)	cmp.leu		p8, p9 = r24, r19
524   (p7)	cmp.ltu		p8, p9 = r24, r19
525	;;
526.Lcj3:
527	.pred.rel "mutex",p8,p9
528   (p8)	add		r24 = r16, r23, 1
529   (p9)	add		r24 = r16, r23
530	;;
531	.pred.rel "mutex",p8,p9
532	st8		[rp] = r24, 8
533   (p8)	cmp.leu		p6, p7 = r24, r16
534   (p9)	cmp.ltu		p6, p7 = r24, r16
535	;;
536.Lcj2:
537	.pred.rel "mutex",p6,p7
538   (p6)	add		r24 = r17, r20, 1
539   (p7)	add		r24 = r17, r20
540	;;
541	.pred.rel "mutex",p6,p7
542	st8		[rp] = r24, 8
543   (p6)	cmp.leu		p8, p9 = r24, r17
544   (p7)	cmp.ltu		p8, p9 = r24, r17
545	;;
546   (p8)	add		r8 = 1, r8
547	mov.i		ar.lc = r2
548	br.ret.sptk.many b0
549EPILOGUE()
550
551PROLOGUE(mpn_mul_1c)
552	.prologue
553	.save	ar.lc, r2
554	.body
555
556ifdef(`HAVE_ABI_32',
557`	addp4		rp = 0, rp		C M I
558	addp4		up = 0, up		C M I
559	zxt4		n = n			C I
560	;;
561')
562{.mmi
563	adds		r15 = -1, n		C M I
564	setf.sig	f9 = cy			C M2 M3
565	mov.i		r2 = ar.lc		C I0
566}
567{.mmb
568	ldf8		f7 = [up], 8		C M
569	and		r14 = 3, n		C M I
570	br.sptk		.Lcommon
571	;;
572}
573EPILOGUE()
574ASM_END()
575