xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/ia64/bdiv_dbm1c.asm (revision 63aea4bd5b445e491ff0389fe27ec78b3099dba3)
1dnl  IA-64 mpn_bdiv_dbm1.
2
3dnl  Contributed to the GNU project by Torbjorn Granlund.
4
5dnl  Copyright 2008, 2009 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of the GNU Lesser General Public License as published
11dnl  by the Free Software Foundation; either version 3 of the License, or (at
12dnl  your option) any later version.
13
14dnl  The GNU MP Library is distributed in the hope that it will be useful, but
15dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17dnl  License for more details.
18
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24C         cycles/limb
25C Itanium:    4
26C Itanium 2:  2
27
28C TODO
29C  * Optimize feed-in and wind-down code, both for speed and code size.
30
31C INPUT PARAMETERS
32define(`rp', `r32')
33define(`up', `r33')
34define(`n', `r34')
35define(`bd', `r35')
36
37ASM_START()
38PROLOGUE(mpn_bdiv_dbm1c)
39	.prologue
40	.save		ar.lc, r2
41	.body
42
43ifdef(`HAVE_ABI_32',
44`	addp4		rp = 0, rp		C M I
45	addp4		up = 0, up		C M I
46	zxt4		n = n			C I
47	;;
48')
49{.mmb
50	mov		r15 = r36		C M I
51	ldf8		f9 = [up], 8		C M
52	nop.b		0			C B
53}
54.Lcommon:
55{.mii
56	adds		r16 = -1, n		C M I
57	mov		r2 = ar.lc		C I0
58	and		r14 = 3, n		C M I
59	;;
60}
61{.mii
62	setf.sig	f6 = bd			C M2 M3
63	shr.u		r31 = r16, 2		C I0
64	cmp.eq		p10, p0 = 0, r14	C M I
65}
66{.mii
67	nop.m		0			C M
68	cmp.eq		p11, p0 = 2, r14	C M I
69	cmp.eq		p12, p0 = 3, r14	C M I
70	;;
71}
72{.mii
73	cmp.ne		p6, p7 = r0, r0		C M I
74	mov.i		ar.lc = r31		C I0
75	cmp.ne		p8, p9 = r0, r0		C M I
76}
77{.bbb
78  (p10)	br.dptk		.Lb00			C B
79  (p11)	br.dptk		.Lb10			C B
80  (p12)	br.dptk		.Lb11			C B
81	;;
82}
83
84.Lb01:	br.cloop.dptk	.grt1
85	;;
86	xma.l		f38 = f9, f6, f0
87	xma.hu		f39 = f9, f6, f0
88	;;
89	getf.sig	r26 = f38
90	getf.sig	r27 = f39
91	br		.Lcj1
92
93.grt1:	ldf8		f10 = [r33], 8
94	;;
95	ldf8		f11 = [r33], 8
96	;;
97	ldf8		f12 = [r33], 8
98	;;
99	xma.l		f38 = f9, f6, f0
100	xma.hu		f39 = f9, f6, f0
101	;;
102	ldf8		f13 = [r33], 8
103	;;
104	xma.l		f32 = f10, f6, f0
105	xma.hu		f33 = f10, f6, f0
106	br.cloop.dptk	.grt5
107
108	;;
109	getf.sig	r26 = f38
110	xma.l		f34 = f11, f6, f0
111	xma.hu		f35 = f11, f6, f0
112	;;
113	getf.sig	r27 = f39
114	;;
115	getf.sig	r20 = f32
116	xma.l		f36 = f12, f6, f0
117	xma.hu		f37 = f12, f6, f0
118	;;
119	getf.sig	r21 = f33
120	;;
121	getf.sig	r22 = f34
122	xma.l		f38 = f13, f6, f0
123	xma.hu		f39 = f13, f6, f0
124	br		.Lcj5
125
126.grt5:	ldf8		f10 = [r33], 8
127	;;
128	getf.sig	r26 = f38
129	xma.l		f34 = f11, f6, f0
130	xma.hu		f35 = f11, f6, f0
131	;;
132	getf.sig	r27 = f39
133	ldf8		f11 = [r33], 8
134	;;
135	getf.sig	r20 = f32
136	xma.l		f36 = f12, f6, f0
137	xma.hu		f37 = f12, f6, f0
138	;;
139	getf.sig	r21 = f33
140	ldf8		f12 = [r33], 8
141	;;
142	getf.sig	r22 = f34
143	xma.l		f38 = f13, f6, f0
144	xma.hu		f39 = f13, f6, f0
145	br		.LL01
146
147.Lb10:	ldf8		f13 = [r33], 8
148	br.cloop.dptk	.grt2
149	;;
150
151	xma.l		f36 = f9, f6, f0
152	xma.hu		f37 = f9, f6, f0
153	;;
154	xma.l		f38 = f13, f6, f0
155	xma.hu		f39 = f13, f6, f0
156	;;
157	getf.sig	r24 = f36
158	;;
159	getf.sig	r25 = f37
160	;;
161	getf.sig	r26 = f38
162	;;
163	getf.sig	r27 = f39
164	br		.Lcj2
165
166.grt2:	ldf8		f10 = [r33], 8
167	;;
168	ldf8		f11 = [r33], 8
169	;;
170	xma.l		f36 = f9, f6, f0
171	xma.hu		f37 = f9, f6, f0
172	;;
173	ldf8		f12 = [r33], 8
174	;;
175	xma.l		f38 = f13, f6, f0
176	xma.hu		f39 = f13, f6, f0
177	;;
178	ldf8		f13 = [r33], 8
179	;;
180	getf.sig	r24 = f36
181	xma.l		f32 = f10, f6, f0
182	xma.hu		f33 = f10, f6, f0
183	br.cloop.dptk	.grt6
184
185	getf.sig	r25 = f37
186	;;
187	getf.sig	r26 = f38
188	xma.l		f34 = f11, f6, f0
189	xma.hu		f35 = f11, f6, f0
190	;;
191	getf.sig	r27 = f39
192	;;
193	getf.sig	r20 = f32
194	xma.l		f36 = f12, f6, f0
195	xma.hu		f37 = f12, f6, f0
196	br		.Lcj6
197
198.grt6:	getf.sig	r25 = f37
199	ldf8		f10 = [r33], 8
200	;;
201	getf.sig	r26 = f38
202	xma.l		f34 = f11, f6, f0
203	xma.hu		f35 = f11, f6, f0
204	;;
205	getf.sig	r27 = f39
206	ldf8		f11 = [r33], 8
207	;;
208	getf.sig	r20 = f32
209	xma.l		f36 = f12, f6, f0
210	xma.hu		f37 = f12, f6, f0
211	br		.LL10
212
213
214.Lb11:	ldf8		f12 = [r33], 8
215	;;
216	ldf8		f13 = [r33], 8
217	br.cloop.dptk	.grt3
218	;;
219
220	xma.l		f34 = f9, f6, f0
221	xma.hu		f35 = f9, f6, f0
222	;;
223	xma.l		f36 = f12, f6, f0
224	xma.hu		f37 = f12, f6, f0
225	;;
226	getf.sig	r22 = f34
227	xma.l		f38 = f13, f6, f0
228	xma.hu		f39 = f13, f6, f0
229	;;
230	getf.sig	r23 = f35
231	;;
232	getf.sig	r24 = f36
233	;;
234	getf.sig	r25 = f37
235	;;
236	getf.sig	r26 = f38
237	br		.Lcj3
238
239.grt3:	ldf8		f10 = [r33], 8
240	;;
241	xma.l		f34 = f9, f6, f0
242	xma.hu		f35 = f9, f6, f0
243	;;
244	ldf8		f11 = [r33], 8
245	;;
246	xma.l		f36 = f12, f6, f0
247	xma.hu		f37 = f12, f6, f0
248	;;
249	ldf8		f12 = [r33], 8
250	;;
251	getf.sig	r22 = f34
252	xma.l		f38 = f13, f6, f0
253	xma.hu		f39 = f13, f6, f0
254	;;
255	getf.sig	r23 = f35
256	ldf8		f13 = [r33], 8
257	;;
258	getf.sig	r24 = f36
259	xma.l		f32 = f10, f6, f0
260	xma.hu		f33 = f10, f6, f0
261	br.cloop.dptk	.grt7
262
263	getf.sig	r25 = f37
264	;;
265	getf.sig	r26 = f38
266	xma.l		f34 = f11, f6, f0
267	xma.hu		f35 = f11, f6, f0
268	br		.Lcj7
269
270.grt7:	getf.sig	r25 = f37
271	ldf8		f10 = [r33], 8
272	;;
273	getf.sig	r26 = f38
274	xma.l		f34 = f11, f6, f0
275	xma.hu		f35 = f11, f6, f0
276	br		.LL11
277
278
279.Lb00:	ldf8		f11 = [r33], 8
280	;;
281	ldf8		f12 = [r33], 8
282	;;
283	ldf8		f13 = [r33], 8
284	br.cloop.dptk	.grt4
285	;;
286
287	xma.l		f32 = f9, f6, f0
288	xma.hu		f33 = f9, f6, f0
289	;;
290	xma.l		f34 = f11, f6, f0
291	xma.hu		f35 = f11, f6, f0
292	;;
293	getf.sig	r20 = f32
294	xma.l		f36 = f12, f6, f0
295	xma.hu		f37 = f12, f6, f0
296	;;
297	getf.sig	r21 = f33
298	;;
299	getf.sig	r22 = f34
300	xma.l		f38 = f13, f6, f0
301	xma.hu		f39 = f13, f6, f0
302	;;
303	getf.sig	r23 = f35
304	;;
305	getf.sig	r24 = f36
306	br		.Lcj4
307
308.grt4:	xma.l		f32 = f9, f6, f0
309	xma.hu		f33 = f9, f6, f0
310	;;
311	ldf8		f10 = [r33], 8
312	;;
313	xma.l		f34 = f11, f6, f0
314	xma.hu		f35 = f11, f6, f0
315	;;
316	ldf8		f11 = [r33], 8
317	;;
318	getf.sig	r20 = f32
319	xma.l		f36 = f12, f6, f0
320	xma.hu		f37 = f12, f6, f0
321	;;
322	getf.sig	r21 = f33
323	ldf8		f12 = [r33], 8
324	;;
325	getf.sig	r22 = f34
326	xma.l		f38 = f13, f6, f0
327	xma.hu		f39 = f13, f6, f0
328	;;
329	getf.sig	r23 = f35
330	ldf8		f13 = [r33], 8
331	;;
332	getf.sig	r24 = f36
333	xma.l		f32 = f10, f6, f0
334	xma.hu		f33 = f10, f6, f0
335	br.cloop.dptk	.LL00
336	br		.Lcj8
337
338C *** MAIN LOOP START ***
339	ALIGN(32)
340.Ltop:
341	.pred.rel "mutex",p6,p7
342C	.mfi
343	getf.sig	r24 = f36
344	xma.l		f32 = f10, f6, f0
345  (p6)	sub		r15 = r19, r27, 1
346C	.mfi
347	st8		[r32] = r19, 8
348	xma.hu		f33 = f10, f6, f0
349  (p7)	sub		r15 = r19, r27
350	;;
351.LL00:
352C	.mfi
353	getf.sig	r25 = f37
354	nop.f 0
355	cmp.ltu		p6, p7 = r15, r20
356C	.mib
357	ldf8		f10 = [r33], 8
358	sub		r16 = r15, r20
359	nop.b 0
360	;;
361
362C	.mfi
363	getf.sig	r26 = f38
364	xma.l		f34 = f11, f6, f0
365  (p6)	sub		r15 = r16, r21, 1
366C	.mfi
367	st8		[r32] = r16, 8
368	xma.hu		f35 = f11, f6, f0
369  (p7)	sub		r15 = r16, r21
370	;;
371.LL11:
372C	.mfi
373	getf.sig	r27 = f39
374	nop.f 0
375	cmp.ltu		p6, p7 = r15, r22
376C	.mib
377	ldf8		f11 = [r33], 8
378	sub		r17 = r15, r22
379	nop.b 0
380	;;
381
382C	.mfi
383	getf.sig	r20 = f32
384	xma.l		f36 = f12, f6, f0
385  (p6)	sub		r15 = r17, r23, 1
386C	.mfi
387	st8		[r32] = r17, 8
388	xma.hu		f37 = f12, f6, f0
389  (p7)	sub		r15 = r17, r23
390	;;
391.LL10:
392C	.mfi
393	getf.sig	r21 = f33
394	nop.f 0
395	cmp.ltu		p6, p7 = r15, r24
396C	.mib
397	ldf8		f12 = [r33], 8
398	sub		r18 = r15, r24
399	nop.b 0
400	;;
401
402C	.mfi
403	getf.sig	r22 = f34
404	xma.l		f38 = f13, f6, f0
405  (p6)	sub		r15 = r18, r25, 1
406C	.mfi
407	st8		[r32] = r18, 8
408	xma.hu		f39 = f13, f6, f0
409  (p7)	sub		r15 = r18, r25
410	;;
411.LL01:
412C	.mfi
413	getf.sig	r23 = f35
414	nop.f 0
415	cmp.ltu		p6, p7 = r15, r26
416C	.mib
417	ldf8		f13 = [r33], 8
418	sub		r19 = r15, r26
419	br.cloop.sptk.few .Ltop
420C *** MAIN LOOP END ***
421	;;
422
423	getf.sig	r24 = f36
424	xma.l		f32 = f10, f6, f0
425  (p6)	sub		r15 = r19, r27, 1
426	st8		[r32] = r19, 8
427	xma.hu		f33 = f10, f6, f0
428  (p7)	sub		r15 = r19, r27
429	;;
430.Lcj8:	getf.sig	r25 = f37
431	cmp.ltu		p6, p7 = r15, r20
432	sub		r16 = r15, r20
433	;;
434	getf.sig	r26 = f38
435	xma.l		f34 = f11, f6, f0
436  (p6)	sub		r15 = r16, r21, 1
437	st8		[r32] = r16, 8
438	xma.hu		f35 = f11, f6, f0
439  (p7)	sub		r15 = r16, r21
440	;;
441.Lcj7:	getf.sig	r27 = f39
442	cmp.ltu		p6, p7 = r15, r22
443	sub		r17 = r15, r22
444	;;
445	getf.sig	r20 = f32
446	xma.l		f36 = f12, f6, f0
447  (p6)	sub		r15 = r17, r23, 1
448	st8		[r32] = r17, 8
449	xma.hu		f37 = f12, f6, f0
450  (p7)	sub		r15 = r17, r23
451	;;
452.Lcj6:	getf.sig	r21 = f33
453	cmp.ltu		p6, p7 = r15, r24
454	sub		r18 = r15, r24
455	;;
456	getf.sig	r22 = f34
457	xma.l		f38 = f13, f6, f0
458  (p6)	sub		r15 = r18, r25, 1
459	st8		[r32] = r18, 8
460	xma.hu		f39 = f13, f6, f0
461  (p7)	sub		r15 = r18, r25
462	;;
463.Lcj5:	getf.sig	r23 = f35
464	cmp.ltu		p6, p7 = r15, r26
465	sub		r19 = r15, r26
466	;;
467	getf.sig	r24 = f36
468  (p6)	sub		r15 = r19, r27, 1
469	st8		[r32] = r19, 8
470  (p7)	sub		r15 = r19, r27
471	;;
472.Lcj4:	getf.sig	r25 = f37
473	cmp.ltu		p6, p7 = r15, r20
474	sub		r16 = r15, r20
475	;;
476	getf.sig	r26 = f38
477  (p6)	sub		r15 = r16, r21, 1
478	st8		[r32] = r16, 8
479  (p7)	sub		r15 = r16, r21
480	;;
481.Lcj3:	getf.sig	r27 = f39
482	cmp.ltu		p6, p7 = r15, r22
483	sub		r17 = r15, r22
484	;;
485  (p6)	sub		r15 = r17, r23, 1
486	st8		[r32] = r17, 8
487  (p7)	sub		r15 = r17, r23
488	;;
489.Lcj2:	cmp.ltu		p6, p7 = r15, r24
490	sub		r18 = r15, r24
491	;;
492  (p6)	sub		r15 = r18, r25, 1
493	st8		[r32] = r18, 8
494  (p7)	sub		r15 = r18, r25
495	;;
496.Lcj1:	cmp.ltu		p6, p7 = r15, r26
497	sub		r19 = r15, r26
498	;;
499  (p6)	sub		r8 = r19, r27, 1
500	st8		[r32] = r19
501  (p7)	sub		r8 = r19, r27
502	mov ar.lc = r2
503	br.ret.sptk.many b0
504EPILOGUE()
505ASM_END()
506