xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/ia64/divrem_1.asm (revision 6d322f2f4598f0d8a138f10ea648ec4fabe41f8b)
1dnl  IA-64 mpn_divrem_1 and mpn_preinv_divrem_1 -- Divide an mpn number by an
2dnl  unnormalized limb.
3
4dnl  Contributed to the GNU project by Torbjorn Granlund.
5
6dnl  Copyright 2002, 2004, 2005 Free Software Foundation, Inc.
7
8dnl  This file is part of the GNU MP Library.
9
10dnl  The GNU MP Library is free software; you can redistribute it and/or modify
11dnl  it under the terms of the GNU Lesser General Public License as published
12dnl  by the Free Software Foundation; either version 3 of the License, or (at
13dnl  your option) any later version.
14
15dnl  The GNU MP Library is distributed in the hope that it will be useful, but
16dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
17dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
18dnl  License for more details.
19
20dnl  You should have received a copy of the GNU Lesser General Public License
21dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
22
23include(`../config.m4')
24
25
26C         cycles/limb
27C Itanium:    40-42
28C Itanium 2:  29-30
29
30C This was generated by gcc, then the loops were optimized.  The preinv entry
31C point was shoehorned into the file.  Lots of things outside the loops could
32C be streamlined.  It would probably be a good idea to merge the loops for
33C normalized and unnormalized divisor, since the shifting stuff is done for
34C free in parallel with other operations.  It would even be possible to merge
35C all loops, if the ld8 were made conditional.
36
37C TODO
38C  * Consider delaying inversion for normalized mpn_divrem_1 entry till after
39C    computing leading limb.
40C  * Inline and interleave limb inversion code with loop setup code.
41
42ASM_START()
43
44C HP's assembler requires these declarations for importing mpn_invert_limb
45	.global	mpn_invert_limb
46	.type	mpn_invert_limb,@function
47
48C INPUT PARAMETERS
49C rp    = r32
50C qxn   = r33
51C up    = r34
52C n     = r35
53C vl    = r36
54C vlinv = r37  (preinv only)
55C cnt = r38    (preinv only)
56
57PROLOGUE(mpn_preinv_divrem_1)
58	.prologue
59	.save	ar.pfs, r42
60	alloc		r42 = ar.pfs, 7, 8, 1, 0
61	.save	ar.lc, r44
62	mov		r44 = ar.lc
63	.save	rp, r41
64	mov		r41 = b0
65	.body
66ifdef(`HAVE_ABI_32',
67`	addp4		r32 = 0, r32
68	sxt4		r33 = r33
69	addp4		r34 = 0, r34
70	sxt4		r35 = r35
71	;;
72')
73	mov		r40 = r38
74	shladd		r34 = r35, 3, r34
75	;;
76	adds		r34 = -8, r34
77	;;
78	ld8		r39 = [r34], -8
79	;;
80
81	add		r15 = r35, r33
82	;;
83	mov		r8 = r37
84	shladd		r32 = r15, 3, r32	C r32 = rp + n + qxn
85	cmp.le		p8, p0 = 0, r36
86	;;
87	adds		r32 = -8, r32		C r32 = rp + n + qxn - 1
88	cmp.leu		p6, p7 = r36, r39
89   (p8)	br.cond.dpnt	.Lpunnorm
90	;;
91
92   (p6)	addl		r15 = 1, r0
93   (p7)	mov		r15 = r0
94	;;
95   (p6)	sub		r38 = r39, r36
96   (p7)	mov		r38 = r39
97	st8		[r32] = r15, -8
98	adds		r35 = -2, r35		C un -= 2
99	br	.Lpn
100
101.Lpunnorm:
102   (p6)	add		r34 = 8, r34
103	mov		r38 = 0			C r = 0
104	shl		r36 = r36, r40
105   (p6)	br.cond.dptk	.Lpu
106	;;
107	shl		r38 = r39, r40		C r = ahigh << cnt
108	cmp.ne		p8, p0 = 1, r35
109	st8		[r32] = r0, -8
110	adds		r35 = -1, r35		C un--
111   (p8)	br.cond.dpnt	.Lpu
112
113	mov		r23 = 1
114	;;
115	setf.sig	f6 = r8
116	setf.sig	f12 = r23
117	br		.L435
118EPILOGUE()
119
120
121PROLOGUE(mpn_divrem_1)
122	.prologue
123	.save	ar.pfs, r42
124	alloc		r42 = ar.pfs, 5, 8, 1, 0
125	.save	ar.lc, r44
126	mov		r44 = ar.lc
127	.save	rp, r41
128	mov		r41 = b0
129	.body
130ifdef(`HAVE_ABI_32',
131`	addp4		r32 = 0, r32
132	sxt4		r33 = r33
133	addp4		r34 = 0, r34
134	sxt4		r35 = r35
135	;;
136')
137	mov		r38 = r0
138	add		r15 = r35, r33
139	;;
140	cmp.ne		p6, p7 = 0, r15
141	;;
142   (p7)	mov		r8 = r0
143   (p7)	br.cond.dpnt	.Lret
144	shladd		r14 = r15, 3, r32	C r14 = rp + n + qxn
145	cmp.le		p6, p7 = 0, r36
146	;;
147	adds		r32 = -8, r14		C r32 = rp + n + qxn - 1
148   (p6)	br.cond.dpnt	.Lunnorm
149	cmp.eq		p6, p7 = 0, r35
150   (p6)	br.cond.dpnt	.L179
151	shladd		r14 = r35, 3, r34
152	;;
153	adds		r14 = -8, r14
154	adds		r35 = -1, r35
155	;;
156	ld8		r38 = [r14]
157	;;
158	cmp.leu		p6, p7 = r36, r38
159	;;
160   (p6)	addl		r15 = 1, r0
161   (p7)	mov		r15 = r0
162	;;
163	st8		[r32] = r15, -8
164  (p6)	sub		r38 = r38, r36
165
166.L179:
167	mov		r45 = r36
168	adds		r35 = -1, r35
169	br.call.sptk.many b0 = mpn_invert_limb
170	;;
171	shladd		r34 = r35, 3, r34
172.Lpn:
173	mov		r23 = 1
174	;;
175	setf.sig	f6 = r8
176	setf.sig	f12 = r23
177	cmp.le		p6, p7 = 0, r35
178	mov		r40 = 0
179   (p7)	br.cond.dpnt	.L435
180	setf.sig	f10 = r36
181	mov		ar.lc = r35
182	setf.sig	f7 = r38
183	;;
184	sub		r28 = -1, r36
185C Develop quotient limbs for normalized divisor
186.Loop1:		C 00				C q=r18 nh=r38/f7
187	ld8		r20 = [r34], -8
188	xma.hu		f11 = f7, f6, f0
189	;;	C 04
190	xma.l		f8 = f11, f12, f7	C q = q + nh
191	;;	C 08
192	getf.sig	r18 = f8
193	xma.hu		f9 = f8, f10, f0
194	xma.l		f8 = f8, f10, f0
195	;;	C 12
196	getf.sig	r16 = f9
197		C 13
198	getf.sig	r15 = f8
199	;;	C 18
200	cmp.ltu		p6, p7 = r20, r15
201	sub		r15 = r20, r15
202	sub		r16 = r38, r16
203	;;	C 19
204   (p6)	cmp.ne		p8, p9 = 1, r16		C is rH != 0?
205   (p7)	cmp.ne		p8, p9 = 0, r16		C is rH != 0?
206   (p6)	add		r16 = -1, r16
207   (p0)	cmp.ne.unc	p6, p7 = r0, r0
208	;;	C 20
209   (p8)	cmp.ltu		p6, p7 = r15, r36
210   (p8)	sub		r15 = r15, r36
211   (p8)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
212	;;	C 21
213	.pred.rel "mutex",p6,p7
214   (p6)	cmp.ne		p8, p9 = 1, r16		C is rH != 0 still?
215   (p7)	cmp.ne		p8, p9 = 0, r16		C is rH != 0 still?
216	cmp.ltu		p6, p7 = r15, r36	C speculative
217	sub		r28 = r15, r36		C speculative, just for cmp
218	;;	C 22
219   (p8)	cmp.ltu		p6, p7 = r28, r36	C redo last cmp if needed
220   (p8)	mov		r15 = r28
221   (p8)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
222	;;	C 23
223   (p6)	setf.sig	f7 = r15
224   (p7)	sub		r15 = r15, r36
225   (p7)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
226	;;	C 24
227   (p7)	setf.sig	f7 = r15
228	st8		[r32] = r18, -8
229	mov		r38 = r15
230	br.cloop.dptk	.Loop1
231		C 29/30
232	br.sptk		.L435
233	;;
234.Lunnorm:
235	mux1		r16 = r36, @rev
236	cmp.eq		p6, p7 = 0, r35
237   (p6)	br.cond.dpnt	.L322
238	shladd		r34 = r35, 3, r34
239	;;
240	adds		r34 = -8, r34
241	;;
242	ld8		r39 = [r34]
243	;;
244	cmp.leu		p6, p7 = r36, r39
245   (p6)	br.cond.dptk	.L322
246	adds		r34 = -8, r34
247	;;
248	mov		r38 = r39
249	;;
250	cmp.ne		p6, p7 = 1, r15
251	st8		[r32] = r0, -8
252	;;
253   (p7)	mov		r8 = r38
254   (p7)	br.cond.dpnt	.Lret
255	adds		r35 = -1, r35
256.L322:
257	sub		r14 = r0, r16
258	;;
259	or		r14 = r16, r14
260	;;
261	mov		r16 = -8
262	czx1.l		r14 = r14
263	;;
264	shladd		r16 = r14, 3, r16
265	;;
266	shr.u		r14 = r36, r16
267	;;
268	cmp.geu		p6, p7 = 15, r14
269	;;
270   (p7)	shr.u		r14 = r14, 4
271   (p7)	adds		r16 = 4, r16
272	;;
273	cmp.geu		p6, p7 = 3, r14
274	;;
275   (p7)	shr.u		r14 = r14, 2
276   (p7)	adds		r16 = 2, r16
277	;;
278	tbit.nz		p6, p7 = r14, 1
279	;;
280	.pred.rel "mutex",p6,p7
281  (p6)	sub		r40 = 62, r16
282  (p7)	sub		r40 = 63, r16
283	;;
284	shl		r45 = r36, r40
285	shl		r36 = r36, r40
286	shl		r38 = r38, r40
287	br.call.sptk.many b0 = mpn_invert_limb
288	;;
289.Lpu:
290	mov		r23 = 1
291	;;
292	setf.sig	f6 = r8
293	setf.sig	f12 = r23
294	cmp.eq		p6, p7 = 0, r35
295   (p6)	br.cond.dpnt	.L435
296	sub		r16 = 64, r40
297	adds		r35 = -2, r35
298	;;
299	ld8		r39 = [r34], -8
300	cmp.le		p6, p7 = 0, r35
301	;;
302	shr.u		r14 = r39, r16
303	;;
304	or		r38 = r14, r38
305   (p7)	br.cond.dpnt	.Lend3
306	;;
307	mov		r22 = r16
308	setf.sig	f10 = r36
309	setf.sig	f7 = r38
310	mov		ar.lc = r35
311	;;
312C Develop quotient limbs for unnormalized divisor
313.Loop3:
314	ld8		r14 = [r34], -8
315	xma.hu		f11 = f7, f6, f0
316	;;
317	xma.l		f8 = f11, f12, f7	C q = q + nh
318	;;
319	getf.sig	r18 = f8
320	xma.hu		f9 = f8, f10, f0
321	shl		r20 = r39, r40
322	xma.l		f8 = f8, f10, f0
323	shr.u		r24 = r14, r22
324	;;
325	getf.sig	r16 = f9
326	getf.sig	r15 = f8
327	or		r20 = r24, r20
328	;;
329	cmp.ltu		p6, p7 = r20, r15
330	sub		r15 = r20, r15
331	sub		r16 = r38, r16
332	;;
333   (p6)	cmp.ne		p8, p9 = 1, r16		C is rH != 0?
334   (p7)	cmp.ne		p8, p9 = 0, r16		C is rH != 0?
335   (p6)	add		r16 = -1, r16
336   (p0)	cmp.ne.unc	p6, p7 = r0, r0
337	;;
338   (p8)	cmp.ltu		p6, p7 = r15, r36
339   (p8)	sub		r15 = r15, r36
340   (p8)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
341	;;
342	.pred.rel "mutex",p6,p7
343   (p6)	cmp.ne		p8, p9 = 1, r16		C is rH != 0 still?
344   (p7)	cmp.ne		p8, p9 = 0, r16		C is rH != 0 still?
345	cmp.ltu		p6, p7 = r15, r36	C speculative
346	sub		r28 = r15, r36		C speculative, just for cmp
347	;;
348   (p8)	cmp.ltu		p6, p7 = r28, r36	C redo last cmp if needed
349   (p8)	mov		r15 = r28
350   (p8)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
351	;;
352   (p6)	setf.sig	f7 = r15
353   (p7)	sub		r15 = r15, r36
354   (p7)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
355	;;
356   (p7)	setf.sig	f7 = r15
357	st8		[r32] = r18, -8
358	mov		r39 = r14
359	mov		r38 = r15
360	br.cloop.dptk	.Loop3
361	;;
362.Lend3:
363	setf.sig	f10 = r36
364	setf.sig	f7 = r38
365	;;
366	xma.hu		f11 = f7, f6, f0
367	;;
368	xma.l		f8 = f11, f12, f7	C q = q + nh
369	;;
370	getf.sig	r18 = f8
371	xma.hu		f9 = f8, f10, f0
372	shl		r20 = r39, r40
373	xma.l		f8 = f8, f10, f0
374	;;
375	getf.sig	r16 = f9
376	getf.sig	r15 = f8
377	;;
378	cmp.ltu		p6, p7 = r20, r15
379	sub		r15 = r20, r15
380	sub		r16 = r38, r16
381	;;
382   (p6)	cmp.ne		p8, p9 = 1, r16		C is rH != 0?
383   (p7)	cmp.ne		p8, p9 = 0, r16		C is rH != 0?
384   (p6)	add		r16 = -1, r16
385   (p0)	cmp.ne.unc	p6, p7 = r0, r0
386	;;
387   (p8)	cmp.ltu		p6, p7 = r15, r36
388   (p8)	sub		r15 = r15, r36
389   (p8)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
390	;;
391	.pred.rel "mutex",p6,p7
392   (p6)	cmp.ne		p8, p9 = 1, r16		C is rH != 0 still?
393   (p7)	cmp.ne		p8, p9 = 0, r16		C is rH != 0 still?
394	;;
395   (p8)	sub		r15 = r15, r36
396   (p8)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
397	;;
398	cmp.ltu		p6, p7 = r15, r36
399	;;
400   (p7)	sub		r15 = r15, r36
401   (p7)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
402	;;
403	st8		[r32] = r18, -8
404	mov		r38 = r15
405.L435:
406	adds		r35 = -1, r33
407	cmp.le		p6, p7 = 1, r33
408   (p7)	br.cond.dpnt	.Lend4
409	;;
410	setf.sig	f7 = r38
411	setf.sig	f10 = r36
412	mov		ar.lc = r35
413	;;
414.Loop4:
415	xma.hu		f11 = f7, f6, f0
416	;;
417	xma.l		f8 = f11, f12, f7	C q = q + nh
418	;;
419	getf.sig	r18 = f8
420	xma.hu		f9 = f8, f10, f0
421	xma.l		f8 = f8, f10, f0
422	;;
423	getf.sig	r16 = f9
424	getf.sig	r15 = f8
425	;;
426	cmp.ltu		p6, p7 = 0, r15
427	sub		r15 = 0, r15
428	sub		r16 = r38, r16
429	;;
430   (p6)	cmp.ne		p8, p9 = 1, r16		C is rH != 0?
431   (p7)	cmp.ne		p8, p9 = 0, r16		C is rH != 0?
432   (p6)	add		r16 = -1, r16
433   (p0)	cmp.ne.unc	p6, p7 = r0, r0
434	;;
435   (p8)	cmp.ltu		p6, p7 = r15, r36
436   (p8)	sub		r15 = r15, r36
437   (p8)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
438	;;
439	.pred.rel "mutex",p6,p7
440   (p6)	cmp.ne		p8, p9 = 1, r16		C is rH != 0 still?
441   (p7)	cmp.ne		p8, p9 = 0, r16		C is rH != 0 still?
442	cmp.ltu		p6, p7 = r15, r36	C speculative
443	sub		r28 = r15, r36		C speculative, just for cmp
444	;;
445   (p8)	cmp.ltu		p6, p7 = r28, r36	C redo last cmp if needed
446   (p8)	mov		r15 = r28
447   (p8)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
448	;;
449   (p6)	setf.sig	f7 = r15
450   (p7)	sub		r15 = r15, r36
451   (p7)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
452	;;
453   (p7)	setf.sig	f7 = r15
454	st8		[r32] = r18, -8
455	mov		r38 = r15
456	br.cloop.dptk	.Loop4
457	;;
458.Lend4:
459	shr.u		r8 = r38, r40
460.Lret:
461	mov		ar.pfs = r42
462	mov		ar.lc = r44
463	mov		b0 = r41
464	br.ret.sptk.many b0
465EPILOGUE()
466ASM_END()
467