xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/ia64/divrem_2.asm (revision e39ef1d61eee3ccba837ee281f1e098c864487aa)
1dnl  IA-64 mpn_divrem_2 -- Divide an n-limb number by a 2-limb number.
2
3dnl  Copyright 2004, 2005 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22C         cycles/limb
23C Itanium:    63
24C Itanium 2:  46
25
26
27C TODO
28C  * Further optimize the loop.  We could probably do some more trickery with
29C    arithmetic in the FPU, or perhaps use a non-zero addend of xma in more
30C    places.
31C  * Software pipeline for perhaps 5 saved cycles, around the end and start of
32C    the loop.
33C  * Schedule code outside of loop better.
34C  * Update the comments.  They are now using the same name for the same
35C    logical quantity.
36C  * Handle conditional zeroing of r31 in loop more cleanly.
37C  * Inline mpn_invert_limb and schedule its insns across the entire init code.
38C  * Ultimately, use 2-limb, or perhaps 3-limb or 4-limb inverse.
39
40define(`qp',`r32')
41define(`qxn',`r33')
42define(`np',`r34')
43define(`nn',`r35')
44define(`dp',`r36')
45
46define(`fnh',`f11')
47define(`fminus1',`f10')
48define(`fd0',`f13')
49define(`fd1',`f14')
50define(`d0',`r39')
51define(`d1',`r36')
52define(`fnl',`f32')
53define(`fdinv',`f12')
54
55define(`R1',`r38') define(`R0',`r37')
56define(`P1',`r28') define(`P0',`r27')
57
58ASM_START()
59
60C HP's assembler requires these declarations for importing mpn_invert_limb
61	.global	mpn_invert_limb
62	.type	mpn_invert_limb,@function
63
64PROLOGUE(mpn_divrem_2)
65	.prologue
66	.save ar.pfs, r42
67	.save ar.lc, r44
68	.save rp, r41
69ifdef(`HAVE_ABI_32',
70`	addp4		qp = 0, qp		C M I
71	addp4		np = 0, np		C M I
72	addp4		dp = 0, dp		C M I
73	zxt4		nn = nn			C I
74	zxt4		qxn = qxn		C I
75	;;
76')
77
78	alloc		r42 = ar.pfs, 5,8,1,0	C M2
79	ld8		d0 = [dp], 8		C M0M1	d0
80	mov		r44 = ar.lc		C I0
81	shladd		np = nn, 3, np		C M I
82	;;
83	ld8		d1 = [dp]		C M0M1	d1
84	mov		r41 = b0		C I0
85	add		r15 = -8, np		C M I
86	add		np = -16, np		C M I
87	mov		r40 = r0		C M I
88	;;
89	ld8		R1 = [r15]		C M0M1	n1
90	ld8		R0 = [r34], -8		C M0M1	n0
91	;;
92	cmp.ltu		p6, p0 = d1, R1		C M I
93	cmp.eq		p8, p0 = d1, R1		C M I
94	;;
95  (p8)	cmp.leu		p6, p0 = d0, R0
96	cmp.ltu		p8, p9 = R0, d0
97  (p6)	br.cond.dpnt	.L_high_limb_1		C FIXME: inline!
98.L8:
99
100	mov		r45 = d1
101	br.call.sptk.many b0 = mpn_invert_limb	C FIXME: inline+schedule
102	;;
103	setf.sig	fd1 = d1		C d1
104	setf.sig	fd0 = d0		C d0
105	add		r14 = r33, r35		C nn + qxn
106	;;
107	setf.sig	fdinv = r8		C dinv
108	mov		r9 = -1
109	add		r35 = -3, r14
110	;;
111	setf.sig	fminus1 = r9
112	cmp.gt		p6, p0 = r0, r35
113	shladd		qp = r35, 3, qp
114	mov		ar.lc = r35
115	mov		r31 = 0			C n0
116  (p6)	br.cond.dpnt	.Ldone
117	;;
118	ALIGN(16)
119C *** MAIN LOOP START ***
120.Loop:		C 00
121	mov		r15 = R0		C nadj = n10
122	cmp.le		p14, p15 = 0, R0	C check high bit of R0
123	cmp.le		p8, p0 = r33, r35	C dividend limbs remaining?
124	;;	C 01
125	.pred.rel "mutex", p14, p15
126  (p8)	ld8		r31 = [r34], -8		C n0
127  (p15)	add		r15 = d1, R0		C nadj = n10 + d1
128  (p15)	add		r14 = 1, R1		C nh + (nl:63)
129  (p14)	mov		r14 = R1		C nh
130	cmp.eq		p6, p0 = d1, R1		C nh == d1
131  (p6)	br.cond.spnt	.L_R1_eq_d1
132	;;	C 02
133	setf.sig	f8 = r14		C n2 + (nl:63)
134	setf.sig	f15 = r15		C nadj
135	sub		r23 = -1, R1		C r23 = ~nh
136	;;	C 03
137	setf.sig	fnh = r23
138	setf.sig	fnl = R0
139	;;	C 08
140	xma.hu		f7 = fdinv, f8, f15	C xh = HI(dinv*(nh-nmask)+nadj)
141	;;	C 12
142	xma.l		f7 = f7, fminus1, fnh	C nh + xh
143	;;	C 16
144	getf.sig	r14 = f7
145	xma.hu		f9 = f7, fd1, fnl	C xh = HI(q1*d1+nl)
146	xma.l		f33 = f7, fd1, fnl	C xh = LO(q1*d1+nl)
147	;;	C 20
148	getf.sig	r16 = f9
149	sub		r24 = d1, R1
150		C 21
151	getf.sig	r17 = f33
152	;;	C 25
153	cmp.eq		p6, p7 = r16, r24
154	;;	C 26
155	.pred.rel "mutex", p6, p7
156  (p6)	xma.l		f8 = f7, fminus1, f0	C f8 = -f7
157  (p7)	xma.l		f8 = f7,fminus1,fminus1	C f8 = -f7-1
158	;;	C 27
159	.pred.rel "mutex", p6, p7
160  (p6)	sub		r18 = 0, r14		C q = -q1
161  (p7)	sub		r18 = -1, r14		C q = -q1-1
162  (p6)	add		r14 = 0, r17		C n1 = xl
163  (p7)	add		r14 = d1, r17		C n1 = xl + d1
164	;;	C 30
165	xma.hu		f9 = fd0, f8, f0	C d0*(-f7-1) = -d0*f7-d0
166	xma.l		f35 = fd0, f8, f0
167	;;	C 34
168	getf.sig	P1 = f9		C P1
169		C 35
170	getf.sig	P0 = f35		C P0
171	;;
172.L_adj:		C 40
173	cmp.ltu		p8, p0 = r31, P0	C p8 = cy from low limb
174	cmp.ltu		p6, p0 = r14, P1	C p6 = prel cy from high limb
175	sub		R0 = r31, P0
176	sub		R1 = r14, P1
177	;;	C 41
178  (p8)	cmp.eq.or	p6, p0 = 0, R1		C p6 = final cy from high limb
179  (p8)	add		R1 = -1, R1
180	cmp.ne		p10, p0 = r0, r0	C clear p10 FIXME: use unc below!
181	cmp.ne		p13, p0 = r0, r0	C clear p13 FIXME: use unc below!
182	;;	C 42
183  (p6)	add		R0 = R0, d0
184  (p6)	add		R1 = R1, d1
185  (p6)	add		r18 = -1, r18		C q--
186	;;	C 43
187  (p6)	cmp.ltu		p10, p0 = R0, d0
188  (p6)	cmp.ltu		p0, p13 = R1, d1
189	;;	C 44
190  (p10)	cmp.ne.and	p0, p13 = -1, R1	C p13 = !cy
191  (p10)	add		R1 = 1, R1
192  (p13)	br.cond.spnt	.L_two_too_big		C jump if not cy
193	;;	C 45
194	st8		[qp] = r18, -8
195	add		r35 = -1, r35
196	mov		r31 = 0			C n0, next iteration
197	br.cloop.sptk	.Loop
198C *** MAIN LOOP END ***
199	;;
200.Ldone:
201	mov		r8 = r40
202	mov		b0 = r41
203	add		r21 = 8, r34
204	add		r22 = 16, r34
205	;;
206	st8		[r21] = R0
207	st8		[r22] = R1
208	mov		ar.pfs = r42
209	mov		ar.lc = r44
210	br.ret.sptk.many b0
211
212.L_high_limb_1:
213	.pred.rel "mutex", p8, p9
214	sub		R0 = R0, d0
215  (p8)	sub		R1 = R1, d1, 1
216  (p9)	sub		R1 = R1, d1
217	mov		r40 = 1
218	br.sptk		.L8
219	;;
220
221.L_two_too_big:
222	add		R0 = R0, d0
223	add		R1 = R1, d1
224	;;
225	add		r18 = -1, r18		C q--
226	cmp.ltu		p10, p0 = R0, d0
227	;;
228  (p10)	add		R1 = 1, R1
229	st8		[qp] = r18, -8
230	add		r35 = -1, r35
231	mov		r31 = 0			C n0, next iteration
232	br.cloop.sptk	.Loop
233	br.sptk		.Ldone
234
235.L_R1_eq_d1:
236	add		r14 = R0, d1		C r = R0 + d1
237	mov		r18 = -1		C q = -1
238	;;
239	cmp.leu		p6, p0 = R0, r14
240 (p6)	br.cond.spnt	.L20			C jump unless cy
241	;;
242	sub		P1 = r14, d0
243	add		R0 = r31, d0
244	;;
245	cmp.ltu		p8, p9 = R0, r31
246	;;
247	.pred.rel "mutex", p8, p9
248	st8		[qp] = r18, -8
249  (p8)	add		R1 = r0, P1, 1		C R1 = n1 - P1 - cy
250  (p9)	add		R1 = r0, P1		C R1 = n1 - P1
251	add		r35 = -1, r35
252	mov		r31 = 0			C n0, next iteration
253	br.cloop.sptk	.Loop
254	br.sptk		.Ldone
255	;;
256.L20:	cmp.ne		p6, p7 = 0, d0
257	;;
258	.pred.rel "mutex", p6, p7
259  (p6)	add		P1 = -1, d0
260  (p7)	mov		P1 = d0
261	sub		P0 = r0, d0
262	br.sptk		.L_adj
263EPILOGUE()
264ASM_END()
265