xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/ia64/divrem_2.asm (revision 70f7362772ba52b749c976fb5e86e39a8b2c9afc)
1dnl  IA-64 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.
2
3dnl  Copyright 2010, 2013 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C               norm   frac
34C itanium 1
35C itanium 2     29     29
36
37
38C TODO
39C  * Inline and interleave limb inversion code with loop setup code.
40C  * We should use explicit bundling in much of the code, since it typically
41C    cuts some cycles with the GNU assembler.
42
43
44ASM_START()
45
46C HP's assembler requires these declarations for importing mpn_invert_limb
47	.global	mpn_invert_limb
48	.type	mpn_invert_limb,@function
49
50C INPUT PARAMETERS
51C qp   = r32
52C fn   = r33
53C np   = r34
54C nn   = r35
55C dp   = r36
56
57define(`f0x1', `f15')
58
59ASM_START()
60PROLOGUE(mpn_divrem_2)
61	.prologue
62ifdef(`HAVE_ABI_32',
63`	addp4		r32 = 0, r32		C M I
64	addp4		r34 = 0, r34		C M I
65	zxt4		r35 = r35		C I
66	addp4		r36 = 0, r36		C M I
67	nop.m		0
68	zxt4		r33 = r33		C I
69	;;
70')
71	.save ar.pfs, r42
72	alloc	 r42 = ar.pfs, 5, 9, 1, 0
73	shladd	 r34 = r35, 3, r34
74	adds	 r14 = 8, r36
75	mov	 r43 = r1
76	;;
77	adds	 r15 = -8, r34
78	ld8	 r39 = [r14]
79	.save ar.lc, r45
80	mov	 r45 = ar.lc
81	adds	 r14 = -16, r34
82	mov	 r40 = r0
83	adds	 r34 = -24, r34
84	;;
85	ld8	 r38 = [r15]
86	.save rp, r41
87	mov	 r41 = b0
88	.body
89	ld8	 r36 = [r36]
90	ld8	 r37 = [r14]
91	;;
92	cmp.gtu	 p6, p7 = r39, r38
93  (p6)	br.cond.dptk .L8
94	;;
95	cmp.leu	 p8, p9 = r36, r37
96	cmp.geu	 p6, p7 = r39, r38
97	;;
98  (p8)	cmp4.ne.and.orcm p6, p7 = 0, r0
99  (p7)	br.cond.dptk .L51
100.L8:
101	add	 r14 = r33, r35		// un + fn
102	mov	 r46 = r39		// argument to mpn_invert_limb
103	;;
104	adds	 r35 = -3, r14
105	;;
106	cmp.gt	 p12, p0 = r0, r35
107  (p12)	br.cond.dpnt L(end)
108	br.call.sptk.many b0 = mpn_invert_limb
109	;;
110	setf.sig f11 = r8		// di (non-final)
111	setf.sig f34 = r39		// d1
112	setf.sig f33 = r36		// d0
113	mov	 r1 = r43
114	;;
115	mov	 r17 = 1
116	setf.sig f9 = r38		// n2
117	xma.l	 f6 = f11, f34, f0	// t0 = LO(di * d1)
118	;;
119	setf.sig f10 = r37		// n1
120	setf.sig f15 = r17		// 1
121	xma.hu	 f8 = f11, f33, f0	// s0 = HI(di * d0)
122	;;
123	getf.sig r17 = f6
124	getf.sig r16 = f8
125	mov	 ar.lc = r35
126	;;
127	sub	 r18 = r0, r39		// -d1
128	add	 r14 = r17, r36
129	;;
130	setf.sig f14 = r18		// -d1
131	cmp.leu	 p8, p9 = r17, r14
132	add	 r16 = r14, r16
133	;;
134  (p9)	adds	 r19 = 0, r0
135  (p8)	adds	 r19 = -1, r0
136	cmp.gtu	 p6, p7 = r14, r16
137	;;
138  (p6)	adds	 r19 = 1, r19
139	;;
140ifelse(1,1,`
141	cmp.gt	 p7, p6 = r0, r19
142	;;
143  (p6)	adds	 r8 = -1, r8		// di--
144  (p6)	sub	 r14 = r16, r39		// t0 -= d1
145  (p6)	cmp.ltu	 p6, p7 = r16, r39	// cy for: t0 - d1
146	;;
147  (p6)	cmp.gt	 p9, p8 = 1, r19
148  (p7)	cmp.gt	 p9, p8 = 0, r19
149  (p6)	adds	 r19 = -1, r19		// t1 -= cy
150	mov	 r16 = r14
151	;;
152  (p8)	adds	 r8 = -1, r8		// di--
153  (p8)	sub	 r14 = r16, r39		// t0 -= d1
154  (p8)	cmp.ltu	 p8, p9 = r16, r39	// cy for: t0 - d1
155	;;
156  (p8)	cmp.gt	 p7, p6 = 1, r19
157  (p9)	cmp.gt	 p7, p6 = 0, r19
158  (p8)	adds	 r19 = -1, r19		// t1 -= cy
159	mov	 r16 = r14
160	;;
161  (p6)	adds	 r8 = -1, r8		// di--
162  (p6)	sub	 r14 = r16, r39		// t0 -= d1
163  (p6)	cmp.ltu	 p6, p7 = r16, r39	// cy for: t0 - d1
164	;;
165  (p6)	cmp.gt	 p9, p8 = 1, r19
166  (p7)	cmp.gt	 p9, p8 = 0, r19
167  (p6)	adds	 r19 = -1, r19		// t1 -= cy
168	mov	 r16 = r14
169	;;
170  (p8)	adds	 r8 = -1, r8		// di--
171  (p8)	sub	 r14 = r16, r39		// t0 -= d1
172  (p8)	cmp.ltu	 p8, p9 = r16, r39	// cy for: t0 - d1
173	;;
174  (p8)	adds	 r19 = -1, r19		// t1 -= cy
175	mov	 r16 = r14
176',`
177	cmp.gt	 p8, p9 = r0, r19
178  (p8)	br.cond.dpnt .L46
179.L52:
180	cmp.leu	 p6, p7 = r39, r16
181	sub	 r14 = r16, r39
182	adds	 r8 = -1, r8
183	;;
184  (p7)	adds	 r19 = -1, r19
185	mov	 r16 = r14
186	;;
187  (p7)	cmp.gt	 p8, p9 = r0, r19
188  (p9)	br.cond.dptk .L52
189.L46:
190')
191	setf.sig f32 = r8		// di
192	shladd	 r32 = r35, 3, r32
193	;;
194
195	ALIGN(16)
196L(top):	nop 0
197	nop 0
198	cmp.gt	 p8, p9 = r33, r35
199	;;
200 (p8)	mov	 r37 = r0
201 (p9)	ld8	 r37 = [r34], -8
202	xma.hu	 f8 = f9, f32, f10	//				0,29
203	xma.l	 f12 = f9, f32, f10	//				0
204	;;
205	getf.sig r20 = f12		// q0				4
206	xma.l	 f13 = f15, f8, f9	// q += n2			4
207	sub	 r8 = -1, r36		// bitnot d0
208	;;
209	getf.sig r18 = f13		//				8
210	xma.l	 f7 = f14, f13, f10	//				8
211	xma.l	 f6 = f33, f13, f33	// t0 = LO(d0*q+d0)		8
212	xma.hu	 f9 = f33, f13, f33	// t1 = HI(d0*q+d0)		9
213	;;
214	getf.sig r38 = f7		// n1				12
215	getf.sig r16 = f6		//				13
216	getf.sig r19 = f9		//				14
217	;;
218	sub	 r38 = r38, r39		// n1 -= d1			17
219	;;
220	cmp.ne	 p9, p0 = r0, r0	// clear p9
221	cmp.leu	 p10, p11 = r16, r37	// cy for: n0 - t0		18
222	;;
223	sub	 r37 = r37, r16		// n0 -= t0			19
224  (p11)	sub	 r38 = r38, r19, 1	// n1 -= t1 - cy		19
225  (p10)	sub	 r38 = r38, r19		// n1 -= t1			19
226	;;
227	cmp.gtu	 p6, p7 = r20, r38	// n1 >= q0			20
228	;;
229  (p7)	cmp.ltu	 p9, p0 = r8, r37	//				21
230  (p6)	add	 r18 = 1, r18		//
231  (p7)	add	 r37 = r37, r36		//				21
232  (p7)	add	 r38 = r38, r39		//				21
233	;;
234	setf.sig f10 = r37		// n1				22
235  (p9)	add	 r38 = 1, r38		//				22
236	;;
237	setf.sig f9 = r38		// n2				23
238	cmp.gtu	 p6, p7 = r39, r38	//				23
239  (p7)	br.cond.spnt L(fix)
240L(bck):	st8	 [r32] = r18, -8
241	adds	 r35 = -1, r35
242	br.cloop.sptk.few L(top)
243	;;
244
245L(end):	add	r14 = 8, r34
246	add	r15 = 16, r34
247	mov	 b0 = r41
248	;;
249	st8	[r14] = r37
250	st8	[r15] = r38
251	mov	 ar.pfs = r42
252	mov	 r8 = r40
253	mov	 ar.lc = r45
254	br.ret.sptk.many b0
255	;;
256.L51:
257	.pred.rel "mutex", p8, p9
258	sub	 r37 = r37, r36
259  (p9)	sub	 r38 = r38, r39, 1
260  (p8)	sub	 r38 = r38, r39
261	adds	 r40 = 1, r0
262	br .L8
263	;;
264
265L(fix):	cmp.geu	 p6, p7 = r39, r38
266	cmp.leu	 p8, p9 = r36, r37
267	;;
268  (p8)	cmp4.ne.and.orcm p6, p7 = 0, r0
269  (p6)	br.cond.dptk L(bck)
270	sub	 r37 = r37, r36
271  (p9)	sub	 r38 = r38, r39, 1
272  (p8)	sub	 r38 = r38, r39
273	adds	 r18 = 1, r18
274	;;
275	setf.sig f9 = r38		// n2
276	setf.sig f10 = r37		// n1
277	br	 L(bck)
278
279EPILOGUE()
280ASM_END()
281