xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/ia64/dive_1.asm (revision 63aea4bd5b445e491ff0389fe27ec78b3099dba3)
1dnl  IA-64 mpn_divexact_1 -- mpn by limb exact division.
2
3dnl  Contributed to the GNU project by Torbjorn Granlund and Kevin Ryde.
4
5dnl  Copyright 2003, 2004, 2005, 2010 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of the GNU Lesser General Public License as published
11dnl  by the Free Software Foundation; either version 3 of the License, or (at
12dnl  your option) any later version.
13
14dnl  The GNU MP Library is distributed in the hope that it will be useful, but
15dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17dnl  License for more details.
18
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24C            cycles/limb
25C Itanium:      16
26C Itanium 2:     8
27
28C INPUT PARAMETERS
29define(`rp', `r32')
30define(`up', `r33')
31define(`n',  `r34')
32define(`divisor', `r35')
33
34define(`lshift', `r24')
35define(`rshift', `r25')
36
37C This code is a bit messy, and not as similar to mode1o.asm as desired.
38
39C The critical path during initialization is for computing the inverse of the
40C divisor.  Since odd divisors are probably common, we conditionally execute
41C the initial count_traling_zeros code and the downshift.
42
43C Possible improvement: Merge more of the feed-in code into the inverse
44C computation.
45
46ASM_START()
47	.text
48	.align	32
49.Ltab:
50data1	0,0x01, 0,0xAB, 0,0xCD, 0,0xB7, 0,0x39, 0,0xA3, 0,0xC5, 0,0xEF
51data1	0,0xF1, 0,0x1B, 0,0x3D, 0,0xA7, 0,0x29, 0,0x13, 0,0x35, 0,0xDF
52data1	0,0xE1, 0,0x8B, 0,0xAD, 0,0x97, 0,0x19, 0,0x83, 0,0xA5, 0,0xCF
53data1	0,0xD1, 0,0xFB, 0,0x1D, 0,0x87, 0,0x09, 0,0xF3, 0,0x15, 0,0xBF
54data1	0,0xC1, 0,0x6B, 0,0x8D, 0,0x77, 0,0xF9, 0,0x63, 0,0x85, 0,0xAF
55data1	0,0xB1, 0,0xDB, 0,0xFD, 0,0x67, 0,0xE9, 0,0xD3, 0,0xF5, 0,0x9F
56data1	0,0xA1, 0,0x4B, 0,0x6D, 0,0x57, 0,0xD9, 0,0x43, 0,0x65, 0,0x8F
57data1	0,0x91, 0,0xBB, 0,0xDD, 0,0x47, 0,0xC9, 0,0xB3, 0,0xD5, 0,0x7F
58data1	0,0x81, 0,0x2B, 0,0x4D, 0,0x37, 0,0xB9, 0,0x23, 0,0x45, 0,0x6F
59data1	0,0x71, 0,0x9B, 0,0xBD, 0,0x27, 0,0xA9, 0,0x93, 0,0xB5, 0,0x5F
60data1	0,0x61, 0,0x0B, 0,0x2D, 0,0x17, 0,0x99, 0,0x03, 0,0x25, 0,0x4F
61data1	0,0x51, 0,0x7B, 0,0x9D, 0,0x07, 0,0x89, 0,0x73, 0,0x95, 0,0x3F
62data1	0,0x41, 0,0xEB, 0,0x0D, 0,0xF7, 0,0x79, 0,0xE3, 0,0x05, 0,0x2F
63data1	0,0x31, 0,0x5B, 0,0x7D, 0,0xE7, 0,0x69, 0,0x53, 0,0x75, 0,0x1F
64data1	0,0x21, 0,0xCB, 0,0xED, 0,0xD7, 0,0x59, 0,0xC3, 0,0xE5, 0,0x0F
65data1	0,0x11, 0,0x3B, 0,0x5D, 0,0xC7, 0,0x49, 0,0x33, 0,0x55, 0,0xFF
66
67
68PROLOGUE(mpn_divexact_1)
69	.prologue
70	.save		ar.lc, r2
71	.body
72
73 {.mmi;	add		r8 = -1, divisor	C M0
74	nop		0			C M1
75	tbit.z		p8, p9 = divisor, 0	C I0
76}
77ifdef(`HAVE_ABI_32',
78`	addp4		rp = 0, rp		C M2  rp extend
79	addp4		up = 0, up		C M3  up extend
80	sxt4		n = n')			C I1  size extend
81	;;
82.Lhere:
83 {.mmi;	ld8		r20 = [up], 8		C M0  up[0]
84  (p8)	andcm		r8 = r8, divisor	C M1
85	mov		r15 = ip		C I0  .Lhere
86	;;
87}{.mii
88	.pred.rel "mutex", p8, p9
89  (p9)	mov		rshift = 0		C M0
90  (p8)	popcnt		rshift = r8		C I0 r8 = cnt_lo_zeros(divisor)
91	cmp.eq		p6, p10 = 1, n		C I1
92	;;
93}{.mii;	add		r9 = .Ltab-.Lhere, r15	C M0
94  (p8)	shr.u		divisor = divisor, rshift C I0
95	nop		0			C I1
96	;;
97}{.mmi;	add		n = -4, n		C M0  size-1
98  (p10)	ld8		r21 = [up], 8		C M1  up[1]
99	mov		r14 = 2			C M1  2
100}{.mfi;	setf.sig	f6 = divisor		C M2  divisor
101	mov		f9 = f0			C M3  carry		FIXME
102	zxt1		r3 = divisor		C I1  divisor low byte
103	;;
104}{.mmi;	add		r3 = r9, r3		C M0  table offset ip and index
105	sub		r16 = 0, divisor	C M1  -divisor
106	mov		r2 = ar.lc		C I0
107}{.mmi;	sub		lshift = 64, rshift	C M2
108	setf.sig	f13 = r14		C M3  2 in significand
109	mov		r17 = -1		C I1  -1
110	;;
111}{.mmi;	ld1		r3 = [r3]		C M0  inverse, 8 bits
112	nop		0			C M1
113	mov		ar.lc = n		C I0  size-1 loop count
114}{.mmi;	setf.sig	f12 = r16		C M2  -divisor
115	setf.sig	f8 = r17		C M3  -1
116	cmp.eq		p7, p0 = -2, n		C I1
117	;;
118}{.mmi;	setf.sig	f7 = r3			C M2  inverse, 8 bits
119	cmp.eq		p8, p0 = -1, n		C M0
120	shr.u		r23 = r20, rshift	C I0
121	;;
122}
123
124	C f6	divisor
125	C f7	inverse, being calculated
126	C f8	-1, will be -inverse
127	C f9	carry
128	C f12	-divisor
129	C f13	2
130	C f14	scratch
131
132	xmpy.l		f14 = f13, f7		C Newton 2*i
133	xmpy.l		f7 = f7, f7		C Newton i*i
134	;;
135	xma.l		f7 = f7, f12, f14	C Newton i*i*-d + 2*i, 16 bits
136	;;
137	setf.sig	f10 = r23		C speculative, used iff n = 1
138	xmpy.l		f14 = f13, f7		C Newton 2*i
139	shl		r22 = r21, lshift	C speculative, used iff n > 1
140	xmpy.l		f7 = f7, f7		C Newton i*i
141	;;
142	or		r31 = r22, r23		C speculative, used iff n > 1
143	xma.l		f7 = f7, f12, f14	C Newton i*i*-d + 2*i, 32 bits
144	shr.u		r23 = r21, rshift	C speculative, used iff n > 1
145	;;
146	setf.sig	f11 = r31		C speculative, used iff n > 1
147	xmpy.l		f14 = f13, f7		C Newton 2*i
148	xmpy.l		f7 = f7, f7		C Newton i*i
149	;;
150	xma.l		f7 = f7, f12, f14	C Newton i*i*-d + 2*i, 64 bits
151
152  (p7)	br.cond.dptk	.Ln2
153  (p10)	br.cond.dptk	.grt3
154	;;
155
156.Ln1:	xmpy.l		f12 = f10, f7		C q = ulimb * inverse
157	br		.Lx1
158
159.Ln2:
160	xmpy.l		f8 = f7, f8		C -inverse = inverse * -1
161	xmpy.l		f12 = f11, f7		C q = ulimb * inverse
162	setf.sig	f11 = r23
163	br		.Lx2
164
165.grt3:
166	ld8		r21 = [up], 8		C up[2]
167	xmpy.l		f8 = f7, f8		C -inverse = inverse * -1
168	;;
169	shl		r22 = r21, lshift
170	;;
171	xmpy.l		f12 = f11, f7		C q = ulimb * inverse
172	;;
173	or		r31 = r22, r23
174	shr.u		r23 = r21, rshift
175	;;
176	setf.sig	f11 = r31
177  (p8)	br.cond.dptk	.Lx3			C branch for n = 3
178	;;
179	ld8		r21 = [up], 8
180	br		.Lent
181
182.Ltop:	ld8		r21 = [up], 8
183	xma.l		f12 = f9, f8, f10	C q = c * -inverse + si
184	nop.b		0
185	;;
186.Lent:	add		r16 = 160, up
187	shl		r22 = r21, lshift
188	nop.b		0
189	;;
190	stf8		[rp] = f12, 8
191	xma.hu		f9 = f12, f6, f9	C c = high(q * divisor + c)
192	nop.b		0
193	nop.m		0
194	xmpy.l		f10 = f11, f7		C si = ulimb * inverse
195	nop.b		0
196	;;
197	or		r31 = r22, r23
198	shr.u		r23 = r21, rshift
199	nop.b		0
200	;;
201	lfetch		[r16]
202	setf.sig	f11 = r31
203	br.cloop.sptk.few.clr .Ltop
204
205
206	xma.l		f12 = f9, f8, f10	C q = c * -inverse + si
207	;;
208.Lx3:	stf8		[rp] = f12, 8
209	xma.hu		f9 = f12, f6, f9	C c = high(q * divisor + c)
210	xmpy.l		f10 = f11, f7		C si = ulimb * inverse
211	;;
212	setf.sig	f11 = r23
213	;;
214	xma.l		f12 = f9, f8, f10	C q = c * -inverse + si
215	;;
216.Lx2:	stf8		[rp] = f12, 8
217	xma.hu		f9 = f12, f6, f9	C c = high(q * divisor + c)
218	xmpy.l		f10 = f11, f7		C si = ulimb * inverse
219	;;
220	xma.l		f12 = f9, f8, f10	C q = c * -inverse + si
221	;;
222.Lx1:	stf8		[rp] = f12, 8
223	mov		ar.lc = r2		C I0
224	br.ret.sptk.many b0
225EPILOGUE()
226