xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/ia64/mode1o.asm (revision 92e958de60c71aa0f2452bd7074cbb006fe6546b)
1dnl  Itanium-2 mpn_modexact_1c_odd -- mpn by 1 exact remainder.
2
3dnl  Contributed to the GNU project by Kevin Ryde.
4
5dnl  Copyright 2003, 2004, 2005 Free Software Foundation, Inc.
6dnl
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or
10dnl  modify it under the terms of the GNU Lesser General Public License as
11dnl  published by the Free Software Foundation; either version 3 of the
12dnl  License, or (at your option) any later version.
13dnl
14dnl  The GNU MP Library is distributed in the hope that it will be useful,
15dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
16dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17dnl  Lesser General Public License for more details.
18dnl
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24
25C            cycles/limb
26C Itanium:      15
27C Itanium 2:     8
28
29
30dnl  Usage: ABI32(`code')
31dnl
32dnl  Emit the given code only under HAVE_ABI_32.
33dnl
34define(ABI32,
35m4_assert_onearg()
36`ifdef(`HAVE_ABI_32',`$1')')
37
38
39C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
40C                                mp_limb_t divisor, mp_limb_t carry);
41C
42C The modexact algorithm is usually conceived as a dependent chain
43C
44C	l = src[i] - c
45C	q = low(l * inverse)
46C	c = high(q*divisor) + (src[i]<c)
47C
48C but we can work the src[i]-c into an xma by calculating si=src[i]*inverse
49C separately (off the dependent chain) and using
50C
51C	q = low(c * inverse + si)
52C	c = high(q*divisor + c)
53C
54C This means the dependent chain is simply xma.l followed by xma.hu, for a
55C total 8 cycles/limb on itanium-2.
56C
57C The reason xma.hu works for the new c is that the low of q*divisor is
58C src[i]-c (being the whole purpose of the q generated, and it can be
59C verified algebraically).  If there was an underflow from src[i]-c, then
60C there will be an overflow from (src-c)+c, thereby adding 1 to the new c
61C the same as the borrow bit (src[i]<c) gives in the first style shown.
62C
63C Incidentally, fcmp is not an option for treating src[i]-c, since it
64C apparently traps to the kernel for unnormalized operands like those used
65C and generated by ldf8 and xma.  On one GNU/Linux system it took about 1200
66C cycles.
67C
68C
69C First Limb:
70C
71C The first limb uses q = (src[0]-c) * inverse shown in the first style.
72C This lets us get the first q as soon as the inverse is ready, without
73C going through si=s*inverse.  Basically at the start we have c and can use
74C it while waiting for the inverse, whereas for the second and subsequent
75C limbs it's the other way around, ie. we have the inverse and are waiting
76C for c.
77C
78C At .Lentry the first two instructions in the loop have been done already.
79C The load of f11=src[1] at the start (predicated on size>=2), and the
80C calculation of q by the initial different scheme.
81C
82C
83C Entry Sequence:
84C
85C In the entry sequence, the critical path is the calculation of the
86C inverse, so this is begun first and optimized.  Apart from that, ar.lc is
87C established nice and early so the br.cloop's should predict perfectly.
88C And the load for the low limbs src[0] and src[1] can be initiated long
89C ahead of where they're needed.
90C
91C
92C Inverse Calculation:
93C
94C The initial 8-bit inverse is calculated using a table lookup.  If it hits
95C L1 (which is likely if we're called several times) then it should take a
96C total 4 cycles, otherwise hopefully L2 for 9 cycles.  This is considered
97C the best approach, on balance.  It could be done bitwise, but that would
98C probably be about 14 cycles (2 per bit beyond the first couple).  Or it
99C could be taken from 4 bits to 8 with xmpy doubling as used beyond 8 bits,
100C but that would be about 11 cycles.
101C
102C The table is not the same as binvert_limb_table, instead it's 256 bytes,
103C designed to be indexed by the low byte of the divisor.  The divisor is
104C always odd, so the relevant data is every second byte in the table.  The
105C padding lets us use zxt1 instead of extr.u, the latter would cost an extra
106C cycle because it must go down I0, and we're using the first I0 slot to get
107C ip.  The extra 128 bytes of padding should be insignificant compared to
108C typical ia64 code bloat.
109C
110C Having the table in .text allows us to use IP-relative addressing,
111C avoiding a fetch from ltoff.  .rodata is apparently not suitable for use
112C IP-relative, it gets a linker relocation overflow on GNU/Linux.
113C
114C
115C Load Scheduling:
116C
117C In the main loop, the data loads are scheduled for an L2 hit, which means
118C 6 cycles for the data ready to use.  In fact we end up 7 cycles ahead.  In
119C any case that scheduling is achieved simply by doing the load (and xmpy.l
120C for "si") in the immediately preceding iteration.
121C
122C The main loop requires size >= 2, and we handle size==1 by an initial
123C br.cloop to enter the loop only if size>1.  Since ar.lc is established
124C early, this should predict perfectly.
125C
126C
127C Not done:
128C
129C Consideration was given to using a plain "(src[0]-c) % divisor" for
130C size==1, but cycle counting suggests about 50 for the sort of approach
131C taken by gcc __umodsi3, versus about 47 for the modexact.  (Both assuming
132C L1 hits for their respective fetching.)
133C
134C Consideration was given to a test for high<divisor and replacing the last
135C loop iteration with instead c-=src[size-1] followed by c+=d if underflow.
136C Branching on high<divisor wouldn't be good since a mispredict would cost
137C more than the loop iteration saved, and the condition is of course data
138C dependent.  So the theory would be to shorten the loop count if
139C high<divisor, and predicate extra operations at the end.  That would mean
140C a gain of 6 when high<divisor, or a cost of 2 if not.
141C
142C Whether such a tradeoff is a win on average depends on assumptions about
143C how many bits in the high and the divisor.  If both are uniformly
144C distributed then high<divisor about 50% of the time.  But smallish
145C divisors (less chance of high<divisor) might be more likely from
146C applications (mpz_divisible_ui, mpz_gcd_ui, etc).  Though biggish divisors
147C would be normal internally from say mpn/generic/perfsqr.c.  On balance,
148C for the moment, it's felt the gain is not really enough to be worth the
149C trouble.
150C
151C
152C Enhancement:
153C
154C Process two source limbs per iteration using a two-limb inverse and a
155C sequence like
156C
157C	ql  = low (c * il + sil)	quotient low limb
158C	qlc = high(c * il + sil)
159C	qh1 = low (c * ih + sih)	quotient high, partial
160C
161C	cl = high (ql * d + c)		carry out of low
162C	qh = low (qlc * 1 + qh1)	quotient high limb
163C
164C	new c = high (qh * d + cl)	carry out of high
165C
166C This would be 13 cycles/iteration, giving 6.5 cycles/limb.  The two limb
167C s*inverse as sih:sil = sh:sl * ih:il would be calculated off the dependent
168C chain with 4 multiplies.  The bigger inverse would take extra time to
169C calculate, but a one limb iteration to handle an odd size could be done as
170C soon as 64-bits of inverse were ready.
171C
172C Perhaps this could even extend to a 3 limb inverse, which might promise 17
173C or 18 cycles for 3 limbs, giving 5.66 or 6.0 cycles/limb.
174C
175
176ASM_START()
177	.explicit
178
179	.text
180	.align	32
181.Ltable:
182data1	0,0x01, 0,0xAB, 0,0xCD, 0,0xB7, 0,0x39, 0,0xA3, 0,0xC5, 0,0xEF
183data1	0,0xF1, 0,0x1B, 0,0x3D, 0,0xA7, 0,0x29, 0,0x13, 0,0x35, 0,0xDF
184data1	0,0xE1, 0,0x8B, 0,0xAD, 0,0x97, 0,0x19, 0,0x83, 0,0xA5, 0,0xCF
185data1	0,0xD1, 0,0xFB, 0,0x1D, 0,0x87, 0,0x09, 0,0xF3, 0,0x15, 0,0xBF
186data1	0,0xC1, 0,0x6B, 0,0x8D, 0,0x77, 0,0xF9, 0,0x63, 0,0x85, 0,0xAF
187data1	0,0xB1, 0,0xDB, 0,0xFD, 0,0x67, 0,0xE9, 0,0xD3, 0,0xF5, 0,0x9F
188data1	0,0xA1, 0,0x4B, 0,0x6D, 0,0x57, 0,0xD9, 0,0x43, 0,0x65, 0,0x8F
189data1	0,0x91, 0,0xBB, 0,0xDD, 0,0x47, 0,0xC9, 0,0xB3, 0,0xD5, 0,0x7F
190data1	0,0x81, 0,0x2B, 0,0x4D, 0,0x37, 0,0xB9, 0,0x23, 0,0x45, 0,0x6F
191data1	0,0x71, 0,0x9B, 0,0xBD, 0,0x27, 0,0xA9, 0,0x93, 0,0xB5, 0,0x5F
192data1	0,0x61, 0,0x0B, 0,0x2D, 0,0x17, 0,0x99, 0,0x03, 0,0x25, 0,0x4F
193data1	0,0x51, 0,0x7B, 0,0x9D, 0,0x07, 0,0x89, 0,0x73, 0,0x95, 0,0x3F
194data1	0,0x41, 0,0xEB, 0,0x0D, 0,0xF7, 0,0x79, 0,0xE3, 0,0x05, 0,0x2F
195data1	0,0x31, 0,0x5B, 0,0x7D, 0,0xE7, 0,0x69, 0,0x53, 0,0x75, 0,0x1F
196data1	0,0x21, 0,0xCB, 0,0xED, 0,0xD7, 0,0x59, 0,0xC3, 0,0xE5, 0,0x0F
197data1	0,0x11, 0,0x3B, 0,0x5D, 0,0xC7, 0,0x49, 0,0x33, 0,0x55, 0,0xFF
198
199
200PROLOGUE(mpn_modexact_1c_odd)
201
202	C r32	src
203	C r33	size
204	C r34	divisor
205	C r35	carry
206
207	.prologue
208.Lhere:
209{ .mmi;	add	r33 = -1, r33		C M0  size-1
210	mov	r14 = 2			C M1  2
211	mov	r15 = ip		C I0  .Lhere
212}{.mmi;	setf.sig f6 = r34		C M2  divisor
213	setf.sig f9 = r35		C M3  carry
214	zxt1	r3 = r34		C I1  divisor low byte
215}	;;
216
217{ .mmi;	add	r3 = .Ltable-.Lhere, r3	C M0  table offset ip and index
218	sub	r16 = 0, r34		C M1  -divisor
219	.save	ar.lc, r2
220	mov	r2 = ar.lc		C I0
221}{.mmi;	.body
222	setf.sig f13 = r14		C M2  2 in significand
223	mov	r17 = -1		C M3  -1
224ABI32(`	zxt4	r33 = r33')		C I1  size extend
225}	;;
226
227{ .mmi;	add	r3 = r3, r15		C M0  table entry address
228ABI32(` addp4	r32 = 0, r32')		C M1  src extend
229	mov	ar.lc = r33		C I0  size-1 loop count
230}{.mmi;	setf.sig f12 = r16		C M2  -divisor
231	setf.sig f8 = r17		C M3  -1
232}	;;
233
234{ .mmi;	ld1	r3 = [r3]		C M0  inverse, 8 bits
235	ldf8	f10 = [r32], 8		C M1  src[0]
236	cmp.ne	p6,p0 = 0, r33		C I0  test size!=1
237}	;;
238
239	C Wait for table load.
240	C Hope for an L1 hit of 1 cycles to ALU, but could be more.
241	setf.sig f7 = r3		C M2  inverse, 8 bits
242(p6)	ldf8	f11 = [r32], 8		C M1  src[1], if size!=1
243	;;
244
245	C 5 cycles
246
247	C f6	divisor
248	C f7	inverse, being calculated
249	C f8	-1, will be -inverse
250	C f9	carry
251	C f10	src[0]
252	C f11	src[1]
253	C f12	-divisor
254	C f13	2
255	C f14	scratch
256
257	xmpy.l	f14 = f13, f7		C 2*i
258	xmpy.l	f7 = f7, f7		C i*i
259	;;
260	xma.l	f7 = f7, f12, f14	C i*i*-d + 2*i, inverse 16 bits
261	;;
262
263	xmpy.l	f14 = f13, f7		C 2*i
264	xmpy.l	f7 = f7, f7		C i*i
265	;;
266	xma.l	f7 = f7, f12, f14	C i*i*-d + 2*i, inverse 32 bits
267	;;
268
269	xmpy.l	f14 = f13, f7		C 2*i
270	xmpy.l	f7 = f7, f7		C i*i
271	;;
272
273	xma.l	f7 = f7, f12, f14	C i*i*-d + 2*i, inverse 64 bits
274	xma.l	f10 = f9, f8, f10	C sc = c * -1 + src[0]
275	;;
276ASSERT(p6, `
277	xmpy.l	f15 = f6, f7 ;;	C divisor*inverse
278	getf.sig r31 = f15 ;;
279	cmp.eq	p6,p0 = 1, r31	C should == 1
280')
281
282	xmpy.l	f10 = f10, f7		C q = sc * inverse
283	xmpy.l	f8 = f7, f8		C -inverse = inverse * -1
284	br.cloop.sptk.few.clr .Lentry	C main loop, if size > 1
285	;;
286
287	C size==1, finish up now
288	xma.hu	f9 = f10, f6, f9	C c = high(q * divisor + c)
289	mov	ar.lc = r2		C I0
290	;;
291	getf.sig r8 = f9		C M2  return c
292	br.ret.sptk.many b0
293
294
295
296.Ltop:
297	C r2	saved ar.lc
298	C f6	divisor
299	C f7	inverse
300	C f8	-inverse
301	C f9	carry
302	C f10	src[i] * inverse
303	C f11	scratch src[i+1]
304
305	add	r16 = 160, r32
306	ldf8	f11 = [r32], 8		C src[i+1]
307	;;
308	C 2 cycles
309
310	lfetch	[r16]
311	xma.l	f10 = f9, f8, f10	C q = c * -inverse + si
312	;;
313	C 3 cycles
314
315.Lentry:
316	xma.hu	f9 = f10, f6, f9	C c = high(q * divisor + c)
317	xmpy.l	f10 = f11, f7		C si = src[i] * inverse
318	br.cloop.sptk.few.clr .Ltop
319	;;
320
321
322
323	xma.l	f10 = f9, f8, f10	C q = c * -inverse + si
324	mov	ar.lc = r2		C I0
325	;;
326	xma.hu	f9 = f10, f6, f9	C c = high(q * divisor + c)
327	;;
328	getf.sig r8 = f9		C M2  return c
329	br.ret.sptk.many b0
330
331EPILOGUE()
332