xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/ia64/mode1o.asm (revision ae87de8892f277bece3527c15b186ebcfa188227)
1dnl  Itanium-2 mpn_modexact_1c_odd -- mpn by 1 exact remainder.
2
3dnl  Contributed to the GNU project by Kevin Ryde.
4
5dnl  Copyright 2003-2005 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35
36C            cycles/limb
37C Itanium:      15
38C Itanium 2:     8
39
40
41dnl  Usage: ABI32(`code')
42dnl
43dnl  Emit the given code only under HAVE_ABI_32.
44dnl
45define(ABI32,
46m4_assert_onearg()
47`ifdef(`HAVE_ABI_32',`$1')')
48
49
50C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
51C                                mp_limb_t divisor, mp_limb_t carry);
52C
53C The modexact algorithm is usually conceived as a dependent chain
54C
55C	l = src[i] - c
56C	q = low(l * inverse)
57C	c = high(q*divisor) + (src[i]<c)
58C
59C but we can work the src[i]-c into an xma by calculating si=src[i]*inverse
60C separately (off the dependent chain) and using
61C
62C	q = low(c * inverse + si)
63C	c = high(q*divisor + c)
64C
65C This means the dependent chain is simply xma.l followed by xma.hu, for a
66C total 8 cycles/limb on itanium-2.
67C
68C The reason xma.hu works for the new c is that the low of q*divisor is
69C src[i]-c (being the whole purpose of the q generated, and it can be
70C verified algebraically).  If there was an underflow from src[i]-c, then
71C there will be an overflow from (src-c)+c, thereby adding 1 to the new c
72C the same as the borrow bit (src[i]<c) gives in the first style shown.
73C
74C Incidentally, fcmp is not an option for treating src[i]-c, since it
75C apparently traps to the kernel for unnormalized operands like those used
76C and generated by ldf8 and xma.  On one GNU/Linux system it took about 1200
77C cycles.
78C
79C
80C First Limb:
81C
82C The first limb uses q = (src[0]-c) * inverse shown in the first style.
83C This lets us get the first q as soon as the inverse is ready, without
84C going through si=s*inverse.  Basically at the start we have c and can use
85C it while waiting for the inverse, whereas for the second and subsequent
86C limbs it's the other way around, ie. we have the inverse and are waiting
87C for c.
88C
89C At .Lentry the first two instructions in the loop have been done already.
90C The load of f11=src[1] at the start (predicated on size>=2), and the
91C calculation of q by the initial different scheme.
92C
93C
94C Entry Sequence:
95C
96C In the entry sequence, the critical path is the calculation of the
97C inverse, so this is begun first and optimized.  Apart from that, ar.lc is
98C established nice and early so the br.cloop's should predict perfectly.
99C And the load for the low limbs src[0] and src[1] can be initiated long
100C ahead of where they're needed.
101C
102C
103C Inverse Calculation:
104C
105C The initial 8-bit inverse is calculated using a table lookup.  If it hits
106C L1 (which is likely if we're called several times) then it should take a
107C total 4 cycles, otherwise hopefully L2 for 9 cycles.  This is considered
108C the best approach, on balance.  It could be done bitwise, but that would
109C probably be about 14 cycles (2 per bit beyond the first couple).  Or it
110C could be taken from 4 bits to 8 with xmpy doubling as used beyond 8 bits,
111C but that would be about 11 cycles.
112C
113C The table is not the same as binvert_limb_table, instead it's 256 bytes,
114C designed to be indexed by the low byte of the divisor.  The divisor is
115C always odd, so the relevant data is every second byte in the table.  The
116C padding lets us use zxt1 instead of extr.u, the latter would cost an extra
117C cycle because it must go down I0, and we're using the first I0 slot to get
118C ip.  The extra 128 bytes of padding should be insignificant compared to
119C typical ia64 code bloat.
120C
121C Having the table in .text allows us to use IP-relative addressing,
122C avoiding a fetch from ltoff.  .rodata is apparently not suitable for use
123C IP-relative, it gets a linker relocation overflow on GNU/Linux.
124C
125C
126C Load Scheduling:
127C
128C In the main loop, the data loads are scheduled for an L2 hit, which means
129C 6 cycles for the data ready to use.  In fact we end up 7 cycles ahead.  In
130C any case that scheduling is achieved simply by doing the load (and xmpy.l
131C for "si") in the immediately preceding iteration.
132C
133C The main loop requires size >= 2, and we handle size==1 by an initial
134C br.cloop to enter the loop only if size>1.  Since ar.lc is established
135C early, this should predict perfectly.
136C
137C
138C Not done:
139C
140C Consideration was given to using a plain "(src[0]-c) % divisor" for
141C size==1, but cycle counting suggests about 50 for the sort of approach
142C taken by gcc __umodsi3, versus about 47 for the modexact.  (Both assuming
143C L1 hits for their respective fetching.)
144C
145C Consideration was given to a test for high<divisor and replacing the last
146C loop iteration with instead c-=src[size-1] followed by c+=d if underflow.
147C Branching on high<divisor wouldn't be good since a mispredict would cost
148C more than the loop iteration saved, and the condition is of course data
149C dependent.  So the theory would be to shorten the loop count if
150C high<divisor, and predicate extra operations at the end.  That would mean
151C a gain of 6 when high<divisor, or a cost of 2 if not.
152C
153C Whether such a tradeoff is a win on average depends on assumptions about
154C how many bits in the high and the divisor.  If both are uniformly
155C distributed then high<divisor about 50% of the time.  But smallish
156C divisors (less chance of high<divisor) might be more likely from
157C applications (mpz_divisible_ui, mpz_gcd_ui, etc).  Though biggish divisors
158C would be normal internally from say mpn/generic/perfsqr.c.  On balance,
159C for the moment, it's felt the gain is not really enough to be worth the
160C trouble.
161C
162C
163C Enhancement:
164C
165C Process two source limbs per iteration using a two-limb inverse and a
166C sequence like
167C
168C	ql  = low (c * il + sil)	quotient low limb
169C	qlc = high(c * il + sil)
170C	qh1 = low (c * ih + sih)	quotient high, partial
171C
172C	cl = high (ql * d + c)		carry out of low
173C	qh = low (qlc * 1 + qh1)	quotient high limb
174C
175C	new c = high (qh * d + cl)	carry out of high
176C
177C This would be 13 cycles/iteration, giving 6.5 cycles/limb.  The two limb
178C s*inverse as sih:sil = sh:sl * ih:il would be calculated off the dependent
179C chain with 4 multiplies.  The bigger inverse would take extra time to
180C calculate, but a one limb iteration to handle an odd size could be done as
181C soon as 64-bits of inverse were ready.
182C
183C Perhaps this could even extend to a 3 limb inverse, which might promise 17
184C or 18 cycles for 3 limbs, giving 5.66 or 6.0 cycles/limb.
185C
186
187ASM_START()
188	.explicit
189
190	.text
191	.align	32
192.Ltable:
193data1	0,0x01, 0,0xAB, 0,0xCD, 0,0xB7, 0,0x39, 0,0xA3, 0,0xC5, 0,0xEF
194data1	0,0xF1, 0,0x1B, 0,0x3D, 0,0xA7, 0,0x29, 0,0x13, 0,0x35, 0,0xDF
195data1	0,0xE1, 0,0x8B, 0,0xAD, 0,0x97, 0,0x19, 0,0x83, 0,0xA5, 0,0xCF
196data1	0,0xD1, 0,0xFB, 0,0x1D, 0,0x87, 0,0x09, 0,0xF3, 0,0x15, 0,0xBF
197data1	0,0xC1, 0,0x6B, 0,0x8D, 0,0x77, 0,0xF9, 0,0x63, 0,0x85, 0,0xAF
198data1	0,0xB1, 0,0xDB, 0,0xFD, 0,0x67, 0,0xE9, 0,0xD3, 0,0xF5, 0,0x9F
199data1	0,0xA1, 0,0x4B, 0,0x6D, 0,0x57, 0,0xD9, 0,0x43, 0,0x65, 0,0x8F
200data1	0,0x91, 0,0xBB, 0,0xDD, 0,0x47, 0,0xC9, 0,0xB3, 0,0xD5, 0,0x7F
201data1	0,0x81, 0,0x2B, 0,0x4D, 0,0x37, 0,0xB9, 0,0x23, 0,0x45, 0,0x6F
202data1	0,0x71, 0,0x9B, 0,0xBD, 0,0x27, 0,0xA9, 0,0x93, 0,0xB5, 0,0x5F
203data1	0,0x61, 0,0x0B, 0,0x2D, 0,0x17, 0,0x99, 0,0x03, 0,0x25, 0,0x4F
204data1	0,0x51, 0,0x7B, 0,0x9D, 0,0x07, 0,0x89, 0,0x73, 0,0x95, 0,0x3F
205data1	0,0x41, 0,0xEB, 0,0x0D, 0,0xF7, 0,0x79, 0,0xE3, 0,0x05, 0,0x2F
206data1	0,0x31, 0,0x5B, 0,0x7D, 0,0xE7, 0,0x69, 0,0x53, 0,0x75, 0,0x1F
207data1	0,0x21, 0,0xCB, 0,0xED, 0,0xD7, 0,0x59, 0,0xC3, 0,0xE5, 0,0x0F
208data1	0,0x11, 0,0x3B, 0,0x5D, 0,0xC7, 0,0x49, 0,0x33, 0,0x55, 0,0xFF
209
210
211PROLOGUE(mpn_modexact_1c_odd)
212
213	C r32	src
214	C r33	size
215	C r34	divisor
216	C r35	carry
217
218	.prologue
219.Lhere:
220{ .mmi;	add	r33 = -1, r33		C M0  size-1
221	mov	r14 = 2			C M1  2
222	mov	r15 = ip		C I0  .Lhere
223}{.mmi;	setf.sig f6 = r34		C M2  divisor
224	setf.sig f9 = r35		C M3  carry
225	zxt1	r3 = r34		C I1  divisor low byte
226}	;;
227
228{ .mmi;	add	r3 = .Ltable-.Lhere, r3	C M0  table offset ip and index
229	sub	r16 = 0, r34		C M1  -divisor
230	.save	ar.lc, r2
231	mov	r2 = ar.lc		C I0
232}{.mmi;	.body
233	setf.sig f13 = r14		C M2  2 in significand
234	mov	r17 = -1		C M3  -1
235ABI32(`	zxt4	r33 = r33')		C I1  size extend
236}	;;
237
238{ .mmi;	add	r3 = r3, r15		C M0  table entry address
239ABI32(` addp4	r32 = 0, r32')		C M1  src extend
240	mov	ar.lc = r33		C I0  size-1 loop count
241}{.mmi;	setf.sig f12 = r16		C M2  -divisor
242	setf.sig f8 = r17		C M3  -1
243}	;;
244
245{ .mmi;	ld1	r3 = [r3]		C M0  inverse, 8 bits
246	ldf8	f10 = [r32], 8		C M1  src[0]
247	cmp.ne	p6,p0 = 0, r33		C I0  test size!=1
248}	;;
249
250	C Wait for table load.
251	C Hope for an L1 hit of 1 cycles to ALU, but could be more.
252	setf.sig f7 = r3		C M2  inverse, 8 bits
253(p6)	ldf8	f11 = [r32], 8		C M1  src[1], if size!=1
254	;;
255
256	C 5 cycles
257
258	C f6	divisor
259	C f7	inverse, being calculated
260	C f8	-1, will be -inverse
261	C f9	carry
262	C f10	src[0]
263	C f11	src[1]
264	C f12	-divisor
265	C f13	2
266	C f14	scratch
267
268	xmpy.l	f14 = f13, f7		C 2*i
269	xmpy.l	f7 = f7, f7		C i*i
270	;;
271	xma.l	f7 = f7, f12, f14	C i*i*-d + 2*i, inverse 16 bits
272	;;
273
274	xmpy.l	f14 = f13, f7		C 2*i
275	xmpy.l	f7 = f7, f7		C i*i
276	;;
277	xma.l	f7 = f7, f12, f14	C i*i*-d + 2*i, inverse 32 bits
278	;;
279
280	xmpy.l	f14 = f13, f7		C 2*i
281	xmpy.l	f7 = f7, f7		C i*i
282	;;
283
284	xma.l	f7 = f7, f12, f14	C i*i*-d + 2*i, inverse 64 bits
285	xma.l	f10 = f9, f8, f10	C sc = c * -1 + src[0]
286	;;
287ASSERT(p6, `
288	xmpy.l	f15 = f6, f7 ;;	C divisor*inverse
289	getf.sig r31 = f15 ;;
290	cmp.eq	p6,p0 = 1, r31	C should == 1
291')
292
293	xmpy.l	f10 = f10, f7		C q = sc * inverse
294	xmpy.l	f8 = f7, f8		C -inverse = inverse * -1
295	br.cloop.sptk.few.clr .Lentry	C main loop, if size > 1
296	;;
297
298	C size==1, finish up now
299	xma.hu	f9 = f10, f6, f9	C c = high(q * divisor + c)
300	mov	ar.lc = r2		C I0
301	;;
302	getf.sig r8 = f9		C M2  return c
303	br.ret.sptk.many b0
304
305
306
307.Ltop:
308	C r2	saved ar.lc
309	C f6	divisor
310	C f7	inverse
311	C f8	-inverse
312	C f9	carry
313	C f10	src[i] * inverse
314	C f11	scratch src[i+1]
315
316	add	r16 = 160, r32
317	ldf8	f11 = [r32], 8		C src[i+1]
318	;;
319	C 2 cycles
320
321	lfetch	[r16]
322	xma.l	f10 = f9, f8, f10	C q = c * -inverse + si
323	;;
324	C 3 cycles
325
326.Lentry:
327	xma.hu	f9 = f10, f6, f9	C c = high(q * divisor + c)
328	xmpy.l	f10 = f11, f7		C si = src[i] * inverse
329	br.cloop.sptk.few.clr .Ltop
330	;;
331
332
333
334	xma.l	f10 = f9, f8, f10	C q = c * -inverse + si
335	mov	ar.lc = r2		C I0
336	;;
337	xma.hu	f9 = f10, f6, f9	C c = high(q * divisor + c)
338	;;
339	getf.sig r8 = f9		C M2  return c
340	br.ret.sptk.many b0
341
342EPILOGUE()
343