xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/alpha/mode1o.asm (revision d909946ca08dceb44d7d0f22ec9488679695d976)
1dnl  Alpha mpn_modexact_1c_odd -- mpn exact remainder
2
3dnl  Copyright 2003, 2004 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C      cycles/limb
24C EV4:    47
25C EV5:    30
26C EV6:    15
27
28
29C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d,
30C                                mp_limb_t c)
31C
32C This code follows the "alternate" code in mpn/generic/mode1o.c,
33C eliminating cbit+climb from the dependent chain.  This leaves,
34C
35C        ev4   ev5   ev6
36C         1     3     1    subq   y = x - h
37C        23    13     7    mulq   q = y * inverse
38C        23    14     7    umulh  h = high (q * d)
39C        --    --    --
40C        47    30    15
41C
42C In each case, the load latency, loop control, and extra carry bit handling
43C hide under the multiply latencies.  Those latencies are long enough that
44C we don't need to worry about alignment or pairing to squeeze out
45C performance.
46C
47C For the first limb, some of the loop code is broken out and scheduled back
48C since it can be done earlier.
49C
50C   - The first ldq src[0] is near the start of the routine, for maximum
51C     time from memory.
52C
53C   - The subq y=x-climb can be done without waiting for the inverse.
54C
55C   - The mulq y*inverse is replicated after the final subq for the inverse,
56C     instead of branching to the mulq in the main loop.  On ev4 a branch
57C     there would cost cycles, but we can hide them under the mulq latency.
58C
59C For the last limb, high<divisor is tested and if that's true a subtract
60C and addback is done, as per the main mpn/generic/mode1o.c code.  This is a
61C data-dependent branch, but we're waiting for umulh so any penalty should
62C hide there.  The multiplies saved would be worth the cost anyway.
63C
64C Enhancements:
65C
66C For size==1, a plain division (done bitwise say) might be faster than
67C calculating an inverse, the latter taking about 130 cycles on ev4 or 70 on
68C ev5.  A call to gcc __remqu might be a possibility.
69
70ASM_START()
71PROLOGUE(mpn_modexact_1c_odd,gp)
72
73	C r16	src
74	C r17	size
75	C r18	d
76	C r19	c
77
78	LEA(r0, binvert_limb_table)
79	srl	r18, 1, r20		C d >> 1
80
81	and	r20, 127, r20		C idx = d>>1 & 0x7F
82
83	addq	r0, r20, r21		C table + idx
84
85ifelse(bwx_available_p,1,
86`	ldbu	r20, 0(r21)		C table[idx], inverse 8 bits
87',`
88	ldq_u	r20, 0(r21)		C table[idx] qword
89	extbl	r20, r21, r20		C table[idx], inverse 8 bits
90')
91
92	mull	r20, r20, r7		C i*i
93	addq	r20, r20, r20		C 2*i
94
95	ldq	r2, 0(r16)		C x = s = src[0]
96	lda	r17, -1(r17)		C size--
97	clr	r0			C initial cbit=0
98
99	mull	r7, r18, r7		C i*i*d
100
101	subq	r20, r7, r20		C 2*i-i*i*d, inverse 16 bits
102
103	mull	r20, r20, r7		C i*i
104	addq	r20, r20, r20		C 2*i
105
106	mull	r7, r18, r7		C i*i*d
107
108	subq	r20, r7, r20		C 2*i-i*i*d, inverse 32 bits
109
110	mulq	r20, r20, r7		C i*i
111	addq	r20, r20, r20		C 2*i
112
113	mulq	r7, r18, r7		C i*i*d
114	subq	r2, r19, r3		C y = x - climb
115
116	subq	r20, r7, r20		C inv = 2*i-i*i*d, inverse 64 bits
117
118ASSERT(r7, C should have d*inv==1 mod 2^64
119`	mulq	r18, r20, r7
120	cmpeq	r7, 1, r7')
121
122	mulq	r3, r20, r4		C first q = y * inv
123
124	beq	r17, L(one)		C if size==1
125	br	L(entry)
126
127
128L(top):
129	C r0	cbit
130	C r16	src, incrementing
131	C r17	size, decrementing
132	C r18	d
133	C r19	climb
134	C r20	inv
135
136	ldq	r1, 0(r16)		C s = src[i]
137	subq	r1, r0, r2		C x = s - cbit
138	cmpult	r1, r0, r0		C new cbit = s < cbit
139
140	subq	r2, r19, r3		C y = x - climb
141
142	mulq	r3, r20, r4		C q = y * inv
143L(entry):
144	cmpult	r2, r19, r5		C cbit2 = x < climb
145	addq	r5, r0, r0		C cbit += cbit2
146	lda	r16, 8(r16)		C src++
147	lda	r17, -1(r17)		C size--
148
149	umulh	r4, r18, r19		C climb = q * d
150	bne	r17, L(top)		C while 2 or more limbs left
151
152
153
154	C r0	cbit
155	C r18	d
156	C r19	climb
157	C r20	inv
158
159	ldq	r1, 0(r16)		C s = src[size-1] high limb
160
161	cmpult	r1, r18, r2		C test high<divisor
162	bne	r2, L(skip)		C skip if so
163
164	C can't skip a division, repeat loop code
165
166	subq	r1, r0, r2		C x = s - cbit
167	cmpult	r1, r0, r0		C new cbit = s < cbit
168
169	subq	r2, r19, r3		C y = x - climb
170
171	mulq	r3, r20, r4		C q = y * inv
172L(one):
173	cmpult	r2, r19, r5		C cbit2 = x < climb
174	addq	r5, r0, r0		C cbit += cbit2
175
176	umulh	r4, r18, r19		C climb = q * d
177
178	addq	r19, r0, r0		C return climb + cbit
179	ret	r31, (r26), 1
180
181
182	ALIGN(8)
183L(skip):
184	C with high<divisor, the final step can be just (cbit+climb)-s and
185	C an addback of d if that underflows
186
187	addq	r19, r0, r19		C c = climb + cbit
188
189	subq	r19, r1, r2		C c - s
190	cmpult	r19, r1, r3		C c < s
191
192	addq	r2, r18, r0		C return c-s + divisor
193
194	cmoveq	r3, r2, r0		C return c-s if no underflow
195	ret	r31, (r26), 1
196
197EPILOGUE()
198ASM_END()
199