xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/alpha/ev6/mod_1_4.asm (revision b1bb3099bf4d47bbe8c7be5b78240a535263771f)
1dnl Alpha mpn_mod_1s_4p
2
3dnl  Contributed to the GNU project by Torbjorn Granlund.
4
5dnl  Copyright 2009, 2010 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of the GNU Lesser General Public License as published
11dnl  by the Free Software Foundation; either version 3 of the License, or (at
12dnl  your option) any later version.
13
14dnl  The GNU MP Library is distributed in the hope that it will be useful, but
15dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17dnl  License for more details.
18
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24C TODO:
25C  * Optimise.  2.75 c/l should be possible.
26C  * Write a proper mpn_mod_1s_4p_cps.  The code below was compiler generated.
27C  * Optimise feed-in code, starting the sw pipeline in switch code.
28C  * Shorten software pipeline.  The mul instructions are scheduled too far
29C    from their users.  Fixing this will allow us to use fewer registers.
30C  * If we cannot reduce register usage, write perhaps small-n basecase.
31C  * Does this work for PIC?
32
33C      cycles/limb
34C EV4:     ?
35C EV5:    23
36C EV6:     3
37
38define(`ap',     `r16')
39define(`n',      `r17')
40define(`pl',     `r24')
41define(`ph',     `r25')
42define(`rl',     `r6')
43define(`rh',     `r7')
44define(`B1modb', `r1')
45define(`B2modb', `r2')
46define(`B3modb', `r3')
47define(`B4modb', `r4')
48define(`B5modb', `r5')
49
50ASM_START()
51PROLOGUE(mpn_mod_1s_4p)
52	lda	r30, -64(r30)
53	stq	r9, 8(r30)
54	ldq	B1modb, 16(r19)
55	stq	r10, 16(r30)
56	ldq	B2modb, 24(r19)
57	stq	r11, 24(r30)
58	ldq	B3modb, 32(r19)
59	stq	r12, 32(r30)
60	ldq	B4modb, 40(r19)
61	stq	r13, 40(r30)
62	ldq	B5modb, 48(r19)
63	s8addq	n, ap, ap		C point ap at vector end
64
65	and	n, 3, r0
66	lda	n, -4(n)
67	beq	r0, L(b0)
68	lda	r6, -2(r0)
69	blt	r6, L(b1)
70	beq	r6, L(b2)
71
72L(b3):	ldq	r21, -16(ap)
73	ldq	r22, -8(ap)
74	ldq	r20, -24(ap)
75	mulq	r21, B1modb, r8
76	umulh	r21, B1modb, r12
77	mulq	r22, B2modb, r9
78	umulh	r22, B2modb, r13
79	addq	r8, r20, pl
80	cmpult	pl, r8, r0
81	addq	r0, r12, ph
82	addq	r9, pl, rl
83	cmpult	rl, r9, r0
84	addq	r13, ph, ph
85	addq	r0, ph, rh
86	lda	ap, -56(ap)
87	br	L(com)
88
89L(b0):	ldq	r21, -24(ap)
90	ldq	r22, -16(ap)
91	ldq	r23, -8(ap)
92	ldq	r20, -32(ap)
93	mulq	r21, B1modb, r8
94	umulh	r21, B1modb, r12
95	mulq	r22, B2modb, r9
96	umulh	r22, B2modb, r13
97	mulq	r23, B3modb, r10
98	umulh	r23, B3modb, r27
99	addq	r8, r20, pl
100	cmpult	pl, r8, r0
101	addq	r0, r12, ph
102	addq	r9, pl, pl
103	cmpult	pl, r9, r0
104	addq	r13, ph, ph
105	addq	r0, ph, ph
106	addq	r10, pl, rl
107	cmpult	rl, r10, r0
108	addq	r27, ph, ph
109	addq	r0, ph, rh
110	lda	ap, -64(ap)
111	br	L(com)
112
113L(b1):	bis	r31, r31, rh
114	ldq	rl, -8(ap)
115	lda	ap, -40(ap)
116	br	L(com)
117
118L(b2):	ldq	rh, -8(ap)
119	ldq	rl, -16(ap)
120	lda	ap, -48(ap)
121
122L(com):	ble	n, L(ed3)
123	ldq	r21, 8(ap)
124	ldq	r22, 16(ap)
125	ldq	r23, 24(ap)
126	ldq	r20, 0(ap)
127	lda	n, -4(n)
128	lda	ap, -32(ap)
129	mulq	r21, B1modb, r8
130	umulh	r21, B1modb, r12
131	mulq	r22, B2modb, r9
132	umulh	r22, B2modb, r13
133	mulq	r23, B3modb, r10
134	umulh	r23, B3modb, r27
135	mulq	rl, B4modb, r11
136	umulh	rl, B4modb, r28
137	ble	n, L(ed2)
138
139	ALIGN(16)
140L(top):	ldq	r21, 8(ap)
141	mulq	rh, B5modb, rl
142	addq	r8, r20, pl
143	ldq	r22, 16(ap)
144	cmpult	pl, r8, r0
145	umulh	rh, B5modb, rh
146	ldq	r23, 24(ap)
147	addq	r0, r12, ph
148	addq	r9, pl, pl
149	mulq	r21, B1modb, r8
150	cmpult	pl, r9, r0
151	addq	r13, ph, ph
152	umulh	r21, B1modb, r12
153	lda	ap, -32(ap)
154	addq	r0, ph, ph
155	addq	r10, pl, pl
156	mulq	r22, B2modb, r9
157	cmpult	pl, r10, r0
158	addq	r27, ph, ph
159	addq	r11, pl, pl
160	umulh	r22, B2modb, r13
161	addq	r0, ph, ph
162	cmpult	pl, r11, r0
163	addq	r28, ph, ph
164	mulq	r23, B3modb, r10
165	ldq	r20, 32(ap)
166	addq	pl, rl, rl
167	umulh	r23, B3modb, r27
168	addq	r0, ph, ph
169	cmpult	rl, pl, r0
170	mulq	rl, B4modb, r11
171	addq	ph, rh, rh
172	umulh	rl, B4modb, r28
173	addq	r0, rh, rh
174	lda	n, -4(n)
175	bgt	n, L(top)
176
177L(ed2):	mulq	rh, B5modb, rl
178	addq	r8, r20, pl
179	umulh	rh, B5modb, rh
180	cmpult	pl, r8, r0
181	addq	r0, r12, ph
182	addq	r9, pl, pl
183	cmpult	pl, r9, r0
184	addq	r13, ph, ph
185	addq	r0, ph, ph
186	addq	r10, pl, pl
187	cmpult	pl, r10, r0
188	addq	r27, ph, ph
189	addq	r11, pl, pl
190	addq	r0, ph, ph
191	cmpult	pl, r11, r0
192	addq	r28, ph, ph
193	addq	pl, rl, rl
194	addq	r0, ph, ph
195	cmpult	rl, pl, r0
196	addq	ph, rh, rh
197	addq	r0, rh, rh
198
199L(ed3):	mulq	rh, B1modb, r8
200	umulh	rh, B1modb, rh
201	addq	r8, rl, rl
202	cmpult	rl, r8, r0
203	addq	r0, rh, rh
204
205	ldq	r24, 8(r19)		C cnt
206	sll	rh, r24, rh
207	subq	r31, r24, r25
208	srl	rl, r25, r2
209	sll	rl, r24, rl
210	or	r2, rh, rh
211
212	ldq	r23, 0(r19)		C bi
213	mulq	rh, r23, r8
214	umulh	rh, r23, r9
215	addq	rh, 1, r7
216	addq	r8, rl, r8		C ql
217	cmpult	r8, rl, r0
218	addq	r9, r7, r9
219	addq	r0, r9, r9		C qh
220	mulq	r9, r18, r21		C qh * b
221	subq	rl, r21, rl
222	cmpult	r8, rl, r0		C rl > ql
223	negq	r0, r0
224	and	r0, r18, r0
225	addq	rl, r0, rl
226	cmpule	r18, rl, r0		C rl >= b
227	negq	r0, r0
228	and	r0, r18, r0
229	subq	rl, r0, rl
230
231	srl	rl, r24, r0
232
233	ldq	r9, 8(r30)
234	ldq	r10, 16(r30)
235	ldq	r11, 24(r30)
236	ldq	r12, 32(r30)
237	ldq	r13, 40(r30)
238	lda	r30, 64(r30)
239	ret	r31, (r26), 1
240EPILOGUE()
241
242PROLOGUE(mpn_mod_1s_4p_cps,gp)
243	lda	r30, -32(r30)
244	stq	r26, 0(r30)
245	stq	r9, 8(r30)
246	stq	r10, 16(r30)
247	stq	r11, 24(r30)
248	mov	r16, r11
249	LEA(	r4, __clz_tab)
250	lda	r10, 65(r31)
251	cmpbge	r31, r17, r1
252	srl	r1, 1, r1
253	xor	r1, 127, r1
254	addq	r1, r4, r1
255	ldq_u	r2, 0(r1)
256	extbl	r2, r1, r2
257	s8subq	r2, 7, r2
258	srl	r17, r2, r3
259	subq	r10, r2, r10
260	addq	r3, r4, r3
261	ldq_u	r1, 0(r3)
262	extbl	r1, r3, r1
263	subq	r10, r1, r10
264	sll	r17, r10, r9
265	mov	r9, r16
266	jsr	r26, mpn_invert_limb
267	ldah	r29, 0(r26)
268	subq	r31, r10, r2
269	lda	r1, 1(r31)
270	sll	r1, r10, r1
271	subq	r31, r9, r3
272	srl	r0, r2, r2
273	ldq	r26, 0(r30)
274	bis	r2, r1, r2
275	lda	r29, 0(r29)
276	stq	r0, 0(r11)
277	stq	r10, 8(r11)
278	mulq	r2, r3, r2
279	srl	r2, r10, r3
280	umulh	r2, r0, r1
281	stq	r3, 16(r11)
282	mulq	r2, r0, r3
283	ornot	r31, r1, r1
284	subq	r1, r2, r1
285	mulq	r1, r9, r1
286	addq	r1, r9, r2
287	cmpule	r1, r3, r3
288	cmoveq	r3, r2, r1
289	srl	r1, r10, r3
290	umulh	r1, r0, r2
291	stq	r3, 24(r11)
292	mulq	r1, r0, r3
293	ornot	r31, r2, r2
294	subq	r2, r1, r2
295	mulq	r2, r9, r2
296	addq	r2, r9, r1
297	cmpule	r2, r3, r3
298	cmoveq	r3, r1, r2
299	srl	r2, r10, r1
300	umulh	r2, r0, r3
301	stq	r1, 32(r11)
302	mulq	r2, r0, r1
303	ornot	r31, r3, r3
304	subq	r3, r2, r3
305	mulq	r3, r9, r3
306	addq	r3, r9, r2
307	cmpule	r3, r1, r1
308	cmoveq	r1, r2, r3
309	srl	r3, r10, r2
310	umulh	r3, r0, r1
311	stq	r2, 40(r11)
312	mulq	r3, r0, r0
313	ornot	r31, r1, r1
314	subq	r1, r3, r1
315	mulq	r1, r9, r1
316	addq	r1, r9, r9
317	cmpule	r1, r0, r0
318	cmoveq	r0, r9, r1
319	ldq	r9, 8(r30)
320	srl	r1, r10, r1
321	ldq	r10, 16(r30)
322	stq	r1, 48(r11)
323	ldq	r11, 24(r30)
324	lda	r30, 32(r30)
325	ret	r31, (r26), 1
326EPILOGUE()
327