xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/alpha/ev6/mul_1.asm (revision d909946ca08dceb44d7d0f22ec9488679695d976)
1dnl  Alpha ev6 mpn_mul_1 -- Multiply a limb vector with a limb and store the
2dnl  result in a second limb vector.
3
4dnl  Copyright 2000, 2001, 2005 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of the GNU Lesser General Public License as published
10dnl  by the Free Software Foundation; either version 3 of the License, or (at
11dnl  your option) any later version.
12
13dnl  The GNU MP Library is distributed in the hope that it will be useful, but
14dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16dnl  License for more details.
17
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23C INPUT PARAMETERS
24C res_ptr	r16
25C s1_ptr	r17
26C size		r18
27C s2_limb	r19
28
29C This code runs at 2.25 cycles/limb on EV6.
30
31C This code was written in close cooperation with ev6 pipeline expert
32C Steve Root.  Any errors are tege's fault, though.
33
34C Code structure:
35
36C  code for n < 8
37C  code for n > 8	code for (n mod 8)
38C			code for (n div 8)	feed-in code
39C						8-way unrolled loop
40C						wind-down code
41
42C Some notes about unrolled loop:
43C
44C   r1-r8     multiplies and workup
45C   r21-r28   multiplies and workup
46C   r9-r12    loads
47C   r0       -1
48C   r20,r29,r13-r15  scramble
49C
50C   We're doing 7 of the 8 carry propagations with a br fixup code and 1 with a
51C   put-the-carry-into-hi.  The idea is that these branches are very rarely
52C   taken, and since a non-taken branch consumes no resources, that is better
53C   than an addq.
54C
55C   Software pipeline: a load in cycle #09, feeds a mul in cycle #16, feeds an
56C   add NEXT cycle #09 which feeds a store in NEXT cycle #02
57
58C The code could use some further work:
59C   1. Speed up really small multiplies.  The default alpha/mul_1.asm code is
60C      faster than this for size < 3.
61C   2. Improve feed-in code, perhaps with the equivalent of switch(n%8) unless
62C      that is too costly.
63C   3. Consider using 4-way unrolling, even if that runs slower.
64C   4. Reduce register usage.  In particular, try to avoid using r29.
65
66ASM_START()
67PROLOGUE(mpn_mul_1)
68	cmpult	r18,	8,	r1
69	beq	r1,	$Large
70$Lsmall:
71	ldq	r2,0(r17)	C r2 = s1_limb
72	lda	r18,-1(r18)	C size--
73	mulq	r2,r19,r3	C r3 = prod_low
74	bic	r31,r31,r4	C clear cy_limb
75	umulh	r2,r19,r0	C r0 = prod_high
76	beq	r18,$Le1a	C jump if size was == 1
77	ldq	r2,8(r17)	C r2 = s1_limb
78	lda	r18,-1(r18)	C size--
79	stq	r3,0(r16)
80	beq	r18,$Le2a	C jump if size was == 2
81	ALIGN(8)
82$Lopa:	mulq	r2,r19,r3	C r3 = prod_low
83	addq	r4,r0,r0	C cy_limb = cy_limb + 'cy'
84	lda	r18,-1(r18)	C size--
85	umulh	r2,r19,r4	C r4 = cy_limb
86	ldq	r2,16(r17)	C r2 = s1_limb
87	lda	r17,8(r17)	C s1_ptr++
88	addq	r3,r0,r3	C r3 = cy_limb + prod_low
89	stq	r3,8(r16)
90	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
91	lda	r16,8(r16)	C res_ptr++
92	bne	r18,$Lopa
93
94$Le2a:	mulq	r2,r19,r3	C r3 = prod_low
95	addq	r4,r0,r0	C cy_limb = cy_limb + 'cy'
96	umulh	r2,r19,r4	C r4 = cy_limb
97	addq	r3,r0,r3	C r3 = cy_limb + prod_low
98	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
99	stq	r3,8(r16)
100	addq	r4,r0,r0	C cy_limb = prod_high + cy
101	ret	r31,(r26),1
102$Le1a:	stq	r3,0(r16)
103	ret	r31,(r26),1
104
105$Large:
106	lda	r30,	-224(r30)
107	stq	r26,	0(r30)
108	stq	r9,	8(r30)
109	stq	r10,	16(r30)
110	stq	r11,	24(r30)
111	stq	r12,	32(r30)
112	stq	r13,	40(r30)
113	stq	r14,	48(r30)
114	stq	r15,	56(r30)
115	stq	r29,	64(r30)
116
117	and	r18,	7,	r20	C count for the first loop, 0-7
118	srl	r18,	3,	r18	C count for unrolled loop
119	bis	r31,	r31,	r21
120	beq	r20,	$L_8_or_more	C skip first loop
121
122$L_9_or_more:
123	ldq	r2,0(r17)	C r2 = s1_limb
124	lda	r17,8(r17)	C s1_ptr++
125	lda	r20,-1(r20)	C size--
126	mulq	r2,r19,r3	C r3 = prod_low
127	umulh	r2,r19,r21	C r21 = prod_high
128	beq	r20,$Le1b	C jump if size was == 1
129	bis	r31, r31, r0	C FIXME: shouldn't need this
130	ldq	r2,0(r17)	C r2 = s1_limb
131	lda	r17,8(r17)	C s1_ptr++
132	lda	r20,-1(r20)	C size--
133	stq	r3,0(r16)
134	lda	r16,8(r16)	C res_ptr++
135	beq	r20,$Le2b	C jump if size was == 2
136	ALIGN(8)
137$Lopb:	mulq	r2,r19,r3	C r3 = prod_low
138	addq	r21,r0,r0	C cy_limb = cy_limb + 'cy'
139	lda	r20,-1(r20)	C size--
140	umulh	r2,r19,r21	C r21 = prod_high
141	ldq	r2,0(r17)	C r2 = s1_limb
142	lda	r17,8(r17)	C s1_ptr++
143	addq	r3,r0,r3	C r3 = cy_limb + prod_low
144	stq	r3,0(r16)
145	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
146	lda	r16,8(r16)	C res_ptr++
147	bne	r20,$Lopb
148
149$Le2b:	mulq	r2,r19,r3	C r3 = prod_low
150	addq	r21,r0,r0	C cy_limb = cy_limb + 'cy'
151	umulh	r2,r19,r21	C r21 = prod_high
152	addq	r3,r0,r3	C r3 = cy_limb + prod_low
153	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
154	stq	r3,0(r16)
155	lda	r16,8(r16)	C res_ptr++
156	addq	r21,r0,r21	C cy_limb = prod_high + cy
157	br	r31,	$L_8_or_more
158$Le1b:	stq	r3,0(r16)
159	lda	r16,8(r16)	C res_ptr++
160
161$L_8_or_more:
162	lda	r0,	-1(r31)		C put -1 in r0, for tricky loop control
163	lda	r17,	-32(r17)	C L1 bookkeeping
164	lda	r18,	-1(r18)		C decrement count
165
166	ldq	r9,	32(r17)		C L1
167	ldq	r10,	40(r17)		C L1
168	mulq	r9,	r19,	r22	C U1 #07
169	ldq	r11,	48(r17)		C L1
170	umulh	r9,	r19,	r23	C U1 #08
171	ldq	r12,	56(r17)		C L1
172	mulq	r10,	r19,	r24	C U1 #09
173	ldq	r9,	64(r17)		C L1
174
175	lda	r17,	64(r17)		C L1 bookkeeping
176
177	umulh	r10,	r19,	r25	C U1 #11
178	mulq	r11,	r19,	r26	C U1 #12
179	umulh	r11,	r19,	r27	C U1 #13
180	mulq	r12,	r19,	r28	C U1 #14
181	ldq	r10,	8(r17)		C L1
182	umulh	r12,	r19,	r1	C U1 #15
183	ldq	r11,	16(r17)		C L1
184	mulq	r9,	r19,	r2	C U1 #16
185	ldq	r12,	24(r17)		C L1
186	umulh	r9,	r19,	r3	C U1 #17
187	addq	r21,	r22,	r13	C L1 mov
188	mulq	r10,	r19,	r4	C U1 #18
189	addq	r23,	r24,	r22	C L0 sum 2 mul's
190	cmpult	r13,	r21,	r14	C L1 carry from sum
191	bgt	r18,	$L_16_or_more
192
193	cmpult	r22,	r24,	r24	C U0 carry from sum
194	umulh	r10,	r19,	r5	C U1 #02
195	addq	r25,	r26,	r23	C U0 sum 2 mul's
196	mulq	r11,	r19,	r6	C U1 #03
197	cmpult	r23,	r26,	r25	C U0 carry from sum
198	umulh	r11,	r19,	r7	C U1 #04
199	addq	r27,	r28,	r28	C U0 sum 2 mul's
200	mulq	r12,	r19,	r8	C U1 #05
201	cmpult	r28,	r27,	r15	C L0 carry from sum
202	lda	r16,	32(r16)		C L1 bookkeeping
203	addq	r13,	r31,	r13	C U0 start carry cascade
204	umulh	r12,	r19,	r21	C U1 #06
205	br	r31,	$ret0c
206
207$L_16_or_more:
208C ---------------------------------------------------------------
209	subq	r18,1,r18
210	cmpult	r22,	r24,	r24	C U0 carry from sum
211	ldq	r9,	32(r17)		C L1
212
213	umulh	r10,	r19,	r5	C U1 #02
214	addq	r25,	r26,	r23	C U0 sum 2 mul's
215	mulq	r11,	r19,	r6	C U1 #03
216	cmpult	r23,	r26,	r25	C U0 carry from sum
217	umulh	r11,	r19,	r7	C U1 #04
218	addq	r27,	r28,	r28	C U0 sum 2 mul's
219	mulq	r12,	r19,	r8	C U1 #05
220	cmpult	r28,	r27,	r15	C L0 carry from sum
221	lda	r16,	32(r16)		C L1 bookkeeping
222	addq	r13,	r31,	r13	C U0 start carry cascade
223
224	umulh	r12,	r19,	r21	C U1 #06
225C	beq	r13,	$fix0w		C U0
226$ret0w:	addq	r22,	r14,	r26	C L0
227	ldq	r10,	40(r17)		C L1
228
229	mulq	r9,	r19,	r22	C U1 #07
230	beq	r26,	$fix1w		C U0
231$ret1w:	addq	r23,	r24,	r27	C L0
232	ldq	r11,	48(r17)		C L1
233
234	umulh	r9,	r19,	r23	C U1 #08
235	beq	r27,	$fix2w		C U0
236$ret2w:	addq	r28,	r25,	r28	C L0
237	ldq	r12,	56(r17)		C L1
238
239	mulq	r10,	r19,	r24	C U1 #09
240	beq	r28,	$fix3w		C U0
241$ret3w:	addq	r1,	r2,	r20	C L0 sum 2 mul's
242	ldq	r9,	64(r17)		C L1
243
244	addq	r3,	r4,	r2	C L0 #10 2 mul's
245	lda	r17,	64(r17)		C L1 bookkeeping
246	cmpult	r20,	r1,	r29	C U0 carry from sum
247
248	umulh	r10,	r19,	r25	C U1 #11
249	cmpult	r2,	r4,	r4	C U0 carry from sum
250	stq	r13,	-32(r16)	C L0
251	stq	r26,	-24(r16)	C L1
252
253	mulq	r11,	r19,	r26	C U1 #12
254	addq	r5,	r6,	r14	C U0 sum 2 mul's
255	stq	r27,	-16(r16)	C L0
256	stq	r28,	-8(r16)		C L1
257
258	umulh	r11,	r19,	r27	C U1 #13
259	cmpult	r14,	r6,	r3	C U0 carry from sum
260C could do cross-jumping here:
261C	bra	$L_middle_of_unrolled_loop
262	mulq	r12,	r19,	r28	C U1 #14
263	addq	r7,	r3,	r5	C L0 eat carry
264	addq	r20,	r15,	r20	C U0 carry cascade
265	ldq	r10,	8(r17)		C L1
266
267	umulh	r12,	r19,	r1	C U1 #15
268	beq	r20,	$fix4		C U0
269$ret4w:	addq	r2,	r29,	r6	C L0
270	ldq	r11,	16(r17)		C L1
271
272	mulq	r9,	r19,	r2	C U1 #16
273	beq	r6,	$fix5		C U0
274$ret5w:	addq	r14,	r4,	r7	C L0
275	ldq	r12,	24(r17)		C L1
276
277	umulh	r9,	r19,	r3	C U1 #17
278	beq	r7,	$fix6		C U0
279$ret6w:	addq	r5,	r8,	r8	C L0 sum 2
280	addq	r21,	r22,	r13	C L1 sum 2 mul's
281
282	mulq	r10,	r19,	r4	C U1 #18
283	addq	r23,	r24,	r22	C L0 sum 2 mul's
284	cmpult	r13,	r21,	r14	C L1 carry from sum
285	ble	r18,	$Lend		C U0
286C ---------------------------------------------------------------
287	ALIGN(16)
288$Loop:
289	umulh	r0,	r18,	r18	C U1 #01 decrement r18!
290	cmpult	r8,	r5,	r29	C L0 carry from last bunch
291	cmpult	r22,	r24,	r24	C U0 carry from sum
292	ldq	r9,	32(r17)		C L1
293
294	umulh	r10,	r19,	r5	C U1 #02
295	addq	r25,	r26,	r23	C U0 sum 2 mul's
296	stq	r20,	0(r16)		C L0
297	stq	r6,	8(r16)		C L1
298
299	mulq	r11,	r19,	r6	C U1 #03
300	cmpult	r23,	r26,	r25	C U0 carry from sum
301	stq	r7,	16(r16)		C L0
302	stq	r8,	24(r16)		C L1
303
304	umulh	r11,	r19,	r7	C U1 #04
305	bis	r31,	r31,	r31	C L0 st slosh
306	bis	r31,	r31,	r31	C L1 st slosh
307	addq	r27,	r28,	r28	C U0 sum 2 mul's
308
309	mulq	r12,	r19,	r8	C U1 #05
310	cmpult	r28,	r27,	r15	C L0 carry from sum
311	lda	r16,	64(r16)		C L1 bookkeeping
312	addq	r13,	r29,	r13	C U0 start carry cascade
313
314	umulh	r12,	r19,	r21	C U1 #06
315	beq	r13,	$fix0		C U0
316$ret0:	addq	r22,	r14,	r26	C L0
317	ldq	r10,	40(r17)		C L1
318
319	mulq	r9,	r19,	r22	C U1 #07
320	beq	r26,	$fix1		C U0
321$ret1:	addq	r23,	r24,	r27	C L0
322	ldq	r11,	48(r17)		C L1
323
324	umulh	r9,	r19,	r23	C U1 #08
325	beq	r27,	$fix2		C U0
326$ret2:	addq	r28,	r25,	r28	C L0
327	ldq	r12,	56(r17)		C L1
328
329	mulq	r10,	r19,	r24	C U1 #09
330	beq	r28,	$fix3		C U0
331$ret3:	addq	r1,	r2,	r20	C L0 sum 2 mul's
332	ldq	r9,	64(r17)		C L1
333
334	addq	r3,	r4,	r2	C L0 #10 2 mul's
335	bis	r31,	r31,	r31	C U1 mul hole
336	lda	r17,	64(r17)		C L1 bookkeeping
337	cmpult	r20,	r1,	r29	C U0 carry from sum
338
339	umulh	r10,	r19,	r25	C U1 #11
340	cmpult	r2,	r4,	r4	C U0 carry from sum
341	stq	r13,	-32(r16)	C L0
342	stq	r26,	-24(r16)	C L1
343
344	mulq	r11,	r19,	r26	C U1 #12
345	addq	r5,	r6,	r14	C U0 sum 2 mul's
346	stq	r27,	-16(r16)	C L0
347	stq	r28,	-8(r16)		C L1
348
349	umulh	r11,	r19,	r27	C U1 #13
350	bis	r31,	r31,	r31	C L0 st slosh
351	bis	r31,	r31,	r31	C L1 st slosh
352	cmpult	r14,	r6,	r3	C U0 carry from sum
353$L_middle_of_unrolled_loop:
354	mulq	r12,	r19,	r28	C U1 #14
355	addq	r7,	r3,	r5	C L0 eat carry
356	addq	r20,	r15,	r20	C U0 carry cascade
357	ldq	r10,	8(r17)		C L1
358
359	umulh	r12,	r19,	r1	C U1 #15
360	beq	r20,	$fix4		C U0
361$ret4:	addq	r2,	r29,	r6	C L0
362	ldq	r11,	16(r17)		C L1
363
364	mulq	r9,	r19,	r2	C U1 #16
365	beq	r6,	$fix5		C U0
366$ret5:	addq	r14,	r4,	r7	C L0
367	ldq	r12,	24(r17)		C L1
368
369	umulh	r9,	r19,	r3	C U1 #17
370	beq	r7,	$fix6		C U0
371$ret6:	addq	r5,	r8,	r8	C L0 sum 2
372	addq	r21,	r22,	r13	C L1 sum 2 mul's
373
374	mulq	r10,	r19,	r4	C U1 #18
375	addq	r23,	r24,	r22	C L0 sum 2 mul's
376	cmpult	r13,	r21,	r14	C L1 carry from sum
377	bgt	r18,	$Loop		C U0
378C ---------------------------------------------------------------
379$Lend:
380	cmpult	r8,	r5,	r29	C L0 carry from last bunch
381	cmpult	r22,	r24,	r24	C U0 carry from sum
382
383	umulh	r10,	r19,	r5	C U1 #02
384	addq	r25,	r26,	r23	C U0 sum 2 mul's
385	stq	r20,	0(r16)		C L0
386	stq	r6,	8(r16)		C L1
387
388	mulq	r11,	r19,	r6	C U1 #03
389	cmpult	r23,	r26,	r25	C U0 carry from sum
390	stq	r7,	16(r16)		C L0
391	stq	r8,	24(r16)		C L1
392
393	umulh	r11,	r19,	r7	C U1 #04
394	addq	r27,	r28,	r28	C U0 sum 2 mul's
395
396	mulq	r12,	r19,	r8	C U1 #05
397	cmpult	r28,	r27,	r15	C L0 carry from sum
398	lda	r16,	64(r16)		C L1 bookkeeping
399	addq	r13,	r29,	r13	C U0 start carry cascade
400
401	umulh	r12,	r19,	r21	C U1 #06
402	beq	r13,	$fix0c		C U0
403$ret0c:	addq	r22,	r14,	r26	C L0
404	beq	r26,	$fix1c		C U0
405$ret1c:	addq	r23,	r24,	r27	C L0
406	beq	r27,	$fix2c		C U0
407$ret2c:	addq	r28,	r25,	r28	C L0
408	beq	r28,	$fix3c		C U0
409$ret3c:	addq	r1,	r2,	r20	C L0 sum 2 mul's
410	addq	r3,	r4,	r2	C L0 #10 2 mul's
411	lda	r17,	64(r17)		C L1 bookkeeping
412	cmpult	r20,	r1,	r29	C U0 carry from sum
413	cmpult	r2,	r4,	r4	C U0 carry from sum
414	stq	r13,	-32(r16)	C L0
415	stq	r26,	-24(r16)	C L1
416	addq	r5,	r6,	r14	C U0 sum 2 mul's
417	stq	r27,	-16(r16)	C L0
418	stq	r28,	-8(r16)		C L1
419	cmpult	r14,	r6,	r3	C U0 carry from sum
420	addq	r7,	r3,	r5	C L0 eat carry
421	addq	r20,	r15,	r20	C U0 carry cascade
422	beq	r20,	$fix4c		C U0
423$ret4c:	addq	r2,	r29,	r6	C L0
424	beq	r6,	$fix5c		C U0
425$ret5c:	addq	r14,	r4,	r7	C L0
426	beq	r7,	$fix6c		C U0
427$ret6c:	addq	r5,	r8,	r8	C L0 sum 2
428	cmpult	r8,	r5,	r29	C L0 carry from last bunch
429	stq	r20,	0(r16)		C L0
430	stq	r6,	8(r16)		C L1
431	stq	r7,	16(r16)		C L0
432	stq	r8,	24(r16)		C L1
433	addq	r29,	r21,	r0
434
435	ldq	r26,	0(r30)
436	ldq	r9,	8(r30)
437	ldq	r10,	16(r30)
438	ldq	r11,	24(r30)
439	ldq	r12,	32(r30)
440	ldq	r13,	40(r30)
441	ldq	r14,	48(r30)
442	ldq	r15,	56(r30)
443	ldq	r29,	64(r30)
444	lda	r30,	224(r30)
445	ret	r31,	(r26),	1
446
447C $fix0w:	bis	r14,	r29,	r14	C join carries
448C	br	r31,	$ret0w
449$fix1w:	bis	r24,	r14,	r24	C join carries
450	br	r31,	$ret1w
451$fix2w:	bis	r25,	r24,	r25	C join carries
452	br	r31,	$ret2w
453$fix3w:	bis	r15,	r25,	r15	C join carries
454	br	r31,	$ret3w
455$fix0:	bis	r14,	r29,	r14	C join carries
456	br	r31,	$ret0
457$fix1:	bis	r24,	r14,	r24	C join carries
458	br	r31,	$ret1
459$fix2:	bis	r25,	r24,	r25	C join carries
460	br	r31,	$ret2
461$fix3:	bis	r15,	r25,	r15	C join carries
462	br	r31,	$ret3
463$fix4:	bis	r29,	r15,	r29	C join carries
464	br	r31,	$ret4
465$fix5:	bis	r4,	r29,	r4	C join carries
466	br	r31,	$ret5
467$fix6:	addq	r5,	r4,	r5	C can't carry twice!
468	br	r31,	$ret6
469$fix0c:	bis	r14,	r29,	r14	C join carries
470	br	r31,	$ret0c
471$fix1c:	bis	r24,	r14,	r24	C join carries
472	br	r31,	$ret1c
473$fix2c:	bis	r25,	r24,	r25	C join carries
474	br	r31,	$ret2c
475$fix3c:	bis	r15,	r25,	r15	C join carries
476	br	r31,	$ret3c
477$fix4c:	bis	r29,	r15,	r29	C join carries
478	br	r31,	$ret4c
479$fix5c:	bis	r4,	r29,	r4	C join carries
480	br	r31,	$ret5c
481$fix6c:	addq	r5,	r4,	r5	C can't carry twice!
482	br	r31,	$ret6c
483
484EPILOGUE(mpn_mul_1)
485ASM_END()
486