xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/alpha/ev6/add_n.asm (revision ce54336801cf28877c3414aa2fcb251dddd543a2)
1dnl  Alpha ev6 mpn_add_n -- Add two limb vectors of the same length > 0 and
2dnl  store sum in a third limb vector.
3
4dnl  Copyright 2000, 2003, 2005 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of either:
10dnl
11dnl    * the GNU Lesser General Public License as published by the Free
12dnl      Software Foundation; either version 3 of the License, or (at your
13dnl      option) any later version.
14dnl
15dnl  or
16dnl
17dnl    * the GNU General Public License as published by the Free Software
18dnl      Foundation; either version 2 of the License, or (at your option) any
19dnl      later version.
20dnl
21dnl  or both in parallel, as here.
22dnl
23dnl  The GNU MP Library is distributed in the hope that it will be useful, but
24dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26dnl  for more details.
27dnl
28dnl  You should have received copies of the GNU General Public License and the
29dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
30dnl  see https://www.gnu.org/licenses/.
31
32include(`../config.m4')
33
34C      cycles/limb
35C EV4:     ?
36C EV5:     5.4
37C EV6:     2.125
38
39C  INPUT PARAMETERS
40C  rp	r16
41C  up	r17
42C  vp	r18
43C  n	r19
44C  cy	r20   (for mpn_add_nc)
45
46C TODO
47C   Finish cleaning up cy registers r22, r23 (make them use cy0/cy1)
48C   Use multi-pronged feed-in.
49C   Perform additional micro-tuning
50
51C  This code was written in cooperation with ev6 pipeline expert Steve Root.
52
53C  Pair loads and stores where possible
54C  Store pairs oct-aligned where possible (didn't need it here)
55C  Stores are delayed every third cycle
56C  Loads and stores are delayed by fills
57C  U stays still, put code there where possible (note alternation of U1 and U0)
58C  L moves because of loads and stores
59C  Note dampers in L to limit damage
60
61C  This odd-looking optimization expects that were having random bits in our
62C  data, so that a pure zero result is unlikely. so we penalize the unlikely
63C  case to help the common case.
64
65define(`u0', `r0')  define(`u1', `r3')
66define(`v0', `r1')  define(`v1', `r4')
67
68define(`cy0', `r20')  define(`cy1', `r21')
69
70MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc)
71
72ASM_START()
73PROLOGUE(mpn_add_nc)
74	br	r31,	$entry
75EPILOGUE()
76PROLOGUE(mpn_add_n)
77	bis	r31,	r31,	cy0	C clear carry in
78$entry:	cmpult	r19,	5,	r22	C L1 move counter
79	ldq	u1,	0(r17)		C L0 get next ones
80	ldq	v1,	0(r18)		C L1
81	bne	r22,	$Lsmall
82
83	ldq	u0,	8(r17)		C L0 get next ones
84	ldq	v0,	8(r18)		C L1
85	addq	u1,	v1,	r5	C U0 add two data
86
87	cmpult	r5,	v1,	r23	C U0 did it carry
88	ldq	u1,	16(r17)		C L0 get next ones
89	ldq	v1,	16(r18)		C L1
90
91	addq	u0,	v0,	r8	C U1 add two data
92	addq	r5,	cy0,	r5	C U0 carry in
93
94	cmpult	r8,	v0,	r22	C U1 did it carry
95	beq	r5,	$fix5f		C U0 fix exact zero
96$ret5f:	ldq	u0,	24(r17)		C L0 get next ones
97	ldq	v0,	24(r18)		C L1
98
99	addq	r8,	r23,	r8	C U1 carry from last
100	addq	u1,	v1,	r7	C U0 add two data
101
102	beq	r8,	$fix6f		C U1 fix exact zero
103$ret6f:	cmpult	r7,	v1,	r23	C U0 did it carry
104	ldq	u1,	32(r17)		C L0 get next ones
105	ldq	v1,	32(r18)		C L1
106
107	lda	r17,	40(r17)		C L0 move pointer
108	lda	r18,	40(r18)		C L1 move pointer
109
110	lda	r16,	-8(r16)
111	lda	r19,	-13(r19)	C L1 move counter
112	blt	r19,	$Lend		C U1 loop control
113
114
115C Main loop.  8-way unrolled.
116	ALIGN(16)
117$Loop:	addq	u0,	v0,	r2	C U1 add two data
118	addq	r7,	r22,	r7	C U0 add in carry
119	stq	r5,	8(r16)		C L0 put an answer
120	stq	r8,	16(r16)		C L1 pair
121
122	cmpult	r2,	v0,	cy1	C U1 did it carry
123	beq	r7,	$fix7		C U0 fix exact 0
124$ret7:	ldq	u0,	0(r17)		C L0 get next ones
125	ldq	v0,	0(r18)		C L1
126
127	bis	r31,	r31,	r31	C L  damp out
128	addq	r2,	r23,	r2	C U1 carry from last
129	bis	r31,	r31,	r31	C L  moves in L !
130	addq	u1,	v1,	r5	C U0 add two data
131
132	beq	r2,	$fix0		C U1 fix exact zero
133$ret0:	cmpult	r5,	v1,	cy0	C U0 did it carry
134	ldq	u1,	8(r17)		C L0 get next ones
135	ldq	v1,	8(r18)		C L1
136
137	addq	u0,	v0,	r8	C U1 add two data
138	addq	r5,	cy1,	r5	C U0 carry from last
139	stq	r7,	24(r16)		C L0 store pair
140	stq	r2,	32(r16)		C L1
141
142	cmpult	r8,	v0,	r22	C U1 did it carry
143	beq	r5,	$fix1		C U0 fix exact zero
144$ret1:	ldq	u0,	16(r17)		C L0 get next ones
145	ldq	v0,	16(r18)		C L1
146
147	lda	r16,	64(r16)		C L0 move pointer
148	addq	r8,	cy0,	r8	C U1 carry from last
149	lda	r19,	-8(r19)		C L1 move counter
150	addq	u1,	v1,	r7	C U0 add two data
151
152	beq	r8,	$fix2		C U1 fix exact zero
153$ret2:	cmpult	r7,	v1,	r23	C U0 did it carry
154	ldq	u1,	24(r17)		C L0 get next ones
155	ldq	v1,	24(r18)		C L1
156
157	addq	u0,	v0,	r2	C U1 add two data
158	addq	r7,	r22,	r7	C U0 add in carry
159	stq	r5,	-24(r16)	C L0 put an answer
160	stq	r8,	-16(r16)	C L1 pair
161
162	cmpult	r2,	v0,	cy1	C U1 did it carry
163	beq	r7,	$fix3		C U0 fix exact 0
164$ret3:	ldq	u0,	32(r17)		C L0 get next ones
165	ldq	v0,	32(r18)		C L1
166
167	bis	r31,	r31,	r31	C L  damp out
168	addq	r2,	r23,	r2	C U1 carry from last
169	bis	r31,	r31,	r31	C L  moves in L !
170	addq	u1,	v1,	r5	C U0 add two data
171
172	beq	r2,	$fix4		C U1 fix exact zero
173$ret4:	cmpult	r5,	v1,	cy0	C U0 did it carry
174	ldq	u1,	40(r17)		C L0 get next ones
175	ldq	v1,	40(r18)		C L1
176
177	addq	u0,	v0,	r8	C U1 add two data
178	addq	r5,	cy1,	r5	C U0 carry from last
179	stq	r7,	-8(r16)		C L0 store pair
180	stq	r2,	0(r16)		C L1
181
182	cmpult	r8,	v0,	r22	C U1 did it carry
183	beq	r5,	$fix5		C U0 fix exact zero
184$ret5:	ldq	u0,	48(r17)		C L0 get next ones
185	ldq	v0,	48(r18)		C L1
186
187	ldl	r31, 256(r17)		C L0 prefetch
188	addq	r8,	cy0,	r8	C U1 carry from last
189	ldl	r31, 256(r18)		C L1 prefetch
190	addq	u1,	v1,	r7	C U0 add two data
191
192	beq	r8,	$fix6		C U1 fix exact zero
193$ret6:	cmpult	r7,	v1,	r23	C U0 did it carry
194	ldq	u1,	56(r17)		C L0 get next ones
195	ldq	v1,	56(r18)		C L1
196
197	lda	r17,	64(r17)		C L0 move pointer
198	bis	r31,	r31,	r31	C U
199	lda	r18,	64(r18)		C L1 move pointer
200	bge	r19,	$Loop		C U1 loop control
201C ==== main loop end
202
203$Lend:	addq	u0,	v0,	r2	C U1 add two data
204	addq	r7,	r22,	r7	C U0 add in carry
205	stq	r5,	8(r16)		C L0 put an answer
206	stq	r8,	16(r16)		C L1 pair
207	cmpult	r2,	v0,	cy1	C U1 did it carry
208	beq	r7,	$fix7c		C U0 fix exact 0
209$ret7c:	addq	r2,	r23,	r2	C U1 carry from last
210	addq	u1,	v1,	r5	C U0 add two data
211	beq	r2,	$fix0c		C U1 fix exact zero
212$ret0c:	cmpult	r5,	v1,	cy0	C U0 did it carry
213	addq	r5,	cy1,	r5	C U0 carry from last
214	stq	r7,	24(r16)		C L0 store pair
215	stq	r2,	32(r16)		C L1
216	beq	r5,	$fix1c		C U0 fix exact zero
217$ret1c:	stq	r5,	40(r16)		C L0 put an answer
218	lda	r16,	48(r16)		C L0 move pointer
219
220	lda	r19,	8(r19)
221	beq	r19,	$Lret
222
223	ldq	u1,	0(r17)
224	ldq	v1,	0(r18)
225$Lsmall:
226	lda	r19,	-1(r19)
227	beq	r19,	$Lend0
228
229	ALIGN(8)
230$Loop0:	addq	u1,	v1,	r2	C main add
231	cmpult	r2,	v1,	r8	C compute cy from last add
232	ldq	u1,	8(r17)
233	ldq	v1,	8(r18)
234	addq	r2,	cy0,	r5	C carry add
235	lda	r17,	8(r17)
236	lda	r18,	8(r18)
237	stq	r5,	0(r16)
238	cmpult	r5,	r2,	cy0	C compute cy from last add
239	lda	r19,	-1(r19)		C decr loop cnt
240	bis	r8,	cy0,	cy0	C combine cy from the two adds
241	lda	r16,	8(r16)
242	bne	r19,	$Loop0
243$Lend0:	addq	u1,	v1,	r2	C main add
244	addq	r2,	cy0,	r5	C carry add
245	cmpult	r2,	v1,	r8	C compute cy from last add
246	cmpult	r5,	r2,	cy0	C compute cy from last add
247	stq	r5,	0(r16)
248	bis	r8,	cy0,	r0	C combine cy from the two adds
249	ret	r31,(r26),1
250
251	ALIGN(8)
252$Lret:	lda	r0,	0(cy0)		C copy carry into return register
253	ret	r31,(r26),1
254
255$fix5f:	bis	r23,	cy0,	r23	C bring forward carry
256	br	r31,	$ret5f
257$fix6f:	bis	r22,	r23,	r22	C bring forward carry
258	br	r31,	$ret6f
259$fix0:	bis	cy1,	r23,	cy1	C bring forward carry
260	br	r31,	$ret0
261$fix1:	bis	cy0,	cy1,	cy0	C bring forward carry
262	br	r31,	$ret1
263$fix2:	bis	r22,	cy0,	r22	C bring forward carry
264	br	r31,	$ret2
265$fix3:	bis	r23,	r22,	r23	C bring forward carry
266	br	r31,	$ret3
267$fix4:	bis	cy1,	r23,	cy1	C bring forward carry
268	br	r31,	$ret4
269$fix5:	bis	cy1,	cy0,	cy0	C bring forward carry
270	br	r31,	$ret5
271$fix6:	bis	r22,	cy0,	r22	C bring forward carry
272	br	r31,	$ret6
273$fix7:	bis	r23,	r22,	r23	C bring forward carry
274	br	r31,	$ret7
275$fix0c:	bis	cy1,	r23,	cy1	C bring forward carry
276	br	r31,	$ret0c
277$fix1c:	bis	cy0,	cy1,	cy0	C bring forward carry
278	br	r31,	$ret1c
279$fix7c:	bis	r23,	r22,	r23	C bring forward carry
280	br	r31,	$ret7c
281
282EPILOGUE()
283ASM_END()
284