xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/alpha/ev6/sub_n.asm (revision 901e7e84758515fbf39dfc064cb0b45ab146d8b0)
1dnl  Alpha ev6 mpn_sub_n -- Subtract two limb vectors of the same length > 0
2dnl  and store difference in a third limb vector.
3
4dnl  Copyright 2000, 2003, 2005 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of either:
10dnl
11dnl    * the GNU Lesser General Public License as published by the Free
12dnl      Software Foundation; either version 3 of the License, or (at your
13dnl      option) any later version.
14dnl
15dnl  or
16dnl
17dnl    * the GNU General Public License as published by the Free Software
18dnl      Foundation; either version 2 of the License, or (at your option) any
19dnl      later version.
20dnl
21dnl  or both in parallel, as here.
22dnl
23dnl  The GNU MP Library is distributed in the hope that it will be useful, but
24dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26dnl  for more details.
27dnl
28dnl  You should have received copies of the GNU General Public License and the
29dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
30dnl  see https://www.gnu.org/licenses/.
31
32include(`../config.m4')
33
34C      cycles/limb
35C EV4:     ?
36C EV5:     5.4
37C EV6:     2.125
38
39C  INPUT PARAMETERS
40C  rp	r16
41C  up	r17
42C  vp	r18
43C  n	r19
44C  cy	r20   (for mpn_add_nc)
45
46C TODO
47C   Finish cleaning up cy registers r22, r23 (make them use cy0/cy1)
48C   Use multi-pronged feed-in.
49C   Perform additional micro-tuning
50
51C  This code was written in cooperation with ev6 pipeline expert Steve Root.
52
53C  Pair loads and stores where possible
54C  Store pairs oct-aligned where possible (didn't need it here)
55C  Stores are delayed every third cycle
56C  Loads and stores are delayed by fills
57C  U stays still, put code there where possible (note alternation of U1 and U0)
58C  L moves because of loads and stores
59C  Note dampers in L to limit damage
60
61C  This odd-looking optimization expects that were having random bits in our
62C  data, so that a pure zero result is unlikely. so we penalize the unlikely
63C  case to help the common case.
64
65define(`u0', `r0')  define(`u1', `r3')
66define(`v0', `r1')  define(`v1', `r4')
67
68define(`cy0', `r20')  define(`cy1', `r21')
69
70MULFUNC_PROLOGUE(mpn_sub_n mpn_sub_nc)
71
72ASM_START()
73PROLOGUE(mpn_sub_nc)
74	br	r31,	$entry
75EPILOGUE()
76PROLOGUE(mpn_sub_n)
77	bis	r31,	r31,	cy0	C clear carry in
78$entry:	cmpult	r19,	5,	r22	C L1 move counter
79	ldq	u1,	0(r17)		C L0 get next ones
80	ldq	v1,	0(r18)		C L1
81	bne	r22,	$Lsmall
82
83	ldq	u0,	8(r17)		C L0 get next ones
84	ldq	v0,	8(r18)		C L1
85	subq	u1,	v1,	r5	C U0 sub two data
86
87	cmpult	u1,	v1,	r23	C U0 did it borrow
88	ldq	u1,	16(r17)		C L0 get next ones
89	ldq	v1,	16(r18)		C L1
90
91	subq	u0,	v0,	r8	C U1 sub two data
92	subq	r5,	cy0,	r24	C U0 borrow in
93
94	cmpult	u0,	v0,	r22	C U1 did it borrow
95	beq	r5,	$fix5f		C U0 fix exact zero
96$ret5f:	ldq	u0,	24(r17)		C L0 get next ones
97	ldq	v0,	24(r18)		C L1
98
99	subq	r8,	r23,	r25	C U1 borrow from last
100	subq	u1,	v1,	r7	C U0 sub two data
101
102	beq	r8,	$fix6f		C U1 fix exact zero
103$ret6f:	cmpult	u1,	v1,	r23	C U0 did it borrow
104	ldq	u1,	32(r17)		C L0 get next ones
105	ldq	v1,	32(r18)		C L1
106
107	lda	r17,	40(r17)		C L0 move pointer
108	lda	r18,	40(r18)		C L1 move pointer
109
110	lda	r16,	-8(r16)
111	lda	r19,	-13(r19)	C L1 move counter
112	blt	r19,	$Lend		C U1 loop control
113
114
115C Main loop.  8-way unrolled.
116	ALIGN(16)
117$Loop:	subq	u0,	v0,	r2	C U1 sub two data
118	stq	r24,	8(r16)		C L0 put an answer
119	subq	r7,	r22,	r24	C U0 borrow from last
120	stq	r25,	16(r16)		C L1 pair
121
122	cmpult	u0,	v0,	cy1	C U1 did it borrow
123	beq	r7,	$fix7		C U0 fix exact 0
124$ret7:	ldq	u0,	0(r17)		C L0 get next ones
125	ldq	v0,	0(r18)		C L1
126
127	bis	r31,	r31,	r31	C L  damp out
128	subq	r2,	r23,	r25	C U1 borrow from last
129	bis	r31,	r31,	r31	C L  moves in L !
130	subq	u1,	v1,	r5	C U0 sub two data
131
132	beq	r2,	$fix0		C U1 fix exact zero
133$ret0:	cmpult	u1,	v1,	cy0	C U0 did it borrow
134	ldq	u1,	8(r17)		C L0 get next ones
135	ldq	v1,	8(r18)		C L1
136
137	subq	u0,	v0,	r8	C U1 sub two data
138	stq	r24,	24(r16)		C L0 store pair
139	subq	r5,	cy1,	r24	C U0 borrow from last
140	stq	r25,	32(r16)		C L1
141
142	cmpult	u0,	v0,	r22	C U1 did it borrow
143	beq	r5,	$fix1		C U0 fix exact zero
144$ret1:	ldq	u0,	16(r17)		C L0 get next ones
145	ldq	v0,	16(r18)		C L1
146
147	lda	r16,	64(r16)		C L0 move pointer
148	subq	r8,	cy0,	r25	C U1 borrow from last
149	lda	r19,	-8(r19)		C L1 move counter
150	subq	u1,	v1,	r7	C U0 sub two data
151
152	beq	r8,	$fix2		C U1 fix exact zero
153$ret2:	cmpult	u1,	v1,	r23	C U0 did it borrow
154	ldq	u1,	24(r17)		C L0 get next ones
155	ldq	v1,	24(r18)		C L1
156
157	subq	u0,	v0,	r2	C U1 sub two data
158	stq	r24,	-24(r16)	C L0 put an answer
159	subq	r7,	r22,	r24	C U0 borrow from last
160	stq	r25,	-16(r16)	C L1 pair
161
162	cmpult	u0,	v0,	cy1	C U1 did it borrow
163	beq	r7,	$fix3		C U0 fix exact 0
164$ret3:	ldq	u0,	32(r17)		C L0 get next ones
165	ldq	v0,	32(r18)		C L1
166
167	bis	r31,	r31,	r31	C L  damp out
168	subq	r2,	r23,	r25	C U1 borrow from last
169	bis	r31,	r31,	r31	C L  moves in L !
170	subq	u1,	v1,	r5	C U0 sub two data
171
172	beq	r2,	$fix4		C U1 fix exact zero
173$ret4:	cmpult	u1,	v1,	cy0	C U0 did it borrow
174	ldq	u1,	40(r17)		C L0 get next ones
175	ldq	v1,	40(r18)		C L1
176
177	subq	u0,	v0,	r8	C U1 sub two data
178	stq	r24,	-8(r16)		C L0 store pair
179	subq	r5,	cy1,	r24	C U0 borrow from last
180	stq	r25,	0(r16)		C L1
181
182	cmpult	u0,	v0,	r22	C U1 did it borrow
183	beq	r5,	$fix5		C U0 fix exact zero
184$ret5:	ldq	u0,	48(r17)		C L0 get next ones
185	ldq	v0,	48(r18)		C L1
186
187	ldl	r31, 256(r17)		C L0 prefetch
188	subq	r8,	cy0,	r25	C U1 borrow from last
189	ldl	r31, 256(r18)		C L1 prefetch
190	subq	u1,	v1,	r7	C U0 sub two data
191
192	beq	r8,	$fix6		C U1 fix exact zero
193$ret6:	cmpult	u1,	v1,	r23	C U0 did it borrow
194	ldq	u1,	56(r17)		C L0 get next ones
195	ldq	v1,	56(r18)		C L1
196
197	lda	r17,	64(r17)		C L0 move pointer
198	bis	r31,	r31,	r31	C U
199	lda	r18,	64(r18)		C L1 move pointer
200	bge	r19,	$Loop		C U1 loop control
201C ==== main loop end
202
203$Lend:	subq	u0,	v0,	r2	C U1 sub two data
204	stq	r24,	8(r16)		C L0 put an answer
205	subq	r7,	r22,	r24	C U0 borrow from last
206	stq	r25,	16(r16)		C L1 pair
207	cmpult	u0,	v0,	cy1	C U1 did it borrow
208	beq	r7,	$fix7c		C U0 fix exact 0
209$ret7c:	subq	r2,	r23,	r25	C U1 borrow from last
210	subq	u1,	v1,	r5	C U0 sub two data
211	beq	r2,	$fix0c		C U1 fix exact zero
212$ret0c:	cmpult	u1,	v1,	cy0	C U0 did it borrow
213	stq	r24,	24(r16)		C L0 store pair
214	subq	r5,	cy1,	r24	C U0 borrow from last
215	stq	r25,	32(r16)		C L1
216	beq	r5,	$fix1c		C U0 fix exact zero
217$ret1c:	stq	r24,	40(r16)		C L0 put an answer
218	lda	r16,	48(r16)		C L0 move pointer
219
220	lda	r19,	8(r19)
221	beq	r19,	$Lret
222
223	ldq	u1,	0(r17)
224	ldq	v1,	0(r18)
225$Lsmall:
226	lda	r19,	-1(r19)
227	beq	r19,	$Lend0
228
229	ALIGN(8)
230$Loop0:	subq	u1,	v1,	r2	C main sub
231	cmpult	u1,	v1,	r8	C compute bw from last sub
232	ldq	u1,	8(r17)
233	ldq	v1,	8(r18)
234	subq	r2,	cy0,	r5	C borrow sub
235	lda	r17,	8(r17)
236	lda	r18,	8(r18)
237	stq	r5,	0(r16)
238	cmpult	r2,	cy0,	cy0	C compute bw from last sub
239	lda	r19,	-1(r19)		C decr loop cnt
240	bis	r8,	cy0,	cy0	C combine bw from the two subs
241	lda	r16,	8(r16)
242	bne	r19,	$Loop0
243$Lend0:	subq	u1,	v1,	r2	C main sub
244	subq	r2,	cy0,	r5	C borrow sub
245	cmpult	u1,	v1,	r8	C compute bw from last sub
246	cmpult	r2,	cy0,	cy0	C compute bw from last sub
247	stq	r5,	0(r16)
248	bis	r8,	cy0,	r0	C combine bw from the two subs
249	ret	r31,(r26),1
250
251	ALIGN(8)
252$Lret:	lda	r0,	0(cy0)		C copy borrow into return register
253	ret	r31,(r26),1
254
255$fix5f:	bis	r23,	cy0,	r23	C bring forward borrow
256	br	r31,	$ret5f
257$fix6f:	bis	r22,	r23,	r22	C bring forward borrow
258	br	r31,	$ret6f
259$fix0:	bis	cy1,	r23,	cy1	C bring forward borrow
260	br	r31,	$ret0
261$fix1:	bis	cy0,	cy1,	cy0	C bring forward borrow
262	br	r31,	$ret1
263$fix2:	bis	r22,	cy0,	r22	C bring forward borrow
264	br	r31,	$ret2
265$fix3:	bis	r23,	r22,	r23	C bring forward borrow
266	br	r31,	$ret3
267$fix4:	bis	cy1,	r23,	cy1	C bring forward borrow
268	br	r31,	$ret4
269$fix5:	bis	cy1,	cy0,	cy0	C bring forward borrow
270	br	r31,	$ret5
271$fix6:	bis	r22,	cy0,	r22	C bring forward borrow
272	br	r31,	$ret6
273$fix7:	bis	r23,	r22,	r23	C bring forward borrow
274	br	r31,	$ret7
275$fix0c:	bis	cy1,	r23,	cy1	C bring forward borrow
276	br	r31,	$ret0c
277$fix1c:	bis	cy0,	cy1,	cy0	C bring forward borrow
278	br	r31,	$ret1c
279$fix7c:	bis	r23,	r22,	r23	C bring forward borrow
280	br	r31,	$ret7c
281
282EPILOGUE()
283ASM_END()
284