xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/sparc64/ultrasparc1234/add_n.asm (revision 479d8f7d843cc1b22d497efdf1f27a50ee8418d4)
1dnl  SPARC v9 mpn_add_n -- Add two limb vectors of the same length > 0 and
2dnl  store sum in a third limb vector.
3
4dnl  Copyright 2001, 2002, 2003, 2011 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of the GNU Lesser General Public License as published
10dnl  by the Free Software Foundation; either version 3 of the License, or (at
11dnl  your option) any later version.
12
13dnl  The GNU MP Library is distributed in the hope that it will be useful, but
14dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16dnl  License for more details.
17
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23C		   cycles/limb
24C UltraSPARC 1&2:     4
25C UltraSPARC 3:	      4.5
26
27C Compute carry-out from the most significant bits of u,v, and r, where
28C r=u+v+carry_in, using logic operations.
29
30C This code runs at 4 cycles/limb on UltraSPARC 1 and 2.  It has a 4 insn
31C recurrency, and the UltraSPARC 1 and 2 the IE units are 100% saturated.
32C Therefore, it seems futile to try to optimize this any further...
33
34C INPUT PARAMETERS
35define(`rp',`%i0')
36define(`up',`%i1')
37define(`vp',`%i2')
38define(`n',`%i3')
39
40define(`u0',`%l0')
41define(`u1',`%l2')
42define(`u2',`%l4')
43define(`u3',`%l6')
44define(`v0',`%l1')
45define(`v1',`%l3')
46define(`v2',`%l5')
47define(`v3',`%l7')
48
49define(`cy',`%i4')
50
51define(`fanop',`fitod %f0,%f2')		dnl  A quasi nop running in the FA pipe
52define(`fmnop',`fmuld %f0,%f0,%f4')	dnl  A quasi nop running in the FM pipe
53
54ASM_START()
55	REGISTER(%g2,#scratch)
56	REGISTER(%g3,#scratch)
57PROLOGUE(mpn_add_nc)
58	save	%sp,-160,%sp
59
60	fitod	%f0,%f0		C make sure f0 contains small, quiet number
61	subcc	n,4,%g0
62	bl,pn	%xcc,.Loop0
63	nop
64	b,a	L(com)
65EPILOGUE()
66
67PROLOGUE(mpn_add_n)
68	save	%sp,-160,%sp
69
70	fitod	%f0,%f0		C make sure f0 contains small, quiet number
71	subcc	n,4,%g0
72	bl,pn	%xcc,.Loop0
73	mov	0,cy
74L(com):
75	ldx	[up+0],u0
76	ldx	[vp+0],v0
77	add	up,32,up
78	ldx	[up-24],u1
79	ldx	[vp+8],v1
80	add	vp,32,vp
81	ldx	[up-16],u2
82	ldx	[vp-16],v2
83	ldx	[up-8],u3
84	ldx	[vp-8],v3
85	subcc	n,8,n
86	add	u0,v0,%g1	C main add
87	add	%g1,cy,%g4	C carry add
88	or	u0,v0,%g2
89	bl,pn	%xcc,.Lend4567
90	fanop
91	b,a	.Loop
92
93	.align	16
94C START MAIN LOOP
95.Loop:	andn	%g2,%g4,%g2
96	and	u0,v0,%g3
97	ldx	[up+0],u0
98	fanop
99C --
100	or	%g3,%g2,%g2
101	ldx	[vp+0],v0
102	add	up,32,up
103	fanop
104C --
105	srlx	%g2,63,cy
106	add	u1,v1,%g1
107	stx	%g4,[rp+0]
108	fanop
109C --
110	add	%g1,cy,%g4
111	or	u1,v1,%g2
112	fmnop
113	fanop
114C --
115	andn	%g2,%g4,%g2
116	and	u1,v1,%g3
117	ldx	[up-24],u1
118	fanop
119C --
120	or	%g3,%g2,%g2
121	ldx	[vp+8],v1
122	add	vp,32,vp
123	fanop
124C --
125	srlx	%g2,63,cy
126	add	u2,v2,%g1
127	stx	%g4,[rp+8]
128	fanop
129C --
130	add	%g1,cy,%g4
131	or	u2,v2,%g2
132	fmnop
133	fanop
134C --
135	andn	%g2,%g4,%g2
136	and	u2,v2,%g3
137	ldx	[up-16],u2
138	fanop
139C --
140	or	%g3,%g2,%g2
141	ldx	[vp-16],v2
142	add	rp,32,rp
143	fanop
144C --
145	srlx	%g2,63,cy
146	add	u3,v3,%g1
147	stx	%g4,[rp-16]
148	fanop
149C --
150	add	%g1,cy,%g4
151	or	u3,v3,%g2
152	fmnop
153	fanop
154C --
155	andn	%g2,%g4,%g2
156	and	u3,v3,%g3
157	ldx	[up-8],u3
158	fanop
159C --
160	or	%g3,%g2,%g2
161	subcc	n,4,n
162	ldx	[vp-8],v3
163	fanop
164C --
165	srlx	%g2,63,cy
166	add	u0,v0,%g1
167	stx	%g4,[rp-8]
168	fanop
169C --
170	add	%g1,cy,%g4
171	or	u0,v0,%g2
172	bge,pt	%xcc,.Loop
173	fanop
174C END MAIN LOOP
175.Lend4567:
176	andn	%g2,%g4,%g2
177	and	u0,v0,%g3
178	or	%g3,%g2,%g2
179	srlx	%g2,63,cy
180	add	u1,v1,%g1
181	stx	%g4,[rp+0]
182	add	%g1,cy,%g4
183	or	u1,v1,%g2
184	andn	%g2,%g4,%g2
185	and	u1,v1,%g3
186	or	%g3,%g2,%g2
187	srlx	%g2,63,cy
188	add	u2,v2,%g1
189	stx	%g4,[rp+8]
190	add	%g1,cy,%g4
191	or	u2,v2,%g2
192	andn	%g2,%g4,%g2
193	and	u2,v2,%g3
194	or	%g3,%g2,%g2
195	add	rp,32,rp
196	srlx	%g2,63,cy
197	add	u3,v3,%g1
198	stx	%g4,[rp-16]
199	add	%g1,cy,%g4
200	or	u3,v3,%g2
201	andn	%g2,%g4,%g2
202	and	u3,v3,%g3
203	or	%g3,%g2,%g2
204	srlx	%g2,63,cy
205	stx	%g4,[rp-8]
206
207	addcc	n,4,n
208	bz,pn	%xcc,.Lret
209	fanop
210
211.Loop0:	ldx	[up],u0
212	add	up,8,up
213	ldx	[vp],v0
214	add	vp,8,vp
215	add	rp,8,rp
216	subcc	n,1,n
217	add	u0,v0,%g1
218	or	u0,v0,%g2
219	add	%g1,cy,%g4
220	and	u0,v0,%g3
221	andn	%g2,%g4,%g2
222	stx	%g4,[rp-8]
223	or	%g3,%g2,%g2
224	bnz,pt	%xcc,.Loop0
225	srlx	%g2,63,cy
226
227.Lret:	mov	cy,%i0
228	ret
229	restore
230EPILOGUE()
231