xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/sparc64/ultrasparc1234/sub_n.asm (revision 32d1c65c71fbdb65a012e8392a62a757dd6853e9)
1dnl  SPARC v9 mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
2dnl  store difference in a third limb vector.
3
4dnl  Copyright 2001-2003, 2011 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of either:
10dnl
11dnl    * the GNU Lesser General Public License as published by the Free
12dnl      Software Foundation; either version 3 of the License, or (at your
13dnl      option) any later version.
14dnl
15dnl  or
16dnl
17dnl    * the GNU General Public License as published by the Free Software
18dnl      Foundation; either version 2 of the License, or (at your option) any
19dnl      later version.
20dnl
21dnl  or both in parallel, as here.
22dnl
23dnl  The GNU MP Library is distributed in the hope that it will be useful, but
24dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26dnl  for more details.
27dnl
28dnl  You should have received copies of the GNU General Public License and the
29dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
30dnl  see https://www.gnu.org/licenses/.
31
32include(`../config.m4')
33
34C		   cycles/limb
35C UltraSPARC 1&2:     4
36C UltraSPARC 3:	      4.5
37
38C Compute carry-out from the most significant bits of u,v, and r, where
39C r=u-v-carry_in, using logic operations.
40
41C This code runs at 4 cycles/limb on UltraSPARC 1 and 2.  It has a 4 insn
42C recurrency, and the UltraSPARC 1 and 2 the IE units are 100% saturated.
43C Therefore, it seems futile to try to optimize this any further...
44
45C INPUT PARAMETERS
46define(`rp',`%i0')
47define(`up',`%i1')
48define(`vp',`%i2')
49define(`n',`%i3')
50
51define(`u0',`%l0')
52define(`u1',`%l2')
53define(`u2',`%l4')
54define(`u3',`%l6')
55define(`v0',`%l1')
56define(`v1',`%l3')
57define(`v2',`%l5')
58define(`v3',`%l7')
59
60define(`cy',`%i4')
61
62define(`fanop',`fitod %f0,%f2')		dnl  A quasi nop running in the FA pipe
63define(`fmnop',`fmuld %f0,%f0,%f4')	dnl  A quasi nop running in the FM pipe
64
65ASM_START()
66	REGISTER(%g2,#scratch)
67	REGISTER(%g3,#scratch)
68PROLOGUE(mpn_sub_nc)
69	save	%sp,-160,%sp
70
71	fitod	%f0,%f0		C make sure f0 contains small, quiet number
72	subcc	n,4,%g0
73	bl,pn	%xcc,.Loop0
74	nop
75	b,a	L(com)
76EPILOGUE()
77
78PROLOGUE(mpn_sub_n)
79	save	%sp,-160,%sp
80
81	fitod	%f0,%f0		C make sure f0 contains small, quiet number
82	subcc	n,4,%g0
83	bl,pn	%xcc,.Loop0
84	mov	0,cy
85L(com):
86	ldx	[up+0],u0
87	ldx	[vp+0],v0
88	add	up,32,up
89	ldx	[up-24],u1
90	ldx	[vp+8],v1
91	add	vp,32,vp
92	ldx	[up-16],u2
93	ldx	[vp-16],v2
94	ldx	[up-8],u3
95	ldx	[vp-8],v3
96	subcc	n,8,n
97	sub	u0,v0,%g1	C main sub
98	sub	%g1,cy,%g5	C carry sub
99	orn	u0,v0,%g2
100	bl,pn	%xcc,.Lend4567
101	fanop
102	b,a	.Loop
103
104	.align	16
105C START MAIN LOOP
106.Loop:	orn	%g5,%g2,%g2
107	andn	u0,v0,%g3
108	ldx	[up+0],u0
109	fanop
110C --
111	andn	%g2,%g3,%g2
112	ldx	[vp+0],v0
113	add	up,32,up
114	fanop
115C --
116	srlx	%g2,63,cy
117	sub	u1,v1,%g1
118	stx	%g5,[rp+0]
119	fanop
120C --
121	sub	%g1,cy,%g5
122	orn	u1,v1,%g2
123	fmnop
124	fanop
125C --
126	orn	%g5,%g2,%g2
127	andn	u1,v1,%g3
128	ldx	[up-24],u1
129	fanop
130C --
131	andn	%g2,%g3,%g2
132	ldx	[vp+8],v1
133	add	vp,32,vp
134	fanop
135C --
136	srlx	%g2,63,cy
137	sub	u2,v2,%g1
138	stx	%g5,[rp+8]
139	fanop
140C --
141	sub	%g1,cy,%g5
142	orn	u2,v2,%g2
143	fmnop
144	fanop
145C --
146	orn	%g5,%g2,%g2
147	andn	u2,v2,%g3
148	ldx	[up-16],u2
149	fanop
150C --
151	andn	%g2,%g3,%g2
152	ldx	[vp-16],v2
153	add	rp,32,rp
154	fanop
155C --
156	srlx	%g2,63,cy
157	sub	u3,v3,%g1
158	stx	%g5,[rp-16]
159	fanop
160C --
161	sub	%g1,cy,%g5
162	orn	u3,v3,%g2
163	fmnop
164	fanop
165C --
166	orn	%g5,%g2,%g2
167	andn	u3,v3,%g3
168	ldx	[up-8],u3
169	fanop
170C --
171	andn	%g2,%g3,%g2
172	subcc	n,4,n
173	ldx	[vp-8],v3
174	fanop
175C --
176	srlx	%g2,63,cy
177	sub	u0,v0,%g1
178	stx	%g5,[rp-8]
179	fanop
180C --
181	sub	%g1,cy,%g5
182	orn	u0,v0,%g2
183	bge,pt	%xcc,.Loop
184	fanop
185C END MAIN LOOP
186.Lend4567:
187	orn	%g5,%g2,%g2
188	andn	u0,v0,%g3
189	andn	%g2,%g3,%g2
190	srlx	%g2,63,cy
191	sub	u1,v1,%g1
192	stx	%g5,[rp+0]
193	sub	%g1,cy,%g5
194	orn	u1,v1,%g2
195	orn	%g5,%g2,%g2
196	andn	u1,v1,%g3
197	andn	%g2,%g3,%g2
198	srlx	%g2,63,cy
199	sub	u2,v2,%g1
200	stx	%g5,[rp+8]
201	sub	%g1,cy,%g5
202	orn	u2,v2,%g2
203	orn	%g5,%g2,%g2
204	andn	u2,v2,%g3
205	andn	%g2,%g3,%g2
206	add	rp,32,rp
207	srlx	%g2,63,cy
208	sub	u3,v3,%g1
209	stx	%g5,[rp-16]
210	sub	%g1,cy,%g5
211	orn	u3,v3,%g2
212	orn	%g5,%g2,%g2
213	andn	u3,v3,%g3
214	andn	%g2,%g3,%g2
215	srlx	%g2,63,cy
216	stx	%g5,[rp-8]
217
218	addcc	n,4,n
219	bz,pn	%xcc,.Lret
220	fanop
221
222.Loop0:	ldx	[up],u0
223	add	up,8,up
224	ldx	[vp],v0
225	add	vp,8,vp
226	add	rp,8,rp
227	subcc	n,1,n
228	sub	u0,v0,%g1
229	orn	u0,v0,%g2
230	sub	%g1,cy,%g5
231	andn	u0,v0,%g3
232	orn	%g5,%g2,%g2
233	stx	%g5,[rp-8]
234	andn	%g2,%g3,%g2
235	bnz,pt	%xcc,.Loop0
236	srlx	%g2,63,cy
237
238.Lret:	mov	cy,%i0
239	ret
240	restore
241EPILOGUE(mpn_sub_n)
242