xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/sparc32/v9/submul_1.asm (revision c7c727fae85036860d5bb848f2730ff419e2b060)
1dnl  SPARC v9 32-bit mpn_submul_1 -- Multiply a limb vector with a limb and
2dnl  subtract the result from a second limb vector.
3
4dnl  Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of the GNU Lesser General Public License as published
10dnl  by the Free Software Foundation; either version 3 of the License, or (at
11dnl  your option) any later version.
12
13dnl  The GNU MP Library is distributed in the hope that it will be useful, but
14dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16dnl  License for more details.
17
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23C Algorithm: We use two floating-point multiplies per limb product, with the
24C invariant v operand split into two 16-bit pieces, and the u operand split
25C into 32-bit pieces.  We convert the two 48-bit products and transfer them to
26C the integer unit.
27
28C		   cycles/limb
29C UltraSPARC 1&2:     6.5
30C UltraSPARC 3:	      ?
31
32C Possible optimizations:
33C   1. Combine 32-bit memory operations into 64-bit operations.  Since we're
34C      memory bandwidth limited, this could save 1.5 cycles/limb.
35C   2. Unroll the inner loop.  Since we already use alternate temporary areas,
36C      it is very straightforward to unroll, using an exit branch midways.
37C      Unrolling would allow deeper scheduling which could improve speed for L2
38C      cache case.
39C   3. For mpn_mul_1: Use more alternating temp areas.  The std'es and ldx'es
40C      aren't sufficiently apart-scheduled with just two temp areas.
41C   4. Specialize for particular v values.  If its upper 16 bits are zero, we
42C      could save many operations.
43
44C INPUT PARAMETERS
45C rp	i0
46C up	i1
47C n	i2
48C v	i3
49
50define(`FSIZE',224)
51
52ASM_START()
53PROLOGUE(mpn_submul_1)
54	add	%sp, -FSIZE, %sp
55	sethi	%hi(0xffff), %g1
56	srl	%o3, 16, %g2
57	or	%g1, %lo(0xffff), %g1
58	and	%o3, %g1, %g1
59	stx	%g1, [%sp+104]
60	stx	%g2, [%sp+112]
61	ldd	[%sp+104], %f6
62	ldd	[%sp+112], %f8
63	fxtod	%f6, %f6
64	fxtod	%f8, %f8
65	ld	[%sp+104], %f10		C zero f10
66
67	mov	0, %g3			C cy = 0
68
69define(`fanop', `fitod %f18, %f0')	C  A quasi nop running in the FA pipe
70
71	add	%sp, 160, %o5		C point in scratch area
72	and	%o5, -32, %o5		C align at 0 (mod 32) in scratch area
73
74	subcc	%o2, 1, %o2
75	ld	[%o1], %f11		C read up[i]
76	add	%o1, 4, %o1		C up++
77	bne,pt	%icc, .L_two_or_more
78	fxtod	%f10, %f2
79
80	fmuld	%f2, %f8, %f16
81	fmuld	%f2, %f6, %f4
82	fdtox	%f16, %f14
83	fdtox	%f4, %f12
84	std	%f14, [%o5+16]
85	std	%f12, [%o5+24]
86	ldx	[%o5+16], %g2		C p16
87	ldx	[%o5+24], %g1		C p0
88	lduw	[%o0], %g5		C read rp[i]
89	b	.L1
90	add	%o0, -16, %o0
91
92	.align	16
93.L_two_or_more:
94	subcc	%o2, 1, %o2
95	ld	[%o1], %f11		C read up[i]
96	fmuld	%f2, %f8, %f16
97	fmuld	%f2, %f6, %f4
98	add	%o1, 4, %o1		C up++
99	bne,pt	%icc, .L_three_or_more
100	fxtod	%f10, %f2
101
102	fdtox	%f16, %f14
103	fdtox	%f4, %f12
104	std	%f14, [%o5+16]
105	fmuld	%f2, %f8, %f16
106	std	%f12, [%o5+24]
107	fmuld	%f2, %f6, %f4
108	fdtox	%f16, %f14
109	fdtox	%f4, %f12
110	std	%f14, [%o5+0]
111	std	%f12, [%o5+8]
112	lduw	[%o0], %g5		C read rp[i]
113	ldx	[%o5+16], %g2		C p16
114	ldx	[%o5+24], %g1		C p0
115	b	.L2
116	add	%o0, -12, %o0
117
118	.align	16
119.L_three_or_more:
120	subcc	%o2, 1, %o2
121	ld	[%o1], %f11		C read up[i]
122	fdtox	%f16, %f14
123	fdtox	%f4, %f12
124	std	%f14, [%o5+16]
125	fmuld	%f2, %f8, %f16
126	std	%f12, [%o5+24]
127	fmuld	%f2, %f6, %f4
128	add	%o1, 4, %o1		C up++
129	bne,pt	%icc, .L_four_or_more
130	fxtod	%f10, %f2
131
132	fdtox	%f16, %f14
133	fdtox	%f4, %f12
134	std	%f14, [%o5+0]
135	fmuld	%f2, %f8, %f16
136	std	%f12, [%o5+8]
137	fmuld	%f2, %f6, %f4
138	fdtox	%f16, %f14
139	ldx	[%o5+16], %g2		C p16
140	fdtox	%f4, %f12
141	ldx	[%o5+24], %g1		C p0
142	std	%f14, [%o5+16]
143	std	%f12, [%o5+24]
144	lduw	[%o0], %g5		C read rp[i]
145	b	.L3
146	add	%o0, -8, %o0
147
148	.align	16
149.L_four_or_more:
150	subcc	%o2, 1, %o2
151	ld	[%o1], %f11		C read up[i]
152	fdtox	%f16, %f14
153	fdtox	%f4, %f12
154	std	%f14, [%o5+0]
155	fmuld	%f2, %f8, %f16
156	std	%f12, [%o5+8]
157	fmuld	%f2, %f6, %f4
158	add	%o1, 4, %o1		C up++
159	bne,pt	%icc, .L_five_or_more
160	fxtod	%f10, %f2
161
162	fdtox	%f16, %f14
163	ldx	[%o5+16], %g2		C p16
164	fdtox	%f4, %f12
165	ldx	[%o5+24], %g1		C p0
166	std	%f14, [%o5+16]
167	fmuld	%f2, %f8, %f16
168	std	%f12, [%o5+24]
169	fmuld	%f2, %f6, %f4
170	add	%o1, 4, %o1		C up++
171	lduw	[%o0], %g5		C read rp[i]
172	b	.L4
173	add	%o0, -4, %o0
174
175	.align	16
176.L_five_or_more:
177	subcc	%o2, 1, %o2
178	ld	[%o1], %f11		C read up[i]
179	fdtox	%f16, %f14
180	ldx	[%o5+16], %g2		C p16
181	fdtox	%f4, %f12
182	ldx	[%o5+24], %g1		C p0
183	std	%f14, [%o5+16]
184	fmuld	%f2, %f8, %f16
185	std	%f12, [%o5+24]
186	fmuld	%f2, %f6, %f4
187	add	%o1, 4, %o1		C up++
188	lduw	[%o0], %g5		C read rp[i]
189	bne,pt	%icc, .Loop
190	fxtod	%f10, %f2
191	b,a	.L5
192
193C BEGIN MAIN LOOP
194	.align 16
195C -- 0
196.Loop:	sub	%g0, %g3, %g3
197	subcc	%o2, 1, %o2
198	ld	[%o1], %f11		C read up[i]
199	fdtox	%f16, %f14
200C -- 1
201	sllx	%g2, 16, %g4		C (p16 << 16)
202	add	%o0, 4, %o0		C rp++
203	ldx	[%o5+0], %g2		C p16
204	fdtox	%f4, %f12
205C -- 2
206	srl	%g3, 0, %g3		C zero most significant 32 bits
207	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
208	ldx	[%o5+8], %g1		C p0
209	fanop
210C -- 3
211	nop
212	add	%g3, %g4, %g4		C p += cy
213	std	%f14, [%o5+0]
214	fmuld	%f2, %f8, %f16
215C -- 4
216	nop
217	sub	%g5, %g4, %g4		C p += rp[i]
218	std	%f12, [%o5+8]
219	fmuld	%f2, %f6, %f4
220C -- 5
221	xor	%o5, 16, %o5		C alternate scratch variables
222	add	%o1, 4, %o1		C up++
223	stw	%g4, [%o0-4]
224	fanop
225C -- 6
226	srlx	%g4, 32, %g3		C new cy
227	lduw	[%o0], %g5		C read rp[i]
228	bne,pt	%icc, .Loop
229	fxtod	%f10, %f2
230C END MAIN LOOP
231
232.L5:	sub	%g0, %g3, %g3
233	fdtox	%f16, %f14
234	sllx	%g2, 16, %g4		C (p16 << 16)
235	ldx	[%o5+0], %g2		C p16
236	fdtox	%f4, %f12
237	srl	%g3, 0, %g3		C zero most significant 32 bits
238	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
239	ldx	[%o5+8], %g1		C p0
240	add	%g4, %g3, %g4		C p += cy
241	std	%f14, [%o5+0]
242	fmuld	%f2, %f8, %f16
243	sub	%g5, %g4, %g4		C p += rp[i]
244	std	%f12, [%o5+8]
245	fmuld	%f2, %f6, %f4
246	xor	%o5, 16, %o5
247	stw	%g4, [%o0+0]
248	srlx	%g4, 32, %g3		C new cy
249	lduw	[%o0+4], %g5		C read rp[i]
250
251	sub	%g0, %g3, %g3
252.L4:	fdtox	%f16, %f14
253	sllx	%g2, 16, %g4		C (p16 << 16)
254	ldx	[%o5+0], %g2		C p16
255	fdtox	%f4, %f12
256	srl	%g3, 0, %g3		C zero most significant 32 bits
257	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
258	ldx	[%o5+8], %g1		C p0
259	add	%g3, %g4, %g4		C p += cy
260	std	%f14, [%o5+0]
261	sub	%g5, %g4, %g4		C p += rp[i]
262	std	%f12, [%o5+8]
263	xor	%o5, 16, %o5
264	stw	%g4, [%o0+4]
265	srlx	%g4, 32, %g3		C new cy
266	lduw	[%o0+8], %g5		C read rp[i]
267
268	sub	%g0, %g3, %g3
269.L3:	sllx	%g2, 16, %g4		C (p16 << 16)
270	ldx	[%o5+0], %g2		C p16
271	srl	%g3, 0, %g3		C zero most significant 32 bits
272	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
273	ldx	[%o5+8], %g1		C p0
274	add	%g3, %g4, %g4		C p += cy
275	sub	%g5, %g4, %g4		C p += rp[i]
276	xor	%o5, 16, %o5
277	stw	%g4, [%o0+8]
278	srlx	%g4, 32, %g3		C new cy
279	lduw	[%o0+12], %g5		C read rp[i]
280
281	sub	%g0, %g3, %g3
282.L2:	sllx	%g2, 16, %g4		C (p16 << 16)
283	ldx	[%o5+0], %g2		C p16
284	srl	%g3, 0, %g3		C zero most significant 32 bits
285	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
286	ldx	[%o5+8], %g1		C p0
287	add	%g3, %g4, %g4		C p += cy
288	sub	%g5, %g4, %g4		C p += rp[i]
289	stw	%g4, [%o0+12]
290	srlx	%g4, 32, %g3		C new cy
291	lduw	[%o0+16], %g5		C read rp[i]
292
293	sub	%g0, %g3, %g3
294.L1:	sllx	%g2, 16, %g4		C (p16 << 16)
295	srl	%g3, 0, %g3		C zero most significant 32 bits
296	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
297	add	%g3, %g4, %g4		C p += cy
298	sub	%g5, %g4, %g4		C p += rp[i]
299	stw	%g4, [%o0+16]
300	srlx	%g4, 32, %g3		C new cy
301
302	sub	%g0, %g3, %o0
303	retl
304	sub	%sp, -FSIZE, %sp
305EPILOGUE(mpn_submul_1)
306