xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/sparc32/v9/addmul_1.asm (revision 413d532bcc3f62d122e56d92e13ac64825a40baf)
1dnl  SPARC v9 32-bit mpn_addmul_1 -- Multiply a limb vector with a limb and add
2dnl  the result to a second limb vector.
3
4dnl  Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of the GNU Lesser General Public License as published
10dnl  by the Free Software Foundation; either version 3 of the License, or (at
11dnl  your option) any later version.
12
13dnl  The GNU MP Library is distributed in the hope that it will be useful, but
14dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16dnl  License for more details.
17
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23C Algorithm: We use two floating-point multiplies per limb product, with the
24C invariant v operand split into two 16-bit pieces, and the u operand split
25C into 32-bit pieces.  We convert the two 48-bit products and transfer them to
26C the integer unit.
27
28C		   cycles/limb
29C UltraSPARC 1&2:     6.5
30C UltraSPARC 3:	      ?
31
32C Possible optimizations:
33C   1. Combine 32-bit memory operations into 64-bit operations.  Since we're
34C      memory bandwidth limited, this could save 1.5 cycles/limb.
35C   2. Unroll the inner loop.  Since we already use alternate temporary areas,
36C      it is very straightforward to unroll, using an exit branch midways.
37C      Unrolling would allow deeper scheduling which could improve speed for L2
38C      cache case.
39C   3. For mpn_mul_1: Use more alternating temp areas.  The std'es and ldx'es
40C      aren't sufficiently apart-scheduled with just two temp areas.
41C   4. Specialize for particular v values.  If its upper 16 bits are zero, we
42C      could save many operations.
43
44C INPUT PARAMETERS
45C rp	i0
46C up	i1
47C n	i2
48C v	i3
49
50define(`FSIZE',224)
51
52ASM_START()
53PROLOGUE(mpn_addmul_1)
54	add	%sp, -FSIZE, %sp
55	sethi	%hi(0xffff), %g1
56	srl	%o3, 16, %g2
57	or	%g1, %lo(0xffff), %g1
58	and	%o3, %g1, %g1
59	stx	%g1, [%sp+104]
60	stx	%g2, [%sp+112]
61	ldd	[%sp+104], %f6
62	ldd	[%sp+112], %f8
63	fxtod	%f6, %f6
64	fxtod	%f8, %f8
65	ld	[%sp+104], %f10		C zero f10
66
67	mov	0, %g3			C cy = 0
68
69define(`fanop', `fitod %f18, %f0')	C  A quasi nop running in the FA pipe
70
71	add	%sp, 160, %o5		C point in scratch area
72	and	%o5, -32, %o5		C align at 0 (mod 32) in scratch area
73
74	subcc	%o2, 1, %o2
75	ld	[%o1], %f11		C read up[i]
76	add	%o1, 4, %o1		C up++
77	bne,pt	%icc, .L_two_or_more
78	fxtod	%f10, %f2
79
80	fmuld	%f2, %f8, %f16
81	fmuld	%f2, %f6, %f4
82	fdtox	%f16, %f14
83	fdtox	%f4, %f12
84	std	%f14, [%o5+16]
85	std	%f12, [%o5+24]
86	ldx	[%o5+16], %g2		C p16
87	ldx	[%o5+24], %g1		C p0
88	lduw	[%o0], %g5		C read rp[i]
89	b	.L1
90	add	%o0, -16, %o0
91
92	.align	16
93.L_two_or_more:
94	subcc	%o2, 1, %o2
95	ld	[%o1], %f11		C read up[i]
96	fmuld	%f2, %f8, %f16
97	fmuld	%f2, %f6, %f4
98	add	%o1, 4, %o1		C up++
99	bne,pt	%icc, .L_three_or_more
100	fxtod	%f10, %f2
101
102	fdtox	%f16, %f14
103	fdtox	%f4, %f12
104	std	%f14, [%o5+16]
105	fmuld	%f2, %f8, %f16
106	std	%f12, [%o5+24]
107	fmuld	%f2, %f6, %f4
108	fdtox	%f16, %f14
109	fdtox	%f4, %f12
110	std	%f14, [%o5+0]
111	std	%f12, [%o5+8]
112	lduw	[%o0], %g5		C read rp[i]
113	ldx	[%o5+16], %g2		C p16
114	ldx	[%o5+24], %g1		C p0
115	b	.L2
116	add	%o0, -12, %o0
117
118	.align	16
119.L_three_or_more:
120	subcc	%o2, 1, %o2
121	ld	[%o1], %f11		C read up[i]
122	fdtox	%f16, %f14
123	fdtox	%f4, %f12
124	std	%f14, [%o5+16]
125	fmuld	%f2, %f8, %f16
126	std	%f12, [%o5+24]
127	fmuld	%f2, %f6, %f4
128	add	%o1, 4, %o1		C up++
129	bne,pt	%icc, .L_four_or_more
130	fxtod	%f10, %f2
131
132	fdtox	%f16, %f14
133	fdtox	%f4, %f12
134	std	%f14, [%o5+0]
135	fmuld	%f2, %f8, %f16
136	std	%f12, [%o5+8]
137	fmuld	%f2, %f6, %f4
138	fdtox	%f16, %f14
139	ldx	[%o5+16], %g2		C p16
140	fdtox	%f4, %f12
141	ldx	[%o5+24], %g1		C p0
142	std	%f14, [%o5+16]
143	std	%f12, [%o5+24]
144	lduw	[%o0], %g5		C read rp[i]
145	b	.L3
146	add	%o0, -8, %o0
147
148	.align	16
149.L_four_or_more:
150	subcc	%o2, 1, %o2
151	ld	[%o1], %f11		C read up[i]
152	fdtox	%f16, %f14
153	fdtox	%f4, %f12
154	std	%f14, [%o5+0]
155	fmuld	%f2, %f8, %f16
156	std	%f12, [%o5+8]
157	fmuld	%f2, %f6, %f4
158	add	%o1, 4, %o1		C up++
159	bne,pt	%icc, .L_five_or_more
160	fxtod	%f10, %f2
161
162	fdtox	%f16, %f14
163	ldx	[%o5+16], %g2		C p16
164	fdtox	%f4, %f12
165	ldx	[%o5+24], %g1		C p0
166	std	%f14, [%o5+16]
167	fmuld	%f2, %f8, %f16
168	std	%f12, [%o5+24]
169	fmuld	%f2, %f6, %f4
170	add	%o1, 4, %o1		C up++
171	lduw	[%o0], %g5		C read rp[i]
172	b	.L4
173	add	%o0, -4, %o0
174
175	.align	16
176.L_five_or_more:
177	subcc	%o2, 1, %o2
178	ld	[%o1], %f11		C read up[i]
179	fdtox	%f16, %f14
180	ldx	[%o5+16], %g2		C p16
181	fdtox	%f4, %f12
182	ldx	[%o5+24], %g1		C p0
183	std	%f14, [%o5+16]
184	fmuld	%f2, %f8, %f16
185	std	%f12, [%o5+24]
186	fmuld	%f2, %f6, %f4
187	add	%o1, 4, %o1		C up++
188	lduw	[%o0], %g5		C read rp[i]
189	bne,pt	%icc, .Loop
190	fxtod	%f10, %f2
191	b,a	.L5
192
193C BEGIN MAIN LOOP
194	.align 16
195C -- 0
196.Loop:	nop
197	subcc	%o2, 1, %o2
198	ld	[%o1], %f11		C read up[i]
199	fdtox	%f16, %f14
200C -- 1
201	sllx	%g2, 16, %g4		C (p16 << 16)
202	add	%o0, 4, %o0		C rp++
203	ldx	[%o5+0], %g2		C p16
204	fdtox	%f4, %f12
205C -- 2
206	nop
207	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
208	ldx	[%o5+8], %g1		C p0
209	fanop
210C -- 3
211	nop
212	add	%g3, %g4, %g4		C p += cy
213	std	%f14, [%o5+0]
214	fmuld	%f2, %f8, %f16
215C -- 4
216	nop
217	add	%g5, %g4, %g4		C p += rp[i]
218	std	%f12, [%o5+8]
219	fmuld	%f2, %f6, %f4
220C -- 5
221	xor	%o5, 16, %o5		C alternate scratch variables
222	add	%o1, 4, %o1		C up++
223	stw	%g4, [%o0-4]
224	fanop
225C -- 6
226	srlx	%g4, 32, %g3		C new cy
227	lduw	[%o0], %g5		C read rp[i]
228	bne,pt	%icc, .Loop
229	fxtod	%f10, %f2
230C END MAIN LOOP
231
232.L5:	fdtox	%f16, %f14
233	sllx	%g2, 16, %g4		C (p16 << 16)
234	ldx	[%o5+0], %g2		C p16
235	fdtox	%f4, %f12
236	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
237	ldx	[%o5+8], %g1		C p0
238	add	%g4, %g3, %g4		C p += cy
239	std	%f14, [%o5+0]
240	fmuld	%f2, %f8, %f16
241	add	%g5, %g4, %g4		C p += rp[i]
242	std	%f12, [%o5+8]
243	fmuld	%f2, %f6, %f4
244	xor	%o5, 16, %o5
245	stw	%g4, [%o0+0]
246	srlx	%g4, 32, %g3		C new cy
247	lduw	[%o0+4], %g5		C read rp[i]
248
249.L4:	fdtox	%f16, %f14
250	sllx	%g2, 16, %g4		C (p16 << 16)
251	ldx	[%o5+0], %g2		C p16
252	fdtox	%f4, %f12
253	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
254	ldx	[%o5+8], %g1		C p0
255	add	%g3, %g4, %g4		C p += cy
256	std	%f14, [%o5+0]
257	add	%g5, %g4, %g4		C p += rp[i]
258	std	%f12, [%o5+8]
259	xor	%o5, 16, %o5
260	stw	%g4, [%o0+4]
261	srlx	%g4, 32, %g3		C new cy
262	lduw	[%o0+8], %g5		C read rp[i]
263
264.L3:	sllx	%g2, 16, %g4		C (p16 << 16)
265	ldx	[%o5+0], %g2		C p16
266	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
267	ldx	[%o5+8], %g1		C p0
268	add	%g3, %g4, %g4		C p += cy
269	add	%g5, %g4, %g4		C p += rp[i]
270	xor	%o5, 16, %o5
271	stw	%g4, [%o0+8]
272	srlx	%g4, 32, %g3		C new cy
273	lduw	[%o0+12], %g5		C read rp[i]
274
275.L2:	sllx	%g2, 16, %g4		C (p16 << 16)
276	ldx	[%o5+0], %g2		C p16
277	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
278	ldx	[%o5+8], %g1		C p0
279	add	%g3, %g4, %g4		C p += cy
280	add	%g5, %g4, %g4		C p += rp[i]
281	stw	%g4, [%o0+12]
282	srlx	%g4, 32, %g3		C new cy
283	lduw	[%o0+16], %g5		C read rp[i]
284
285.L1:	sllx	%g2, 16, %g4		C (p16 << 16)
286	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
287	add	%g3, %g4, %g4		C p += cy
288	add	%g5, %g4, %g4		C p += rp[i]
289	stw	%g4, [%o0+16]
290	srlx	%g4, 32, %g3		C new cy
291
292	mov	%g3, %o0
293	retl
294	sub	%sp, -FSIZE, %sp
295EPILOGUE(mpn_addmul_1)
296