xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/sparc32/v9/addmul_1.asm (revision 796c32c94f6e154afc9de0f63da35c91bb739b45)
1dnl  SPARC v9 32-bit mpn_addmul_1 -- Multiply a limb vector with a limb and add
2dnl  the result to a second limb vector.
3
4dnl  Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of either:
10dnl
11dnl    * the GNU Lesser General Public License as published by the Free
12dnl      Software Foundation; either version 3 of the License, or (at your
13dnl      option) any later version.
14dnl
15dnl  or
16dnl
17dnl    * the GNU General Public License as published by the Free Software
18dnl      Foundation; either version 2 of the License, or (at your option) any
19dnl      later version.
20dnl
21dnl  or both in parallel, as here.
22dnl
23dnl  The GNU MP Library is distributed in the hope that it will be useful, but
24dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26dnl  for more details.
27dnl
28dnl  You should have received copies of the GNU General Public License and the
29dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
30dnl  see https://www.gnu.org/licenses/.
31
32include(`../config.m4')
33
34C Algorithm: We use two floating-point multiplies per limb product, with the
35C invariant v operand split into two 16-bit pieces, and the u operand split
36C into 32-bit pieces.  We convert the two 48-bit products and transfer them to
37C the integer unit.
38
39C		   cycles/limb
40C UltraSPARC 1&2:     6.5
41C UltraSPARC 3:	      ?
42
43C Possible optimizations:
44C   1. Combine 32-bit memory operations into 64-bit operations.  Since we're
45C      memory bandwidth limited, this could save 1.5 cycles/limb.
46C   2. Unroll the inner loop.  Since we already use alternate temporary areas,
47C      it is very straightforward to unroll, using an exit branch midways.
48C      Unrolling would allow deeper scheduling which could improve speed for L2
49C      cache case.
50C   3. For mpn_mul_1: Use more alternating temp areas.  The std'es and ldx'es
51C      aren't sufficiently apart-scheduled with just two temp areas.
52C   4. Specialize for particular v values.  If its upper 16 bits are zero, we
53C      could save many operations.
54
55C INPUT PARAMETERS
56C rp	i0
57C up	i1
58C n	i2
59C v	i3
60
61define(`FSIZE',224)
62
63ASM_START()
64PROLOGUE(mpn_addmul_1)
65	add	%sp, -FSIZE, %sp
66	sethi	%hi(0xffff), %g1
67	srl	%o3, 16, %g2
68	or	%g1, %lo(0xffff), %g1
69	and	%o3, %g1, %g1
70	stx	%g1, [%sp+104]
71	stx	%g2, [%sp+112]
72	ldd	[%sp+104], %f6
73	ldd	[%sp+112], %f8
74	fxtod	%f6, %f6
75	fxtod	%f8, %f8
76	ld	[%sp+104], %f10		C zero f10
77
78	mov	0, %g3			C cy = 0
79
80define(`fanop', `fitod %f18, %f0')	C  A quasi nop running in the FA pipe
81
82	add	%sp, 160, %o5		C point in scratch area
83	and	%o5, -32, %o5		C align at 0 (mod 32) in scratch area
84
85	subcc	%o2, 1, %o2
86	ld	[%o1], %f11		C read up[i]
87	add	%o1, 4, %o1		C up++
88	bne,pt	%icc, .L_two_or_more
89	fxtod	%f10, %f2
90
91	fmuld	%f2, %f8, %f16
92	fmuld	%f2, %f6, %f4
93	fdtox	%f16, %f14
94	fdtox	%f4, %f12
95	std	%f14, [%o5+16]
96	std	%f12, [%o5+24]
97	ldx	[%o5+16], %g2		C p16
98	ldx	[%o5+24], %g1		C p0
99	lduw	[%o0], %g5		C read rp[i]
100	b	.L1
101	add	%o0, -16, %o0
102
103	.align	16
104.L_two_or_more:
105	subcc	%o2, 1, %o2
106	ld	[%o1], %f11		C read up[i]
107	fmuld	%f2, %f8, %f16
108	fmuld	%f2, %f6, %f4
109	add	%o1, 4, %o1		C up++
110	bne,pt	%icc, .L_three_or_more
111	fxtod	%f10, %f2
112
113	fdtox	%f16, %f14
114	fdtox	%f4, %f12
115	std	%f14, [%o5+16]
116	fmuld	%f2, %f8, %f16
117	std	%f12, [%o5+24]
118	fmuld	%f2, %f6, %f4
119	fdtox	%f16, %f14
120	fdtox	%f4, %f12
121	std	%f14, [%o5+0]
122	std	%f12, [%o5+8]
123	lduw	[%o0], %g5		C read rp[i]
124	ldx	[%o5+16], %g2		C p16
125	ldx	[%o5+24], %g1		C p0
126	b	.L2
127	add	%o0, -12, %o0
128
129	.align	16
130.L_three_or_more:
131	subcc	%o2, 1, %o2
132	ld	[%o1], %f11		C read up[i]
133	fdtox	%f16, %f14
134	fdtox	%f4, %f12
135	std	%f14, [%o5+16]
136	fmuld	%f2, %f8, %f16
137	std	%f12, [%o5+24]
138	fmuld	%f2, %f6, %f4
139	add	%o1, 4, %o1		C up++
140	bne,pt	%icc, .L_four_or_more
141	fxtod	%f10, %f2
142
143	fdtox	%f16, %f14
144	fdtox	%f4, %f12
145	std	%f14, [%o5+0]
146	fmuld	%f2, %f8, %f16
147	std	%f12, [%o5+8]
148	fmuld	%f2, %f6, %f4
149	fdtox	%f16, %f14
150	ldx	[%o5+16], %g2		C p16
151	fdtox	%f4, %f12
152	ldx	[%o5+24], %g1		C p0
153	std	%f14, [%o5+16]
154	std	%f12, [%o5+24]
155	lduw	[%o0], %g5		C read rp[i]
156	b	.L3
157	add	%o0, -8, %o0
158
159	.align	16
160.L_four_or_more:
161	subcc	%o2, 1, %o2
162	ld	[%o1], %f11		C read up[i]
163	fdtox	%f16, %f14
164	fdtox	%f4, %f12
165	std	%f14, [%o5+0]
166	fmuld	%f2, %f8, %f16
167	std	%f12, [%o5+8]
168	fmuld	%f2, %f6, %f4
169	add	%o1, 4, %o1		C up++
170	bne,pt	%icc, .L_five_or_more
171	fxtod	%f10, %f2
172
173	fdtox	%f16, %f14
174	ldx	[%o5+16], %g2		C p16
175	fdtox	%f4, %f12
176	ldx	[%o5+24], %g1		C p0
177	std	%f14, [%o5+16]
178	fmuld	%f2, %f8, %f16
179	std	%f12, [%o5+24]
180	fmuld	%f2, %f6, %f4
181	add	%o1, 4, %o1		C up++
182	lduw	[%o0], %g5		C read rp[i]
183	b	.L4
184	add	%o0, -4, %o0
185
186	.align	16
187.L_five_or_more:
188	subcc	%o2, 1, %o2
189	ld	[%o1], %f11		C read up[i]
190	fdtox	%f16, %f14
191	ldx	[%o5+16], %g2		C p16
192	fdtox	%f4, %f12
193	ldx	[%o5+24], %g1		C p0
194	std	%f14, [%o5+16]
195	fmuld	%f2, %f8, %f16
196	std	%f12, [%o5+24]
197	fmuld	%f2, %f6, %f4
198	add	%o1, 4, %o1		C up++
199	lduw	[%o0], %g5		C read rp[i]
200	bne,pt	%icc, .Loop
201	fxtod	%f10, %f2
202	b,a	.L5
203
204C BEGIN MAIN LOOP
205	.align 16
206C -- 0
207.Loop:	nop
208	subcc	%o2, 1, %o2
209	ld	[%o1], %f11		C read up[i]
210	fdtox	%f16, %f14
211C -- 1
212	sllx	%g2, 16, %g4		C (p16 << 16)
213	add	%o0, 4, %o0		C rp++
214	ldx	[%o5+0], %g2		C p16
215	fdtox	%f4, %f12
216C -- 2
217	nop
218	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
219	ldx	[%o5+8], %g1		C p0
220	fanop
221C -- 3
222	nop
223	add	%g3, %g4, %g4		C p += cy
224	std	%f14, [%o5+0]
225	fmuld	%f2, %f8, %f16
226C -- 4
227	nop
228	add	%g5, %g4, %g4		C p += rp[i]
229	std	%f12, [%o5+8]
230	fmuld	%f2, %f6, %f4
231C -- 5
232	xor	%o5, 16, %o5		C alternate scratch variables
233	add	%o1, 4, %o1		C up++
234	stw	%g4, [%o0-4]
235	fanop
236C -- 6
237	srlx	%g4, 32, %g3		C new cy
238	lduw	[%o0], %g5		C read rp[i]
239	bne,pt	%icc, .Loop
240	fxtod	%f10, %f2
241C END MAIN LOOP
242
243.L5:	fdtox	%f16, %f14
244	sllx	%g2, 16, %g4		C (p16 << 16)
245	ldx	[%o5+0], %g2		C p16
246	fdtox	%f4, %f12
247	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
248	ldx	[%o5+8], %g1		C p0
249	add	%g4, %g3, %g4		C p += cy
250	std	%f14, [%o5+0]
251	fmuld	%f2, %f8, %f16
252	add	%g5, %g4, %g4		C p += rp[i]
253	std	%f12, [%o5+8]
254	fmuld	%f2, %f6, %f4
255	xor	%o5, 16, %o5
256	stw	%g4, [%o0+0]
257	srlx	%g4, 32, %g3		C new cy
258	lduw	[%o0+4], %g5		C read rp[i]
259
260.L4:	fdtox	%f16, %f14
261	sllx	%g2, 16, %g4		C (p16 << 16)
262	ldx	[%o5+0], %g2		C p16
263	fdtox	%f4, %f12
264	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
265	ldx	[%o5+8], %g1		C p0
266	add	%g3, %g4, %g4		C p += cy
267	std	%f14, [%o5+0]
268	add	%g5, %g4, %g4		C p += rp[i]
269	std	%f12, [%o5+8]
270	xor	%o5, 16, %o5
271	stw	%g4, [%o0+4]
272	srlx	%g4, 32, %g3		C new cy
273	lduw	[%o0+8], %g5		C read rp[i]
274
275.L3:	sllx	%g2, 16, %g4		C (p16 << 16)
276	ldx	[%o5+0], %g2		C p16
277	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
278	ldx	[%o5+8], %g1		C p0
279	add	%g3, %g4, %g4		C p += cy
280	add	%g5, %g4, %g4		C p += rp[i]
281	xor	%o5, 16, %o5
282	stw	%g4, [%o0+8]
283	srlx	%g4, 32, %g3		C new cy
284	lduw	[%o0+12], %g5		C read rp[i]
285
286.L2:	sllx	%g2, 16, %g4		C (p16 << 16)
287	ldx	[%o5+0], %g2		C p16
288	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
289	ldx	[%o5+8], %g1		C p0
290	add	%g3, %g4, %g4		C p += cy
291	add	%g5, %g4, %g4		C p += rp[i]
292	stw	%g4, [%o0+12]
293	srlx	%g4, 32, %g3		C new cy
294	lduw	[%o0+16], %g5		C read rp[i]
295
296.L1:	sllx	%g2, 16, %g4		C (p16 << 16)
297	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
298	add	%g3, %g4, %g4		C p += cy
299	add	%g5, %g4, %g4		C p += rp[i]
300	stw	%g4, [%o0+16]
301	srlx	%g4, 32, %g3		C new cy
302
303	mov	%g3, %o0
304	retl
305	sub	%sp, -FSIZE, %sp
306EPILOGUE(mpn_addmul_1)
307