xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/sparc32/v9/submul_1.asm (revision 87d689fb734c654d2486f87f7be32f1b53ecdbec)
1dnl  SPARC v9 32-bit mpn_submul_1 -- Multiply a limb vector with a limb and
2dnl  subtract the result from a second limb vector.
3
4dnl  Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of either:
10dnl
11dnl    * the GNU Lesser General Public License as published by the Free
12dnl      Software Foundation; either version 3 of the License, or (at your
13dnl      option) any later version.
14dnl
15dnl  or
16dnl
17dnl    * the GNU General Public License as published by the Free Software
18dnl      Foundation; either version 2 of the License, or (at your option) any
19dnl      later version.
20dnl
21dnl  or both in parallel, as here.
22dnl
23dnl  The GNU MP Library is distributed in the hope that it will be useful, but
24dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26dnl  for more details.
27dnl
28dnl  You should have received copies of the GNU General Public License and the
29dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
30dnl  see https://www.gnu.org/licenses/.
31
32include(`../config.m4')
33
34C Algorithm: We use two floating-point multiplies per limb product, with the
35C invariant v operand split into two 16-bit pieces, and the u operand split
36C into 32-bit pieces.  We convert the two 48-bit products and transfer them to
37C the integer unit.
38
39C		   cycles/limb
40C UltraSPARC 1&2:     6.5
41C UltraSPARC 3:	      ?
42
43C Possible optimizations:
44C   1. Combine 32-bit memory operations into 64-bit operations.  Since we're
45C      memory bandwidth limited, this could save 1.5 cycles/limb.
46C   2. Unroll the inner loop.  Since we already use alternate temporary areas,
47C      it is very straightforward to unroll, using an exit branch midways.
48C      Unrolling would allow deeper scheduling which could improve speed for L2
49C      cache case.
50C   3. For mpn_mul_1: Use more alternating temp areas.  The std'es and ldx'es
51C      aren't sufficiently apart-scheduled with just two temp areas.
52C   4. Specialize for particular v values.  If its upper 16 bits are zero, we
53C      could save many operations.
54
55C INPUT PARAMETERS
56C rp	i0
57C up	i1
58C n	i2
59C v	i3
60
61define(`FSIZE',224)
62
63ASM_START()
64PROLOGUE(mpn_submul_1)
65	add	%sp, -FSIZE, %sp
66	sethi	%hi(0xffff), %g1
67	srl	%o3, 16, %g2
68	or	%g1, %lo(0xffff), %g1
69	and	%o3, %g1, %g1
70	stx	%g1, [%sp+104]
71	stx	%g2, [%sp+112]
72	ldd	[%sp+104], %f6
73	ldd	[%sp+112], %f8
74	fxtod	%f6, %f6
75	fxtod	%f8, %f8
76	ld	[%sp+104], %f10		C zero f10
77
78	mov	0, %g3			C cy = 0
79
80define(`fanop', `fitod %f18, %f0')	C  A quasi nop running in the FA pipe
81
82	add	%sp, 160, %o5		C point in scratch area
83	and	%o5, -32, %o5		C align at 0 (mod 32) in scratch area
84
85	subcc	%o2, 1, %o2
86	ld	[%o1], %f11		C read up[i]
87	add	%o1, 4, %o1		C up++
88	bne,pt	%icc, .L_two_or_more
89	fxtod	%f10, %f2
90
91	fmuld	%f2, %f8, %f16
92	fmuld	%f2, %f6, %f4
93	fdtox	%f16, %f14
94	fdtox	%f4, %f12
95	std	%f14, [%o5+16]
96	std	%f12, [%o5+24]
97	ldx	[%o5+16], %g2		C p16
98	ldx	[%o5+24], %g1		C p0
99	lduw	[%o0], %g5		C read rp[i]
100	b	.L1
101	add	%o0, -16, %o0
102
103	.align	16
104.L_two_or_more:
105	subcc	%o2, 1, %o2
106	ld	[%o1], %f11		C read up[i]
107	fmuld	%f2, %f8, %f16
108	fmuld	%f2, %f6, %f4
109	add	%o1, 4, %o1		C up++
110	bne,pt	%icc, .L_three_or_more
111	fxtod	%f10, %f2
112
113	fdtox	%f16, %f14
114	fdtox	%f4, %f12
115	std	%f14, [%o5+16]
116	fmuld	%f2, %f8, %f16
117	std	%f12, [%o5+24]
118	fmuld	%f2, %f6, %f4
119	fdtox	%f16, %f14
120	fdtox	%f4, %f12
121	std	%f14, [%o5+0]
122	std	%f12, [%o5+8]
123	lduw	[%o0], %g5		C read rp[i]
124	ldx	[%o5+16], %g2		C p16
125	ldx	[%o5+24], %g1		C p0
126	b	.L2
127	add	%o0, -12, %o0
128
129	.align	16
130.L_three_or_more:
131	subcc	%o2, 1, %o2
132	ld	[%o1], %f11		C read up[i]
133	fdtox	%f16, %f14
134	fdtox	%f4, %f12
135	std	%f14, [%o5+16]
136	fmuld	%f2, %f8, %f16
137	std	%f12, [%o5+24]
138	fmuld	%f2, %f6, %f4
139	add	%o1, 4, %o1		C up++
140	bne,pt	%icc, .L_four_or_more
141	fxtod	%f10, %f2
142
143	fdtox	%f16, %f14
144	fdtox	%f4, %f12
145	std	%f14, [%o5+0]
146	fmuld	%f2, %f8, %f16
147	std	%f12, [%o5+8]
148	fmuld	%f2, %f6, %f4
149	fdtox	%f16, %f14
150	ldx	[%o5+16], %g2		C p16
151	fdtox	%f4, %f12
152	ldx	[%o5+24], %g1		C p0
153	std	%f14, [%o5+16]
154	std	%f12, [%o5+24]
155	lduw	[%o0], %g5		C read rp[i]
156	b	.L3
157	add	%o0, -8, %o0
158
159	.align	16
160.L_four_or_more:
161	subcc	%o2, 1, %o2
162	ld	[%o1], %f11		C read up[i]
163	fdtox	%f16, %f14
164	fdtox	%f4, %f12
165	std	%f14, [%o5+0]
166	fmuld	%f2, %f8, %f16
167	std	%f12, [%o5+8]
168	fmuld	%f2, %f6, %f4
169	add	%o1, 4, %o1		C up++
170	bne,pt	%icc, .L_five_or_more
171	fxtod	%f10, %f2
172
173	fdtox	%f16, %f14
174	ldx	[%o5+16], %g2		C p16
175	fdtox	%f4, %f12
176	ldx	[%o5+24], %g1		C p0
177	std	%f14, [%o5+16]
178	fmuld	%f2, %f8, %f16
179	std	%f12, [%o5+24]
180	fmuld	%f2, %f6, %f4
181	add	%o1, 4, %o1		C up++
182	lduw	[%o0], %g5		C read rp[i]
183	b	.L4
184	add	%o0, -4, %o0
185
186	.align	16
187.L_five_or_more:
188	subcc	%o2, 1, %o2
189	ld	[%o1], %f11		C read up[i]
190	fdtox	%f16, %f14
191	ldx	[%o5+16], %g2		C p16
192	fdtox	%f4, %f12
193	ldx	[%o5+24], %g1		C p0
194	std	%f14, [%o5+16]
195	fmuld	%f2, %f8, %f16
196	std	%f12, [%o5+24]
197	fmuld	%f2, %f6, %f4
198	add	%o1, 4, %o1		C up++
199	lduw	[%o0], %g5		C read rp[i]
200	bne,pt	%icc, .Loop
201	fxtod	%f10, %f2
202	b,a	.L5
203
204C BEGIN MAIN LOOP
205	.align 16
206C -- 0
207.Loop:	sub	%g0, %g3, %g3
208	subcc	%o2, 1, %o2
209	ld	[%o1], %f11		C read up[i]
210	fdtox	%f16, %f14
211C -- 1
212	sllx	%g2, 16, %g4		C (p16 << 16)
213	add	%o0, 4, %o0		C rp++
214	ldx	[%o5+0], %g2		C p16
215	fdtox	%f4, %f12
216C -- 2
217	srl	%g3, 0, %g3		C zero most significant 32 bits
218	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
219	ldx	[%o5+8], %g1		C p0
220	fanop
221C -- 3
222	nop
223	add	%g3, %g4, %g4		C p += cy
224	std	%f14, [%o5+0]
225	fmuld	%f2, %f8, %f16
226C -- 4
227	nop
228	sub	%g5, %g4, %g4		C p += rp[i]
229	std	%f12, [%o5+8]
230	fmuld	%f2, %f6, %f4
231C -- 5
232	xor	%o5, 16, %o5		C alternate scratch variables
233	add	%o1, 4, %o1		C up++
234	stw	%g4, [%o0-4]
235	fanop
236C -- 6
237	srlx	%g4, 32, %g3		C new cy
238	lduw	[%o0], %g5		C read rp[i]
239	bne,pt	%icc, .Loop
240	fxtod	%f10, %f2
241C END MAIN LOOP
242
243.L5:	sub	%g0, %g3, %g3
244	fdtox	%f16, %f14
245	sllx	%g2, 16, %g4		C (p16 << 16)
246	ldx	[%o5+0], %g2		C p16
247	fdtox	%f4, %f12
248	srl	%g3, 0, %g3		C zero most significant 32 bits
249	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
250	ldx	[%o5+8], %g1		C p0
251	add	%g4, %g3, %g4		C p += cy
252	std	%f14, [%o5+0]
253	fmuld	%f2, %f8, %f16
254	sub	%g5, %g4, %g4		C p += rp[i]
255	std	%f12, [%o5+8]
256	fmuld	%f2, %f6, %f4
257	xor	%o5, 16, %o5
258	stw	%g4, [%o0+0]
259	srlx	%g4, 32, %g3		C new cy
260	lduw	[%o0+4], %g5		C read rp[i]
261
262	sub	%g0, %g3, %g3
263.L4:	fdtox	%f16, %f14
264	sllx	%g2, 16, %g4		C (p16 << 16)
265	ldx	[%o5+0], %g2		C p16
266	fdtox	%f4, %f12
267	srl	%g3, 0, %g3		C zero most significant 32 bits
268	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
269	ldx	[%o5+8], %g1		C p0
270	add	%g3, %g4, %g4		C p += cy
271	std	%f14, [%o5+0]
272	sub	%g5, %g4, %g4		C p += rp[i]
273	std	%f12, [%o5+8]
274	xor	%o5, 16, %o5
275	stw	%g4, [%o0+4]
276	srlx	%g4, 32, %g3		C new cy
277	lduw	[%o0+8], %g5		C read rp[i]
278
279	sub	%g0, %g3, %g3
280.L3:	sllx	%g2, 16, %g4		C (p16 << 16)
281	ldx	[%o5+0], %g2		C p16
282	srl	%g3, 0, %g3		C zero most significant 32 bits
283	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
284	ldx	[%o5+8], %g1		C p0
285	add	%g3, %g4, %g4		C p += cy
286	sub	%g5, %g4, %g4		C p += rp[i]
287	xor	%o5, 16, %o5
288	stw	%g4, [%o0+8]
289	srlx	%g4, 32, %g3		C new cy
290	lduw	[%o0+12], %g5		C read rp[i]
291
292	sub	%g0, %g3, %g3
293.L2:	sllx	%g2, 16, %g4		C (p16 << 16)
294	ldx	[%o5+0], %g2		C p16
295	srl	%g3, 0, %g3		C zero most significant 32 bits
296	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
297	ldx	[%o5+8], %g1		C p0
298	add	%g3, %g4, %g4		C p += cy
299	sub	%g5, %g4, %g4		C p += rp[i]
300	stw	%g4, [%o0+12]
301	srlx	%g4, 32, %g3		C new cy
302	lduw	[%o0+16], %g5		C read rp[i]
303
304	sub	%g0, %g3, %g3
305.L1:	sllx	%g2, 16, %g4		C (p16 << 16)
306	srl	%g3, 0, %g3		C zero most significant 32 bits
307	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
308	add	%g3, %g4, %g4		C p += cy
309	sub	%g5, %g4, %g4		C p += rp[i]
310	stw	%g4, [%o0+16]
311	srlx	%g4, 32, %g3		C new cy
312
313	sub	%g0, %g3, %o0
314	retl
315	sub	%sp, -FSIZE, %sp
316EPILOGUE(mpn_submul_1)
317