xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/sparc32/v9/mul_1.asm (revision aef5eb5f59cdfe8314f1b5f78ac04eb144e44010)
1dnl  SPARC v9 32-bit mpn_mul_1 -- Multiply a limb vector with a limb and store
2dnl  the result in a second limb vector.
3
4dnl  Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of either:
10dnl
11dnl    * the GNU Lesser General Public License as published by the Free
12dnl      Software Foundation; either version 3 of the License, or (at your
13dnl      option) any later version.
14dnl
15dnl  or
16dnl
17dnl    * the GNU General Public License as published by the Free Software
18dnl      Foundation; either version 2 of the License, or (at your option) any
19dnl      later version.
20dnl
21dnl  or both in parallel, as here.
22dnl
23dnl  The GNU MP Library is distributed in the hope that it will be useful, but
24dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26dnl  for more details.
27dnl
28dnl  You should have received copies of the GNU General Public License and the
29dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
30dnl  see https://www.gnu.org/licenses/.
31
32include(`../config.m4')
33
34C Algorithm: We use two floating-point multiplies per limb product, with the
35C invariant v operand split into two 16-bit pieces, and the u operand split
36C into 32-bit pieces.  We convert the two 48-bit products and transfer them to
37C the integer unit.
38
39C		   cycles/limb
40C UltraSPARC 1&2:     6.5
41C UltraSPARC 3:	      ?
42
43C Possible optimizations:
44C   1. Combine 32-bit memory operations into 64-bit operations.  Since we're
45C      memory bandwidth limited, this could save 1.5 cycles/limb.
46C   2. Unroll the inner loop.  Since we already use alternate temporary areas,
47C      it is very straightforward to unroll, using an exit branch midways.
48C      Unrolling would allow deeper scheduling which could improve speed for L2
49C      cache case.
50C   3. For mpn_mul_1: Use more alternating temp areas.  The std'es and ldx'es
51C      aren't sufficiently apart-scheduled with just two temp areas.
52C   4. Specialize for particular v values.  If its upper 16 bits are zero, we
53C      could save many operations.
54
55C INPUT PARAMETERS
56C rp	i0
57C up	i1
58C n	i2
59C v	i3
60
61define(`FSIZE',224)
62
63ASM_START()
64PROLOGUE(mpn_mul_1)
65	add	%sp, -FSIZE, %sp
66	sethi	%hi(0xffff), %g1
67	srl	%o3, 16, %g2
68	or	%g1, %lo(0xffff), %g1
69	and	%o3, %g1, %g1
70	stx	%g1, [%sp+104]
71	stx	%g2, [%sp+112]
72	ldd	[%sp+104], %f6
73	ldd	[%sp+112], %f8
74	fxtod	%f6, %f6
75	fxtod	%f8, %f8
76	ld	[%sp+104], %f10		C zero f10
77
78	mov	0, %g3			C cy = 0
79
80define(`fanop', `fitod %f18, %f0')	C  A quasi nop running in the FA pipe
81
82	add	%sp, 160, %o5		C point in scratch area
83	and	%o5, -32, %o5		C align at 0 (mod 32) in scratch area
84
85	subcc	%o2, 1, %o2
86	ld	[%o1], %f11		C read up[i]
87	add	%o1, 4, %o1		C up++
88	bne,pt	%icc, .L_two_or_more
89	fxtod	%f10, %f2
90
91	fmuld	%f2, %f8, %f16
92	fmuld	%f2, %f6, %f4
93	fdtox	%f16, %f14
94	fdtox	%f4, %f12
95	std	%f14, [%o5+16]
96	std	%f12, [%o5+24]
97	ldx	[%o5+16], %g2		C p16
98	ldx	[%o5+24], %g1		C p0
99	b	.L1
100	add	%o0, -16, %o0
101
102	.align	16
103.L_two_or_more:
104	subcc	%o2, 1, %o2
105	ld	[%o1], %f11		C read up[i]
106	fmuld	%f2, %f8, %f16
107	fmuld	%f2, %f6, %f4
108	add	%o1, 4, %o1		C up++
109	bne,pt	%icc, .L_three_or_more
110	fxtod	%f10, %f2
111
112	fdtox	%f16, %f14
113	fdtox	%f4, %f12
114	std	%f14, [%o5+16]
115	fmuld	%f2, %f8, %f16
116	std	%f12, [%o5+24]
117	fmuld	%f2, %f6, %f4
118	fdtox	%f16, %f14
119	fdtox	%f4, %f12
120	std	%f14, [%o5+0]
121	std	%f12, [%o5+8]
122	ldx	[%o5+16], %g2		C p16
123	ldx	[%o5+24], %g1		C p0
124	b	.L2
125	add	%o0, -12, %o0
126
127	.align	16
128.L_three_or_more:
129	subcc	%o2, 1, %o2
130	ld	[%o1], %f11		C read up[i]
131	fdtox	%f16, %f14
132	fdtox	%f4, %f12
133	std	%f14, [%o5+16]
134	fmuld	%f2, %f8, %f16
135	std	%f12, [%o5+24]
136	fmuld	%f2, %f6, %f4
137	add	%o1, 4, %o1		C up++
138	bne,pt	%icc, .L_four_or_more
139	fxtod	%f10, %f2
140
141	fdtox	%f16, %f14
142	fdtox	%f4, %f12
143	std	%f14, [%o5+0]
144	fmuld	%f2, %f8, %f16
145	std	%f12, [%o5+8]
146	fmuld	%f2, %f6, %f4
147	fdtox	%f16, %f14
148	ldx	[%o5+16], %g2		C p16
149	fdtox	%f4, %f12
150	ldx	[%o5+24], %g1		C p0
151	std	%f14, [%o5+16]
152	std	%f12, [%o5+24]
153	b	.L3
154	add	%o0, -8, %o0
155
156	.align	16
157.L_four_or_more:
158	subcc	%o2, 1, %o2
159	ld	[%o1], %f11		C read up[i]
160	fdtox	%f16, %f14
161	fdtox	%f4, %f12
162	std	%f14, [%o5+0]
163	fmuld	%f2, %f8, %f16
164	std	%f12, [%o5+8]
165	fmuld	%f2, %f6, %f4
166	add	%o1, 4, %o1		C up++
167	bne,pt	%icc, .L_five_or_more
168	fxtod	%f10, %f2
169
170	fdtox	%f16, %f14
171	ldx	[%o5+16], %g2		C p16
172	fdtox	%f4, %f12
173	ldx	[%o5+24], %g1		C p0
174	std	%f14, [%o5+16]
175	fmuld	%f2, %f8, %f16
176	std	%f12, [%o5+24]
177	fmuld	%f2, %f6, %f4
178	add	%o1, 4, %o1		C up++
179	b	.L4
180	add	%o0, -4, %o0
181
182	.align	16
183.L_five_or_more:
184	subcc	%o2, 1, %o2
185	ld	[%o1], %f11		C read up[i]
186	fdtox	%f16, %f14
187	ldx	[%o5+16], %g2		C p16
188	fdtox	%f4, %f12
189	ldx	[%o5+24], %g1		C p0
190	std	%f14, [%o5+16]
191	fmuld	%f2, %f8, %f16
192	std	%f12, [%o5+24]
193	fmuld	%f2, %f6, %f4
194	add	%o1, 4, %o1		C up++
195	bne,pt	%icc, .Loop
196	fxtod	%f10, %f2
197	b,a	.L5
198
199C BEGIN MAIN LOOP
200	.align 16
201C -- 0
202.Loop:	nop
203	subcc	%o2, 1, %o2
204	ld	[%o1], %f11		C read up[i]
205	fdtox	%f16, %f14
206C -- 1
207	sllx	%g2, 16, %g4		C (p16 << 16)
208	add	%o0, 4, %o0		C rp++
209	ldx	[%o5+0], %g2		C p16
210	fdtox	%f4, %f12
211C -- 2
212	nop
213	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
214	ldx	[%o5+8], %g1		C p0
215	fanop
216C -- 3
217	nop
218	add	%g3, %g4, %g4		C p += cy
219	std	%f14, [%o5+0]
220	fmuld	%f2, %f8, %f16
221C -- 4
222	srlx	%g4, 32, %g3		C new cy
223	add	%o1, 4, %o1		C up++
224	std	%f12, [%o5+8]
225	fmuld	%f2, %f6, %f4
226C -- 5
227	xor	%o5, 16, %o5		C alternate scratch variables
228	stw	%g4, [%o0-4]
229	bne,pt	%icc, .Loop
230	fxtod	%f10, %f2
231C END MAIN LOOP
232
233.L5:	fdtox	%f16, %f14
234	sllx	%g2, 16, %g4		C (p16 << 16)
235	ldx	[%o5+0], %g2		C p16
236	fdtox	%f4, %f12
237	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
238	ldx	[%o5+8], %g1		C p0
239	add	%g4, %g3, %g4		C p += cy
240	std	%f14, [%o5+0]
241	fmuld	%f2, %f8, %f16
242	std	%f12, [%o5+8]
243	fmuld	%f2, %f6, %f4
244	xor	%o5, 16, %o5
245	stw	%g4, [%o0+0]
246	srlx	%g4, 32, %g3		C new cy
247
248.L4:	fdtox	%f16, %f14
249	sllx	%g2, 16, %g4		C (p16 << 16)
250	ldx	[%o5+0], %g2		C p16
251	fdtox	%f4, %f12
252	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
253	ldx	[%o5+8], %g1		C p0
254	add	%g3, %g4, %g4		C p += cy
255	std	%f14, [%o5+0]
256	std	%f12, [%o5+8]
257	xor	%o5, 16, %o5
258	stw	%g4, [%o0+4]
259	srlx	%g4, 32, %g3		C new cy
260
261.L3:	sllx	%g2, 16, %g4		C (p16 << 16)
262	ldx	[%o5+0], %g2		C p16
263	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
264	ldx	[%o5+8], %g1		C p0
265	add	%g3, %g4, %g4		C p += cy
266	xor	%o5, 16, %o5
267	stw	%g4, [%o0+8]
268	srlx	%g4, 32, %g3		C new cy
269
270.L2:	sllx	%g2, 16, %g4		C (p16 << 16)
271	ldx	[%o5+0], %g2		C p16
272	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
273	ldx	[%o5+8], %g1		C p0
274	add	%g3, %g4, %g4		C p += cy
275	stw	%g4, [%o0+12]
276	srlx	%g4, 32, %g3		C new cy
277
278.L1:	sllx	%g2, 16, %g4		C (p16 << 16)
279	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
280	add	%g3, %g4, %g4		C p += cy
281	stw	%g4, [%o0+16]
282	srlx	%g4, 32, %g3		C new cy
283
284	mov	%g3, %o0
285	retl
286	sub	%sp, -FSIZE, %sp
287EPILOGUE(mpn_mul_1)
288