xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/sparc32/v9/sqr_diagonal.asm (revision 75f6d617e282811cb173c2ccfbf5df0dd71f7045)
1dnl  SPARC v9 32-bit mpn_sqr_diagonal.
2
3dnl  Copyright 2001, 2003 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20
21include(`../config.m4')
22
23C INPUT PARAMETERS
24C rp	i0
25C up	i1
26C n	i2
27
28C This code uses a very deep software pipeline, due to the need for moving data
29C forth and back between the integer registers and floating-point registers.
30C
31C A VIS variant of this code would make the pipeline less deep, since the
32C masking now done in the integer unit could take place in the floating-point
33C unit using the FAND instruction.  It would be possible to save several cycles
34C too.
35C
36C On UltraSPARC 1 and 2, this code runs at 11 cycles/limb from the Dcache and
37C not much slower from the Ecache.  It would perhaps be possible to shave off
38C one cycle, but not easily.  We cannot do better than 10 cycles/limb with the
39C used instructions, since we have 10 memory operations per limb.  But a VIS
40C variant could run three cycles faster than the corresponding non-VIS code.
41
42C This is non-pipelined code showing the algorithm:
43C
44C .Loop:
45C	lduw	[up+0],%g4		C 00000000hhhhllll
46C	sllx	%g4,16,%g3		C 0000hhhhllll0000
47C	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
48C	andn	%g2,%g5,%g2		C 0000hhhh0000llll
49C	stx	%g2,[%fp+80]
50C	ldd	[%fp+80],%f0
51C	fitod	%f0,%f4			C hi16
52C	fitod	%f1,%f6			C lo16
53C	ld	[up+0],%f9
54C	fxtod	%f8,%f2
55C	fmuld	%f2,%f4,%f4
56C	fmuld	%f2,%f6,%f6
57C	fdtox	%f4,%f4
58C	fdtox	%f6,%f6
59C	std	%f4,[%fp-24]
60C	std	%f6,[%fp-16]
61C	ldx	[%fp-24],%g2
62C	ldx	[%fp-16],%g1
63C	sllx	%g2,16,%g2
64C	add	%g2,%g1,%g1
65C	stw	%g1,[rp+0]
66C	srlx	%g1,32,%l0
67C	stw	%l0,[rp+4]
68C	add	up,4,up
69C	subcc	n,1,n
70C	bne,pt	%icc,.Loop
71C	add	rp,8,rp
72
73define(`fanop',`fitod %f12,%f10')	dnl  A quasi nop running in the FA pipe
74
75ASM_START()
76
77	TEXT
78	ALIGN(4)
79.Lnoll:
80	.word	0
81
82PROLOGUE(mpn_sqr_diagonal)
83	save	%sp,-256,%sp
84
85ifdef(`PIC',
86`.Lpc:	rd	%pc,%o7
87	ld	[%o7+.Lnoll-.Lpc],%f8',
88`	sethi	%hi(.Lnoll),%g1
89	ld	[%g1+%lo(.Lnoll)],%f8')
90
91	sethi	%hi(0xffff0000),%g5
92	add	%i1,-8,%i1
93
94	lduw	[%i1+8],%g4
95	add	%i1,4,%i1		C s1_ptr++
96	sllx	%g4,16,%g3		C 0000hhhhllll0000
97	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
98	subcc	%i2,1,%i2
99	bne,pt	%icc,.L_grt_1
100	andn	%g2,%g5,%g2		C 0000hhhh0000llll
101
102	add	%i1,4,%i1		C s1_ptr++
103	stx	%g2,[%fp+80]
104	ld	[%i1],%f9
105	ldd	[%fp+80],%f0
106	fxtod	%f8,%f2
107	fitod	%f0,%f4
108	fitod	%f1,%f6
109	fmuld	%f2,%f4,%f4
110	fmuld	%f2,%f6,%f6
111	fdtox	%f4,%f4
112	fdtox	%f6,%f6
113	std	%f4,[%fp-24]
114	std	%f6,[%fp-16]
115
116	add	%fp, 80, %l3
117	add	%fp, -24, %l4
118	add	%fp, 72, %l5
119	b	.L1
120	add	%fp, -40, %l6
121
122.L_grt_1:
123	stx	%g2,[%fp+80]
124	lduw	[%i1+8],%g4
125	add	%i1,4,%i1		C s1_ptr++
126	sllx	%g4,16,%g3		C 0000hhhhllll0000
127	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
128	subcc	%i2,1,%i2
129	bne,pt	%icc,.L_grt_2
130	andn	%g2,%g5,%g2		C 0000hhhh0000llll
131
132	stx	%g2,[%fp+72]
133	ld	[%i1],%f9
134	add	%i1,4,%i1		C s1_ptr++
135	ldd	[%fp+80],%f0
136	fxtod	%f8,%f2
137	fitod	%f0,%f4
138	fitod	%f1,%f6
139	fmuld	%f2,%f4,%f4
140	ld	[%i1],%f9
141	fmuld	%f2,%f6,%f6
142	ldd	[%fp+72],%f0
143	fdtox	%f4,%f4
144	fdtox	%f6,%f6
145	std	%f4,[%fp-24]
146	fxtod	%f8,%f2
147	std	%f6,[%fp-16]
148	fitod	%f0,%f4
149	fitod	%f1,%f6
150	fmuld	%f2,%f4,%f4
151	fmuld	%f2,%f6,%f6
152	fdtox	%f4,%f4
153
154	add	%fp, 72, %l3
155	add	%fp, -40, %l4
156	add	%fp, 80, %l5
157	b	.L2
158	add	%fp, -24, %l6
159
160.L_grt_2:
161	stx	%g2,[%fp+72]
162	lduw	[%i1+8],%g4
163	ld	[%i1],%f9
164	add	%i1,4,%i1		C s1_ptr++
165	ldd	[%fp+80],%f0
166	sllx	%g4,16,%g3		C 0000hhhhllll0000
167	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
168	subcc	%i2,1,%i2
169	fxtod	%f8,%f2
170	bne,pt	%icc,.L_grt_3
171	andn	%g2,%g5,%g2		C 0000hhhh0000llll
172
173	stx	%g2,[%fp+80]
174	fitod	%f0,%f4
175	fitod	%f1,%f6
176	fmuld	%f2,%f4,%f4
177	ld	[%i1],%f9
178	fmuld	%f2,%f6,%f6
179	add	%i1,4,%i1		C s1_ptr++
180	ldd	[%fp+72],%f0
181	fdtox	%f4,%f4
182	fdtox	%f6,%f6
183	std	%f4,[%fp-24]
184	fxtod	%f8,%f2
185	std	%f6,[%fp-16]
186	fitod	%f0,%f4
187	fitod	%f1,%f6
188	fmuld	%f2,%f4,%f4
189	ld	[%i1],%f9
190	add	%fp, 80, %l3
191	fmuld	%f2,%f6,%f6
192	add	%fp, -24, %l4
193	ldd	[%fp+80],%f0
194	add	%fp, 72, %l5
195	fdtox	%f4,%f4
196	b	.L3
197	add	%fp, -40, %l6
198
199.L_grt_3:
200	stx	%g2,[%fp+80]
201	fitod	%f0,%f4
202	lduw	[%i1+8],%g4
203	fitod	%f1,%f6
204	fmuld	%f2,%f4,%f4
205	ld	[%i1],%f9
206	fmuld	%f2,%f6,%f6
207	add	%i1,4,%i1		C s1_ptr++
208	ldd	[%fp+72],%f0
209	fdtox	%f4,%f4
210	sllx	%g4,16,%g3		C 0000hhhhllll0000
211	fdtox	%f6,%f6
212	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
213	subcc	%i2,1,%i2
214	std	%f4,[%fp-24]
215	fxtod	%f8,%f2
216	std	%f6,[%fp-16]
217	bne,pt	%icc,.L_grt_4
218	andn	%g2,%g5,%g2		C 0000hhhh0000llll
219
220	stx	%g2,[%fp+72]
221	fitod	%f0,%f4
222	fitod	%f1,%f6
223	add	%fp, 72, %l3
224	fmuld	%f2,%f4,%f4
225	add	%fp, -40, %l4
226	ld	[%i1],%f9
227	fmuld	%f2,%f6,%f6
228	add	%i1,4,%i1		C s1_ptr++
229	ldd	[%fp+80],%f0
230	add	%fp, 80, %l5
231	fdtox	%f4,%f4
232	b	.L4
233	add	%fp, -24, %l6
234
235.L_grt_4:
236	stx	%g2,[%fp+72]
237	fitod	%f0,%f4
238	lduw	[%i1+8],%g4
239	fitod	%f1,%f6
240	fmuld	%f2,%f4,%f4
241	ld	[%i1],%f9
242	fmuld	%f2,%f6,%f6
243	add	%i1,4,%i1		C s1_ptr++
244	ldd	[%fp+80],%f0
245	fdtox	%f4,%f4
246	sllx	%g4,16,%g3		C 0000hhhhllll0000
247	fdtox	%f6,%f6
248	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
249	subcc	%i2,1,%i2
250	std	%f4,[%fp-40]
251	fxtod	%f8,%f2
252	std	%f6,[%fp-32]
253	be,pn	%icc,.L5
254	andn	%g2,%g5,%g2		C 0000hhhh0000llll
255
256	b,a	.Loop
257
258	.align	16
259C --- LOOP BEGIN
260.Loop:	nop
261	nop
262	stx	%g2,[%fp+80]
263	fitod	%f0,%f4
264C ---
265	nop
266	nop
267	lduw	[%i1+8],%g4
268	fitod	%f1,%f6
269C ---
270	nop
271	nop
272	ldx	[%fp-24],%g2		C p16
273	fanop
274C ---
275	nop
276	nop
277	ldx	[%fp-16],%g1		C p0
278	fmuld	%f2,%f4,%f4
279C ---
280	sllx	%g2,16,%g2		C align p16
281	add	%i0,8,%i0		C res_ptr++
282	ld	[%i1],%f9
283	fmuld	%f2,%f6,%f6
284C ---
285	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
286	add	%i1,4,%i1		C s1_ptr++
287	ldd	[%fp+72],%f0
288	fanop
289C ---
290	srlx	%g1,32,%l0
291	nop
292	stw	%g1,[%i0-8]
293	fdtox	%f4,%f4
294C ---
295	sllx	%g4,16,%g3		C 0000hhhhllll0000
296	nop
297	stw	%l0,[%i0-4]
298	fdtox	%f6,%f6
299C ---
300	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
301	subcc	%i2,1,%i2
302	std	%f4,[%fp-24]
303	fxtod	%f8,%f2
304C ---
305	std	%f6,[%fp-16]
306	andn	%g2,%g5,%g2		C 0000hhhh0000llll
307	be,pn	%icc,.Lend
308	fanop
309C ---  LOOP MIDDLE
310	nop
311	nop
312	stx	%g2,[%fp+72]
313	fitod	%f0,%f4
314C ---
315	nop
316	nop
317	lduw	[%i1+8],%g4
318	fitod	%f1,%f6
319C ---
320	nop
321	nop
322	ldx	[%fp-40],%g2		C p16
323	fanop
324C ---
325	nop
326	nop
327	ldx	[%fp-32],%g1		C p0
328	fmuld	%f2,%f4,%f4
329C ---
330	sllx	%g2,16,%g2		C align p16
331	add	%i0,8,%i0		C res_ptr++
332	ld	[%i1],%f9
333	fmuld	%f2,%f6,%f6
334C ---
335	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
336	add	%i1,4,%i1		C s1_ptr++
337	ldd	[%fp+80],%f0
338	fanop
339C ---
340	srlx	%g1,32,%l0
341	nop
342	stw	%g1,[%i0-8]
343	fdtox	%f4,%f4
344C ---
345	sllx	%g4,16,%g3		C 0000hhhhllll0000
346	nop
347	stw	%l0,[%i0-4]
348	fdtox	%f6,%f6
349C ---
350	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
351	subcc	%i2,1,%i2
352	std	%f4,[%fp-40]
353	fxtod	%f8,%f2
354C ---
355	std	%f6,[%fp-32]
356	andn	%g2,%g5,%g2		C 0000hhhh0000llll
357	bne,pt	%icc,.Loop
358	fanop
359C --- LOOP END
360
361.L5:	add	%fp, 80, %l3
362	add	%fp, -24, %l4
363	add	%fp, 72, %l5
364	b	.Ltail
365	add	%fp, -40, %l6
366
367.Lend:	add	%fp, 72, %l3
368	add	%fp, -40, %l4
369	add	%fp, 80, %l5
370	add	%fp, -24, %l6
371.Ltail:	stx	%g2,[%l3]
372	fitod	%f0,%f4
373	fitod	%f1,%f6
374	ldx	[%l4],%g2		C p16
375	ldx	[%l4+8],%g1		C p0
376	fmuld	%f2,%f4,%f4
377	sllx	%g2,16,%g2		C align p16
378	add	%i0,8,%i0		C res_ptr++
379	ld	[%i1],%f9
380	fmuld	%f2,%f6,%f6
381	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
382	add	%i1,4,%i1		C s1_ptr++
383	ldd	[%l5],%f0
384	srlx	%g1,32,%l0
385	stw	%g1,[%i0-8]
386	fdtox	%f4,%f4
387	stw	%l0,[%i0-4]
388.L4:	fdtox	%f6,%f6
389	std	%f4,[%l4]
390	fxtod	%f8,%f2
391	std	%f6,[%l4+8]
392
393	fitod	%f0,%f4
394	fitod	%f1,%f6
395	ldx	[%l6],%g2		C p16
396	ldx	[%l6+8],%g1		C p0
397	fmuld	%f2,%f4,%f4
398	sllx	%g2,16,%g2		C align p16
399	add	%i0,8,%i0		C res_ptr++
400	ld	[%i1],%f9
401	fmuld	%f2,%f6,%f6
402	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
403	ldd	[%l3],%f0
404	srlx	%g1,32,%l0
405	stw	%g1,[%i0-8]
406	fdtox	%f4,%f4
407	stw	%l0,[%i0-4]
408.L3:	fdtox	%f6,%f6
409	std	%f4,[%l6]
410	fxtod	%f8,%f2
411	std	%f6,[%l6+8]
412
413	fitod	%f0,%f4
414	fitod	%f1,%f6
415	ldx	[%l4],%g2		C p16
416	ldx	[%l4+8],%g1		C p0
417	fmuld	%f2,%f4,%f4
418	sllx	%g2,16,%g2		C align p16
419	add	%i0,8,%i0		C res_ptr++
420	fmuld	%f2,%f6,%f6
421	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
422	srlx	%g1,32,%l0
423	stw	%g1,[%i0-8]
424	fdtox	%f4,%f4
425	stw	%l0,[%i0-4]
426.L2:	fdtox	%f6,%f6
427	std	%f4,[%l4]
428	std	%f6,[%l4+8]
429
430	ldx	[%l6],%g2		C p16
431	ldx	[%l6+8],%g1		C p0
432	sllx	%g2,16,%g2		C align p16
433	add	%i0,8,%i0		C res_ptr++
434	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
435	srlx	%g1,32,%l0
436	stw	%g1,[%i0-8]
437	stw	%l0,[%i0-4]
438
439.L1:	ldx	[%l4],%g2		C p16
440	ldx	[%l4+8],%g1		C p0
441	sllx	%g2,16,%g2		C align p16
442	add	%i0,8,%i0		C res_ptr++
443	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
444	srlx	%g1,32,%l0
445	stw	%g1,[%i0-8]
446	stw	%l0,[%i0-4]
447
448	ret
449	restore	%g0,%g0,%o0
450
451EPILOGUE(mpn_sqr_diagonal)
452