xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/sparc32/v9/sqr_diagonal.asm (revision ce54336801cf28877c3414aa2fcb251dddd543a2)
1dnl  SPARC v9 32-bit mpn_sqr_diagonal.
2
3dnl  Copyright 2001, 2003 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31
32include(`../config.m4')
33
34C INPUT PARAMETERS
35C rp	i0
36C up	i1
37C n	i2
38
39C This code uses a very deep software pipeline, due to the need for moving data
40C forth and back between the integer registers and floating-point registers.
41C
42C A VIS variant of this code would make the pipeline less deep, since the
43C masking now done in the integer unit could take place in the floating-point
44C unit using the FAND instruction.  It would be possible to save several cycles
45C too.
46C
47C On UltraSPARC 1 and 2, this code runs at 11 cycles/limb from the Dcache and
48C not much slower from the Ecache.  It would perhaps be possible to shave off
49C one cycle, but not easily.  We cannot do better than 10 cycles/limb with the
50C used instructions, since we have 10 memory operations per limb.  But a VIS
51C variant could run three cycles faster than the corresponding non-VIS code.
52
53C This is non-pipelined code showing the algorithm:
54C
55C .Loop:
56C	lduw	[up+0],%g4		C 00000000hhhhllll
57C	sllx	%g4,16,%g3		C 0000hhhhllll0000
58C	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
59C	andn	%g2,%g5,%g2		C 0000hhhh0000llll
60C	stx	%g2,[%fp+80]
61C	ldd	[%fp+80],%f0
62C	fitod	%f0,%f4			C hi16
63C	fitod	%f1,%f6			C lo16
64C	ld	[up+0],%f9
65C	fxtod	%f8,%f2
66C	fmuld	%f2,%f4,%f4
67C	fmuld	%f2,%f6,%f6
68C	fdtox	%f4,%f4
69C	fdtox	%f6,%f6
70C	std	%f4,[%fp-24]
71C	std	%f6,[%fp-16]
72C	ldx	[%fp-24],%g2
73C	ldx	[%fp-16],%g1
74C	sllx	%g2,16,%g2
75C	add	%g2,%g1,%g1
76C	stw	%g1,[rp+0]
77C	srlx	%g1,32,%l0
78C	stw	%l0,[rp+4]
79C	add	up,4,up
80C	subcc	n,1,n
81C	bne,pt	%icc,.Loop
82C	add	rp,8,rp
83
84define(`fanop',`fitod %f12,%f10')	dnl  A quasi nop running in the FA pipe
85
86ASM_START()
87
88	TEXT
89	ALIGN(4)
90.Lnoll:
91	.word	0
92
93PROLOGUE(mpn_sqr_diagonal)
94	save	%sp,-256,%sp
95
96ifdef(`PIC',
97`.Lpc:	rd	%pc,%o7
98	ld	[%o7+.Lnoll-.Lpc],%f8',
99`	sethi	%hi(.Lnoll),%g1
100	ld	[%g1+%lo(.Lnoll)],%f8')
101
102	sethi	%hi(0xffff0000),%g5
103	add	%i1,-8,%i1
104
105	lduw	[%i1+8],%g4
106	add	%i1,4,%i1		C s1_ptr++
107	sllx	%g4,16,%g3		C 0000hhhhllll0000
108	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
109	subcc	%i2,1,%i2
110	bne,pt	%icc,.L_grt_1
111	andn	%g2,%g5,%g2		C 0000hhhh0000llll
112
113	add	%i1,4,%i1		C s1_ptr++
114	stx	%g2,[%fp+80]
115	ld	[%i1],%f9
116	ldd	[%fp+80],%f0
117	fxtod	%f8,%f2
118	fitod	%f0,%f4
119	fitod	%f1,%f6
120	fmuld	%f2,%f4,%f4
121	fmuld	%f2,%f6,%f6
122	fdtox	%f4,%f4
123	fdtox	%f6,%f6
124	std	%f4,[%fp-24]
125	std	%f6,[%fp-16]
126
127	add	%fp, 80, %l3
128	add	%fp, -24, %l4
129	add	%fp, 72, %l5
130	b	.L1
131	add	%fp, -40, %l6
132
133.L_grt_1:
134	stx	%g2,[%fp+80]
135	lduw	[%i1+8],%g4
136	add	%i1,4,%i1		C s1_ptr++
137	sllx	%g4,16,%g3		C 0000hhhhllll0000
138	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
139	subcc	%i2,1,%i2
140	bne,pt	%icc,.L_grt_2
141	andn	%g2,%g5,%g2		C 0000hhhh0000llll
142
143	stx	%g2,[%fp+72]
144	ld	[%i1],%f9
145	add	%i1,4,%i1		C s1_ptr++
146	ldd	[%fp+80],%f0
147	fxtod	%f8,%f2
148	fitod	%f0,%f4
149	fitod	%f1,%f6
150	fmuld	%f2,%f4,%f4
151	ld	[%i1],%f9
152	fmuld	%f2,%f6,%f6
153	ldd	[%fp+72],%f0
154	fdtox	%f4,%f4
155	fdtox	%f6,%f6
156	std	%f4,[%fp-24]
157	fxtod	%f8,%f2
158	std	%f6,[%fp-16]
159	fitod	%f0,%f4
160	fitod	%f1,%f6
161	fmuld	%f2,%f4,%f4
162	fmuld	%f2,%f6,%f6
163	fdtox	%f4,%f4
164
165	add	%fp, 72, %l3
166	add	%fp, -40, %l4
167	add	%fp, 80, %l5
168	b	.L2
169	add	%fp, -24, %l6
170
171.L_grt_2:
172	stx	%g2,[%fp+72]
173	lduw	[%i1+8],%g4
174	ld	[%i1],%f9
175	add	%i1,4,%i1		C s1_ptr++
176	ldd	[%fp+80],%f0
177	sllx	%g4,16,%g3		C 0000hhhhllll0000
178	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
179	subcc	%i2,1,%i2
180	fxtod	%f8,%f2
181	bne,pt	%icc,.L_grt_3
182	andn	%g2,%g5,%g2		C 0000hhhh0000llll
183
184	stx	%g2,[%fp+80]
185	fitod	%f0,%f4
186	fitod	%f1,%f6
187	fmuld	%f2,%f4,%f4
188	ld	[%i1],%f9
189	fmuld	%f2,%f6,%f6
190	add	%i1,4,%i1		C s1_ptr++
191	ldd	[%fp+72],%f0
192	fdtox	%f4,%f4
193	fdtox	%f6,%f6
194	std	%f4,[%fp-24]
195	fxtod	%f8,%f2
196	std	%f6,[%fp-16]
197	fitod	%f0,%f4
198	fitod	%f1,%f6
199	fmuld	%f2,%f4,%f4
200	ld	[%i1],%f9
201	add	%fp, 80, %l3
202	fmuld	%f2,%f6,%f6
203	add	%fp, -24, %l4
204	ldd	[%fp+80],%f0
205	add	%fp, 72, %l5
206	fdtox	%f4,%f4
207	b	.L3
208	add	%fp, -40, %l6
209
210.L_grt_3:
211	stx	%g2,[%fp+80]
212	fitod	%f0,%f4
213	lduw	[%i1+8],%g4
214	fitod	%f1,%f6
215	fmuld	%f2,%f4,%f4
216	ld	[%i1],%f9
217	fmuld	%f2,%f6,%f6
218	add	%i1,4,%i1		C s1_ptr++
219	ldd	[%fp+72],%f0
220	fdtox	%f4,%f4
221	sllx	%g4,16,%g3		C 0000hhhhllll0000
222	fdtox	%f6,%f6
223	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
224	subcc	%i2,1,%i2
225	std	%f4,[%fp-24]
226	fxtod	%f8,%f2
227	std	%f6,[%fp-16]
228	bne,pt	%icc,.L_grt_4
229	andn	%g2,%g5,%g2		C 0000hhhh0000llll
230
231	stx	%g2,[%fp+72]
232	fitod	%f0,%f4
233	fitod	%f1,%f6
234	add	%fp, 72, %l3
235	fmuld	%f2,%f4,%f4
236	add	%fp, -40, %l4
237	ld	[%i1],%f9
238	fmuld	%f2,%f6,%f6
239	add	%i1,4,%i1		C s1_ptr++
240	ldd	[%fp+80],%f0
241	add	%fp, 80, %l5
242	fdtox	%f4,%f4
243	b	.L4
244	add	%fp, -24, %l6
245
246.L_grt_4:
247	stx	%g2,[%fp+72]
248	fitod	%f0,%f4
249	lduw	[%i1+8],%g4
250	fitod	%f1,%f6
251	fmuld	%f2,%f4,%f4
252	ld	[%i1],%f9
253	fmuld	%f2,%f6,%f6
254	add	%i1,4,%i1		C s1_ptr++
255	ldd	[%fp+80],%f0
256	fdtox	%f4,%f4
257	sllx	%g4,16,%g3		C 0000hhhhllll0000
258	fdtox	%f6,%f6
259	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
260	subcc	%i2,1,%i2
261	std	%f4,[%fp-40]
262	fxtod	%f8,%f2
263	std	%f6,[%fp-32]
264	be,pn	%icc,.L5
265	andn	%g2,%g5,%g2		C 0000hhhh0000llll
266
267	b,a	.Loop
268
269	.align	16
270C --- LOOP BEGIN
271.Loop:	nop
272	nop
273	stx	%g2,[%fp+80]
274	fitod	%f0,%f4
275C ---
276	nop
277	nop
278	lduw	[%i1+8],%g4
279	fitod	%f1,%f6
280C ---
281	nop
282	nop
283	ldx	[%fp-24],%g2		C p16
284	fanop
285C ---
286	nop
287	nop
288	ldx	[%fp-16],%g1		C p0
289	fmuld	%f2,%f4,%f4
290C ---
291	sllx	%g2,16,%g2		C align p16
292	add	%i0,8,%i0		C res_ptr++
293	ld	[%i1],%f9
294	fmuld	%f2,%f6,%f6
295C ---
296	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
297	add	%i1,4,%i1		C s1_ptr++
298	ldd	[%fp+72],%f0
299	fanop
300C ---
301	srlx	%g1,32,%l0
302	nop
303	stw	%g1,[%i0-8]
304	fdtox	%f4,%f4
305C ---
306	sllx	%g4,16,%g3		C 0000hhhhllll0000
307	nop
308	stw	%l0,[%i0-4]
309	fdtox	%f6,%f6
310C ---
311	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
312	subcc	%i2,1,%i2
313	std	%f4,[%fp-24]
314	fxtod	%f8,%f2
315C ---
316	std	%f6,[%fp-16]
317	andn	%g2,%g5,%g2		C 0000hhhh0000llll
318	be,pn	%icc,.Lend
319	fanop
320C ---  LOOP MIDDLE
321	nop
322	nop
323	stx	%g2,[%fp+72]
324	fitod	%f0,%f4
325C ---
326	nop
327	nop
328	lduw	[%i1+8],%g4
329	fitod	%f1,%f6
330C ---
331	nop
332	nop
333	ldx	[%fp-40],%g2		C p16
334	fanop
335C ---
336	nop
337	nop
338	ldx	[%fp-32],%g1		C p0
339	fmuld	%f2,%f4,%f4
340C ---
341	sllx	%g2,16,%g2		C align p16
342	add	%i0,8,%i0		C res_ptr++
343	ld	[%i1],%f9
344	fmuld	%f2,%f6,%f6
345C ---
346	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
347	add	%i1,4,%i1		C s1_ptr++
348	ldd	[%fp+80],%f0
349	fanop
350C ---
351	srlx	%g1,32,%l0
352	nop
353	stw	%g1,[%i0-8]
354	fdtox	%f4,%f4
355C ---
356	sllx	%g4,16,%g3		C 0000hhhhllll0000
357	nop
358	stw	%l0,[%i0-4]
359	fdtox	%f6,%f6
360C ---
361	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
362	subcc	%i2,1,%i2
363	std	%f4,[%fp-40]
364	fxtod	%f8,%f2
365C ---
366	std	%f6,[%fp-32]
367	andn	%g2,%g5,%g2		C 0000hhhh0000llll
368	bne,pt	%icc,.Loop
369	fanop
370C --- LOOP END
371
372.L5:	add	%fp, 80, %l3
373	add	%fp, -24, %l4
374	add	%fp, 72, %l5
375	b	.Ltail
376	add	%fp, -40, %l6
377
378.Lend:	add	%fp, 72, %l3
379	add	%fp, -40, %l4
380	add	%fp, 80, %l5
381	add	%fp, -24, %l6
382.Ltail:	stx	%g2,[%l3]
383	fitod	%f0,%f4
384	fitod	%f1,%f6
385	ldx	[%l4],%g2		C p16
386	ldx	[%l4+8],%g1		C p0
387	fmuld	%f2,%f4,%f4
388	sllx	%g2,16,%g2		C align p16
389	add	%i0,8,%i0		C res_ptr++
390	ld	[%i1],%f9
391	fmuld	%f2,%f6,%f6
392	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
393	add	%i1,4,%i1		C s1_ptr++
394	ldd	[%l5],%f0
395	srlx	%g1,32,%l0
396	stw	%g1,[%i0-8]
397	fdtox	%f4,%f4
398	stw	%l0,[%i0-4]
399.L4:	fdtox	%f6,%f6
400	std	%f4,[%l4]
401	fxtod	%f8,%f2
402	std	%f6,[%l4+8]
403
404	fitod	%f0,%f4
405	fitod	%f1,%f6
406	ldx	[%l6],%g2		C p16
407	ldx	[%l6+8],%g1		C p0
408	fmuld	%f2,%f4,%f4
409	sllx	%g2,16,%g2		C align p16
410	add	%i0,8,%i0		C res_ptr++
411	ld	[%i1],%f9
412	fmuld	%f2,%f6,%f6
413	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
414	ldd	[%l3],%f0
415	srlx	%g1,32,%l0
416	stw	%g1,[%i0-8]
417	fdtox	%f4,%f4
418	stw	%l0,[%i0-4]
419.L3:	fdtox	%f6,%f6
420	std	%f4,[%l6]
421	fxtod	%f8,%f2
422	std	%f6,[%l6+8]
423
424	fitod	%f0,%f4
425	fitod	%f1,%f6
426	ldx	[%l4],%g2		C p16
427	ldx	[%l4+8],%g1		C p0
428	fmuld	%f2,%f4,%f4
429	sllx	%g2,16,%g2		C align p16
430	add	%i0,8,%i0		C res_ptr++
431	fmuld	%f2,%f6,%f6
432	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
433	srlx	%g1,32,%l0
434	stw	%g1,[%i0-8]
435	fdtox	%f4,%f4
436	stw	%l0,[%i0-4]
437.L2:	fdtox	%f6,%f6
438	std	%f4,[%l4]
439	std	%f6,[%l4+8]
440
441	ldx	[%l6],%g2		C p16
442	ldx	[%l6+8],%g1		C p0
443	sllx	%g2,16,%g2		C align p16
444	add	%i0,8,%i0		C res_ptr++
445	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
446	srlx	%g1,32,%l0
447	stw	%g1,[%i0-8]
448	stw	%l0,[%i0-4]
449
450.L1:	ldx	[%l4],%g2		C p16
451	ldx	[%l4+8],%g1		C p0
452	sllx	%g2,16,%g2		C align p16
453	add	%i0,8,%i0		C res_ptr++
454	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
455	srlx	%g1,32,%l0
456	stw	%g1,[%i0-8]
457	stw	%l0,[%i0-4]
458
459	ret
460	restore	%g0,%g0,%o0
461
462EPILOGUE(mpn_sqr_diagonal)
463