xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/pa64/sqr_diagonal.asm (revision 4fee23f98c45552038ad6b5bd05124a41302fb01)
1dnl  HP-PA 2.0 64-bit mpn_sqr_diagonal.
2
3dnl  Copyright 2001, 2002, 2003 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20
21dnl  This code runs at 7.25 cycles/limb on PA8000 and 7.75 cycles/limb on
22dnl  PA8500.  The cache would saturate at 5 cycles/limb, so there is some room
23dnl  for optimization.
24
25include(`../config.m4')
26
27C INPUT PARAMETERS
28define(`rp',`%r26')
29define(`up',`%r25')
30define(`n',`%r24')
31
32define(`p00',`%r28')
33define(`p32',`%r29')
34define(`p64',`%r31')
35define(`t0',`%r19')
36define(`t1',`%r20')
37
38ifdef(`HAVE_ABI_2_0w',
39`	.level	2.0w
40',`	.level	2.0
41')
42PROLOGUE(mpn_sqr_diagonal)
43	ldo		128(%r30),%r30
44
45	fldds,ma	8(up),%fr8
46	addib,=		-1,n,L(end1)
47	nop
48	fldds,ma	8(up),%fr4
49	xmpyu		%fr8l,%fr8r,%fr10
50	fstd		%fr10,-120(%r30)
51	xmpyu		%fr8r,%fr8r,%fr9
52	fstd		%fr9,0(rp)
53	xmpyu		%fr8l,%fr8l,%fr11
54	fstd		%fr11,8(rp)
55	addib,=		-1,n,L(end2)
56	ldo		16(rp),rp
57
58LDEF(loop)
59	fldds,ma	8(up),%fr8		C load next up limb
60	xmpyu		%fr4l,%fr4r,%fr6
61	fstd		%fr6,-128(%r30)
62	xmpyu		%fr4r,%fr4r,%fr5	C multiply in fp regs
63	fstd		%fr5,0(rp)
64	xmpyu		%fr4l,%fr4l,%fr7
65	fstd		%fr7,8(rp)
66	ldd		-120(%r30),p32
67	ldd		-16(rp),p00		C accumulate in int regs
68	ldd		-8(rp),p64
69	depd,z		p32,30,31,t0
70	add		t0,p00,p00
71	std		p00,-16(rp)
72	extrd,u		p32,32,33,t1
73	add,dc		t1,p64,p64
74	std		p64,-8(rp)
75	addib,=		-1,n,L(exit)
76	ldo		16(rp),rp
77
78	fldds,ma	8(up),%fr4
79	xmpyu		%fr8l,%fr8r,%fr10
80	fstd		%fr10,-120(%r30)
81	xmpyu		%fr8r,%fr8r,%fr9
82	fstd		%fr9,0(rp)
83	xmpyu		%fr8l,%fr8l,%fr11
84	fstd		%fr11,8(rp)
85	ldd		-128(%r30),p32
86	ldd		-16(rp),p00
87	ldd		-8(rp),p64
88	depd,z		p32,30,31,t0
89	add		t0,p00,p00
90	std		p00,-16(rp)
91	extrd,u		p32,32,33,t1
92	add,dc		t1,p64,p64
93	std		p64,-8(rp)
94	addib,<>	-1,n,L(loop)
95	ldo		16(rp),rp
96
97LDEF(end2)
98	xmpyu		%fr4l,%fr4r,%fr6
99	fstd		%fr6,-128(%r30)
100	xmpyu		%fr4r,%fr4r,%fr5
101	fstd		%fr5,0(rp)
102	xmpyu		%fr4l,%fr4l,%fr7
103	fstd		%fr7,8(rp)
104	ldd		-120(%r30),p32
105	ldd		-16(rp),p00
106	ldd		-8(rp),p64
107	depd,z		p32,30,31,t0
108	add		t0,p00,p00
109	std		p00,-16(rp)
110	extrd,u		p32,32,33,t1
111	add,dc		t1,p64,p64
112	std		p64,-8(rp)
113	ldo		16(rp),rp
114	ldd		-128(%r30),p32
115	ldd		-16(rp),p00
116	ldd		-8(rp),p64
117	depd,z		p32,30,31,t0
118	add		t0,p00,p00
119	std		p00,-16(rp)
120	extrd,u		p32,32,33,t1
121	add,dc		t1,p64,p64
122	std		p64,-8(rp)
123	bve		(%r2)
124	ldo		-128(%r30),%r30
125
126LDEF(exit)
127	xmpyu		%fr8l,%fr8r,%fr10
128	fstd		%fr10,-120(%r30)
129	xmpyu		%fr8r,%fr8r,%fr9
130	fstd		%fr9,0(rp)
131	xmpyu		%fr8l,%fr8l,%fr11
132	fstd		%fr11,8(rp)
133	ldd		-128(%r30),p32
134	ldd		-16(rp),p00
135	ldd		-8(rp),p64
136	depd,z		p32,31,32,t0
137	add		t0,p00,p00
138	extrd,u		p32,31,32,t1
139	add,dc		t1,p64,p64
140	add		t0,p00,p00
141	add,dc		t1,p64,p64
142	std		p00,-16(rp)
143	std		p64,-8(rp)
144	ldo		16(rp),rp
145	ldd		-120(%r30),p32
146	ldd		-16(rp),p00
147	ldd		-8(rp),p64
148	depd,z		p32,31,32,t0
149	add		t0,p00,p00
150	extrd,u		p32,31,32,t1
151	add,dc		t1,p64,p64
152	add		t0,p00,p00
153	add,dc		t1,p64,p64
154	std		p00,-16(rp)
155	std		p64,-8(rp)
156	bve		(%r2)
157	ldo		-128(%r30),%r30
158
159LDEF(end1)
160	xmpyu		%fr8l,%fr8r,%fr10
161	fstd		%fr10,-128(%r30)
162	xmpyu		%fr8r,%fr8r,%fr9
163	fstd		%fr9,0(rp)
164	xmpyu		%fr8l,%fr8l,%fr11
165	fstd		%fr11,8(rp)
166	ldo		16(rp),rp
167	ldd		-128(%r30),p32
168	ldd		-16(rp),p00
169	ldd		-8(rp),p64
170	depd,z		p32,31,32,t0
171	add		t0,p00,p00
172	extrd,u		p32,31,32,t1
173	add,dc		t1,p64,p64
174	add		t0,p00,p00
175	add,dc		t1,p64,p64
176	std		p00,-16(rp)
177	std		p64,-8(rp)
178	bve		(%r2)
179	ldo		-128(%r30),%r30
180EPILOGUE(mpn_sqr_diagonal)
181