xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/s390_64/sqr_basecase.asm (revision 9fb66d812c00ebfb445c0b47dea128f32aa6fe96)
1dnl  S/390-64 mpn_sqr_basecase.
2
3dnl  Copyright 2011 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C            cycles/limb
34C z900		 ?
35C z990		23
36C z9		 ?
37C z10		28
38C z196		 ?
39
40C TODO
41C  * Clean up.
42C  * Stop iterating addmul_1 loop at latest for n = 2, implement longer tail.
43C    This will ask for basecase handling of n = 3.
44C  * Update counters and pointers more straightforwardly, possibly lowering
45C    register usage.
46C  * Should we use this allocation-free style for more sqr_basecase asm
47C    implementations?  The only disadvantage is that it requires R != U.
48C  * Replace loops by faster code.  The mul_1 and addmul_1 loops could be sped
49C    up by about 10%.  The sqr_diag_addlsh1 loop could probably be sped up even
50C    more.
51
52C INPUT PARAMETERS
53define(`rp',	`%r2')
54define(`up',	`%r3')
55define(`n',	`%r4')
56
57define(`zero',	`%r8')
58define(`rp_saved',	`%r9')
59define(`up_saved',	`%r13')
60define(`n_saved',	`%r14')
61
62ASM_START()
63PROLOGUE(mpn_sqr_basecase)
64	aghi	n, -2
65	jhe	L(ge2)
66
67C n = 1
68	lg	%r5, 0(up)
69	mlgr	%r4, %r5
70	stg	%r5, 0(rp)
71	stg	%r4, 8(rp)
72	br	%r14
73
74L(ge2):	jne	L(gen)
75
76C n = 2
77	stmg	%r6, %r8, 48(%r15)
78	lghi	zero, 0
79
80	lg	%r5, 0(up)
81	mlgr	%r4, %r5		C u0 * u0
82	lg	%r1, 8(up)
83	mlgr	%r0, %r1		C u1 * u1
84	stg	%r5, 0(rp)
85
86	lg	%r7, 0(up)
87	mlg	%r6, 8(up)		C u0 * u1
88	algr	%r7, %r7
89	alcgr	%r6, %r6
90	alcgr	%r0, zero
91
92	algr	%r4, %r7
93	alcgr	%r1, %r6
94	alcgr	%r0, zero
95	stg	%r4, 8(rp)
96	stg	%r1, 16(rp)
97	stg	%r0, 24(rp)
98
99	lmg	%r6, %r8, 48(%r15)
100	br	%r14
101
102L(gen):
103C mul_1 =======================================================================
104
105	stmg	%r6, %r14, 48(%r15)
106	lghi	zero, 0
107	lgr	up_saved, up
108	lgr	rp_saved, rp
109	lgr	n_saved, n
110
111	lg	%r6, 0(up)
112	lg	%r11, 8(up)
113	lghi	%r12, 16		C init index register
114	mlgr	%r10, %r6
115	lgr	%r5, n
116	stg	%r11, 8(rp)
117	cr	%r15, %r15		C clear carry flag
118
119L(tm):	lg	%r1, 0(%r12,up)
120	mlgr	%r0, %r6
121	alcgr	%r1, %r10
122	lgr	%r10, %r0		C copy high part to carry limb
123	stg	%r1, 0(%r12,rp)
124	la	%r12, 8(%r12)
125	brctg	%r5, L(tm)
126
127	alcgr	%r0, zero
128	stg	%r0, 0(%r12,rp)
129
130C addmul_1 loop ===============================================================
131
132	aghi	n, -1
133	je	L(outer_end)
134L(outer_loop):
135
136	la	rp, 16(rp)		C rp += 2
137	la	up, 8(up)		C up += 1
138	lg	%r6, 0(up)
139	lg	%r11, 8(up)
140	lghi	%r12, 16		C init index register
141	mlgr	%r10, %r6
142	lgr	%r5, n
143	alg	%r11, 8(rp)
144	stg	%r11, 8(rp)
145
146L(tam):	lg	%r1, 0(%r12,up)
147	lg	%r7, 0(%r12,rp)
148	mlgr	%r0, %r6
149	alcgr	%r1, %r7
150	alcgr	%r0, zero
151	algr	%r1, %r10
152	lgr	%r10, %r0
153	stg	%r1, 0(%r12,rp)
154	la	%r12, 8(%r12)
155	brctg	%r5, L(tam)
156
157	alcgr	%r0, zero
158	stg	%r0, 0(%r12,rp)
159
160	brctg	n, L(outer_loop)
161L(outer_end):
162
163	lg	%r6, 8(up)
164	lg	%r1, 16(up)
165	lgr	%r7, %r0		C Same as: lg %r7, 24(,rp)
166	mlgr	%r0, %r6
167	algr	%r1, %r7
168	alcgr	%r0, zero
169	stg	%r1, 24(rp)
170	stg	%r0, 32(rp)
171
172C sqr_diag_addlsh1 ============================================================
173
174define(`up', `up_saved')
175define(`rp', `rp_saved')
176	la	n, 1(n_saved)
177
178	lg	%r1, 0(up)
179	mlgr	%r0, %r1
180	stg	%r1, 0(rp)
181C	clr	%r15, %r15		C clear carry (already clear per above)
182
183L(top):	lg	%r11, 8(up)
184	la	up, 8(up)
185	lg	%r6, 8(rp)
186	lg	%r7, 16(rp)
187	mlgr	%r10, %r11
188	alcgr	%r6, %r6
189	alcgr	%r7, %r7
190	alcgr	%r10, zero		C propagate carry to high product limb
191	algr	%r6, %r0
192	alcgr	%r7, %r11
193	stmg	%r6, %r7, 8(rp)
194	la	rp, 16(rp)
195	lgr	%r0, %r10		C copy carry limb
196	brctg	n, L(top)
197
198	alcgr	%r0, zero
199	stg	%r0, 8(rp)
200
201	lmg	%r6, %r14, 48(%r15)
202	br	%r14
203EPILOGUE()
204