xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/s390_64/sqr_basecase.asm (revision 8450a7c42673d65e3b1f6560d3b6ecd317a6cbe8)
1dnl  S/390-64 mpn_sqr_basecase.
2
3dnl  Copyright 2011 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22C            cycles/limb
23C z900		 ?
24C z990		23
25C z9		 ?
26C z10		 ?
27C z196		 ?
28
29C TODO
30C  * Clean up.
31C  * Stop iterating addmul_1 loop at latest for n = 2, implement longer tail.
32C    This will ask for basecase handling of n = 3.
33C  * Update counters and pointers more straightforwardly, possibly lowering
34C    register usage.
35C  * Should we use this allocation-free style for more sqr_basecase asm
36C    implementations?  The only disadvantage is that it requires R != U.
37C  * Replace loops by faster code.  The mul_1 and addmul_1 loops could be sped
38C    up by about 10%.  The sqr_diag_addlsh1 loop could probably be sped up even
39C    more.
40
41C INPUT PARAMETERS
42define(`rp',	`%r2')
43define(`up',	`%r3')
44define(`n',	`%r4')
45
46define(`zero',	`%r8')
47define(`rp_saved',	`%r9')
48define(`up_saved',	`%r13')
49define(`n_saved',	`%r14')
50
51ASM_START()
52PROLOGUE(mpn_sqr_basecase)
53	aghi	n, -2
54	jhe	L(ge2)
55
56C n = 1
57	lg	%r5, 0(up)
58	mlgr	%r4, %r5
59	stg	%r5, 0(rp)
60	stg	%r4, 8(rp)
61	br	%r14
62
63L(ge2):	jne	L(gen)
64
65C n = 2
66	stmg	%r6, %r8, 48(%r15)
67	lghi	zero, 0
68
69	lg	%r5, 0(up)
70	mlgr	%r4, %r5		C u0 * u0
71	lg	%r1, 8(up)
72	mlgr	%r0, %r1		C u1 * u1
73	stg	%r5, 0(rp)
74
75	lg	%r7, 0(up)
76	mlg	%r6, 8(up)		C u0 * u1
77	algr	%r7, %r7
78	alcgr	%r6, %r6
79	alcgr	%r0, zero
80
81	algr	%r4, %r7
82	alcgr	%r1, %r6
83	alcgr	%r0, zero
84	stg	%r4, 8(rp)
85	stg	%r1, 16(rp)
86	stg	%r0, 24(rp)
87
88	lmg	%r6, %r8, 48(%r15)
89	br	%r14
90
91L(gen):
92C mul_1 =======================================================================
93
94	stmg	%r6, %r14, 48(%r15)
95	lghi	zero, 0
96	lgr	up_saved, up
97	lgr	rp_saved, rp
98	lgr	n_saved, n
99
100	lg	%r6, 0(up)
101	lg	%r11, 8(up)
102	lghi	%r12, 16		C init index register
103	mlgr	%r10, %r6
104	lgr	%r5, n
105	stg	%r11, 8(rp)
106	cr	%r15, %r15		C clear carry flag
107
108L(tm):	lg	%r1, 0(%r12,up)
109	mlgr	%r0, %r6
110	alcgr	%r1, %r10
111	lgr	%r10, %r0		C copy high part to carry limb
112	stg	%r1, 0(%r12,rp)
113	la	%r12, 8(%r12)
114	brctg	%r5, L(tm)
115
116	alcgr	%r0, zero
117	stg	%r0, 0(%r12,rp)
118
119C addmul_1 loop ===============================================================
120
121	aghi	n, -1
122	je	L(outer_end)
123L(outer_loop):
124
125	la	rp, 16(rp)		C rp += 2
126	la	up, 8(up)		C up += 1
127	lg	%r6, 0(up)
128	lg	%r11, 8(up)
129	lghi	%r12, 16		C init index register
130	mlgr	%r10, %r6
131	lgr	%r5, n
132	alg	%r11, 8(rp)
133	stg	%r11, 8(rp)
134
135L(tam):	lg	%r1, 0(%r12,up)
136	lg	%r7, 0(%r12,rp)
137	mlgr	%r0, %r6
138	alcgr	%r1, %r7
139	alcgr	%r0, zero
140	algr	%r1, %r10
141	lgr	%r10, %r0
142	stg	%r1, 0(%r12,rp)
143	la	%r12, 8(%r12)
144	brctg	%r5, L(tam)
145
146	alcgr	%r0, zero
147	stg	%r0, 0(%r12,rp)
148
149	brctg	n, L(outer_loop)
150L(outer_end):
151
152	lg	%r6, 8(up)
153	lg	%r1, 16(up)
154	lgr	%r7, %r0		C Same as: lg %r7, 24(,rp)
155	mlgr	%r0, %r6
156	algr	%r1, %r7
157	alcgr	%r0, zero
158	stg	%r1, 24(rp)
159	stg	%r0, 32(rp)
160
161C sqr_diag_addlsh1 ============================================================
162
163define(`up', `up_saved')
164define(`rp', `rp_saved')
165	la	n, 1(n_saved)
166
167	lg	%r1, 0(up)
168	mlgr	%r0, %r1
169	stg	%r1, 0(rp)
170C	clr	%r15, %r15		C clear carry (already clear per above)
171
172L(top):	lg	%r11, 8(up)
173	la	up, 8(up)
174	lg	%r6, 8(rp)
175	lg	%r7, 16(rp)
176	mlgr	%r10, %r11
177	alcgr	%r6, %r6
178	alcgr	%r7, %r7
179	alcgr	%r10, zero		C propagate carry to high product limb
180	algr	%r6, %r0
181	alcgr	%r7, %r11
182	stmg	%r6, %r7, 8(rp)
183	la	rp, 16(rp)
184	lgr	%r0, %r10		C copy carry limb
185	brctg	n, L(top)
186
187	alcgr	%r0, zero
188	stg	%r0, 8(rp)
189
190	lmg	%r6, %r14, 48(%r15)
191	br	%r14
192EPILOGUE()
193