xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/s390_32/esame/sqr_basecase.asm (revision 72c7faa4dbb41dbb0238d6b4a109da0d4b236dd4)
1dnl  S/390-32 mpn_sqr_basecase.
2
3dnl  Copyright 2011 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C            cycles/limb
34C z900		 ?
35C z990		23
36C z9		 ?
37C z10		 ?
38C z196		 ?
39
40C TODO
41C  * Clean up.
42C  * Stop iterating addmul_1 loop at latest for n = 2, implement longer tail.
43C    This will ask for basecase handling of n = 3.
44C  * Update counters and pointers more straightforwardly, possibly lowering
45C    register usage.
46C  * Should we use this allocation-free style for more sqr_basecase asm
47C    implementations?  The only disadvantage is that it requires R != U.
48C  * Replace loops by faster code.  The mul_1 and addmul_1 loops could be sped
49C    up by about 10%.  The sqr_diag_addlsh1 loop could probably be sped up even
50C    more.
51
52C INPUT PARAMETERS
53define(`rp',	`%r2')
54define(`up',	`%r3')
55define(`n',	`%r4')
56
57define(`zero',	`%r8')
58define(`rp_saved',	`%r9')
59define(`up_saved',	`%r13')
60define(`n_saved',	`%r14')
61
62ASM_START()
63PROLOGUE(mpn_sqr_basecase)
64	ahi	n, -2
65	jhe	L(ge2)
66
67C n = 1
68	l	%r5, 0(up)
69	mlr	%r4, %r5
70	st	%r5, 0(rp)
71	st	%r4, 4(rp)
72	br	%r14
73
74L(ge2):	jne	L(gen)
75
76C n = 2
77	stm	%r6, %r8, 24(%r15)
78	lhi	zero, 0
79
80	l	%r5, 0(up)
81	mlr	%r4, %r5		C u0 * u0
82	l	%r1, 4(up)
83	mlr	%r0, %r1		C u1 * u1
84	st	%r5, 0(rp)
85
86	l	%r7, 0(up)
87	ml	%r6, 4(up)		C u0 * u1
88	alr	%r7, %r7
89	alcr	%r6, %r6
90	alcr	%r0, zero
91
92	alr	%r4, %r7
93	alcr	%r1, %r6
94	alcr	%r0, zero
95	st	%r4, 4(rp)
96	st	%r1, 8(rp)
97	st	%r0, 12(rp)
98
99	lm	%r6, %r8, 24(%r15)
100	br	%r14
101
102L(gen):
103C mul_1 =======================================================================
104
105	stm	%r6, %r14, 24(%r15)
106	lhi	zero, 0
107	lr	up_saved, up
108	lr	rp_saved, rp
109	lr	n_saved, n
110
111	l	%r6, 0(up)
112	l	%r11, 4(up)
113	lhi	%r12, 8		C init index register
114	mlr	%r10, %r6
115	lr	%r5, n
116	st	%r11, 4(rp)
117	cr	%r15, %r15		C clear carry flag
118
119L(tm):	l	%r1, 0(%r12,up)
120	mlr	%r0, %r6
121	alcr	%r1, %r10
122	lr	%r10, %r0		C copy high part to carry limb
123	st	%r1, 0(%r12,rp)
124	la	%r12, 4(%r12)
125	brct	%r5, L(tm)
126
127	alcr	%r0, zero
128	st	%r0, 0(%r12,rp)
129
130C addmul_1 loop ===============================================================
131
132	ahi	n, -1
133	je	L(outer_end)
134L(outer_loop):
135
136	la	rp, 8(rp)		C rp += 2
137	la	up, 4(up)		C up += 1
138	l	%r6, 0(up)
139	l	%r11, 4(up)
140	lhi	%r12, 8		C init index register
141	mlr	%r10, %r6
142	lr	%r5, n
143	al	%r11, 4(rp)
144	st	%r11, 4(rp)
145
146L(tam):	l	%r1, 0(%r12,up)
147	l	%r7, 0(%r12,rp)
148	mlr	%r0, %r6
149	alcr	%r1, %r7
150	alcr	%r0, zero
151	alr	%r1, %r10
152	lr	%r10, %r0
153	st	%r1, 0(%r12,rp)
154	la	%r12, 4(%r12)
155	brct	%r5, L(tam)
156
157	alcr	%r0, zero
158	st	%r0, 0(%r12,rp)
159
160	brct	n, L(outer_loop)
161L(outer_end):
162
163	l	%r6, 4(up)
164	l	%r1, 8(up)
165	lr	%r7, %r0		C Same as: l %r7, 12(,rp)
166	mlr	%r0, %r6
167	alr	%r1, %r7
168	alcr	%r0, zero
169	st	%r1, 12(rp)
170	st	%r0, 16(rp)
171
172C sqr_diag_addlsh1 ============================================================
173
174define(`up', `up_saved')
175define(`rp', `rp_saved')
176	la	n, 1(n_saved)
177
178	l	%r1, 0(up)
179	mlr	%r0, %r1
180	st	%r1, 0(rp)
181C	clr	%r15, %r15		C clear carry (already clear per above)
182
183L(top):	l	%r11, 4(up)
184	la	up, 4(up)
185	l	%r6, 4(rp)
186	l	%r7, 8(rp)
187	mlr	%r10, %r11
188	alcr	%r6, %r6
189	alcr	%r7, %r7
190	alcr	%r10, zero		C propagate carry to high product limb
191	alr	%r6, %r0
192	alcr	%r7, %r11
193	stm	%r6, %r7, 4(rp)
194	la	rp, 8(rp)
195	lr	%r0, %r10		C copy carry limb
196	brct	n, L(top)
197
198	alcr	%r0, zero
199	st	%r0, 4(rp)
200
201	lm	%r6, %r14, 24(%r15)
202	br	%r14
203EPILOGUE()
204