xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/sqr_basecase.asm (revision 501cd18a74d52bfcca7d9e7e3b0d472bbc870558)
1dnl  x86 generic mpn_sqr_basecase -- square an mpn number.
2
3dnl  Copyright 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20
21include(`../config.m4')
22
23
24C     cycles/crossproduct  cycles/triangleproduct
25C P5
26C P6
27C K6
28C K7
29C P4
30
31
32C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
33C
34C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
35C lot of function call overheads are avoided, especially when the size is
36C small.
37C
38C The mul1 loop is not unrolled like mul_1.asm, it doesn't seem worth the
39C code size to do so here.
40C
41C Enhancements:
42C
43C The addmul loop here is also not unrolled like aorsmul_1.asm and
44C mul_basecase.asm are.  Perhaps it should be done.  It'd add to the
45C complexity, but if it's worth doing in the other places then it should be
46C worthwhile here.
47C
48C A fully-unrolled style like other sqr_basecase.asm versions (k6, k7, p6)
49C might be worth considering.  That'd add quite a bit to the code size, but
50C only as much as is used would be dragged into L1 cache.
51
52defframe(PARAM_SIZE,12)
53defframe(PARAM_SRC, 8)
54defframe(PARAM_DST, 4)
55
56	TEXT
57	ALIGN(8)
58PROLOGUE(mpn_sqr_basecase)
59deflit(`FRAME',0)
60
61	movl	PARAM_SIZE, %edx
62
63	movl	PARAM_SRC, %eax
64
65	cmpl	$2, %edx
66	movl	PARAM_DST, %ecx
67
68	je	L(two_limbs)
69	ja	L(three_or_more)
70
71
72C -----------------------------------------------------------------------------
73C one limb only
74	C eax	src
75	C ebx
76	C ecx	dst
77	C edx
78
79	movl	(%eax), %eax
80	mull	%eax
81	movl	%eax, (%ecx)
82	movl	%edx, 4(%ecx)
83	ret
84
85
86C -----------------------------------------------------------------------------
87	ALIGN(8)
88L(two_limbs):
89	C eax	src
90	C ebx
91	C ecx	dst
92	C edx
93
94	pushl	%ebx
95	pushl	%ebp
96
97	movl	%eax, %ebx
98	movl	(%eax), %eax
99
100	mull	%eax		C src[0]^2
101
102	pushl	%esi
103	pushl	%edi
104
105	movl	%edx, %esi	C dst[1]
106	movl	%eax, (%ecx)	C dst[0]
107
108	movl	4(%ebx), %eax
109	mull	%eax		C src[1]^2
110
111	movl	%eax, %edi	C dst[2]
112	movl	%edx, %ebp	C dst[3]
113
114	movl	(%ebx), %eax
115	mull	4(%ebx)		C src[0]*src[1]
116
117	addl	%eax, %esi
118
119	adcl	%edx, %edi
120
121	adcl	$0, %ebp
122	addl	%esi, %eax
123
124	adcl	%edi, %edx
125	movl	%eax, 4(%ecx)
126
127	adcl	$0, %ebp
128
129	movl	%edx, 8(%ecx)
130	movl	%ebp, 12(%ecx)
131
132	popl	%edi
133	popl	%esi
134
135	popl	%ebp
136	popl	%ebx
137
138	ret
139
140
141C -----------------------------------------------------------------------------
142	ALIGN(8)
143L(three_or_more):
144deflit(`FRAME',0)
145	C eax	src
146	C ebx
147	C ecx	dst
148	C edx	size
149
150	pushl	%ebx	FRAME_pushl()
151	pushl	%edi	FRAME_pushl()
152
153	pushl	%esi	FRAME_pushl()
154	pushl	%ebp	FRAME_pushl()
155
156	leal	(%ecx,%edx,4), %edi	C &dst[size], end of this mul1
157	leal	(%eax,%edx,4), %esi	C &src[size]
158
159C First multiply src[0]*src[1..size-1] and store at dst[1..size].
160
161	movl	(%eax), %ebp		C src[0], multiplier
162	movl	%edx, %ecx
163
164	negl	%ecx			C -size
165	xorl	%ebx, %ebx		C clear carry limb
166
167	incl	%ecx			C -(size-1)
168
169L(mul1):
170	C eax	scratch
171	C ebx	carry
172	C ecx	counter, limbs, negative
173	C edx	scratch
174	C esi	&src[size]
175	C edi	&dst[size]
176	C ebp	multiplier
177
178	movl	(%esi,%ecx,4), %eax
179	mull	%ebp
180	addl	%eax, %ebx
181	adcl	$0, %edx
182	movl	%ebx, (%edi,%ecx,4)
183	movl	%edx, %ebx
184	incl	%ecx
185	jnz	L(mul1)
186
187	movl	%ebx, (%edi)
188
189
190	C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for
191	C n=1..size-2.
192	C
193	C The last products src[size-2]*src[size-1], which is the end corner
194	C of the product triangle, is handled separately at the end to save
195	C looping overhead.  If size is 3 then it's only this that needs to
196	C be done.
197	C
198	C In the outer loop %esi is a constant, and %edi just advances by 1
199	C limb each time.  The size of the operation decreases by 1 limb
200	C each time.
201
202	C eax
203	C ebx	carry (needing carry flag added)
204	C ecx
205	C edx
206	C esi	&src[size]
207	C edi	&dst[size]
208	C ebp
209
210	movl	PARAM_SIZE, %ecx
211	subl	$3, %ecx
212	jz	L(corner)
213
214	negl	%ecx
215
216dnl  re-use parameter space
217define(VAR_OUTER,`PARAM_DST')
218
219L(outer):
220	C eax
221	C ebx
222	C ecx
223	C edx	outer loop counter, -(size-3) to -1
224	C esi	&src[size]
225	C edi	dst, pointing at stored carry limb of previous loop
226	C ebp
227
228	movl	%ecx, VAR_OUTER
229	addl	$4, %edi		C advance dst end
230
231	movl	-8(%esi,%ecx,4), %ebp	C next multiplier
232	subl	$1, %ecx
233
234	xorl	%ebx, %ebx		C initial carry limb
235
236L(inner):
237	C eax	scratch
238	C ebx	carry (needing carry flag added)
239	C ecx	counter, -n-1 to -1
240	C edx	scratch
241	C esi	&src[size]
242	C edi	dst end of this addmul
243	C ebp	multiplier
244
245	movl	(%esi,%ecx,4), %eax
246	mull	%ebp
247	addl	%ebx, %eax
248	adcl	$0, %edx
249	addl	%eax, (%edi,%ecx,4)
250	adcl	$0, %edx
251	movl	%edx, %ebx
252	addl	$1, %ecx
253	jl	L(inner)
254
255
256	movl	%ebx, (%edi)
257	movl	VAR_OUTER, %ecx
258	incl	%ecx
259	jnz	L(outer)
260
261
262L(corner):
263	C esi	&src[size]
264	C edi	&dst[2*size-3]
265
266	movl	-4(%esi), %eax
267	mull	-8(%esi)		C src[size-1]*src[size-2]
268	addl	%eax, 0(%edi)
269	adcl	$0, %edx
270	movl	%edx, 4(%edi)		C dst high limb
271
272
273C -----------------------------------------------------------------------------
274C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
275
276	movl	PARAM_SIZE, %eax
277	negl	%eax
278	addl	$1, %eax		C -(size-1) and clear carry
279
280L(lshift):
281	C eax	counter, negative
282	C ebx	next limb
283	C ecx
284	C edx
285	C esi
286	C edi	&dst[2*size-4]
287	C ebp
288
289	rcll	8(%edi,%eax,8)
290	rcll	12(%edi,%eax,8)
291	incl	%eax
292	jnz	L(lshift)
293
294
295	adcl	%eax, %eax		C high bit out
296	movl	%eax, 8(%edi)		C dst most significant limb
297
298
299C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ...,
300C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
301C low limb of src[0]^2.
302
303	movl	PARAM_SRC, %esi
304	movl	(%esi), %eax		C src[0]
305	mull	%eax			C src[0]^2
306
307	movl	PARAM_SIZE, %ecx
308	leal	(%esi,%ecx,4), %esi	C src end
309
310	negl	%ecx			C -size
311	movl	%edx, %ebx		C initial carry
312
313	movl	%eax, 12(%edi,%ecx,8)	C dst[0]
314	incl	%ecx			C -(size-1)
315
316L(diag):
317	C eax	scratch (low product)
318	C ebx	carry limb
319	C ecx	counter, -(size-1) to -1
320	C edx	scratch (high product)
321	C esi	&src[size]
322	C edi	&dst[2*size-3]
323	C ebp	scratch (fetched dst limbs)
324
325	movl	(%esi,%ecx,4), %eax
326	mull	%eax
327
328	addl	%ebx, 8(%edi,%ecx,8)
329	movl	%edx, %ebx
330
331	adcl	%eax, 12(%edi,%ecx,8)
332	adcl	$0, %ebx
333
334	incl	%ecx
335	jnz	L(diag)
336
337
338	addl	%ebx, 8(%edi)		C dst most significant limb
339
340	popl	%ebp
341	popl	%esi
342
343	popl	%edi
344	popl	%ebx
345
346	ret
347
348EPILOGUE()
349