xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/sqr_basecase.asm (revision ce54336801cf28877c3414aa2fcb251dddd543a2)
1dnl  x86 generic mpn_sqr_basecase -- square an mpn number.
2
3dnl  Copyright 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31
32include(`../config.m4')
33
34
35C     cycles/crossproduct  cycles/triangleproduct
36C P5
37C P6
38C K6
39C K7
40C P4
41
42
43C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
44C
45C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
46C lot of function call overheads are avoided, especially when the size is
47C small.
48C
49C The mul1 loop is not unrolled like mul_1.asm, it doesn't seem worth the
50C code size to do so here.
51C
52C Enhancements:
53C
54C The addmul loop here is also not unrolled like aorsmul_1.asm and
55C mul_basecase.asm are.  Perhaps it should be done.  It'd add to the
56C complexity, but if it's worth doing in the other places then it should be
57C worthwhile here.
58C
59C A fully-unrolled style like other sqr_basecase.asm versions (k6, k7, p6)
60C might be worth considering.  That'd add quite a bit to the code size, but
61C only as much as is used would be dragged into L1 cache.
62
63defframe(PARAM_SIZE,12)
64defframe(PARAM_SRC, 8)
65defframe(PARAM_DST, 4)
66
67	TEXT
68	ALIGN(8)
69PROLOGUE(mpn_sqr_basecase)
70deflit(`FRAME',0)
71
72	movl	PARAM_SIZE, %edx
73
74	movl	PARAM_SRC, %eax
75
76	cmpl	$2, %edx
77	movl	PARAM_DST, %ecx
78
79	je	L(two_limbs)
80	ja	L(three_or_more)
81
82
83C -----------------------------------------------------------------------------
84C one limb only
85	C eax	src
86	C ebx
87	C ecx	dst
88	C edx
89
90	movl	(%eax), %eax
91	mull	%eax
92	movl	%eax, (%ecx)
93	movl	%edx, 4(%ecx)
94	ret
95
96
97C -----------------------------------------------------------------------------
98	ALIGN(8)
99L(two_limbs):
100	C eax	src
101	C ebx
102	C ecx	dst
103	C edx
104
105	pushl	%ebx
106	pushl	%ebp
107
108	movl	%eax, %ebx
109	movl	(%eax), %eax
110
111	mull	%eax		C src[0]^2
112
113	pushl	%esi
114	pushl	%edi
115
116	movl	%edx, %esi	C dst[1]
117	movl	%eax, (%ecx)	C dst[0]
118
119	movl	4(%ebx), %eax
120	mull	%eax		C src[1]^2
121
122	movl	%eax, %edi	C dst[2]
123	movl	%edx, %ebp	C dst[3]
124
125	movl	(%ebx), %eax
126	mull	4(%ebx)		C src[0]*src[1]
127
128	addl	%eax, %esi
129
130	adcl	%edx, %edi
131
132	adcl	$0, %ebp
133	addl	%esi, %eax
134
135	adcl	%edi, %edx
136	movl	%eax, 4(%ecx)
137
138	adcl	$0, %ebp
139
140	movl	%edx, 8(%ecx)
141	movl	%ebp, 12(%ecx)
142
143	popl	%edi
144	popl	%esi
145
146	popl	%ebp
147	popl	%ebx
148
149	ret
150
151
152C -----------------------------------------------------------------------------
153	ALIGN(8)
154L(three_or_more):
155deflit(`FRAME',0)
156	C eax	src
157	C ebx
158	C ecx	dst
159	C edx	size
160
161	pushl	%ebx	FRAME_pushl()
162	pushl	%edi	FRAME_pushl()
163
164	pushl	%esi	FRAME_pushl()
165	pushl	%ebp	FRAME_pushl()
166
167	leal	(%ecx,%edx,4), %edi	C &dst[size], end of this mul1
168	leal	(%eax,%edx,4), %esi	C &src[size]
169
170C First multiply src[0]*src[1..size-1] and store at dst[1..size].
171
172	movl	(%eax), %ebp		C src[0], multiplier
173	movl	%edx, %ecx
174
175	negl	%ecx			C -size
176	xorl	%ebx, %ebx		C clear carry limb
177
178	incl	%ecx			C -(size-1)
179
180L(mul1):
181	C eax	scratch
182	C ebx	carry
183	C ecx	counter, limbs, negative
184	C edx	scratch
185	C esi	&src[size]
186	C edi	&dst[size]
187	C ebp	multiplier
188
189	movl	(%esi,%ecx,4), %eax
190	mull	%ebp
191	addl	%eax, %ebx
192	adcl	$0, %edx
193	movl	%ebx, (%edi,%ecx,4)
194	movl	%edx, %ebx
195	incl	%ecx
196	jnz	L(mul1)
197
198	movl	%ebx, (%edi)
199
200
201	C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for
202	C n=1..size-2.
203	C
204	C The last products src[size-2]*src[size-1], which is the end corner
205	C of the product triangle, is handled separately at the end to save
206	C looping overhead.  If size is 3 then it's only this that needs to
207	C be done.
208	C
209	C In the outer loop %esi is a constant, and %edi just advances by 1
210	C limb each time.  The size of the operation decreases by 1 limb
211	C each time.
212
213	C eax
214	C ebx	carry (needing carry flag added)
215	C ecx
216	C edx
217	C esi	&src[size]
218	C edi	&dst[size]
219	C ebp
220
221	movl	PARAM_SIZE, %ecx
222	subl	$3, %ecx
223	jz	L(corner)
224
225	negl	%ecx
226
227dnl  re-use parameter space
228define(VAR_OUTER,`PARAM_DST')
229
230L(outer):
231	C eax
232	C ebx
233	C ecx
234	C edx	outer loop counter, -(size-3) to -1
235	C esi	&src[size]
236	C edi	dst, pointing at stored carry limb of previous loop
237	C ebp
238
239	movl	%ecx, VAR_OUTER
240	addl	$4, %edi		C advance dst end
241
242	movl	-8(%esi,%ecx,4), %ebp	C next multiplier
243	subl	$1, %ecx
244
245	xorl	%ebx, %ebx		C initial carry limb
246
247L(inner):
248	C eax	scratch
249	C ebx	carry (needing carry flag added)
250	C ecx	counter, -n-1 to -1
251	C edx	scratch
252	C esi	&src[size]
253	C edi	dst end of this addmul
254	C ebp	multiplier
255
256	movl	(%esi,%ecx,4), %eax
257	mull	%ebp
258	addl	%ebx, %eax
259	adcl	$0, %edx
260	addl	%eax, (%edi,%ecx,4)
261	adcl	$0, %edx
262	movl	%edx, %ebx
263	addl	$1, %ecx
264	jl	L(inner)
265
266
267	movl	%ebx, (%edi)
268	movl	VAR_OUTER, %ecx
269	incl	%ecx
270	jnz	L(outer)
271
272
273L(corner):
274	C esi	&src[size]
275	C edi	&dst[2*size-3]
276
277	movl	-4(%esi), %eax
278	mull	-8(%esi)		C src[size-1]*src[size-2]
279	addl	%eax, 0(%edi)
280	adcl	$0, %edx
281	movl	%edx, 4(%edi)		C dst high limb
282
283
284C -----------------------------------------------------------------------------
285C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
286
287	movl	PARAM_SIZE, %eax
288	negl	%eax
289	addl	$1, %eax		C -(size-1) and clear carry
290
291L(lshift):
292	C eax	counter, negative
293	C ebx	next limb
294	C ecx
295	C edx
296	C esi
297	C edi	&dst[2*size-4]
298	C ebp
299
300	rcll	8(%edi,%eax,8)
301	rcll	12(%edi,%eax,8)
302	incl	%eax
303	jnz	L(lshift)
304
305
306	adcl	%eax, %eax		C high bit out
307	movl	%eax, 8(%edi)		C dst most significant limb
308
309
310C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ...,
311C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
312C low limb of src[0]^2.
313
314	movl	PARAM_SRC, %esi
315	movl	(%esi), %eax		C src[0]
316	mull	%eax			C src[0]^2
317
318	movl	PARAM_SIZE, %ecx
319	leal	(%esi,%ecx,4), %esi	C src end
320
321	negl	%ecx			C -size
322	movl	%edx, %ebx		C initial carry
323
324	movl	%eax, 12(%edi,%ecx,8)	C dst[0]
325	incl	%ecx			C -(size-1)
326
327L(diag):
328	C eax	scratch (low product)
329	C ebx	carry limb
330	C ecx	counter, -(size-1) to -1
331	C edx	scratch (high product)
332	C esi	&src[size]
333	C edi	&dst[2*size-3]
334	C ebp	scratch (fetched dst limbs)
335
336	movl	(%esi,%ecx,4), %eax
337	mull	%eax
338
339	addl	%ebx, 8(%edi,%ecx,8)
340	movl	%edx, %ebx
341
342	adcl	%eax, 12(%edi,%ecx,8)
343	adcl	$0, %ebx
344
345	incl	%ecx
346	jnz	L(diag)
347
348
349	addl	%ebx, 8(%edi)		C dst most significant limb
350
351	popl	%ebp
352	popl	%esi
353
354	popl	%edi
355	popl	%ebx
356
357	ret
358
359EPILOGUE()
360