xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/pentium/sqr_basecase.asm (revision a24efa7dea9f1f56c3bdb15a927d3516792ace1c)
1dnl  Intel P5 mpn_sqr_basecase -- square an mpn number.
2
3dnl  Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C P5: approx 8 cycles per crossproduct, or 15.5 cycles per triangular
24C product at around 20x20 limbs.
25
26
27C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
28C
29C Calculate src,size squared, storing the result in dst,2*size.
30C
31C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
32C lot of function call overheads are avoided, especially when the size is
33C small.
34
35defframe(PARAM_SIZE,12)
36defframe(PARAM_SRC, 8)
37defframe(PARAM_DST, 4)
38
39	TEXT
40	ALIGN(8)
41PROLOGUE(mpn_sqr_basecase)
42deflit(`FRAME',0)
43
44	movl	PARAM_SIZE, %edx
45	movl	PARAM_SRC, %eax
46
47	cmpl	$2, %edx
48	movl	PARAM_DST, %ecx
49
50	je	L(two_limbs)
51
52	movl	(%eax), %eax
53	ja	L(three_or_more)
54
55C -----------------------------------------------------------------------------
56C one limb only
57	C eax	src
58	C ebx
59	C ecx	dst
60	C edx
61
62	mull	%eax
63
64	movl	%eax, (%ecx)
65	movl	%edx, 4(%ecx)
66
67	ret
68
69C -----------------------------------------------------------------------------
70	ALIGN(8)
71L(two_limbs):
72	C eax	src
73	C ebx
74	C ecx	dst
75	C edx	size
76
77	pushl	%ebp
78	pushl	%edi
79
80	pushl	%esi
81	pushl	%ebx
82
83	movl	%eax, %ebx
84	movl	(%eax), %eax
85
86	mull	%eax		C src[0]^2
87
88	movl	%eax, (%ecx)	C dst[0]
89	movl	%edx, %esi	C dst[1]
90
91	movl	4(%ebx), %eax
92
93	mull	%eax		C src[1]^2
94
95	movl	%eax, %edi	C dst[2]
96	movl	%edx, %ebp	C dst[3]
97
98	movl	(%ebx), %eax
99
100	mull	4(%ebx)		C src[0]*src[1]
101
102	addl	%eax, %esi
103	popl	%ebx
104
105	adcl	%edx, %edi
106
107	adcl	$0, %ebp
108	addl	%esi, %eax
109
110	adcl	%edi, %edx
111	movl	%eax, 4(%ecx)
112
113	adcl	$0, %ebp
114	popl	%esi
115
116	movl	%edx, 8(%ecx)
117	movl	%ebp, 12(%ecx)
118
119	popl	%edi
120	popl	%ebp
121
122	ret
123
124
125C -----------------------------------------------------------------------------
126	ALIGN(8)
127L(three_or_more):
128	C eax	src low limb
129	C ebx
130	C ecx	dst
131	C edx	size
132
133	cmpl	$4, %edx
134	pushl	%ebx
135deflit(`FRAME',4)
136
137	movl	PARAM_SRC, %ebx
138	jae	L(four_or_more)
139
140
141C -----------------------------------------------------------------------------
142C three limbs
143	C eax	src low limb
144	C ebx	src
145	C ecx	dst
146	C edx	size
147
148	pushl	%ebp
149	pushl	%edi
150
151	mull	%eax		C src[0] ^ 2
152
153	movl	%eax, (%ecx)
154	movl	%edx, 4(%ecx)
155
156	movl	4(%ebx), %eax
157	xorl	%ebp, %ebp
158
159	mull	%eax		C src[1] ^ 2
160
161	movl	%eax, 8(%ecx)
162	movl	%edx, 12(%ecx)
163
164	movl	8(%ebx), %eax
165	pushl	%esi		C risk of cache bank clash
166
167	mull	%eax		C src[2] ^ 2
168
169	movl	%eax, 16(%ecx)
170	movl	%edx, 20(%ecx)
171
172	movl	(%ebx), %eax
173
174	mull	4(%ebx)		C src[0] * src[1]
175
176	movl	%eax, %esi
177	movl	%edx, %edi
178
179	movl	(%ebx), %eax
180
181	mull	8(%ebx)		C src[0] * src[2]
182
183	addl	%eax, %edi
184	movl	%edx, %ebp
185
186	adcl	$0, %ebp
187	movl	4(%ebx), %eax
188
189	mull	8(%ebx)		C src[1] * src[2]
190
191	xorl	%ebx, %ebx
192	addl	%eax, %ebp
193
194	C eax
195	C ebx	zero, will be dst[5]
196	C ecx	dst
197	C edx	dst[4]
198	C esi	dst[1]
199	C edi	dst[2]
200	C ebp	dst[3]
201
202	adcl	$0, %edx
203	addl	%esi, %esi
204
205	adcl	%edi, %edi
206
207	adcl	%ebp, %ebp
208
209	adcl	%edx, %edx
210	movl	4(%ecx), %eax
211
212	adcl	$0, %ebx
213	addl	%esi, %eax
214
215	movl	%eax, 4(%ecx)
216	movl	8(%ecx), %eax
217
218	adcl	%edi, %eax
219	movl	12(%ecx), %esi
220
221	adcl	%ebp, %esi
222	movl	16(%ecx), %edi
223
224	movl	%eax, 8(%ecx)
225	movl	%esi, 12(%ecx)
226
227	adcl	%edx, %edi
228	popl	%esi
229
230	movl	20(%ecx), %eax
231	movl	%edi, 16(%ecx)
232
233	popl	%edi
234	popl	%ebp
235
236	adcl	%ebx, %eax	C no carry out of this
237	popl	%ebx
238
239	movl	%eax, 20(%ecx)
240
241	ret
242
243
244C -----------------------------------------------------------------------------
245	ALIGN(8)
246L(four_or_more):
247	C eax	src low limb
248	C ebx	src
249	C ecx	dst
250	C edx	size
251	C esi
252	C edi
253	C ebp
254	C
255	C First multiply src[0]*src[1..size-1] and store at dst[1..size].
256
257deflit(`FRAME',4)
258
259	pushl	%edi
260FRAME_pushl()
261	pushl	%esi
262FRAME_pushl()
263
264	pushl	%ebp
265FRAME_pushl()
266	leal	(%ecx,%edx,4), %edi	C dst end of this mul1
267
268	leal	(%ebx,%edx,4), %esi	C src end
269	movl	%ebx, %ebp		C src
270
271	negl	%edx			C -size
272	xorl	%ebx, %ebx		C clear carry limb and carry flag
273
274	leal	1(%edx), %ecx		C -(size-1)
275
276L(mul1):
277	C eax	scratch
278	C ebx	carry
279	C ecx	counter, negative
280	C edx	scratch
281	C esi	&src[size]
282	C edi	&dst[size]
283	C ebp	src
284
285	adcl	$0, %ebx
286	movl	(%esi,%ecx,4), %eax
287
288	mull	(%ebp)
289
290	addl	%eax, %ebx
291
292	movl	%ebx, (%edi,%ecx,4)
293	incl	%ecx
294
295	movl	%edx, %ebx
296	jnz	L(mul1)
297
298
299	C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for
300	C n=1..size-2.
301	C
302	C The last two products, which are the end corner of the product
303	C triangle, are handled separately to save looping overhead.  These
304	C are src[size-3]*src[size-2,size-1] and src[size-2]*src[size-1].
305	C If size is 4 then it's only these that need to be done.
306	C
307	C In the outer loop %esi is a constant, and %edi just advances by 1
308	C limb each time.  The size of the operation decreases by 1 limb
309	C each time.
310
311	C eax
312	C ebx	carry (needing carry flag added)
313	C ecx
314	C edx
315	C esi	&src[size]
316	C edi	&dst[size]
317	C ebp
318
319	adcl	$0, %ebx
320	movl	PARAM_SIZE, %edx
321
322	movl	%ebx, (%edi)
323	subl	$4, %edx
324
325	negl	%edx
326	jz	L(corner)
327
328
329L(outer):
330	C ebx	previous carry limb to store
331	C edx	outer loop counter (negative)
332	C esi	&src[size]
333	C edi	dst, pointing at stored carry limb of previous loop
334
335	pushl	%edx			C new outer loop counter
336	leal	-2(%edx), %ecx
337
338	movl	%ebx, (%edi)
339	addl	$4, %edi
340
341	addl	$4, %ebp
342	xorl	%ebx, %ebx		C initial carry limb, clear carry flag
343
344L(inner):
345	C eax	scratch
346	C ebx	carry (needing carry flag added)
347	C ecx	counter, negative
348	C edx	scratch
349	C esi	&src[size]
350	C edi	dst end of this addmul
351	C ebp	&src[j]
352
353	adcl	$0, %ebx
354	movl	(%esi,%ecx,4), %eax
355
356	mull	(%ebp)
357
358	addl	%ebx, %eax
359	movl	(%edi,%ecx,4), %ebx
360
361	adcl	$0, %edx
362	addl	%eax, %ebx
363
364	movl	%ebx, (%edi,%ecx,4)
365	incl	%ecx
366
367	movl	%edx, %ebx
368	jnz	L(inner)
369
370
371	adcl	$0, %ebx
372	popl	%edx		C outer loop counter
373
374	incl	%edx
375	jnz	L(outer)
376
377
378	movl	%ebx, (%edi)
379
380L(corner):
381	C esi	&src[size]
382	C edi	&dst[2*size-4]
383
384	movl	-8(%esi), %eax
385	movl	-4(%edi), %ebx		C risk of data cache bank clash here
386
387	mull	-12(%esi)		C src[size-2]*src[size-3]
388
389	addl	%eax, %ebx
390	movl	%edx, %ecx
391
392	adcl	$0, %ecx
393	movl	-4(%esi), %eax
394
395	mull	-12(%esi)		C src[size-1]*src[size-3]
396
397	addl	%ecx, %eax
398	movl	(%edi), %ecx
399
400	adcl	$0, %edx
401	movl	%ebx, -4(%edi)
402
403	addl	%eax, %ecx
404	movl	%edx, %ebx
405
406	adcl	$0, %ebx
407	movl	-4(%esi), %eax
408
409	mull	-8(%esi)		C src[size-1]*src[size-2]
410
411	movl	%ecx, (%edi)
412	addl	%eax, %ebx
413
414	adcl	$0, %edx
415	movl	PARAM_SIZE, %eax
416
417	negl	%eax
418	movl	%ebx, 4(%edi)
419
420	addl	$1, %eax		C -(size-1) and clear carry
421	movl	%edx, 8(%edi)
422
423
424C -----------------------------------------------------------------------------
425C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
426
427L(lshift):
428	C eax	counter, negative
429	C ebx	next limb
430	C ecx
431	C edx
432	C esi
433	C edi	&dst[2*size-4]
434	C ebp
435
436	movl	12(%edi,%eax,8), %ebx
437
438	rcll	%ebx
439	movl	16(%edi,%eax,8), %ecx
440
441	rcll	%ecx
442	movl	%ebx, 12(%edi,%eax,8)
443
444	movl	%ecx, 16(%edi,%eax,8)
445	incl	%eax
446
447	jnz	L(lshift)
448
449
450	adcl	%eax, %eax		C high bit out
451	movl	PARAM_SRC, %esi
452
453	movl	PARAM_SIZE, %ecx	C risk of cache bank clash
454	movl	%eax, 12(%edi)		C dst most significant limb
455
456
457C -----------------------------------------------------------------------------
458C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ...,
459C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
460C low limb of src[0]^2.
461
462	movl	(%esi), %eax		C src[0]
463	leal	(%esi,%ecx,4), %esi	C src end
464
465	negl	%ecx
466
467	mull	%eax
468
469	movl	%eax, 16(%edi,%ecx,8)	C dst[0]
470	movl	%edx, %ebx
471
472	addl	$1, %ecx		C size-1 and clear carry
473
474L(diag):
475	C eax	scratch (low product)
476	C ebx	carry limb
477	C ecx	counter, negative
478	C edx	scratch (high product)
479	C esi	&src[size]
480	C edi	&dst[2*size-4]
481	C ebp	scratch (fetched dst limbs)
482
483	movl	(%esi,%ecx,4), %eax
484	adcl	$0, %ebx
485
486	mull	%eax
487
488	movl	16-4(%edi,%ecx,8), %ebp
489
490	addl	%ebp, %ebx
491	movl	16(%edi,%ecx,8), %ebp
492
493	adcl	%eax, %ebp
494	movl	%ebx, 16-4(%edi,%ecx,8)
495
496	movl	%ebp, 16(%edi,%ecx,8)
497	incl	%ecx
498
499	movl	%edx, %ebx
500	jnz	L(diag)
501
502
503	adcl	$0, %edx
504	movl	16-4(%edi), %eax	C dst most significant limb
505
506	addl	%eax, %edx
507	popl	%ebp
508
509	movl	%edx, 16-4(%edi)
510	popl	%esi		C risk of cache bank clash
511
512	popl	%edi
513	popl	%ebx
514
515	ret
516
517EPILOGUE()
518