xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/p6/sqr_basecase.asm (revision c34236556bea94afcaca1782d7d228301edc3ea0)
1dnl  Intel P6 mpn_sqr_basecase -- square an mpn number.
2
3dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C P6: approx 4.0 cycles per cross product, or 7.75 cycles per triangular
24C     product (measured on the speed difference between 20 and 40 limbs,
25C     which is the Karatsuba recursing range).
26
27
28dnl  These are the same as in mpn/x86/k6/sqr_basecase.asm, see that file for
29dnl  a description.  The only difference here is that UNROLL_COUNT can go up
30dnl  to 64 (not 63) making SQR_TOOM2_THRESHOLD_MAX 67.
31
32deflit(SQR_TOOM2_THRESHOLD_MAX, 67)
33
34ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE',
35`define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)')
36
37m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD')
38deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3))
39
40
41C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
42C
43C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
44C lot of function call overheads are avoided, especially when the given size
45C is small.
46C
47C The code size might look a bit excessive, but not all of it is executed so
48C it won't all get into the code cache.  The 1x1, 2x2 and 3x3 special cases
49C clearly apply only to those sizes; mid sizes like 10x10 only need part of
50C the unrolled addmul; and big sizes like 40x40 that do use the full
51C unrolling will least be making good use of it, because 40x40 will take
52C something like 7000 cycles.
53
54defframe(PARAM_SIZE,12)
55defframe(PARAM_SRC, 8)
56defframe(PARAM_DST, 4)
57
58	TEXT
59	ALIGN(32)
60PROLOGUE(mpn_sqr_basecase)
61deflit(`FRAME',0)
62
63	movl	PARAM_SIZE, %edx
64
65	movl	PARAM_SRC, %eax
66
67	cmpl	$2, %edx
68	movl	PARAM_DST, %ecx
69	je	L(two_limbs)
70
71	movl	(%eax), %eax
72	ja	L(three_or_more)
73
74
75C -----------------------------------------------------------------------------
76C one limb only
77	C eax	src limb
78	C ebx
79	C ecx	dst
80	C edx
81
82	mull	%eax
83
84	movl	%eax, (%ecx)
85	movl	%edx, 4(%ecx)
86
87	ret
88
89
90C -----------------------------------------------------------------------------
91L(two_limbs):
92	C eax	src
93	C ebx
94	C ecx	dst
95	C edx
96
97defframe(SAVE_ESI, -4)
98defframe(SAVE_EBX, -8)
99defframe(SAVE_EDI, -12)
100defframe(SAVE_EBP, -16)
101deflit(`STACK_SPACE',16)
102
103	subl	$STACK_SPACE, %esp
104deflit(`FRAME',STACK_SPACE)
105
106	movl	%esi, SAVE_ESI
107	movl	%eax, %esi
108	movl	(%eax), %eax
109
110	mull	%eax		C src[0]^2
111
112	movl	%eax, (%ecx)	C dst[0]
113	movl	4(%esi), %eax
114
115	movl	%ebx, SAVE_EBX
116	movl	%edx, %ebx	C dst[1]
117
118	mull	%eax		C src[1]^2
119
120	movl	%edi, SAVE_EDI
121	movl	%eax, %edi	C dst[2]
122	movl	(%esi), %eax
123
124	movl	%ebp, SAVE_EBP
125	movl	%edx, %ebp	C dst[3]
126
127	mull	4(%esi)		C src[0]*src[1]
128
129	addl	%eax, %ebx
130	movl	SAVE_ESI, %esi
131
132	adcl	%edx, %edi
133
134	adcl	$0, %ebp
135	addl	%ebx, %eax
136	movl	SAVE_EBX, %ebx
137
138	adcl	%edi, %edx
139	movl	SAVE_EDI, %edi
140
141	adcl	$0, %ebp
142
143	movl	%eax, 4(%ecx)
144
145	movl	%ebp, 12(%ecx)
146	movl	SAVE_EBP, %ebp
147
148	movl	%edx, 8(%ecx)
149	addl	$FRAME, %esp
150
151	ret
152
153
154C -----------------------------------------------------------------------------
155L(three_or_more):
156	C eax	src low limb
157	C ebx
158	C ecx	dst
159	C edx	size
160deflit(`FRAME',0)
161
162	pushl	%esi	defframe_pushl(`SAVE_ESI')
163	cmpl	$4, %edx
164
165	movl	PARAM_SRC, %esi
166	jae	L(four_or_more)
167
168
169C -----------------------------------------------------------------------------
170C three limbs
171
172	C eax	src low limb
173	C ebx
174	C ecx	dst
175	C edx
176	C esi	src
177	C edi
178	C ebp
179
180	pushl	%ebp	defframe_pushl(`SAVE_EBP')
181	pushl	%edi	defframe_pushl(`SAVE_EDI')
182
183	mull	%eax		C src[0] ^ 2
184
185	movl	%eax, (%ecx)
186	movl	%edx, 4(%ecx)
187
188	movl	4(%esi), %eax
189	xorl	%ebp, %ebp
190
191	mull	%eax		C src[1] ^ 2
192
193	movl	%eax, 8(%ecx)
194	movl	%edx, 12(%ecx)
195	movl	8(%esi), %eax
196
197	pushl	%ebx	defframe_pushl(`SAVE_EBX')
198
199	mull	%eax		C src[2] ^ 2
200
201	movl	%eax, 16(%ecx)
202	movl	%edx, 20(%ecx)
203
204	movl	(%esi), %eax
205
206	mull	4(%esi)		C src[0] * src[1]
207
208	movl	%eax, %ebx
209	movl	%edx, %edi
210
211	movl	(%esi), %eax
212
213	mull	8(%esi)		C src[0] * src[2]
214
215	addl	%eax, %edi
216	movl	%edx, %ebp
217
218	adcl	$0, %ebp
219	movl	4(%esi), %eax
220
221	mull	8(%esi)		C src[1] * src[2]
222
223	xorl	%esi, %esi
224	addl	%eax, %ebp
225
226	C eax
227	C ebx	dst[1]
228	C ecx	dst
229	C edx	dst[4]
230	C esi	zero, will be dst[5]
231	C edi	dst[2]
232	C ebp	dst[3]
233
234	adcl	$0, %edx
235	addl	%ebx, %ebx
236
237	adcl	%edi, %edi
238
239	adcl	%ebp, %ebp
240
241	adcl	%edx, %edx
242	movl	4(%ecx), %eax
243
244	adcl	$0, %esi
245	addl	%ebx, %eax
246
247	movl	%eax, 4(%ecx)
248	movl	8(%ecx), %eax
249
250	adcl	%edi, %eax
251	movl	12(%ecx), %ebx
252
253	adcl	%ebp, %ebx
254	movl	16(%ecx), %edi
255
256	movl	%eax, 8(%ecx)
257	movl	SAVE_EBP, %ebp
258
259	movl	%ebx, 12(%ecx)
260	movl	SAVE_EBX, %ebx
261
262	adcl	%edx, %edi
263	movl	20(%ecx), %eax
264
265	movl	%edi, 16(%ecx)
266	movl	SAVE_EDI, %edi
267
268	adcl	%esi, %eax	C no carry out of this
269	movl	SAVE_ESI, %esi
270
271	movl	%eax, 20(%ecx)
272	addl	$FRAME, %esp
273
274	ret
275
276
277
278C -----------------------------------------------------------------------------
279defframe(VAR_COUNTER,-20)
280defframe(VAR_JMP,    -24)
281deflit(`STACK_SPACE',24)
282
283L(four_or_more):
284	C eax	src low limb
285	C ebx
286	C ecx
287	C edx	size
288	C esi	src
289	C edi
290	C ebp
291deflit(`FRAME',4)  dnl  %esi already pushed
292
293C First multiply src[0]*src[1..size-1] and store at dst[1..size].
294
295	subl	$STACK_SPACE-FRAME, %esp
296deflit(`FRAME',STACK_SPACE)
297	movl	$1, %ecx
298
299	movl	%edi, SAVE_EDI
300	movl	PARAM_DST, %edi
301
302	movl	%ebx, SAVE_EBX
303	subl	%edx, %ecx		C -(size-1)
304
305	movl	%ebp, SAVE_EBP
306	movl	$0, %ebx		C initial carry
307
308	leal	(%esi,%edx,4), %esi	C &src[size]
309	movl	%eax, %ebp		C multiplier
310
311	leal	-4(%edi,%edx,4), %edi	C &dst[size-1]
312
313
314C This loop runs at just over 6 c/l.
315
316L(mul_1):
317	C eax	scratch
318	C ebx	carry
319	C ecx	counter, limbs, negative, -(size-1) to -1
320	C edx	scratch
321	C esi	&src[size]
322	C edi	&dst[size-1]
323	C ebp	multiplier
324
325	movl	%ebp, %eax
326
327	mull	(%esi,%ecx,4)
328
329	addl	%ebx, %eax
330	movl	$0, %ebx
331
332	adcl	%edx, %ebx
333	movl	%eax, 4(%edi,%ecx,4)
334
335	incl	%ecx
336	jnz	L(mul_1)
337
338
339	movl	%ebx, 4(%edi)
340
341
342C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
343C
344C The last two addmuls, which are the bottom right corner of the product
345C triangle, are left to the end.  These are src[size-3]*src[size-2,size-1]
346C and src[size-2]*src[size-1].  If size is 4 then it's only these corner
347C cases that need to be done.
348C
349C The unrolled code is the same as mpn_addmul_1(), see that routine for some
350C comments.
351C
352C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive.
353C
354C VAR_JMP is the computed jump into the unrolled code, stepped by one code
355C chunk each outer loop.
356
357dnl  This is also hard-coded in the address calculation below.
358deflit(CODE_BYTES_PER_LIMB, 15)
359
360dnl  With &src[size] and &dst[size-1] pointers, the displacements in the
361dnl  unrolled code fit in a byte for UNROLL_COUNT values up to 32, but above
362dnl  that an offset must be added to them.
363deflit(OFFSET,
364ifelse(eval(UNROLL_COUNT>32),1,
365eval((UNROLL_COUNT-32)*4),
3660))
367
368	C eax
369	C ebx	carry
370	C ecx
371	C edx
372	C esi	&src[size]
373	C edi	&dst[size-1]
374	C ebp
375
376	movl	PARAM_SIZE, %ecx
377
378	subl	$4, %ecx
379	jz	L(corner)
380
381	movl	%ecx, %edx
382	negl	%ecx
383
384	shll	$4, %ecx
385ifelse(OFFSET,0,,`subl	$OFFSET, %esi')
386
387ifdef(`PIC',`
388	call	L(pic_calc)
389L(here):
390',`
391	leal	L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
392')
393	negl	%edx
394
395ifelse(OFFSET,0,,`subl	$OFFSET, %edi')
396
397	C The calculated jump mustn't be before the start of the available
398	C code.  This is the limit that UNROLL_COUNT puts on the src operand
399	C size, but checked here using the jump address directly.
400
401	ASSERT(ae,
402	`movl_text_address( L(unroll_inner_start), %eax)
403	cmpl	%eax, %ecx')
404
405
406C -----------------------------------------------------------------------------
407	ALIGN(16)
408L(unroll_outer_top):
409	C eax
410	C ebx	high limb to store
411	C ecx	VAR_JMP
412	C edx	VAR_COUNTER, limbs, negative
413	C esi	&src[size], constant
414	C edi	dst ptr, second highest limb of last addmul
415	C ebp
416
417	movl	-12+OFFSET(%esi,%edx,4), %ebp	C multiplier
418	movl	%edx, VAR_COUNTER
419
420	movl	-8+OFFSET(%esi,%edx,4), %eax	C first limb of multiplicand
421
422	mull	%ebp
423
424define(cmovX,`ifelse(eval(UNROLL_COUNT%2),1,`cmovz($@)',`cmovnz($@)')')
425
426	testb	$1, %cl
427
428	movl	%edx, %ebx	C high carry
429	leal	4(%edi), %edi
430
431	movl	%ecx, %edx	C jump
432
433	movl	%eax, %ecx	C low carry
434	leal	CODE_BYTES_PER_LIMB(%edx), %edx
435
436	cmovX(	%ebx, %ecx)	C high carry reverse
437	cmovX(	%eax, %ebx)	C low carry reverse
438	movl	%edx, VAR_JMP
439	jmp	*%edx
440
441
442	C Must be on an even address here so the low bit of the jump address
443	C will indicate which way around ecx/ebx should start.
444
445	ALIGN(2)
446
447L(unroll_inner_start):
448	C eax	scratch
449	C ebx	carry high
450	C ecx	carry low
451	C edx	scratch
452	C esi	src pointer
453	C edi	dst pointer
454	C ebp	multiplier
455	C
456	C 15 code bytes each limb
457	C ecx/ebx reversed on each chunk
458
459forloop(`i', UNROLL_COUNT, 1, `
460	deflit(`disp_src', eval(-i*4 + OFFSET))
461	deflit(`disp_dst', eval(disp_src))
462
463	m4_assert(`disp_src>=-128 && disp_src<128')
464	m4_assert(`disp_dst>=-128 && disp_dst<128')
465
466ifelse(eval(i%2),0,`
467Zdisp(	movl,	disp_src,(%esi), %eax)
468	mull	%ebp
469Zdisp(	addl,	%ebx, disp_dst,(%edi))
470	adcl	%eax, %ecx
471	movl	%edx, %ebx
472	adcl	$0, %ebx
473',`
474	dnl  this one comes out last
475Zdisp(	movl,	disp_src,(%esi), %eax)
476	mull	%ebp
477Zdisp(	addl,	%ecx, disp_dst,(%edi))
478	adcl	%eax, %ebx
479	movl	%edx, %ecx
480	adcl	$0, %ecx
481')
482')
483L(unroll_inner_end):
484
485	addl	%ebx, m4_empty_if_zero(OFFSET)(%edi)
486
487	movl	VAR_COUNTER, %edx
488	adcl	$0, %ecx
489
490	movl	%ecx, m4_empty_if_zero(OFFSET+4)(%edi)
491	movl	VAR_JMP, %ecx
492
493	incl	%edx
494	jnz	L(unroll_outer_top)
495
496
497ifelse(OFFSET,0,,`
498	addl	$OFFSET, %esi
499	addl	$OFFSET, %edi
500')
501
502
503C -----------------------------------------------------------------------------
504	ALIGN(16)
505L(corner):
506	C eax
507	C ebx
508	C ecx
509	C edx
510	C esi	&src[size]
511	C edi	&dst[2*size-5]
512	C ebp
513
514	movl	-12(%esi), %eax
515
516	mull	-8(%esi)
517
518	addl	%eax, (%edi)
519	movl	-12(%esi), %eax
520	movl	$0, %ebx
521
522	adcl	%edx, %ebx
523
524	mull	-4(%esi)
525
526	addl	%eax, %ebx
527	movl	-8(%esi), %eax
528
529	adcl	$0, %edx
530
531	addl	%ebx, 4(%edi)
532	movl	$0, %ebx
533
534	adcl	%edx, %ebx
535
536	mull	-4(%esi)
537
538	movl	PARAM_SIZE, %ecx
539	addl	%ebx, %eax
540
541	adcl	$0, %edx
542
543	movl	%eax, 8(%edi)
544
545	movl	%edx, 12(%edi)
546	movl	PARAM_DST, %edi
547
548
549C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1].
550
551	subl	$1, %ecx		C size-1
552	xorl	%eax, %eax		C ready for final adcl, and clear carry
553
554	movl	%ecx, %edx
555	movl	PARAM_SRC, %esi
556
557
558L(lshift):
559	C eax
560	C ebx
561	C ecx	counter, size-1 to 1
562	C edx	size-1 (for later use)
563	C esi	src (for later use)
564	C edi	dst, incrementing
565	C ebp
566
567	rcll	4(%edi)
568	rcll	8(%edi)
569
570	leal	8(%edi), %edi
571	decl	%ecx
572	jnz	L(lshift)
573
574
575	adcl	%eax, %eax
576
577	movl	%eax, 4(%edi)		C dst most significant limb
578	movl	(%esi), %eax		C src[0]
579
580	leal	4(%esi,%edx,4), %esi	C &src[size]
581	subl	%edx, %ecx		C -(size-1)
582
583
584C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
585C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
586C low limb of src[0]^2.
587
588
589	mull	%eax
590
591	movl	%eax, (%edi,%ecx,8)	C dst[0]
592
593
594L(diag):
595	C eax	scratch
596	C ebx	scratch
597	C ecx	counter, negative
598	C edx	carry
599	C esi	&src[size]
600	C edi	dst[2*size-2]
601	C ebp
602
603	movl	(%esi,%ecx,4), %eax
604	movl	%edx, %ebx
605
606	mull	%eax
607
608	addl	%ebx, 4(%edi,%ecx,8)
609	adcl	%eax, 8(%edi,%ecx,8)
610	adcl	$0, %edx
611
612	incl	%ecx
613	jnz	L(diag)
614
615
616	movl	SAVE_ESI, %esi
617	movl	SAVE_EBX, %ebx
618
619	addl	%edx, 4(%edi)		C dst most significant limb
620
621	movl	SAVE_EDI, %edi
622	movl	SAVE_EBP, %ebp
623	addl	$FRAME, %esp
624	ret
625
626
627
628C -----------------------------------------------------------------------------
629ifdef(`PIC',`
630L(pic_calc):
631	addl	(%esp), %ecx
632	addl	$L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx
633	addl	%edx, %ecx
634	ret_internal
635')
636
637
638EPILOGUE()
639