xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/p6/sqr_basecase.asm (revision 122b5006ee1bd67145794b4cde92f4fe4781a5ec)
1dnl  Intel P6 mpn_sqr_basecase -- square an mpn number.
2
3dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C P6: approx 4.0 cycles per cross product, or 7.75 cycles per triangular
35C     product (measured on the speed difference between 20 and 40 limbs,
36C     which is the Karatsuba recursing range).
37
38
39dnl  These are the same as in mpn/x86/k6/sqr_basecase.asm, see that file for
40dnl  a description.  The only difference here is that UNROLL_COUNT can go up
41dnl  to 64 (not 63) making SQR_TOOM2_THRESHOLD_MAX 67.
42
43deflit(SQR_TOOM2_THRESHOLD_MAX, 67)
44
45ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE',
46`define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)')
47
48m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD')
49deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3))
50
51
52C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
53C
54C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
55C lot of function call overheads are avoided, especially when the given size
56C is small.
57C
58C The code size might look a bit excessive, but not all of it is executed so
59C it won't all get into the code cache.  The 1x1, 2x2 and 3x3 special cases
60C clearly apply only to those sizes; mid sizes like 10x10 only need part of
61C the unrolled addmul; and big sizes like 40x40 that do use the full
62C unrolling will least be making good use of it, because 40x40 will take
63C something like 7000 cycles.
64
65defframe(PARAM_SIZE,12)
66defframe(PARAM_SRC, 8)
67defframe(PARAM_DST, 4)
68
69	TEXT
70	ALIGN(32)
71PROLOGUE(mpn_sqr_basecase)
72deflit(`FRAME',0)
73
74	movl	PARAM_SIZE, %edx
75
76	movl	PARAM_SRC, %eax
77
78	cmpl	$2, %edx
79	movl	PARAM_DST, %ecx
80	je	L(two_limbs)
81
82	movl	(%eax), %eax
83	ja	L(three_or_more)
84
85
86C -----------------------------------------------------------------------------
87C one limb only
88	C eax	src limb
89	C ebx
90	C ecx	dst
91	C edx
92
93	mull	%eax
94
95	movl	%eax, (%ecx)
96	movl	%edx, 4(%ecx)
97
98	ret
99
100
101C -----------------------------------------------------------------------------
102L(two_limbs):
103	C eax	src
104	C ebx
105	C ecx	dst
106	C edx
107
108defframe(SAVE_ESI, -4)
109defframe(SAVE_EBX, -8)
110defframe(SAVE_EDI, -12)
111defframe(SAVE_EBP, -16)
112deflit(`STACK_SPACE',16)
113
114	subl	$STACK_SPACE, %esp
115deflit(`FRAME',STACK_SPACE)
116
117	movl	%esi, SAVE_ESI
118	movl	%eax, %esi
119	movl	(%eax), %eax
120
121	mull	%eax		C src[0]^2
122
123	movl	%eax, (%ecx)	C dst[0]
124	movl	4(%esi), %eax
125
126	movl	%ebx, SAVE_EBX
127	movl	%edx, %ebx	C dst[1]
128
129	mull	%eax		C src[1]^2
130
131	movl	%edi, SAVE_EDI
132	movl	%eax, %edi	C dst[2]
133	movl	(%esi), %eax
134
135	movl	%ebp, SAVE_EBP
136	movl	%edx, %ebp	C dst[3]
137
138	mull	4(%esi)		C src[0]*src[1]
139
140	addl	%eax, %ebx
141	movl	SAVE_ESI, %esi
142
143	adcl	%edx, %edi
144
145	adcl	$0, %ebp
146	addl	%ebx, %eax
147	movl	SAVE_EBX, %ebx
148
149	adcl	%edi, %edx
150	movl	SAVE_EDI, %edi
151
152	adcl	$0, %ebp
153
154	movl	%eax, 4(%ecx)
155
156	movl	%ebp, 12(%ecx)
157	movl	SAVE_EBP, %ebp
158
159	movl	%edx, 8(%ecx)
160	addl	$FRAME, %esp
161
162	ret
163
164
165C -----------------------------------------------------------------------------
166L(three_or_more):
167	C eax	src low limb
168	C ebx
169	C ecx	dst
170	C edx	size
171deflit(`FRAME',0)
172
173	pushl	%esi	defframe_pushl(`SAVE_ESI')
174	cmpl	$4, %edx
175
176	movl	PARAM_SRC, %esi
177	jae	L(four_or_more)
178
179
180C -----------------------------------------------------------------------------
181C three limbs
182
183	C eax	src low limb
184	C ebx
185	C ecx	dst
186	C edx
187	C esi	src
188	C edi
189	C ebp
190
191	pushl	%ebp	defframe_pushl(`SAVE_EBP')
192	pushl	%edi	defframe_pushl(`SAVE_EDI')
193
194	mull	%eax		C src[0] ^ 2
195
196	movl	%eax, (%ecx)
197	movl	%edx, 4(%ecx)
198
199	movl	4(%esi), %eax
200	xorl	%ebp, %ebp
201
202	mull	%eax		C src[1] ^ 2
203
204	movl	%eax, 8(%ecx)
205	movl	%edx, 12(%ecx)
206	movl	8(%esi), %eax
207
208	pushl	%ebx	defframe_pushl(`SAVE_EBX')
209
210	mull	%eax		C src[2] ^ 2
211
212	movl	%eax, 16(%ecx)
213	movl	%edx, 20(%ecx)
214
215	movl	(%esi), %eax
216
217	mull	4(%esi)		C src[0] * src[1]
218
219	movl	%eax, %ebx
220	movl	%edx, %edi
221
222	movl	(%esi), %eax
223
224	mull	8(%esi)		C src[0] * src[2]
225
226	addl	%eax, %edi
227	movl	%edx, %ebp
228
229	adcl	$0, %ebp
230	movl	4(%esi), %eax
231
232	mull	8(%esi)		C src[1] * src[2]
233
234	xorl	%esi, %esi
235	addl	%eax, %ebp
236
237	C eax
238	C ebx	dst[1]
239	C ecx	dst
240	C edx	dst[4]
241	C esi	zero, will be dst[5]
242	C edi	dst[2]
243	C ebp	dst[3]
244
245	adcl	$0, %edx
246	addl	%ebx, %ebx
247
248	adcl	%edi, %edi
249
250	adcl	%ebp, %ebp
251
252	adcl	%edx, %edx
253	movl	4(%ecx), %eax
254
255	adcl	$0, %esi
256	addl	%ebx, %eax
257
258	movl	%eax, 4(%ecx)
259	movl	8(%ecx), %eax
260
261	adcl	%edi, %eax
262	movl	12(%ecx), %ebx
263
264	adcl	%ebp, %ebx
265	movl	16(%ecx), %edi
266
267	movl	%eax, 8(%ecx)
268	movl	SAVE_EBP, %ebp
269
270	movl	%ebx, 12(%ecx)
271	movl	SAVE_EBX, %ebx
272
273	adcl	%edx, %edi
274	movl	20(%ecx), %eax
275
276	movl	%edi, 16(%ecx)
277	movl	SAVE_EDI, %edi
278
279	adcl	%esi, %eax	C no carry out of this
280	movl	SAVE_ESI, %esi
281
282	movl	%eax, 20(%ecx)
283	addl	$FRAME, %esp
284
285	ret
286
287
288
289C -----------------------------------------------------------------------------
290defframe(VAR_COUNTER,-20)
291defframe(VAR_JMP,    -24)
292deflit(`STACK_SPACE',24)
293
294L(four_or_more):
295	C eax	src low limb
296	C ebx
297	C ecx
298	C edx	size
299	C esi	src
300	C edi
301	C ebp
302deflit(`FRAME',4)  dnl  %esi already pushed
303
304C First multiply src[0]*src[1..size-1] and store at dst[1..size].
305
306	subl	$STACK_SPACE-FRAME, %esp
307deflit(`FRAME',STACK_SPACE)
308	movl	$1, %ecx
309
310	movl	%edi, SAVE_EDI
311	movl	PARAM_DST, %edi
312
313	movl	%ebx, SAVE_EBX
314	subl	%edx, %ecx		C -(size-1)
315
316	movl	%ebp, SAVE_EBP
317	movl	$0, %ebx		C initial carry
318
319	leal	(%esi,%edx,4), %esi	C &src[size]
320	movl	%eax, %ebp		C multiplier
321
322	leal	-4(%edi,%edx,4), %edi	C &dst[size-1]
323
324
325C This loop runs at just over 6 c/l.
326
327L(mul_1):
328	C eax	scratch
329	C ebx	carry
330	C ecx	counter, limbs, negative, -(size-1) to -1
331	C edx	scratch
332	C esi	&src[size]
333	C edi	&dst[size-1]
334	C ebp	multiplier
335
336	movl	%ebp, %eax
337
338	mull	(%esi,%ecx,4)
339
340	addl	%ebx, %eax
341	movl	$0, %ebx
342
343	adcl	%edx, %ebx
344	movl	%eax, 4(%edi,%ecx,4)
345
346	incl	%ecx
347	jnz	L(mul_1)
348
349
350	movl	%ebx, 4(%edi)
351
352
353C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
354C
355C The last two addmuls, which are the bottom right corner of the product
356C triangle, are left to the end.  These are src[size-3]*src[size-2,size-1]
357C and src[size-2]*src[size-1].  If size is 4 then it's only these corner
358C cases that need to be done.
359C
360C The unrolled code is the same as mpn_addmul_1(), see that routine for some
361C comments.
362C
363C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive.
364C
365C VAR_JMP is the computed jump into the unrolled code, stepped by one code
366C chunk each outer loop.
367
368dnl  This is also hard-coded in the address calculation below.
369deflit(CODE_BYTES_PER_LIMB, 15)
370
371dnl  With &src[size] and &dst[size-1] pointers, the displacements in the
372dnl  unrolled code fit in a byte for UNROLL_COUNT values up to 32, but above
373dnl  that an offset must be added to them.
374deflit(OFFSET,
375ifelse(eval(UNROLL_COUNT>32),1,
376eval((UNROLL_COUNT-32)*4),
3770))
378
379	C eax
380	C ebx	carry
381	C ecx
382	C edx
383	C esi	&src[size]
384	C edi	&dst[size-1]
385	C ebp
386
387	movl	PARAM_SIZE, %ecx
388
389	subl	$4, %ecx
390	jz	L(corner)
391
392	movl	%ecx, %edx
393	negl	%ecx
394
395	shll	$4, %ecx
396ifelse(OFFSET,0,,`subl	$OFFSET, %esi')
397
398ifdef(`PIC',`
399	call	L(pic_calc)
400L(here):
401',`
402	leal	L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
403')
404	negl	%edx
405
406ifelse(OFFSET,0,,`subl	$OFFSET, %edi')
407
408	C The calculated jump mustn't be before the start of the available
409	C code.  This is the limit that UNROLL_COUNT puts on the src operand
410	C size, but checked here using the jump address directly.
411
412	ASSERT(ae,
413	`movl_text_address( L(unroll_inner_start), %eax)
414	cmpl	%eax, %ecx')
415
416
417C -----------------------------------------------------------------------------
418	ALIGN(16)
419L(unroll_outer_top):
420	C eax
421	C ebx	high limb to store
422	C ecx	VAR_JMP
423	C edx	VAR_COUNTER, limbs, negative
424	C esi	&src[size], constant
425	C edi	dst ptr, second highest limb of last addmul
426	C ebp
427
428	movl	-12+OFFSET(%esi,%edx,4), %ebp	C multiplier
429	movl	%edx, VAR_COUNTER
430
431	movl	-8+OFFSET(%esi,%edx,4), %eax	C first limb of multiplicand
432
433	mull	%ebp
434
435define(cmovX,`ifelse(eval(UNROLL_COUNT%2),1,`cmovz($@)',`cmovnz($@)')')
436
437	testb	$1, %cl
438
439	movl	%edx, %ebx	C high carry
440	leal	4(%edi), %edi
441
442	movl	%ecx, %edx	C jump
443
444	movl	%eax, %ecx	C low carry
445	leal	CODE_BYTES_PER_LIMB(%edx), %edx
446
447	cmovX(	%ebx, %ecx)	C high carry reverse
448	cmovX(	%eax, %ebx)	C low carry reverse
449	movl	%edx, VAR_JMP
450	jmp	*%edx
451
452
453	C Must be on an even address here so the low bit of the jump address
454	C will indicate which way around ecx/ebx should start.
455
456	ALIGN(2)
457
458L(unroll_inner_start):
459	C eax	scratch
460	C ebx	carry high
461	C ecx	carry low
462	C edx	scratch
463	C esi	src pointer
464	C edi	dst pointer
465	C ebp	multiplier
466	C
467	C 15 code bytes each limb
468	C ecx/ebx reversed on each chunk
469
470forloop(`i', UNROLL_COUNT, 1, `
471	deflit(`disp_src', eval(-i*4 + OFFSET))
472	deflit(`disp_dst', eval(disp_src))
473
474	m4_assert(`disp_src>=-128 && disp_src<128')
475	m4_assert(`disp_dst>=-128 && disp_dst<128')
476
477ifelse(eval(i%2),0,`
478Zdisp(	movl,	disp_src,(%esi), %eax)
479	mull	%ebp
480Zdisp(	addl,	%ebx, disp_dst,(%edi))
481	adcl	%eax, %ecx
482	movl	%edx, %ebx
483	adcl	$0, %ebx
484',`
485	dnl  this one comes out last
486Zdisp(	movl,	disp_src,(%esi), %eax)
487	mull	%ebp
488Zdisp(	addl,	%ecx, disp_dst,(%edi))
489	adcl	%eax, %ebx
490	movl	%edx, %ecx
491	adcl	$0, %ecx
492')
493')
494L(unroll_inner_end):
495
496	addl	%ebx, m4_empty_if_zero(OFFSET)(%edi)
497
498	movl	VAR_COUNTER, %edx
499	adcl	$0, %ecx
500
501	movl	%ecx, m4_empty_if_zero(OFFSET+4)(%edi)
502	movl	VAR_JMP, %ecx
503
504	incl	%edx
505	jnz	L(unroll_outer_top)
506
507
508ifelse(OFFSET,0,,`
509	addl	$OFFSET, %esi
510	addl	$OFFSET, %edi
511')
512
513
514C -----------------------------------------------------------------------------
515	ALIGN(16)
516L(corner):
517	C eax
518	C ebx
519	C ecx
520	C edx
521	C esi	&src[size]
522	C edi	&dst[2*size-5]
523	C ebp
524
525	movl	-12(%esi), %eax
526
527	mull	-8(%esi)
528
529	addl	%eax, (%edi)
530	movl	-12(%esi), %eax
531	movl	$0, %ebx
532
533	adcl	%edx, %ebx
534
535	mull	-4(%esi)
536
537	addl	%eax, %ebx
538	movl	-8(%esi), %eax
539
540	adcl	$0, %edx
541
542	addl	%ebx, 4(%edi)
543	movl	$0, %ebx
544
545	adcl	%edx, %ebx
546
547	mull	-4(%esi)
548
549	movl	PARAM_SIZE, %ecx
550	addl	%ebx, %eax
551
552	adcl	$0, %edx
553
554	movl	%eax, 8(%edi)
555
556	movl	%edx, 12(%edi)
557	movl	PARAM_DST, %edi
558
559
560C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1].
561
562	subl	$1, %ecx		C size-1
563	xorl	%eax, %eax		C ready for final adcl, and clear carry
564
565	movl	%ecx, %edx
566	movl	PARAM_SRC, %esi
567
568
569L(lshift):
570	C eax
571	C ebx
572	C ecx	counter, size-1 to 1
573	C edx	size-1 (for later use)
574	C esi	src (for later use)
575	C edi	dst, incrementing
576	C ebp
577
578	rcll	4(%edi)
579	rcll	8(%edi)
580
581	leal	8(%edi), %edi
582	decl	%ecx
583	jnz	L(lshift)
584
585
586	adcl	%eax, %eax
587
588	movl	%eax, 4(%edi)		C dst most significant limb
589	movl	(%esi), %eax		C src[0]
590
591	leal	4(%esi,%edx,4), %esi	C &src[size]
592	subl	%edx, %ecx		C -(size-1)
593
594
595C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
596C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
597C low limb of src[0]^2.
598
599
600	mull	%eax
601
602	movl	%eax, (%edi,%ecx,8)	C dst[0]
603
604
605L(diag):
606	C eax	scratch
607	C ebx	scratch
608	C ecx	counter, negative
609	C edx	carry
610	C esi	&src[size]
611	C edi	dst[2*size-2]
612	C ebp
613
614	movl	(%esi,%ecx,4), %eax
615	movl	%edx, %ebx
616
617	mull	%eax
618
619	addl	%ebx, 4(%edi,%ecx,8)
620	adcl	%eax, 8(%edi,%ecx,8)
621	adcl	$0, %edx
622
623	incl	%ecx
624	jnz	L(diag)
625
626
627	movl	SAVE_ESI, %esi
628	movl	SAVE_EBX, %ebx
629
630	addl	%edx, 4(%edi)		C dst most significant limb
631
632	movl	SAVE_EDI, %edi
633	movl	SAVE_EBP, %ebp
634	addl	$FRAME, %esp
635	ret
636
637
638
639C -----------------------------------------------------------------------------
640ifdef(`PIC',`
641L(pic_calc):
642	addl	(%esp), %ecx
643	addl	$L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx
644	addl	%edx, %ecx
645	ret_internal
646')
647
648
649EPILOGUE()
650