xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/k6/sqr_basecase.asm (revision 5dd36a3bc8bf2a9dec29ceb6349550414570c447)
1dnl  AMD K6 mpn_sqr_basecase -- square an mpn number.
2
3dnl  Copyright 1999-2002 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C K6: approx 4.7 cycles per cross product, or 9.2 cycles per triangular
35C     product (measured on the speed difference between 17 and 33 limbs,
36C     which is roughly the Karatsuba recursing range).
37
38
39dnl  SQR_TOOM2_THRESHOLD_MAX is the maximum SQR_TOOM2_THRESHOLD this
40dnl  code supports.  This value is used only by the tune program to know
41dnl  what it can go up to.  (An attempt to compile with a bigger value will
42dnl  trigger some m4_assert()s in the code, making the build fail.)
43dnl
44dnl  The value is determined by requiring the displacements in the unrolled
45dnl  addmul to fit in single bytes.  This means a maximum UNROLL_COUNT of
46dnl  63, giving a maximum SQR_TOOM2_THRESHOLD of 66.
47
48deflit(SQR_TOOM2_THRESHOLD_MAX, 66)
49
50
51dnl  Allow a value from the tune program to override config.m4.
52
53ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE',
54`define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)')
55
56
57dnl  UNROLL_COUNT is the number of code chunks in the unrolled addmul.  The
58dnl  number required is determined by SQR_TOOM2_THRESHOLD, since
59dnl  mpn_sqr_basecase only needs to handle sizes < SQR_TOOM2_THRESHOLD.
60dnl
61dnl  The first addmul is the biggest, and this takes the second least
62dnl  significant limb and multiplies it by the third least significant and
63dnl  up.  Hence for a maximum operand size of SQR_TOOM2_THRESHOLD-1
64dnl  limbs, UNROLL_COUNT needs to be SQR_TOOM2_THRESHOLD-3.
65
66m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD')
67deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3))
68
69
70C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
71C
72C The algorithm is essentially the same as mpn/generic/sqr_basecase.c, but a
73C lot of function call overheads are avoided, especially when the given size
74C is small.
75C
76C The code size might look a bit excessive, but not all of it is executed
77C and so won't fill up the code cache.  The 1x1, 2x2 and 3x3 special cases
78C clearly apply only to those sizes; mid sizes like 10x10 only need part of
79C the unrolled addmul; and big sizes like 35x35 that do need all of it will
80C at least be getting value for money, because 35x35 spends something like
81C 5780 cycles here.
82C
83C Different values of UNROLL_COUNT give slightly different speeds, between
84C 9.0 and 9.2 c/tri-prod measured on the difference between 17 and 33 limbs.
85C This isn't a big difference, but it's presumably some alignment effect
86C which if understood could give a simple speedup.
87
88defframe(PARAM_SIZE,12)
89defframe(PARAM_SRC, 8)
90defframe(PARAM_DST, 4)
91
92	TEXT
93	ALIGN(32)
94PROLOGUE(mpn_sqr_basecase)
95deflit(`FRAME',0)
96
97	movl	PARAM_SIZE, %ecx
98	movl	PARAM_SRC, %eax
99
100	cmpl	$2, %ecx
101	je	L(two_limbs)
102
103	movl	PARAM_DST, %edx
104	ja	L(three_or_more)
105
106
107C -----------------------------------------------------------------------------
108C one limb only
109	C eax	src
110	C ebx
111	C ecx	size
112	C edx	dst
113
114	movl	(%eax), %eax
115	movl	%edx, %ecx
116
117	mull	%eax
118
119	movl	%eax, (%ecx)
120	movl	%edx, 4(%ecx)
121	ret
122
123
124C -----------------------------------------------------------------------------
125	ALIGN(16)
126L(two_limbs):
127	C eax	src
128	C ebx
129	C ecx	size
130	C edx	dst
131
132	pushl	%ebx
133	movl	%eax, %ebx	C src
134deflit(`FRAME',4)
135
136	movl	(%ebx), %eax
137	movl	PARAM_DST, %ecx
138
139	mull	%eax		C src[0]^2
140
141	movl	%eax, (%ecx)
142	movl	4(%ebx), %eax
143
144	movl	%edx, 4(%ecx)
145
146	mull	%eax		C src[1]^2
147
148	movl	%eax, 8(%ecx)
149	movl	(%ebx), %eax
150
151	movl	%edx, 12(%ecx)
152	movl	4(%ebx), %edx
153
154	mull	%edx		C src[0]*src[1]
155
156	addl	%eax, 4(%ecx)
157
158	adcl	%edx, 8(%ecx)
159	adcl	$0, 12(%ecx)
160
161	popl	%ebx
162	addl	%eax, 4(%ecx)
163
164	adcl	%edx, 8(%ecx)
165	adcl	$0, 12(%ecx)
166
167	ret
168
169
170C -----------------------------------------------------------------------------
171L(three_or_more):
172deflit(`FRAME',0)
173	cmpl	$4, %ecx
174	jae	L(four_or_more)
175
176
177C -----------------------------------------------------------------------------
178C three limbs
179	C eax	src
180	C ecx	size
181	C edx	dst
182
183	pushl	%ebx
184	movl	%eax, %ebx	C src
185
186	movl	(%ebx), %eax
187	movl	%edx, %ecx	C dst
188
189	mull	%eax		C src[0] ^ 2
190
191	movl	%eax, (%ecx)
192	movl	4(%ebx), %eax
193
194	movl	%edx, 4(%ecx)
195	pushl	%esi
196
197	mull	%eax		C src[1] ^ 2
198
199	movl	%eax, 8(%ecx)
200	movl	8(%ebx), %eax
201
202	movl	%edx, 12(%ecx)
203	pushl	%edi
204
205	mull	%eax		C src[2] ^ 2
206
207	movl	%eax, 16(%ecx)
208	movl	(%ebx), %eax
209
210	movl	%edx, 20(%ecx)
211	movl	4(%ebx), %edx
212
213	mull	%edx		C src[0] * src[1]
214
215	movl	%eax, %esi
216	movl	(%ebx), %eax
217
218	movl	%edx, %edi
219	movl	8(%ebx), %edx
220
221	pushl	%ebp
222	xorl	%ebp, %ebp
223
224	mull	%edx		C src[0] * src[2]
225
226	addl	%eax, %edi
227	movl	4(%ebx), %eax
228
229	adcl	%edx, %ebp
230
231	movl	8(%ebx), %edx
232
233	mull	%edx		C src[1] * src[2]
234
235	addl	%eax, %ebp
236
237	adcl	$0, %edx
238
239
240	C eax	will be dst[5]
241	C ebx
242	C ecx	dst
243	C edx	dst[4]
244	C esi	dst[1]
245	C edi	dst[2]
246	C ebp	dst[3]
247
248	xorl	%eax, %eax
249	addl	%esi, %esi
250	adcl	%edi, %edi
251	adcl	%ebp, %ebp
252	adcl	%edx, %edx
253	adcl	$0, %eax
254
255	addl	%esi, 4(%ecx)
256	adcl	%edi, 8(%ecx)
257	adcl	%ebp, 12(%ecx)
258
259	popl	%ebp
260	popl	%edi
261
262	adcl	%edx, 16(%ecx)
263
264	popl	%esi
265	popl	%ebx
266
267	adcl	%eax, 20(%ecx)
268	ASSERT(nc)
269
270	ret
271
272
273C -----------------------------------------------------------------------------
274
275defframe(SAVE_EBX,   -4)
276defframe(SAVE_ESI,   -8)
277defframe(SAVE_EDI,   -12)
278defframe(SAVE_EBP,   -16)
279defframe(VAR_COUNTER,-20)
280defframe(VAR_JMP,    -24)
281deflit(STACK_SPACE, 24)
282
283	ALIGN(16)
284L(four_or_more):
285
286	C eax	src
287	C ebx
288	C ecx	size
289	C edx	dst
290	C esi
291	C edi
292	C ebp
293
294C First multiply src[0]*src[1..size-1] and store at dst[1..size].
295C
296C A test was done calling mpn_mul_1 here to get the benefit of its unrolled
297C loop, but this was only a tiny speedup; at 35 limbs it took 24 cycles off
298C a 5780 cycle operation, which is not surprising since the loop here is 8
299C c/l and mpn_mul_1 is 6.25 c/l.
300
301	subl	$STACK_SPACE, %esp	deflit(`FRAME',STACK_SPACE)
302
303	movl	%edi, SAVE_EDI
304	leal	4(%edx), %edi
305
306	movl	%ebx, SAVE_EBX
307	leal	4(%eax), %ebx
308
309	movl	%esi, SAVE_ESI
310	xorl	%esi, %esi
311
312	movl	%ebp, SAVE_EBP
313
314	C eax
315	C ebx	src+4
316	C ecx	size
317	C edx
318	C esi
319	C edi	dst+4
320	C ebp
321
322	movl	(%eax), %ebp	C multiplier
323	leal	-1(%ecx), %ecx	C size-1, and pad to a 16 byte boundary
324
325
326	ALIGN(16)
327L(mul_1):
328	C eax	scratch
329	C ebx	src ptr
330	C ecx	counter
331	C edx	scratch
332	C esi	carry
333	C edi	dst ptr
334	C ebp	multiplier
335
336	movl	(%ebx), %eax
337	addl	$4, %ebx
338
339	mull	%ebp
340
341	addl	%esi, %eax
342	movl	$0, %esi
343
344	adcl	%edx, %esi
345
346	movl	%eax, (%edi)
347	addl	$4, %edi
348
349	loop	L(mul_1)
350
351
352C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
353C
354C The last two addmuls, which are the bottom right corner of the product
355C triangle, are left to the end.  These are src[size-3]*src[size-2,size-1]
356C and src[size-2]*src[size-1].  If size is 4 then it's only these corner
357C cases that need to be done.
358C
359C The unrolled code is the same as mpn_addmul_1(), see that routine for some
360C comments.
361C
362C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive.
363C
364C VAR_JMP is the computed jump into the unrolled code, stepped by one code
365C chunk each outer loop.
366C
367C K6 doesn't do any branch prediction on indirect jumps, which is good
368C actually because it's a different target each time.  The unrolled addmul
369C is about 3 cycles/limb faster than a simple loop, so the 6 cycle cost of
370C the indirect jump is quickly recovered.
371
372
373dnl  This value is also implicitly encoded in a shift and add.
374dnl
375deflit(CODE_BYTES_PER_LIMB, 15)
376
377dnl  With the unmodified &src[size] and &dst[size] pointers, the
378dnl  displacements in the unrolled code fit in a byte for UNROLL_COUNT
379dnl  values up to 31.  Above that an offset must be added to them.
380dnl
381deflit(OFFSET,
382ifelse(eval(UNROLL_COUNT>31),1,
383eval((UNROLL_COUNT-31)*4),
3840))
385
386	C eax
387	C ebx	&src[size]
388	C ecx
389	C edx
390	C esi	carry
391	C edi	&dst[size]
392	C ebp
393
394	movl	PARAM_SIZE, %ecx
395	movl	%esi, (%edi)
396
397	subl	$4, %ecx
398	jz	L(corner)
399
400	movl	%ecx, %edx
401ifelse(OFFSET,0,,
402`	subl	$OFFSET, %ebx')
403
404	shll	$4, %ecx
405ifelse(OFFSET,0,,
406`	subl	$OFFSET, %edi')
407
408	negl	%ecx
409
410ifdef(`PIC',`
411	call	L(pic_calc)
412L(here):
413',`
414	leal	L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
415')
416	negl	%edx
417
418
419	C The calculated jump mustn't be before the start of the available
420	C code.  This is the limitation UNROLL_COUNT puts on the src operand
421	C size, but checked here using the jump address directly.
422	C
423	ASSERT(ae,`
424	movl_text_address( L(unroll_inner_start), %eax)
425	cmpl	%eax, %ecx
426	')
427
428
429C -----------------------------------------------------------------------------
430	ALIGN(16)
431L(unroll_outer_top):
432	C eax
433	C ebx	&src[size], constant
434	C ecx	VAR_JMP
435	C edx	VAR_COUNTER, limbs, negative
436	C esi	high limb to store
437	C edi	dst ptr, high of last addmul
438	C ebp
439
440	movl	-12+OFFSET(%ebx,%edx,4), %ebp	C multiplier
441	movl	%edx, VAR_COUNTER
442
443	movl	-8+OFFSET(%ebx,%edx,4), %eax	C first limb of multiplicand
444
445	mull	%ebp
446
447	testb	$1, %cl
448
449	movl	%edx, %esi	C high carry
450	movl	%ecx, %edx	C jump
451
452	movl	%eax, %ecx	C low carry
453	leal	CODE_BYTES_PER_LIMB(%edx), %edx
454
455	movl	%edx, VAR_JMP
456	leal	4(%edi), %edi
457
458	C A branch-free version of this using some xors was found to be a
459	C touch slower than just a conditional jump, despite the jump
460	C switching between taken and not taken on every loop.
461
462ifelse(eval(UNROLL_COUNT%2),0,
463	jz,jnz)	L(unroll_noswap)
464	movl	%esi, %eax	C high,low carry other way around
465
466	movl	%ecx, %esi
467	movl	%eax, %ecx
468L(unroll_noswap):
469
470	jmp	*%edx
471
472
473	C Must be on an even address here so the low bit of the jump address
474	C will indicate which way around ecx/esi should start.
475	C
476	C An attempt was made at padding here to get the end of the unrolled
477	C code to come out on a good alignment, to save padding before
478	C L(corner).  This worked, but turned out to run slower than just an
479	C ALIGN(2).  The reason for this is not clear, it might be related
480	C to the different speeds on different UNROLL_COUNTs noted above.
481
482	ALIGN(2)
483
484L(unroll_inner_start):
485	C eax	scratch
486	C ebx	src
487	C ecx	carry low
488	C edx	scratch
489	C esi	carry high
490	C edi	dst
491	C ebp	multiplier
492	C
493	C 15 code bytes each limb
494	C ecx/esi swapped on each chunk
495
496forloop(`i', UNROLL_COUNT, 1, `
497	deflit(`disp_src', eval(-i*4 + OFFSET))
498	deflit(`disp_dst', eval(disp_src - 4))
499
500	m4_assert(`disp_src>=-128 && disp_src<128')
501	m4_assert(`disp_dst>=-128 && disp_dst<128')
502
503ifelse(eval(i%2),0,`
504Zdisp(	movl,	disp_src,(%ebx), %eax)
505	mull	%ebp
506Zdisp(	addl,	%esi, disp_dst,(%edi))
507	adcl	%eax, %ecx
508	movl	%edx, %esi
509	jadcl0( %esi)
510',`
511	dnl  this one comes out last
512Zdisp(	movl,	disp_src,(%ebx), %eax)
513	mull	%ebp
514Zdisp(	addl,	%ecx, disp_dst,(%edi))
515	adcl	%eax, %esi
516	movl	%edx, %ecx
517	jadcl0( %ecx)
518')
519')
520L(unroll_inner_end):
521
522	addl	%esi, -4+OFFSET(%edi)
523
524	movl	VAR_COUNTER, %edx
525	jadcl0(	%ecx)
526
527	movl	%ecx, m4_empty_if_zero(OFFSET)(%edi)
528	movl	VAR_JMP, %ecx
529
530	incl	%edx
531	jnz	L(unroll_outer_top)
532
533
534ifelse(OFFSET,0,,`
535	addl	$OFFSET, %ebx
536	addl	$OFFSET, %edi
537')
538
539
540C -----------------------------------------------------------------------------
541	ALIGN(16)
542L(corner):
543	C ebx	&src[size]
544	C edi	&dst[2*size-5]
545
546	movl	-12(%ebx), %ebp
547
548	movl	-8(%ebx), %eax
549	movl	%eax, %ecx
550
551	mull	%ebp
552
553	addl	%eax, -4(%edi)
554	adcl	$0, %edx
555
556	movl	-4(%ebx), %eax
557	movl	%edx, %esi
558	movl	%eax, %ebx
559
560	mull	%ebp
561
562	addl	%esi, %eax
563	adcl	$0, %edx
564
565	addl	%eax, (%edi)
566	adcl	$0, %edx
567
568	movl	%edx, %esi
569	movl	%ebx, %eax
570
571	mull	%ecx
572
573	addl	%esi, %eax
574	movl	%eax, 4(%edi)
575
576	adcl	$0, %edx
577
578	movl	%edx, 8(%edi)
579
580
581C -----------------------------------------------------------------------------
582C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1].
583C The loop measures about 6 cycles/iteration, though it looks like it should
584C decode in 5.
585
586L(lshift_start):
587	movl	PARAM_SIZE, %ecx
588
589	movl	PARAM_DST, %edi
590	subl	$1, %ecx		C size-1 and clear carry
591
592	movl	PARAM_SRC, %ebx
593	movl	%ecx, %edx
594
595	xorl	%eax, %eax		C ready for adcl
596
597
598	ALIGN(16)
599L(lshift):
600	C eax
601	C ebx	src (for later use)
602	C ecx	counter, decrementing
603	C edx	size-1 (for later use)
604	C esi
605	C edi	dst, incrementing
606	C ebp
607
608	rcll	4(%edi)
609	rcll	8(%edi)
610	leal	8(%edi), %edi
611	loop	L(lshift)
612
613
614	adcl	%eax, %eax
615
616	movl	%eax, 4(%edi)		C dst most significant limb
617	movl	(%ebx), %eax		C src[0]
618
619	leal	4(%ebx,%edx,4), %ebx	C &src[size]
620	subl	%edx, %ecx		C -(size-1)
621
622
623C -----------------------------------------------------------------------------
624C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
625C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
626C low limb of src[0]^2.
627
628
629	mull	%eax
630
631	movl	%eax, (%edi,%ecx,8)	C dst[0]
632
633
634	ALIGN(16)
635L(diag):
636	C eax	scratch
637	C ebx	&src[size]
638	C ecx	counter, negative
639	C edx	carry
640	C esi	scratch
641	C edi	dst[2*size-2]
642	C ebp
643
644	movl	(%ebx,%ecx,4), %eax
645	movl	%edx, %esi
646
647	mull	%eax
648
649	addl	%esi, 4(%edi,%ecx,8)
650	adcl	%eax, 8(%edi,%ecx,8)
651	adcl	$0, %edx
652
653	incl	%ecx
654	jnz	L(diag)
655
656
657	movl	SAVE_EBX, %ebx
658	movl	SAVE_ESI, %esi
659
660	addl	%edx, 4(%edi)		C dst most significant limb
661
662	movl	SAVE_EDI, %edi
663	movl	SAVE_EBP, %ebp
664	addl	$FRAME, %esp
665	ret
666
667
668
669C -----------------------------------------------------------------------------
670ifdef(`PIC',`
671L(pic_calc):
672	C See mpn/x86/README about old gas bugs
673	addl	(%esp), %ecx
674	addl	$L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx
675	addl	%edx, %ecx
676	ret_internal
677')
678
679
680EPILOGUE()
681