xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/k7/sqr_basecase.asm (revision ce54336801cf28877c3414aa2fcb251dddd543a2)
1dnl  AMD K7 mpn_sqr_basecase -- square an mpn number.
2
3dnl  Copyright 1999-2002 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C K7: approx 2.3 cycles/crossproduct, or 4.55 cycles/triangular product
35C     (measured on the speed difference between 25 and 50 limbs, which is
36C     roughly the Karatsuba recursing range).
37
38
39dnl  These are the same as mpn/x86/k6/sqr_basecase.asm, see that code for
40dnl  some comments.
41
42deflit(SQR_TOOM2_THRESHOLD_MAX, 66)
43
44ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE',
45`define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)')
46
47m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD')
48deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3))
49
50
51C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
52C
53C With a SQR_TOOM2_THRESHOLD around 50 this code is about 1500 bytes,
54C which is quite a bit, but is considered good value since squares big
55C enough to use most of the code will be spending quite a few cycles in it.
56
57
58defframe(PARAM_SIZE,12)
59defframe(PARAM_SRC, 8)
60defframe(PARAM_DST, 4)
61
62	TEXT
63	ALIGN(32)
64PROLOGUE(mpn_sqr_basecase)
65deflit(`FRAME',0)
66
67	movl	PARAM_SIZE, %ecx
68	movl	PARAM_SRC, %eax
69	cmpl	$2, %ecx
70
71	movl	PARAM_DST, %edx
72	je	L(two_limbs)
73	ja	L(three_or_more)
74
75
76C------------------------------------------------------------------------------
77C one limb only
78	C eax	src
79	C ecx	size
80	C edx	dst
81
82	movl	(%eax), %eax
83	movl	%edx, %ecx
84
85	mull	%eax
86
87	movl	%edx, 4(%ecx)
88	movl	%eax, (%ecx)
89	ret
90
91
92C------------------------------------------------------------------------------
93C
94C Using the read/modify/write "add"s seems to be faster than saving and
95C restoring registers.  Perhaps the loads for the first set hide under the
96C mul latency and the second gets store to load forwarding.
97
98	ALIGN(16)
99L(two_limbs):
100	C eax	src
101	C ebx
102	C ecx	size
103	C edx	dst
104deflit(`FRAME',0)
105
106	pushl	%ebx		FRAME_pushl()
107	movl	%eax, %ebx	C src
108	movl	(%eax), %eax
109
110	movl	%edx, %ecx	C dst
111
112	mull	%eax		C src[0]^2
113
114	movl	%eax, (%ecx)	C dst[0]
115	movl	4(%ebx), %eax
116
117	movl	%edx, 4(%ecx)	C dst[1]
118
119	mull	%eax		C src[1]^2
120
121	movl	%eax, 8(%ecx)	C dst[2]
122	movl	(%ebx), %eax
123
124	movl	%edx, 12(%ecx)	C dst[3]
125
126	mull	4(%ebx)		C src[0]*src[1]
127
128	popl	%ebx
129
130	addl	%eax, 4(%ecx)
131	adcl	%edx, 8(%ecx)
132	adcl	$0, 12(%ecx)
133	ASSERT(nc)
134
135	addl	%eax, 4(%ecx)
136	adcl	%edx, 8(%ecx)
137	adcl	$0, 12(%ecx)
138	ASSERT(nc)
139
140	ret
141
142
143C------------------------------------------------------------------------------
144defframe(SAVE_EBX,  -4)
145defframe(SAVE_ESI,  -8)
146defframe(SAVE_EDI, -12)
147defframe(SAVE_EBP, -16)
148deflit(STACK_SPACE, 16)
149
150L(three_or_more):
151	subl	$STACK_SPACE, %esp
152	cmpl	$4, %ecx
153	jae	L(four_or_more)
154deflit(`FRAME',STACK_SPACE)
155
156
157C------------------------------------------------------------------------------
158C Three limbs
159C
160C Writing out the loads and stores separately at the end of this code comes
161C out about 10 cycles faster than using adcls to memory.
162
163	C eax	src
164	C ecx	size
165	C edx	dst
166
167	movl	%ebx, SAVE_EBX
168	movl	%eax, %ebx	C src
169	movl	(%eax), %eax
170
171	movl	%edx, %ecx	C dst
172	movl	%esi, SAVE_ESI
173	movl	%edi, SAVE_EDI
174
175	mull	%eax		C src[0] ^ 2
176
177	movl	%eax, (%ecx)
178	movl	4(%ebx), %eax
179	movl	%edx, 4(%ecx)
180
181	mull	%eax		C src[1] ^ 2
182
183	movl	%eax, 8(%ecx)
184	movl	8(%ebx), %eax
185	movl	%edx, 12(%ecx)
186
187	mull	%eax		C src[2] ^ 2
188
189	movl	%eax, 16(%ecx)
190	movl	(%ebx), %eax
191	movl	%edx, 20(%ecx)
192
193	mull	4(%ebx)		C src[0] * src[1]
194
195	movl	%eax, %esi
196	movl	(%ebx), %eax
197	movl	%edx, %edi
198
199	mull	8(%ebx)		C src[0] * src[2]
200
201	addl	%eax, %edi
202	movl	%ebp, SAVE_EBP
203	movl	$0, %ebp
204
205	movl	4(%ebx), %eax
206	adcl	%edx, %ebp
207
208	mull	8(%ebx)		C src[1] * src[2]
209
210	xorl	%ebx, %ebx
211	addl	%eax, %ebp
212
213	adcl	$0, %edx
214
215	C eax
216	C ebx	zero, will be dst[5]
217	C ecx	dst
218	C edx	dst[4]
219	C esi	dst[1]
220	C edi	dst[2]
221	C ebp	dst[3]
222
223	adcl	$0, %edx
224	addl	%esi, %esi
225
226	adcl	%edi, %edi
227	movl	4(%ecx), %eax
228
229	adcl	%ebp, %ebp
230
231	adcl	%edx, %edx
232
233	adcl	$0, %ebx
234	addl	%eax, %esi
235	movl	8(%ecx), %eax
236
237	adcl	%eax, %edi
238	movl	12(%ecx), %eax
239	movl	%esi, 4(%ecx)
240
241	adcl	%eax, %ebp
242	movl	16(%ecx), %eax
243	movl	%edi, 8(%ecx)
244
245	movl	SAVE_ESI, %esi
246	movl	SAVE_EDI, %edi
247
248	adcl	%eax, %edx
249	movl	20(%ecx), %eax
250	movl	%ebp, 12(%ecx)
251
252	adcl	%ebx, %eax
253	ASSERT(nc)
254	movl	SAVE_EBX, %ebx
255	movl	SAVE_EBP, %ebp
256
257	movl	%edx, 16(%ecx)
258	movl	%eax, 20(%ecx)
259	addl	$FRAME, %esp
260
261	ret
262
263
264C------------------------------------------------------------------------------
265L(four_or_more):
266
267C First multiply src[0]*src[1..size-1] and store at dst[1..size].
268C Further products are added in rather than stored.
269
270	C eax	src
271	C ebx
272	C ecx	size
273	C edx	dst
274	C esi
275	C edi
276	C ebp
277
278defframe(`VAR_COUNTER',-20)
279defframe(`VAR_JMP',    -24)
280deflit(EXTRA_STACK_SPACE, 8)
281
282	movl	%ebx, SAVE_EBX
283	movl	%edi, SAVE_EDI
284	leal	(%edx,%ecx,4), %edi	C &dst[size]
285
286	movl	%esi, SAVE_ESI
287	movl	%ebp, SAVE_EBP
288	leal	(%eax,%ecx,4), %esi	C &src[size]
289
290	movl	(%eax), %ebp		C multiplier
291	movl	$0, %ebx
292	decl	%ecx
293
294	negl	%ecx
295	subl	$EXTRA_STACK_SPACE, %esp
296FRAME_subl_esp(EXTRA_STACK_SPACE)
297
298L(mul_1):
299	C eax	scratch
300	C ebx	carry
301	C ecx	counter
302	C edx	scratch
303	C esi	&src[size]
304	C edi	&dst[size]
305	C ebp	multiplier
306
307	movl	(%esi,%ecx,4), %eax
308
309	mull	%ebp
310
311	addl	%ebx, %eax
312	movl	%eax, (%edi,%ecx,4)
313	movl	$0, %ebx
314
315	adcl	%edx, %ebx
316	incl	%ecx
317	jnz	L(mul_1)
318
319
320C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
321C
322C The last two products, which are the bottom right corner of the product
323C triangle, are left to the end.  These are src[size-3]*src[size-2,size-1]
324C and src[size-2]*src[size-1].  If size is 4 then it's only these corner
325C cases that need to be done.
326C
327C The unrolled code is the same as in mpn_addmul_1, see that routine for
328C some comments.
329C
330C VAR_COUNTER is the outer loop, running from -size+4 to -1, inclusive.
331C
332C VAR_JMP is the computed jump into the unrolled code, stepped by one code
333C chunk each outer loop.
334C
335C K7 does branch prediction on indirect jumps, which is bad since it's a
336C different target each time.  There seems no way to avoid this.
337
338dnl  This value also hard coded in some shifts and adds
339deflit(CODE_BYTES_PER_LIMB, 17)
340
341dnl  With the unmodified &src[size] and &dst[size] pointers, the
342dnl  displacements in the unrolled code fit in a byte for UNROLL_COUNT
343dnl  values up to 31, but above that an offset must be added to them.
344
345deflit(OFFSET,
346ifelse(eval(UNROLL_COUNT>31),1,
347eval((UNROLL_COUNT-31)*4),
3480))
349
350dnl  Because the last chunk of code is generated differently, a label placed
351dnl  at the end doesn't work.  Instead calculate the implied end using the
352dnl  start and how many chunks of code there are.
353
354deflit(UNROLL_INNER_END,
355`L(unroll_inner_start)+eval(UNROLL_COUNT*CODE_BYTES_PER_LIMB)')
356
357	C eax
358	C ebx	carry
359	C ecx
360	C edx
361	C esi	&src[size]
362	C edi	&dst[size]
363	C ebp
364
365	movl	PARAM_SIZE, %ecx
366	movl	%ebx, (%edi)
367
368	subl	$4, %ecx
369	jz	L(corner)
370
371	negl	%ecx
372ifelse(OFFSET,0,,`subl	$OFFSET, %edi')
373ifelse(OFFSET,0,,`subl	$OFFSET, %esi')
374
375	movl	%ecx, %edx
376	shll	$4, %ecx
377
378ifdef(`PIC',`
379	call	L(pic_calc)
380L(here):
381',`
382	leal	UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
383')
384
385
386	C The calculated jump mustn't come out to before the start of the
387	C code available.  This is the limit UNROLL_COUNT puts on the src
388	C operand size, but checked here directly using the jump address.
389	ASSERT(ae,
390	`movl_text_address(L(unroll_inner_start), %eax)
391	cmpl	%eax, %ecx')
392
393
394C------------------------------------------------------------------------------
395	ALIGN(16)
396L(unroll_outer_top):
397	C eax
398	C ebx	high limb to store
399	C ecx	VAR_JMP
400	C edx	VAR_COUNTER, limbs, negative
401	C esi	&src[size], constant
402	C edi	dst ptr, high of last addmul
403	C ebp
404
405	movl	-12+OFFSET(%esi,%edx,4), %ebp	C next multiplier
406	movl	-8+OFFSET(%esi,%edx,4), %eax	C first of multiplicand
407
408	movl	%edx, VAR_COUNTER
409
410	mull	%ebp
411
412define(cmovX,`ifelse(eval(UNROLL_COUNT%2),0,`cmovz($@)',`cmovnz($@)')')
413
414	testb	$1, %cl
415	movl	%edx, %ebx	C high carry
416	movl	%ecx, %edx	C jump
417
418	movl	%eax, %ecx	C low carry
419	cmovX(	%ebx, %ecx)	C high carry reverse
420	cmovX(	%eax, %ebx)	C low carry reverse
421
422	leal	CODE_BYTES_PER_LIMB(%edx), %eax
423	xorl	%edx, %edx
424	leal	4(%edi), %edi
425
426	movl	%eax, VAR_JMP
427
428	jmp	*%eax
429
430
431ifdef(`PIC',`
432L(pic_calc):
433	addl	(%esp), %ecx
434	addl	$UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)-L(here), %ecx
435	addl	%edx, %ecx
436	ret_internal
437')
438
439
440	C Must be an even address to preserve the significance of the low
441	C bit of the jump address indicating which way around ecx/ebx should
442	C start.
443	ALIGN(2)
444
445L(unroll_inner_start):
446	C eax	next limb
447	C ebx	carry high
448	C ecx	carry low
449	C edx	scratch
450	C esi	src
451	C edi	dst
452	C ebp	multiplier
453
454forloop(`i', UNROLL_COUNT, 1, `
455	deflit(`disp_src', eval(-i*4 + OFFSET))
456	deflit(`disp_dst', eval(disp_src - 4))
457
458	m4_assert(`disp_src>=-128 && disp_src<128')
459	m4_assert(`disp_dst>=-128 && disp_dst<128')
460
461ifelse(eval(i%2),0,`
462Zdisp(	movl,	disp_src,(%esi), %eax)
463	adcl	%edx, %ebx
464
465	mull	%ebp
466
467Zdisp(  addl,	%ecx, disp_dst,(%edi))
468	movl	$0, %ecx
469
470	adcl	%eax, %ebx
471
472',`
473	dnl  this bit comes out last
474Zdisp(  movl,	disp_src,(%esi), %eax)
475	adcl	%edx, %ecx
476
477	mull	%ebp
478
479Zdisp(	addl,	%ebx, disp_dst,(%edi))
480
481ifelse(forloop_last,0,
482`	movl	$0, %ebx')
483
484	adcl	%eax, %ecx
485')
486')
487
488	C eax	next limb
489	C ebx	carry high
490	C ecx	carry low
491	C edx	scratch
492	C esi	src
493	C edi	dst
494	C ebp	multiplier
495
496	adcl	$0, %edx
497	addl	%ecx, -4+OFFSET(%edi)
498	movl	VAR_JMP, %ecx
499
500	adcl	$0, %edx
501
502	movl	%edx, m4_empty_if_zero(OFFSET) (%edi)
503	movl	VAR_COUNTER, %edx
504
505	incl	%edx
506	jnz	L(unroll_outer_top)
507
508
509ifelse(OFFSET,0,,`
510	addl	$OFFSET, %esi
511	addl	$OFFSET, %edi
512')
513
514
515C------------------------------------------------------------------------------
516L(corner):
517	C esi	&src[size]
518	C edi	&dst[2*size-5]
519
520	movl	-12(%esi), %ebp
521	movl	-8(%esi), %eax
522	movl	%eax, %ecx
523
524	mull	%ebp
525
526	addl	%eax, -4(%edi)
527	movl	-4(%esi), %eax
528
529	adcl	$0, %edx
530	movl	%edx, %ebx
531	movl	%eax, %esi
532
533	mull	%ebp
534
535	addl	%ebx, %eax
536
537	adcl	$0, %edx
538	addl	%eax, (%edi)
539	movl	%esi, %eax
540
541	adcl	$0, %edx
542	movl	%edx, %ebx
543
544	mull	%ecx
545
546	addl	%ebx, %eax
547	movl	%eax, 4(%edi)
548
549	adcl	$0, %edx
550	movl	%edx, 8(%edi)
551
552
553
554C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
555
556L(lshift_start):
557	movl	PARAM_SIZE, %eax
558	movl	PARAM_DST, %edi
559	xorl	%ecx, %ecx		C clear carry
560
561	leal	(%edi,%eax,8), %edi
562	notl	%eax			C -size-1, preserve carry
563
564	leal	2(%eax), %eax		C -(size-1)
565
566L(lshift):
567	C eax	counter, negative
568	C ebx
569	C ecx
570	C edx
571	C esi
572	C edi	dst, pointing just after last limb
573	C ebp
574
575	rcll	-4(%edi,%eax,8)
576	rcll	(%edi,%eax,8)
577	incl	%eax
578	jnz	L(lshift)
579
580	setc	%al
581
582	movl	PARAM_SRC, %esi
583	movl	%eax, -4(%edi)		C dst most significant limb
584
585	movl	PARAM_SIZE, %ecx
586
587
588C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
589C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
590C low limb of src[0]^2.
591
592	movl	(%esi), %eax		C src[0]
593
594	mull	%eax
595
596	leal	(%esi,%ecx,4), %esi	C src point just after last limb
597	negl	%ecx
598
599	movl	%eax, (%edi,%ecx,8)	C dst[0]
600	incl	%ecx
601
602L(diag):
603	C eax	scratch
604	C ebx	scratch
605	C ecx	counter, negative
606	C edx	carry
607	C esi	src just after last limb
608	C edi	dst just after last limb
609	C ebp
610
611	movl	(%esi,%ecx,4), %eax
612	movl	%edx, %ebx
613
614	mull	%eax
615
616	addl	%ebx, -4(%edi,%ecx,8)
617	adcl	%eax, (%edi,%ecx,8)
618	adcl	$0, %edx
619
620	incl	%ecx
621	jnz	L(diag)
622
623
624	movl	SAVE_ESI, %esi
625	movl	SAVE_EBX, %ebx
626
627	addl	%edx, -4(%edi)		C dst most significant limb
628	movl	SAVE_EDI, %edi
629
630	movl	SAVE_EBP, %ebp
631	addl	$FRAME, %esp
632
633	ret
634
635EPILOGUE()
636