xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/k7/mul_basecase.asm (revision 8e33eff89e26cf71871ead62f0d5063e1313c33a)
1dnl  AMD K7 mpn_mul_basecase -- multiply two mpn numbers.
2
3dnl  Copyright 1999-2002 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C K7: approx 4.42 cycles per cross product at around 20x20 limbs (16
35C     limbs/loop unrolling).
36
37
38
39dnl  K7 UNROLL_COUNT cycles/product (at around 20x20)
40dnl           8           4.67
41dnl          16           4.59
42dnl          32           4.42
43dnl  Maximum possible with the current code is 32.
44dnl
45dnl  At 32 the typical 13-26 limb sizes from the karatsuba code will get
46dnl  done with a straight run through a block of code, no inner loop.  Using
47dnl  32 gives 1k of code, but the k7 has a 64k L1 code cache.
48
49deflit(UNROLL_COUNT, 32)
50
51
52C void mpn_mul_basecase (mp_ptr wp,
53C                        mp_srcptr xp, mp_size_t xsize,
54C                        mp_srcptr yp, mp_size_t ysize);
55C
56C Calculate xp,xsize multiplied by yp,ysize, storing the result in
57C wp,xsize+ysize.
58C
59C This routine is essentially the same as mpn/generic/mul_basecase.c, but
60C it's faster because it does most of the mpn_addmul_1() startup
61C calculations only once.  The saving is 15-25% on typical sizes coming from
62C the Karatsuba multiply code.
63
64ifdef(`PIC',`
65deflit(UNROLL_THRESHOLD, 5)
66',`
67deflit(UNROLL_THRESHOLD, 5)
68')
69
70defframe(PARAM_YSIZE,20)
71defframe(PARAM_YP,   16)
72defframe(PARAM_XSIZE,12)
73defframe(PARAM_XP,   8)
74defframe(PARAM_WP,   4)
75
76	TEXT
77	ALIGN(32)
78PROLOGUE(mpn_mul_basecase)
79deflit(`FRAME',0)
80
81	movl	PARAM_XSIZE, %ecx
82	movl	PARAM_YP, %eax
83
84	movl	PARAM_XP, %edx
85	movl	(%eax), %eax	C yp low limb
86
87	cmpl	$2, %ecx
88	ja	L(xsize_more_than_two)
89	je	L(two_by_something)
90
91
92	C one limb by one limb
93
94	mull	(%edx)
95
96	movl	PARAM_WP, %ecx
97	movl	%eax, (%ecx)
98	movl	%edx, 4(%ecx)
99	ret
100
101
102C -----------------------------------------------------------------------------
103L(two_by_something):
104deflit(`FRAME',0)
105	decl	PARAM_YSIZE
106	pushl	%ebx		defframe_pushl(`SAVE_EBX')
107	movl	%eax, %ecx	C yp low limb
108
109	movl	PARAM_WP, %ebx
110	pushl	%esi		defframe_pushl(`SAVE_ESI')
111	movl	%edx, %esi	C xp
112
113	movl	(%edx), %eax	C xp low limb
114	jnz	L(two_by_two)
115
116
117	C two limbs by one limb
118
119	mull	%ecx
120
121	movl	%eax, (%ebx)
122	movl	4(%esi), %eax
123	movl	%edx, %esi	C carry
124
125	mull	%ecx
126
127	addl	%eax, %esi
128
129	movl	%esi, 4(%ebx)
130	movl	SAVE_ESI, %esi
131
132	adcl	$0, %edx
133
134	movl	%edx, 8(%ebx)
135	movl	SAVE_EBX, %ebx
136	addl	$FRAME, %esp
137
138	ret
139
140
141
142C -----------------------------------------------------------------------------
143C Could load yp earlier into another register.
144
145	ALIGN(16)
146L(two_by_two):
147	C eax	xp low limb
148	C ebx	wp
149	C ecx	yp low limb
150	C edx
151	C esi	xp
152	C edi
153	C ebp
154
155dnl  FRAME carries on from previous
156
157	mull	%ecx		C xp[0] * yp[0]
158
159	push	%edi		defframe_pushl(`SAVE_EDI')
160	movl	%edx, %edi	C carry, for wp[1]
161
162	movl	%eax, (%ebx)
163	movl	4(%esi), %eax
164
165	mull	%ecx		C xp[1] * yp[0]
166
167	addl	%eax, %edi
168	movl	PARAM_YP, %ecx
169
170	adcl	$0, %edx
171	movl	4(%ecx), %ecx	C yp[1]
172	movl	%edi, 4(%ebx)
173
174	movl	4(%esi), %eax	C xp[1]
175	movl	%edx, %edi	C carry, for wp[2]
176
177	mull	%ecx		C xp[1] * yp[1]
178
179	addl	%eax, %edi
180
181	adcl	$0, %edx
182	movl	(%esi), %eax	C xp[0]
183
184	movl	%edx, %esi	C carry, for wp[3]
185
186	mull	%ecx		C xp[0] * yp[1]
187
188	addl	%eax, 4(%ebx)
189	adcl	%edx, %edi
190	movl	%edi, 8(%ebx)
191
192	adcl	$0, %esi
193	movl	SAVE_EDI, %edi
194	movl	%esi, 12(%ebx)
195
196	movl	SAVE_ESI, %esi
197	movl	SAVE_EBX, %ebx
198	addl	$FRAME, %esp
199
200	ret
201
202
203C -----------------------------------------------------------------------------
204	ALIGN(16)
205L(xsize_more_than_two):
206
207C The first limb of yp is processed with a simple mpn_mul_1 style loop
208C inline.  Unrolling this doesn't seem worthwhile since it's only run once
209C (whereas the addmul below is run ysize-1 many times).  A call to the
210C actual mpn_mul_1 will be slowed down by the call and parameter pushing and
211C popping, and doesn't seem likely to be worthwhile on the typical 13-26
212C limb operations the Karatsuba code calls here with.
213
214	C eax	yp[0]
215	C ebx
216	C ecx	xsize
217	C edx	xp
218	C esi
219	C edi
220	C ebp
221
222dnl  FRAME doesn't carry on from previous, no pushes yet here
223defframe(`SAVE_EBX',-4)
224defframe(`SAVE_ESI',-8)
225defframe(`SAVE_EDI',-12)
226defframe(`SAVE_EBP',-16)
227deflit(`FRAME',0)
228
229	subl	$16, %esp
230deflit(`FRAME',16)
231
232	movl	%edi, SAVE_EDI
233	movl	PARAM_WP, %edi
234
235	movl	%ebx, SAVE_EBX
236	movl	%ebp, SAVE_EBP
237	movl	%eax, %ebp
238
239	movl	%esi, SAVE_ESI
240	xorl	%ebx, %ebx
241	leal	(%edx,%ecx,4), %esi	C xp end
242
243	leal	(%edi,%ecx,4), %edi	C wp end of mul1
244	negl	%ecx
245
246
247L(mul1):
248	C eax	scratch
249	C ebx	carry
250	C ecx	counter, negative
251	C edx	scratch
252	C esi	xp end
253	C edi	wp end of mul1
254	C ebp	multiplier
255
256	movl	(%esi,%ecx,4), %eax
257
258	mull	%ebp
259
260	addl	%ebx, %eax
261	movl	%eax, (%edi,%ecx,4)
262	movl	$0, %ebx
263
264	adcl	%edx, %ebx
265	incl	%ecx
266	jnz	L(mul1)
267
268
269	movl	PARAM_YSIZE, %edx
270	movl	PARAM_XSIZE, %ecx
271
272	movl	%ebx, (%edi)		C final carry
273	decl	%edx
274
275	jnz	L(ysize_more_than_one)
276
277
278	movl	SAVE_EDI, %edi
279	movl	SAVE_EBX, %ebx
280
281	movl	SAVE_EBP, %ebp
282	movl	SAVE_ESI, %esi
283	addl	$FRAME, %esp
284
285	ret
286
287
288L(ysize_more_than_one):
289	cmpl	$UNROLL_THRESHOLD, %ecx
290	movl	PARAM_YP, %eax
291
292	jae	L(unroll)
293
294
295C -----------------------------------------------------------------------------
296	C simple addmul looping
297	C
298	C eax	yp
299	C ebx
300	C ecx	xsize
301	C edx	ysize-1
302	C esi	xp end
303	C edi	wp end of mul1
304	C ebp
305
306	leal	4(%eax,%edx,4), %ebp	C yp end
307	negl	%ecx
308	negl	%edx
309
310	movl	(%esi,%ecx,4), %eax	C xp low limb
311	movl	%edx, PARAM_YSIZE	C -(ysize-1)
312	incl	%ecx
313
314	xorl	%ebx, %ebx		C initial carry
315	movl	%ecx, PARAM_XSIZE	C -(xsize-1)
316	movl	%ebp, PARAM_YP
317
318	movl	(%ebp,%edx,4), %ebp	C yp second lowest limb - multiplier
319	jmp	L(simple_outer_entry)
320
321
322	C this is offset 0x121 so close enough to aligned
323L(simple_outer_top):
324	C ebp	ysize counter, negative
325
326	movl	PARAM_YP, %edx
327	movl	PARAM_XSIZE, %ecx	C -(xsize-1)
328	xorl	%ebx, %ebx		C carry
329
330	movl	%ebp, PARAM_YSIZE
331	addl	$4, %edi		C next position in wp
332
333	movl	(%edx,%ebp,4), %ebp	C yp limb - multiplier
334	movl	-4(%esi,%ecx,4), %eax	C xp low limb
335
336
337L(simple_outer_entry):
338
339L(simple_inner):
340	C eax	xp limb
341	C ebx	carry limb
342	C ecx	loop counter (negative)
343	C edx	scratch
344	C esi	xp end
345	C edi	wp end
346	C ebp	multiplier
347
348	mull	%ebp
349
350	addl	%eax, %ebx
351	adcl	$0, %edx
352
353	addl	%ebx, (%edi,%ecx,4)
354	movl	(%esi,%ecx,4), %eax
355	adcl	$0, %edx
356
357	incl	%ecx
358	movl	%edx, %ebx
359	jnz	L(simple_inner)
360
361
362	mull	%ebp
363
364	movl	PARAM_YSIZE, %ebp
365	addl	%eax, %ebx
366
367	adcl	$0, %edx
368	addl	%ebx, (%edi)
369
370	adcl	$0, %edx
371	incl	%ebp
372
373	movl	%edx, 4(%edi)
374	jnz	L(simple_outer_top)
375
376
377	movl	SAVE_EBX, %ebx
378	movl	SAVE_ESI, %esi
379
380	movl	SAVE_EDI, %edi
381	movl	SAVE_EBP, %ebp
382	addl	$FRAME, %esp
383
384	ret
385
386
387
388C -----------------------------------------------------------------------------
389C
390C The unrolled loop is the same as in mpn_addmul_1(), see that code for some
391C comments.
392C
393C VAR_ADJUST is the negative of how many limbs the leals in the inner loop
394C increment xp and wp.  This is used to adjust back xp and wp, and rshifted
395C to given an initial VAR_COUNTER at the top of the outer loop.
396C
397C VAR_COUNTER is for the unrolled loop, running from VAR_ADJUST/UNROLL_COUNT
398C up to -1, inclusive.
399C
400C VAR_JMP is the computed jump into the unrolled loop.
401C
402C VAR_XP_LOW is the least significant limb of xp, which is needed at the
403C start of the unrolled loop.
404C
405C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
406C inclusive.
407C
408C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
409C added to give the location of the next limb of yp, which is the multiplier
410C in the unrolled loop.
411C
412C The trick with VAR_ADJUST means it's only necessary to do one fetch in the
413C outer loop to take care of xp, wp and the inner loop counter.
414
415defframe(VAR_COUNTER,  -20)
416defframe(VAR_ADJUST,   -24)
417defframe(VAR_JMP,      -28)
418defframe(VAR_XP_LOW,   -32)
419deflit(VAR_EXTRA_SPACE, 16)
420
421
422L(unroll):
423	C eax	yp
424	C ebx
425	C ecx	xsize
426	C edx	ysize-1
427	C esi	xp end
428	C edi	wp end of mul1
429	C ebp
430
431	movl	PARAM_XP, %esi
432	movl	4(%eax), %ebp		C multiplier (yp second limb)
433	leal	4(%eax,%edx,4), %eax	C yp adjust for ysize indexing
434
435	movl	PARAM_WP, %edi
436	movl	%eax, PARAM_YP
437	negl	%edx
438
439	movl	%edx, PARAM_YSIZE
440	leal	UNROLL_COUNT-2(%ecx), %ebx	C (xsize-1)+UNROLL_COUNT-1
441	decl	%ecx				C xsize-1
442
443	movl	(%esi), %eax		C xp low limb
444	andl	$-UNROLL_MASK-1, %ebx
445	negl	%ecx
446
447	subl	$VAR_EXTRA_SPACE, %esp
448deflit(`FRAME',16+VAR_EXTRA_SPACE)
449	negl	%ebx
450	andl	$UNROLL_MASK, %ecx
451
452	movl	%ebx, VAR_ADJUST
453	movl	%ecx, %edx
454	shll	$4, %ecx
455
456	sarl	$UNROLL_LOG2, %ebx
457
458	C 17 code bytes per limb
459ifdef(`PIC',`
460	call	L(pic_calc)
461L(unroll_here):
462',`
463	leal	L(unroll_entry) (%ecx,%edx,1), %ecx
464')
465	negl	%edx
466
467	movl	%eax, VAR_XP_LOW
468	movl	%ecx, VAR_JMP
469	leal	4(%edi,%edx,4), %edi	C wp and xp, adjust for unrolling,
470	leal	4(%esi,%edx,4), %esi	C  and start at second limb
471	jmp	L(unroll_outer_entry)
472
473
474ifdef(`PIC',`
475L(pic_calc):
476	C See mpn/x86/README about old gas bugs
477	leal	(%ecx,%edx,1), %ecx
478	addl	$L(unroll_entry)-L(unroll_here), %ecx
479	addl	(%esp), %ecx
480	ret_internal
481')
482
483
484C --------------------------------------------------------------------------
485	ALIGN(32)
486L(unroll_outer_top):
487	C ebp	ysize counter, negative
488
489	movl	VAR_ADJUST, %ebx
490	movl	PARAM_YP, %edx
491
492	movl	VAR_XP_LOW, %eax
493	movl	%ebp, PARAM_YSIZE	C store incremented ysize counter
494
495	leal	4(%edi,%ebx,4), %edi
496	leal	(%esi,%ebx,4), %esi
497	sarl	$UNROLL_LOG2, %ebx
498
499	movl	(%edx,%ebp,4), %ebp	C yp next multiplier
500	movl	VAR_JMP, %ecx
501
502L(unroll_outer_entry):
503	mull	%ebp
504
505	testb	$1, %cl		C and clear carry bit
506	movl	%ebx, VAR_COUNTER
507	movl	$0, %ebx
508
509	movl	$0, %ecx
510	cmovz(	%eax, %ecx)	C eax into low carry, zero into high carry limb
511	cmovnz(	%eax, %ebx)
512
513	C Extra fetch of VAR_JMP is bad, but registers are tight
514	jmp	*VAR_JMP
515
516
517C -----------------------------------------------------------------------------
518	ALIGN(32)
519L(unroll_top):
520	C eax	xp limb
521	C ebx	carry high
522	C ecx	carry low
523	C edx	scratch
524	C esi	xp+8
525	C edi	wp
526	C ebp	yp multiplier limb
527	C
528	C VAR_COUNTER  loop counter, negative
529	C
530	C 17 bytes each limb
531
532L(unroll_entry):
533
534deflit(CHUNK_COUNT,2)
535forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
536	deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
537	deflit(`disp1', eval(disp0 + 4))
538
539Zdisp(	movl,	disp0,(%esi), %eax)
540	adcl	%edx, %ebx
541
542	mull	%ebp
543
544Zdisp(	addl,	%ecx, disp0,(%edi))
545	movl	$0, %ecx
546
547	adcl	%eax, %ebx
548
549
550	movl	disp1(%esi), %eax
551	adcl	%edx, %ecx
552
553	mull	%ebp
554
555	addl	%ebx, disp1(%edi)
556	movl	$0, %ebx
557
558	adcl	%eax, %ecx
559')
560
561
562	incl	VAR_COUNTER
563	leal	UNROLL_BYTES(%esi), %esi
564	leal	UNROLL_BYTES(%edi), %edi
565
566	jnz	L(unroll_top)
567
568
569	C eax
570	C ebx	zero
571	C ecx	low
572	C edx	high
573	C esi
574	C edi	wp, pointing at second last limb)
575	C ebp
576	C
577	C carry flag to be added to high
578
579deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
580deflit(`disp1', eval(disp0-0 + 4))
581
582	movl	PARAM_YSIZE, %ebp
583	adcl	$0, %edx
584	addl	%ecx, disp0(%edi)
585
586	adcl	$0, %edx
587	incl	%ebp
588
589	movl	%edx, disp1(%edi)
590	jnz	L(unroll_outer_top)
591
592
593	movl	SAVE_ESI, %esi
594	movl	SAVE_EBP, %ebp
595
596	movl	SAVE_EDI, %edi
597	movl	SAVE_EBX, %ebx
598	addl	$FRAME, %esp
599
600	ret
601
602EPILOGUE()
603