xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/p6/mul_basecase.asm (revision f0fde9902fd4d72ded2807793acc7bfaa1ebf243)
1dnl  Intel P6 mpn_mul_basecase -- multiply two mpn numbers.
2
3dnl  Copyright 1999-2003 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C P6: approx 6.5 cycles per cross product (16 limbs/loop unrolling).
35
36
37dnl  P6 UNROLL_COUNT cycles/product (approx)
38dnl           8           7
39dnl          16           6.5
40dnl          32           6.4
41dnl  Maximum possible with the current code is 32.
42
43deflit(UNROLL_COUNT, 16)
44
45
46C void mpn_mul_basecase (mp_ptr wp,
47C                        mp_srcptr xp, mp_size_t xsize,
48C                        mp_srcptr yp, mp_size_t ysize);
49C
50C This routine is essentially the same as mpn/generic/mul_basecase.c, but
51C it's faster because it does most of the mpn_addmul_1() startup
52C calculations only once.
53
54ifdef(`PIC',`
55deflit(UNROLL_THRESHOLD, 5)
56',`
57deflit(UNROLL_THRESHOLD, 5)
58')
59
60defframe(PARAM_YSIZE,20)
61defframe(PARAM_YP,   16)
62defframe(PARAM_XSIZE,12)
63defframe(PARAM_XP,   8)
64defframe(PARAM_WP,   4)
65
66	TEXT
67	ALIGN(16)
68
69PROLOGUE(mpn_mul_basecase)
70deflit(`FRAME',0)
71
72	movl	PARAM_XSIZE, %ecx
73
74	movl	PARAM_YP, %eax
75
76	movl	PARAM_XP, %edx
77
78	movl	(%eax), %eax		C yp[0]
79	cmpl	$2, %ecx
80	ja	L(xsize_more_than_two)
81	je	L(two_by_something)
82
83
84	C one limb by one limb
85
86	mull	(%edx)
87
88	movl	PARAM_WP, %ecx
89	movl	%eax, (%ecx)
90	movl	%edx, 4(%ecx)
91	ret
92
93
94C -----------------------------------------------------------------------------
95L(two_by_something):
96deflit(`FRAME',0)
97
98dnl  re-use parameter space
99define(SAVE_EBX, `PARAM_XSIZE')
100define(SAVE_ESI, `PARAM_YSIZE')
101
102	movl	%ebx, SAVE_EBX
103	cmpl	$1, PARAM_YSIZE
104	movl	%eax, %ecx		C yp[0]
105
106	movl	%esi, SAVE_ESI		C save esi
107	movl	PARAM_WP, %ebx
108	movl	%edx, %esi		C xp
109
110	movl	(%edx), %eax		C xp[0]
111	jne	L(two_by_two)
112
113
114	C two limbs by one limb
115	C
116	C eax	xp[0]
117	C ebx	wp
118	C ecx	yp[0]
119	C edx
120	C esi	xp
121
122	mull	%ecx
123
124	movl	%eax, (%ebx)
125	movl	4(%esi), %eax
126	movl	%edx, %esi		C carry
127
128	mull	%ecx
129
130	addl	%eax, %esi
131
132	movl	%esi, 4(%ebx)
133	movl	SAVE_ESI, %esi
134
135	adcl	$0, %edx
136
137	movl	%edx, 8(%ebx)
138	movl	SAVE_EBX, %ebx
139
140	ret
141
142
143
144C -----------------------------------------------------------------------------
145
146	ALIGN(16)
147L(two_by_two):
148	C eax	xp[0]
149	C ebx	wp
150	C ecx	yp[0]
151	C edx
152	C esi	xp
153	C edi
154	C ebp
155
156dnl  more parameter space re-use
157define(SAVE_EDI, `PARAM_WP')
158
159	mull	%ecx		C xp[0] * yp[0]
160
161	movl	%edi, SAVE_EDI
162	movl	%edx, %edi	C carry, for wp[1]
163
164	movl	%eax, (%ebx)
165	movl	4(%esi), %eax
166
167	mull	%ecx		C xp[1] * yp[0]
168
169	addl	%eax, %edi
170	movl	PARAM_YP, %ecx
171
172	adcl	$0, %edx
173	movl	4(%ecx), %ecx	C yp[1]
174
175	movl	%edi, 4(%ebx)
176	movl	4(%esi), %eax	C xp[1]
177	movl	%edx, %edi	C carry, for wp[2]
178
179	mull	%ecx		C xp[1] * yp[1]
180
181	addl	%eax, %edi
182	movl	(%esi), %eax	C xp[0]
183
184	adcl	$0, %edx
185	movl	%edx, %esi	C carry, for wp[3]
186
187	mull	%ecx		C xp[0] * yp[1]
188
189	addl	%eax, 4(%ebx)
190	movl	%esi, %eax
191
192	adcl	%edx, %edi
193	movl	SAVE_ESI, %esi
194
195	movl	%edi, 8(%ebx)
196
197	adcl	$0, %eax
198	movl	SAVE_EDI, %edi
199
200	movl	%eax, 12(%ebx)
201	movl	SAVE_EBX, %ebx
202
203	ret
204
205
206C -----------------------------------------------------------------------------
207	ALIGN(16)
208L(xsize_more_than_two):
209
210C The first limb of yp is processed with a simple mpn_mul_1 loop running at
211C about 6.2 c/l.  Unrolling this doesn't seem worthwhile since it's only run
212C once (whereas the addmul_1 below is run ysize-1 many times).  A call to
213C mpn_mul_1 would be slowed down by the parameter pushing and popping etc,
214C and doesn't seem likely to be worthwhile on the typical sizes reaching
215C here from the Karatsuba code.
216
217	C eax	yp[0]
218	C ebx
219	C ecx	xsize
220	C edx	xp
221	C esi
222	C edi
223	C ebp
224
225defframe(`SAVE_EBX',    -4)
226defframe(`SAVE_ESI',    -8)
227defframe(`SAVE_EDI',   -12)
228defframe(`SAVE_EBP',   -16)
229defframe(VAR_COUNTER,  -20)  dnl for use in the unroll case
230defframe(VAR_ADJUST,   -24)
231defframe(VAR_JMP,      -28)
232defframe(VAR_SWAP,     -32)
233defframe(VAR_XP_LOW,   -36)
234deflit(STACK_SPACE, 36)
235
236	subl	$STACK_SPACE, %esp
237deflit(`FRAME',STACK_SPACE)
238
239	movl	%edi, SAVE_EDI
240	movl	PARAM_WP, %edi
241
242	movl	%ebx, SAVE_EBX
243
244	movl	%ebp, SAVE_EBP
245	movl	%eax, %ebp
246
247	movl	%esi, SAVE_ESI
248	xorl	%ebx, %ebx
249	leal	(%edx,%ecx,4), %esi	C xp end
250
251	leal	(%edi,%ecx,4), %edi	C wp end of mul1
252	negl	%ecx
253
254
255L(mul1):
256	C eax	scratch
257	C ebx	carry
258	C ecx	counter, negative
259	C edx	scratch
260	C esi	xp end
261	C edi	wp end of mul1
262	C ebp	multiplier
263
264	movl	(%esi,%ecx,4), %eax
265
266	mull	%ebp
267
268	addl	%ebx, %eax
269	movl	%eax, (%edi,%ecx,4)
270	movl	$0, %ebx
271
272	adcl	%edx, %ebx
273	incl	%ecx
274	jnz	L(mul1)
275
276
277	movl	PARAM_YSIZE, %edx
278
279	movl	%ebx, (%edi)		C final carry
280	movl	PARAM_XSIZE, %ecx
281	decl	%edx
282
283	jz	L(done)			C if ysize==1
284
285	cmpl	$UNROLL_THRESHOLD, %ecx
286	movl	PARAM_YP, %eax
287	jae	L(unroll)
288
289
290C -----------------------------------------------------------------------------
291	C simple addmul looping
292	C
293	C eax	yp
294	C ebx
295	C ecx	xsize
296	C edx	ysize-1
297	C esi	xp end
298	C edi	wp end of mul1
299	C ebp
300
301	leal	4(%eax,%edx,4), %ebp	C yp end
302	negl	%ecx
303	negl	%edx
304
305	movl	%edx, PARAM_YSIZE	C -(ysize-1)
306	movl	(%esi,%ecx,4), %eax	C xp low limb
307	incl	%ecx
308
309	movl	%ecx, PARAM_XSIZE	C -(xsize-1)
310	xorl	%ebx, %ebx		C initial carry
311
312	movl	%ebp, PARAM_YP
313	movl	(%ebp,%edx,4), %ebp	C yp second lowest limb - multiplier
314	jmp	L(simple_outer_entry)
315
316
317L(simple_outer_top):
318	C ebp	ysize counter, negative
319
320	movl	PARAM_YP, %edx
321
322	movl	PARAM_XSIZE, %ecx	C -(xsize-1)
323	xorl	%ebx, %ebx		C carry
324
325	movl	%ebp, PARAM_YSIZE
326	addl	$4, %edi		C next position in wp
327
328	movl	(%edx,%ebp,4), %ebp	C yp limb - multiplier
329
330	movl	-4(%esi,%ecx,4), %eax	C xp low limb
331
332
333L(simple_outer_entry):
334
335L(simple_inner_top):
336	C eax	xp limb
337	C ebx	carry limb
338	C ecx	loop counter (negative)
339	C edx	scratch
340	C esi	xp end
341	C edi	wp end
342	C ebp	multiplier
343
344	mull	%ebp
345
346	addl	%eax, %ebx
347	adcl	$0, %edx
348
349	addl	%ebx, (%edi,%ecx,4)
350	movl	(%esi,%ecx,4), %eax
351	adcl	$0, %edx
352
353	incl	%ecx
354	movl	%edx, %ebx
355	jnz	L(simple_inner_top)
356
357
358	C separate code for last limb so outer loop counter handling can be
359	C interleaved
360
361	mull	%ebp
362
363	movl	PARAM_YSIZE, %ebp
364	addl	%eax, %ebx
365
366	adcl	$0, %edx
367
368	addl	%ebx, (%edi)
369
370	adcl	$0, %edx
371	incl	%ebp
372
373	movl	%edx, 4(%edi)
374	jnz	L(simple_outer_top)
375
376
377L(done):
378	movl	SAVE_EBX, %ebx
379
380	movl	SAVE_ESI, %esi
381
382	movl	SAVE_EDI, %edi
383
384	movl	SAVE_EBP, %ebp
385	addl	$FRAME, %esp
386
387	ret
388
389
390
391C -----------------------------------------------------------------------------
392C
393C The unrolled loop is the same as in mpn_addmul_1, see that code for some
394C comments.
395C
396C VAR_ADJUST is the negative of how many limbs the leals in the inner loop
397C increment xp and wp.  This is used to adjust xp and wp, and is rshifted to
398C given an initial VAR_COUNTER at the top of the outer loop.
399C
400C VAR_COUNTER is for the unrolled loop, running from VAR_ADJUST/UNROLL_COUNT
401C up to -1, inclusive.
402C
403C VAR_JMP is the computed jump into the unrolled loop.
404C
405C VAR_SWAP is 0 if xsize odd or 0xFFFFFFFF if xsize even, used to swap the
406C initial ebx and ecx on entry to the unrolling.
407C
408C VAR_XP_LOW is the least significant limb of xp, which is needed at the
409C start of the unrolled loop.
410C
411C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
412C inclusive.
413C
414C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
415C added to give the location of the next limb of yp, which is the multiplier
416C in the unrolled loop.
417C
418C The trick with the VAR_ADJUST value means it's only necessary to do one
419C fetch in the outer loop to take care of xp, wp and the inner loop counter.
420
421
422L(unroll):
423	C eax	yp
424	C ebx
425	C ecx	xsize
426	C edx	ysize-1
427	C esi	xp end
428	C edi	wp end of mul1
429	C ebp
430
431	movl	PARAM_XP, %esi
432
433	movl	4(%eax), %ebp		C multiplier (yp second limb)
434	leal	4(%eax,%edx,4), %eax	C yp adjust for ysize indexing
435
436	movl	%eax, PARAM_YP
437	movl	PARAM_WP, %edi
438	negl	%edx
439
440	movl	%edx, PARAM_YSIZE
441	leal	UNROLL_COUNT-2(%ecx), %ebx	C (xsize-1)+UNROLL_COUNT-1
442	decl	%ecx				C xsize-1
443
444	movl	(%esi), %eax		C xp low limb
445	andl	$-UNROLL_MASK-1, %ebx
446	negl	%ecx			C -(xsize-1)
447
448	negl	%ebx
449	andl	$UNROLL_MASK, %ecx
450
451	movl	%ebx, VAR_ADJUST
452	movl	%ecx, %edx
453	shll	$4, %ecx
454
455	movl	%eax, VAR_XP_LOW
456	sarl	$UNROLL_LOG2, %ebx
457	negl	%edx
458
459	C 15 code bytes per limb
460ifdef(`PIC',`
461	call	L(pic_calc)
462L(unroll_here):
463',`
464	leal	L(unroll_inner_entry) (%ecx,%edx,1), %ecx
465')
466
467	movl	%ecx, VAR_JMP
468	movl	%edx, %ecx
469	shll	$31, %edx
470
471	sarl	$31, %edx		C 0 or -1 as xsize odd or even
472	leal	4(%edi,%ecx,4), %edi	C wp and xp, adjust for unrolling,
473	leal	4(%esi,%ecx,4), %esi	C  and start at second limb
474
475	movl	%edx, VAR_SWAP
476	jmp	L(unroll_outer_entry)
477
478
479ifdef(`PIC',`
480L(pic_calc):
481	C See mpn/x86/README about old gas bugs
482	leal	(%ecx,%edx,1), %ecx
483	addl	$L(unroll_inner_entry)-L(unroll_here), %ecx
484	addl	(%esp), %ecx
485	ret_internal
486')
487
488
489C --------------------------------------------------------------------------
490	ALIGN(16)
491L(unroll_outer_top):
492	C eax
493	C ebx
494	C ecx
495	C edx
496	C esi	xp + offset
497	C edi	wp + offset
498	C ebp	ysize counter, negative
499
500	movl	VAR_ADJUST, %ebx
501	movl	PARAM_YP, %edx
502
503	movl	VAR_XP_LOW, %eax
504	movl	%ebp, PARAM_YSIZE	C store incremented ysize counter
505
506	leal	eval(UNROLL_BYTES + 4) (%edi,%ebx,4), %edi
507	leal	(%esi,%ebx,4), %esi
508	sarl	$UNROLL_LOG2, %ebx
509
510	movl	(%edx,%ebp,4), %ebp	C yp next multiplier
511
512L(unroll_outer_entry):
513	mull	%ebp
514
515	movl	%ebx, VAR_COUNTER
516	movl	%edx, %ebx		C carry high
517	movl	%eax, %ecx		C carry low
518
519	xorl	%edx, %eax
520	movl	VAR_JMP, %edx
521
522	andl	VAR_SWAP, %eax
523
524	xorl	%eax, %ebx		C carries other way for odd index
525	xorl	%eax, %ecx
526
527	jmp	*%edx
528
529
530C -----------------------------------------------------------------------------
531
532L(unroll_inner_top):
533	C eax	xp limb
534	C ebx	carry high
535	C ecx	carry low
536	C edx	scratch
537	C esi	xp+8
538	C edi	wp
539	C ebp	yp multiplier limb
540	C
541	C VAR_COUNTER  loop counter, negative
542	C
543	C 15 bytes each limb
544
545	addl	$UNROLL_BYTES, %edi
546
547L(unroll_inner_entry):
548
549deflit(CHUNK_COUNT,2)
550forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
551	deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
552	deflit(`disp1', eval(disp0 + 4))
553
554Zdisp(	movl,	disp0,(%esi), %eax)
555	mull	%ebp
556Zdisp(	addl,	%ecx, disp0,(%edi))
557	adcl	%eax, %ebx		C new carry low
558	movl	%edx, %ecx
559	adcl	$0, %ecx		C new carry high
560
561	movl	disp1(%esi), %eax
562	mull	%ebp
563	addl	%ebx, disp1(%edi)
564	adcl	%eax, %ecx		C new carry low
565	movl	%edx, %ebx
566	adcl	$0, %ebx		C new carry high
567')
568
569
570	incl	VAR_COUNTER
571	leal	UNROLL_BYTES(%esi), %esi
572	jnz	L(unroll_inner_top)
573
574
575	C eax
576	C ebx	carry high
577	C ecx	carry low
578	C edx
579	C esi
580	C edi	wp, pointing at second last limb)
581	C ebp
582
583deflit(`disp0',	eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128)))
584deflit(`disp1', eval(disp0 + 4))
585
586	movl	PARAM_YSIZE, %ebp
587	addl	%ecx, disp0(%edi)	C carry low
588
589	adcl	$0, %ebx
590	incl	%ebp
591
592	movl	%ebx, disp1(%edi)	C carry high
593	jnz	L(unroll_outer_top)
594
595
596	movl	SAVE_ESI, %esi
597
598	movl	SAVE_EBP, %ebp
599
600	movl	SAVE_EDI, %edi
601
602	movl	SAVE_EBX, %ebx
603	addl	$FRAME, %esp
604
605	ret
606
607EPILOGUE()
608