xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/p6/mul_basecase.asm (revision 288bb96063654ec504ca8732afc683d3ebc514b5)
1dnl  Intel P6 mpn_mul_basecase -- multiply two mpn numbers.
2
3dnl  Copyright 1999, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C P6: approx 6.5 cycles per cross product (16 limbs/loop unrolling).
24
25
26dnl  P6 UNROLL_COUNT cycles/product (approx)
27dnl           8           7
28dnl          16           6.5
29dnl          32           6.4
30dnl  Maximum possible with the current code is 32.
31
32deflit(UNROLL_COUNT, 16)
33
34
35C void mpn_mul_basecase (mp_ptr wp,
36C                        mp_srcptr xp, mp_size_t xsize,
37C                        mp_srcptr yp, mp_size_t ysize);
38C
39C This routine is essentially the same as mpn/generic/mul_basecase.c, but
40C it's faster because it does most of the mpn_addmul_1() startup
41C calculations only once.
42
43ifdef(`PIC',`
44deflit(UNROLL_THRESHOLD, 5)
45',`
46deflit(UNROLL_THRESHOLD, 5)
47')
48
49defframe(PARAM_YSIZE,20)
50defframe(PARAM_YP,   16)
51defframe(PARAM_XSIZE,12)
52defframe(PARAM_XP,   8)
53defframe(PARAM_WP,   4)
54
55	TEXT
56	ALIGN(16)
57
58PROLOGUE(mpn_mul_basecase)
59deflit(`FRAME',0)
60
61	movl	PARAM_XSIZE, %ecx
62
63	movl	PARAM_YP, %eax
64
65	movl	PARAM_XP, %edx
66
67	movl	(%eax), %eax		C yp[0]
68	cmpl	$2, %ecx
69	ja	L(xsize_more_than_two)
70	je	L(two_by_something)
71
72
73	C one limb by one limb
74
75	mull	(%edx)
76
77	movl	PARAM_WP, %ecx
78	movl	%eax, (%ecx)
79	movl	%edx, 4(%ecx)
80	ret
81
82
83C -----------------------------------------------------------------------------
84L(two_by_something):
85deflit(`FRAME',0)
86
87dnl  re-use parameter space
88define(SAVE_EBX, `PARAM_XSIZE')
89define(SAVE_ESI, `PARAM_YSIZE')
90
91	movl	%ebx, SAVE_EBX
92	cmpl	$1, PARAM_YSIZE
93	movl	%eax, %ecx		C yp[0]
94
95	movl	%esi, SAVE_ESI		C save esi
96	movl	PARAM_WP, %ebx
97	movl	%edx, %esi		C xp
98
99	movl	(%edx), %eax		C xp[0]
100	jne	L(two_by_two)
101
102
103	C two limbs by one limb
104	C
105	C eax	xp[0]
106	C ebx	wp
107	C ecx	yp[0]
108	C edx
109	C esi	xp
110
111	mull	%ecx
112
113	movl	%eax, (%ebx)
114	movl	4(%esi), %eax
115	movl	%edx, %esi		C carry
116
117	mull	%ecx
118
119	addl	%eax, %esi
120
121	movl	%esi, 4(%ebx)
122	movl	SAVE_ESI, %esi
123
124	adcl	$0, %edx
125
126	movl	%edx, 8(%ebx)
127	movl	SAVE_EBX, %ebx
128
129	ret
130
131
132
133C -----------------------------------------------------------------------------
134
135	ALIGN(16)
136L(two_by_two):
137	C eax	xp[0]
138	C ebx	wp
139	C ecx	yp[0]
140	C edx
141	C esi	xp
142	C edi
143	C ebp
144
145dnl  more parameter space re-use
146define(SAVE_EDI, `PARAM_WP')
147
148	mull	%ecx		C xp[0] * yp[0]
149
150	movl	%edi, SAVE_EDI
151	movl	%edx, %edi	C carry, for wp[1]
152
153	movl	%eax, (%ebx)
154	movl	4(%esi), %eax
155
156	mull	%ecx		C xp[1] * yp[0]
157
158	addl	%eax, %edi
159	movl	PARAM_YP, %ecx
160
161	adcl	$0, %edx
162	movl	4(%ecx), %ecx	C yp[1]
163
164	movl	%edi, 4(%ebx)
165	movl	4(%esi), %eax	C xp[1]
166	movl	%edx, %edi	C carry, for wp[2]
167
168	mull	%ecx		C xp[1] * yp[1]
169
170	addl	%eax, %edi
171	movl	(%esi), %eax	C xp[0]
172
173	adcl	$0, %edx
174	movl	%edx, %esi	C carry, for wp[3]
175
176	mull	%ecx		C xp[0] * yp[1]
177
178	addl	%eax, 4(%ebx)
179	movl	%esi, %eax
180
181	adcl	%edx, %edi
182	movl	SAVE_ESI, %esi
183
184	movl	%edi, 8(%ebx)
185
186	adcl	$0, %eax
187	movl	SAVE_EDI, %edi
188
189	movl	%eax, 12(%ebx)
190	movl	SAVE_EBX, %ebx
191
192	ret
193
194
195C -----------------------------------------------------------------------------
196	ALIGN(16)
197L(xsize_more_than_two):
198
199C The first limb of yp is processed with a simple mpn_mul_1 loop running at
200C about 6.2 c/l.  Unrolling this doesn't seem worthwhile since it's only run
201C once (whereas the addmul_1 below is run ysize-1 many times).  A call to
202C mpn_mul_1 would be slowed down by the parameter pushing and popping etc,
203C and doesn't seem likely to be worthwhile on the typical sizes reaching
204C here from the Karatsuba code.
205
206	C eax	yp[0]
207	C ebx
208	C ecx	xsize
209	C edx	xp
210	C esi
211	C edi
212	C ebp
213
214defframe(`SAVE_EBX',    -4)
215defframe(`SAVE_ESI',    -8)
216defframe(`SAVE_EDI',   -12)
217defframe(`SAVE_EBP',   -16)
218defframe(VAR_COUNTER,  -20)  dnl for use in the unroll case
219defframe(VAR_ADJUST,   -24)
220defframe(VAR_JMP,      -28)
221defframe(VAR_SWAP,     -32)
222defframe(VAR_XP_LOW,   -36)
223deflit(STACK_SPACE, 36)
224
225	subl	$STACK_SPACE, %esp
226deflit(`FRAME',STACK_SPACE)
227
228	movl	%edi, SAVE_EDI
229	movl	PARAM_WP, %edi
230
231	movl	%ebx, SAVE_EBX
232
233	movl	%ebp, SAVE_EBP
234	movl	%eax, %ebp
235
236	movl	%esi, SAVE_ESI
237	xorl	%ebx, %ebx
238	leal	(%edx,%ecx,4), %esi	C xp end
239
240	leal	(%edi,%ecx,4), %edi	C wp end of mul1
241	negl	%ecx
242
243
244L(mul1):
245	C eax	scratch
246	C ebx	carry
247	C ecx	counter, negative
248	C edx	scratch
249	C esi	xp end
250	C edi	wp end of mul1
251	C ebp	multiplier
252
253	movl	(%esi,%ecx,4), %eax
254
255	mull	%ebp
256
257	addl	%ebx, %eax
258	movl	%eax, (%edi,%ecx,4)
259	movl	$0, %ebx
260
261	adcl	%edx, %ebx
262	incl	%ecx
263	jnz	L(mul1)
264
265
266	movl	PARAM_YSIZE, %edx
267
268	movl	%ebx, (%edi)		C final carry
269	movl	PARAM_XSIZE, %ecx
270	decl	%edx
271
272	jz	L(done)			C if ysize==1
273
274	cmpl	$UNROLL_THRESHOLD, %ecx
275	movl	PARAM_YP, %eax
276	jae	L(unroll)
277
278
279C -----------------------------------------------------------------------------
280	C simple addmul looping
281	C
282	C eax	yp
283	C ebx
284	C ecx	xsize
285	C edx	ysize-1
286	C esi	xp end
287	C edi	wp end of mul1
288	C ebp
289
290	leal	4(%eax,%edx,4), %ebp	C yp end
291	negl	%ecx
292	negl	%edx
293
294	movl	%edx, PARAM_YSIZE	C -(ysize-1)
295	movl	(%esi,%ecx,4), %eax	C xp low limb
296	incl	%ecx
297
298	movl	%ecx, PARAM_XSIZE	C -(xsize-1)
299	xorl	%ebx, %ebx		C initial carry
300
301	movl	%ebp, PARAM_YP
302	movl	(%ebp,%edx,4), %ebp	C yp second lowest limb - multiplier
303	jmp	L(simple_outer_entry)
304
305
306L(simple_outer_top):
307	C ebp	ysize counter, negative
308
309	movl	PARAM_YP, %edx
310
311	movl	PARAM_XSIZE, %ecx	C -(xsize-1)
312	xorl	%ebx, %ebx		C carry
313
314	movl	%ebp, PARAM_YSIZE
315	addl	$4, %edi		C next position in wp
316
317	movl	(%edx,%ebp,4), %ebp	C yp limb - multiplier
318
319	movl	-4(%esi,%ecx,4), %eax	C xp low limb
320
321
322L(simple_outer_entry):
323
324L(simple_inner_top):
325	C eax	xp limb
326	C ebx	carry limb
327	C ecx	loop counter (negative)
328	C edx	scratch
329	C esi	xp end
330	C edi	wp end
331	C ebp	multiplier
332
333	mull	%ebp
334
335	addl	%eax, %ebx
336	adcl	$0, %edx
337
338	addl	%ebx, (%edi,%ecx,4)
339	movl	(%esi,%ecx,4), %eax
340	adcl	$0, %edx
341
342	incl	%ecx
343	movl	%edx, %ebx
344	jnz	L(simple_inner_top)
345
346
347	C separate code for last limb so outer loop counter handling can be
348	C interleaved
349
350	mull	%ebp
351
352	movl	PARAM_YSIZE, %ebp
353	addl	%eax, %ebx
354
355	adcl	$0, %edx
356
357	addl	%ebx, (%edi)
358
359	adcl	$0, %edx
360	incl	%ebp
361
362	movl	%edx, 4(%edi)
363	jnz	L(simple_outer_top)
364
365
366L(done):
367	movl	SAVE_EBX, %ebx
368
369	movl	SAVE_ESI, %esi
370
371	movl	SAVE_EDI, %edi
372
373	movl	SAVE_EBP, %ebp
374	addl	$FRAME, %esp
375
376	ret
377
378
379
380C -----------------------------------------------------------------------------
381C
382C The unrolled loop is the same as in mpn_addmul_1, see that code for some
383C comments.
384C
385C VAR_ADJUST is the negative of how many limbs the leals in the inner loop
386C increment xp and wp.  This is used to adjust xp and wp, and is rshifted to
387C given an initial VAR_COUNTER at the top of the outer loop.
388C
389C VAR_COUNTER is for the unrolled loop, running from VAR_ADJUST/UNROLL_COUNT
390C up to -1, inclusive.
391C
392C VAR_JMP is the computed jump into the unrolled loop.
393C
394C VAR_SWAP is 0 if xsize odd or 0xFFFFFFFF if xsize even, used to swap the
395C initial ebx and ecx on entry to the unrolling.
396C
397C VAR_XP_LOW is the least significant limb of xp, which is needed at the
398C start of the unrolled loop.
399C
400C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
401C inclusive.
402C
403C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
404C added to give the location of the next limb of yp, which is the multiplier
405C in the unrolled loop.
406C
407C The trick with the VAR_ADJUST value means it's only necessary to do one
408C fetch in the outer loop to take care of xp, wp and the inner loop counter.
409
410
411L(unroll):
412	C eax	yp
413	C ebx
414	C ecx	xsize
415	C edx	ysize-1
416	C esi	xp end
417	C edi	wp end of mul1
418	C ebp
419
420	movl	PARAM_XP, %esi
421
422	movl	4(%eax), %ebp		C multiplier (yp second limb)
423	leal	4(%eax,%edx,4), %eax	C yp adjust for ysize indexing
424
425	movl	%eax, PARAM_YP
426	movl	PARAM_WP, %edi
427	negl	%edx
428
429	movl	%edx, PARAM_YSIZE
430	leal	UNROLL_COUNT-2(%ecx), %ebx	C (xsize-1)+UNROLL_COUNT-1
431	decl	%ecx				C xsize-1
432
433	movl	(%esi), %eax		C xp low limb
434	andl	$-UNROLL_MASK-1, %ebx
435	negl	%ecx			C -(xsize-1)
436
437	negl	%ebx
438	andl	$UNROLL_MASK, %ecx
439
440	movl	%ebx, VAR_ADJUST
441	movl	%ecx, %edx
442	shll	$4, %ecx
443
444	movl	%eax, VAR_XP_LOW
445	sarl	$UNROLL_LOG2, %ebx
446	negl	%edx
447
448	C 15 code bytes per limb
449ifdef(`PIC',`
450	call	L(pic_calc)
451L(unroll_here):
452',`
453	leal	L(unroll_inner_entry) (%ecx,%edx,1), %ecx
454')
455
456	movl	%ecx, VAR_JMP
457	movl	%edx, %ecx
458	shll	$31, %edx
459
460	sarl	$31, %edx		C 0 or -1 as xsize odd or even
461	leal	4(%edi,%ecx,4), %edi	C wp and xp, adjust for unrolling,
462	leal	4(%esi,%ecx,4), %esi	C  and start at second limb
463
464	movl	%edx, VAR_SWAP
465	jmp	L(unroll_outer_entry)
466
467
468ifdef(`PIC',`
469L(pic_calc):
470	C See mpn/x86/README about old gas bugs
471	leal	(%ecx,%edx,1), %ecx
472	addl	$L(unroll_inner_entry)-L(unroll_here), %ecx
473	addl	(%esp), %ecx
474	ret_internal
475')
476
477
478C --------------------------------------------------------------------------
479	ALIGN(16)
480L(unroll_outer_top):
481	C eax
482	C ebx
483	C ecx
484	C edx
485	C esi	xp + offset
486	C edi	wp + offset
487	C ebp	ysize counter, negative
488
489	movl	VAR_ADJUST, %ebx
490	movl	PARAM_YP, %edx
491
492	movl	VAR_XP_LOW, %eax
493	movl	%ebp, PARAM_YSIZE	C store incremented ysize counter
494
495	leal	eval(UNROLL_BYTES + 4) (%edi,%ebx,4), %edi
496	leal	(%esi,%ebx,4), %esi
497	sarl	$UNROLL_LOG2, %ebx
498
499	movl	(%edx,%ebp,4), %ebp	C yp next multiplier
500
501L(unroll_outer_entry):
502	mull	%ebp
503
504	movl	%ebx, VAR_COUNTER
505	movl	%edx, %ebx		C carry high
506	movl	%eax, %ecx		C carry low
507
508	xorl	%edx, %eax
509	movl	VAR_JMP, %edx
510
511	andl	VAR_SWAP, %eax
512
513	xorl	%eax, %ebx		C carries other way for odd index
514	xorl	%eax, %ecx
515
516	jmp	*%edx
517
518
519C -----------------------------------------------------------------------------
520
521L(unroll_inner_top):
522	C eax	xp limb
523	C ebx	carry high
524	C ecx	carry low
525	C edx	scratch
526	C esi	xp+8
527	C edi	wp
528	C ebp	yp multiplier limb
529	C
530	C VAR_COUNTER  loop counter, negative
531	C
532	C 15 bytes each limb
533
534	addl	$UNROLL_BYTES, %edi
535
536L(unroll_inner_entry):
537
538deflit(CHUNK_COUNT,2)
539forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
540	deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
541	deflit(`disp1', eval(disp0 + 4))
542
543Zdisp(	movl,	disp0,(%esi), %eax)
544	mull	%ebp
545Zdisp(	addl,	%ecx, disp0,(%edi))
546	adcl	%eax, %ebx		C new carry low
547	movl	%edx, %ecx
548	adcl	$0, %ecx		C new carry high
549
550	movl	disp1(%esi), %eax
551	mull	%ebp
552	addl	%ebx, disp1(%edi)
553	adcl	%eax, %ecx		C new carry low
554	movl	%edx, %ebx
555	adcl	$0, %ebx		C new carry high
556')
557
558
559	incl	VAR_COUNTER
560	leal	UNROLL_BYTES(%esi), %esi
561	jnz	L(unroll_inner_top)
562
563
564	C eax
565	C ebx	carry high
566	C ecx	carry low
567	C edx
568	C esi
569	C edi	wp, pointing at second last limb)
570	C ebp
571
572deflit(`disp0',	eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128)))
573deflit(`disp1', eval(disp0 + 4))
574
575	movl	PARAM_YSIZE, %ebp
576	addl	%ecx, disp0(%edi)	C carry low
577
578	adcl	$0, %ebx
579	incl	%ebp
580
581	movl	%ebx, disp1(%edi)	C carry high
582	jnz	L(unroll_outer_top)
583
584
585	movl	SAVE_ESI, %esi
586
587	movl	SAVE_EBP, %ebp
588
589	movl	SAVE_EDI, %edi
590
591	movl	SAVE_EBX, %ebx
592	addl	$FRAME, %esp
593
594	ret
595
596EPILOGUE()
597