xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/mul_basecase.asm (revision b45fa494daa2ba02187711d31a4144faf0993066)
1dnl  x86 mpn_mul_basecase -- Multiply two limb vectors and store the result
2dnl  in a third limb vector.
3
4dnl  Copyright 1996, 1997, 1998, 1999, 2000, 2001, 2002 Free Software
5dnl  Foundation, Inc.
6dnl
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or
10dnl  modify it under the terms of the GNU Lesser General Public License as
11dnl  published by the Free Software Foundation; either version 3 of the
12dnl  License, or (at your option) any later version.
13dnl
14dnl  The GNU MP Library is distributed in the hope that it will be useful,
15dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
16dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17dnl  Lesser General Public License for more details.
18dnl
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24
25C     cycles/crossproduct
26C P5:     15
27C P6:      7.5
28C K6:     12.5
29C K7:      5.5
30C P4:     24
31
32
33C void mpn_mul_basecase (mp_ptr wp,
34C                        mp_srcptr xp, mp_size_t xsize,
35C                        mp_srcptr yp, mp_size_t ysize);
36C
37C This was written in a haste since the Pentium optimized code that was used
38C for all x86 machines was slow for the Pentium II.  This code would benefit
39C from some cleanup.
40C
41C To shave off some percentage of the run-time, one should make 4 variants
42C of the Louter loop, for the four different outcomes of un mod 4.  That
43C would avoid Loop0 altogether.  Code expansion would be > 4-fold for that
44C part of the function, but since it is not very large, that would be
45C acceptable.
46C
47C The mul loop (at L(oopM)) might need some tweaking.  It's current speed is
48C unknown.
49
50defframe(PARAM_YSIZE,20)
51defframe(PARAM_YP,   16)
52defframe(PARAM_XSIZE,12)
53defframe(PARAM_XP,   8)
54defframe(PARAM_WP,   4)
55
56defframe(VAR_MULTIPLIER, -4)
57defframe(VAR_COUNTER,    -8)
58deflit(VAR_STACK_SPACE,  8)
59
60	TEXT
61	ALIGN(8)
62
63PROLOGUE(mpn_mul_basecase)
64deflit(`FRAME',0)
65
66	subl	$VAR_STACK_SPACE,%esp
67	pushl	%esi
68	pushl	%ebp
69	pushl	%edi
70deflit(`FRAME',eval(VAR_STACK_SPACE+12))
71
72	movl	PARAM_XP,%esi
73	movl	PARAM_WP,%edi
74	movl	PARAM_YP,%ebp
75
76	movl	(%esi),%eax		C load xp[0]
77	mull	(%ebp)			C multiply by yp[0]
78	movl	%eax,(%edi)		C store to wp[0]
79	movl	PARAM_XSIZE,%ecx	C xsize
80	decl	%ecx			C If xsize = 1, ysize = 1 too
81	jz	L(done)
82
83	pushl	%ebx
84FRAME_pushl()
85	movl	%edx,%ebx
86
87	leal	4(%esi),%esi
88	leal	4(%edi),%edi
89
90L(oopM):
91	movl	(%esi),%eax		C load next limb at xp[j]
92	leal	4(%esi),%esi
93	mull	(%ebp)
94	addl	%ebx,%eax
95	movl	%edx,%ebx
96	adcl	$0,%ebx
97	movl	%eax,(%edi)
98	leal	4(%edi),%edi
99	decl	%ecx
100	jnz	L(oopM)
101
102	movl	%ebx,(%edi)		C most significant limb of product
103	addl	$4,%edi			C increment wp
104	movl	PARAM_XSIZE,%eax
105	shll	$2,%eax
106	subl	%eax,%edi
107	subl	%eax,%esi
108
109	movl	PARAM_YSIZE,%eax	C ysize
110	decl	%eax
111	jz	L(skip)
112	movl	%eax,VAR_COUNTER	C set index i to ysize
113
114L(outer):
115	movl	PARAM_YP,%ebp		C yp
116	addl	$4,%ebp			C make ebp point to next v limb
117	movl	%ebp,PARAM_YP
118	movl	(%ebp),%eax		C copy y limb ...
119	movl	%eax,VAR_MULTIPLIER	C ... to stack slot
120	movl	PARAM_XSIZE,%ecx
121
122	xorl	%ebx,%ebx
123	andl	$3,%ecx
124	jz	L(end0)
125
126L(oop0):
127	movl	(%esi),%eax
128	mull	VAR_MULTIPLIER
129	leal	4(%esi),%esi
130	addl	%ebx,%eax
131	movl	$0,%ebx
132	adcl	%ebx,%edx
133	addl	%eax,(%edi)
134	adcl	%edx,%ebx		C propagate carry into cylimb
135
136	leal	4(%edi),%edi
137	decl	%ecx
138	jnz	L(oop0)
139
140L(end0):
141	movl	PARAM_XSIZE,%ecx
142	shrl	$2,%ecx
143	jz	L(endX)
144
145	ALIGN(8)
146L(oopX):
147	movl	(%esi),%eax
148	mull	VAR_MULTIPLIER
149	addl	%eax,%ebx
150	movl	$0,%ebp
151	adcl	%edx,%ebp
152
153	movl	4(%esi),%eax
154	mull	VAR_MULTIPLIER
155	addl	%ebx,(%edi)
156	adcl	%eax,%ebp	C new lo + cylimb
157	movl	$0,%ebx
158	adcl	%edx,%ebx
159
160	movl	8(%esi),%eax
161	mull	VAR_MULTIPLIER
162	addl	%ebp,4(%edi)
163	adcl	%eax,%ebx	C new lo + cylimb
164	movl	$0,%ebp
165	adcl	%edx,%ebp
166
167	movl	12(%esi),%eax
168	mull	VAR_MULTIPLIER
169	addl	%ebx,8(%edi)
170	adcl	%eax,%ebp	C new lo + cylimb
171	movl	$0,%ebx
172	adcl	%edx,%ebx
173
174	addl	%ebp,12(%edi)
175	adcl	$0,%ebx		C propagate carry into cylimb
176
177	leal	16(%esi),%esi
178	leal	16(%edi),%edi
179	decl	%ecx
180	jnz	L(oopX)
181
182L(endX):
183	movl	%ebx,(%edi)
184	addl	$4,%edi
185
186	C we incremented wp and xp in the loop above; compensate
187	movl	PARAM_XSIZE,%eax
188	shll	$2,%eax
189	subl	%eax,%edi
190	subl	%eax,%esi
191
192	movl	VAR_COUNTER,%eax
193	decl	%eax
194	movl	%eax,VAR_COUNTER
195	jnz	L(outer)
196
197L(skip):
198	popl	%ebx
199	popl	%edi
200	popl	%ebp
201	popl	%esi
202	addl	$8,%esp
203	ret
204
205L(done):
206	movl	%edx,4(%edi)	   C store to wp[1]
207	popl	%edi
208	popl	%ebp
209	popl	%esi
210	addl	$8,%esp
211	ret
212
213EPILOGUE()
214