xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/k7/mul_1.asm (revision 49d8c9ecf4abd21261269266ef64939f71b3cd09)
1dnl  AMD K7 mpn_mul_1.
2
3dnl  Copyright 1999, 2000, 2001, 2002, 2005, 2008 Free Software Foundation,
4dnl  Inc.
5dnl
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or
9dnl  modify it under the terms of the GNU Lesser General Public License as
10dnl  published by the Free Software Foundation; either version 3 of the
11dnl  License, or (at your option) any later version.
12dnl
13dnl  The GNU MP Library is distributed in the hope that it will be useful,
14dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
15dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16dnl  Lesser General Public License for more details.
17dnl
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23
24C			    cycles/limb
25C P5
26C P6 model 0-8,10-12)
27C P6 model 9  (Banias)
28C P6 model 13 (Dothan)
29C P4 model 0  (Willamette)
30C P4 model 1  (?)
31C P4 model 2  (Northwood)
32C P4 model 3  (Prescott)
33C P4 model 4  (Nocona)
34C AMD K6
35C AMD K7			 3.25
36C AMD K8
37
38C TODO
39C  * Improve feed-in and wind-down code.  We beat the old code for all n != 1,
40C    but we might be able to do even better.
41C  * The feed-in code for mul_1c is crude.
42
43ASM_START()
44	TEXT
45	ALIGN(16)
46PROLOGUE(mpn_mul_1c)
47	add	$-16, %esp
48	mov	%ebp, (%esp)
49	mov	%ebx, 4(%esp)
50	mov	%esi, 8(%esp)
51	mov	%edi, 12(%esp)
52
53	mov	20(%esp), %edi
54	mov	24(%esp), %esi
55	mov	28(%esp), %ebp
56	mov	32(%esp), %ecx
57	mov	%ebp, %ebx
58	shr	$2, %ebp
59	mov	%ebp, 28(%esp)
60	mov	(%esi), %eax
61	and	$3, %ebx
62	jz	L(c0)
63	cmp	$2, %ebx
64	mov	36(%esp), %ebx
65	jz	L(c2)
66	jg	L(c3)
67
68L(c1):	lea	-4(%edi), %edi
69	mul	%ecx
70	test	%ebp, %ebp
71	jnz	1f
72	add	%ebx, %eax
73	mov	%eax, 4(%edi)
74	mov	%edx, %eax
75	adc	%ebp, %eax
76	jmp	L(rt)
771:	add	%eax, %ebx
78	mov	$0, %ebp
79	adc	%edx, %ebp
80	mov	4(%esi), %eax
81	jmp	L(1)
82
83L(c2):	lea	4(%esi), %esi
84	mul	%ecx
85	test	%ebp, %ebp
86	mov	%ebx, %ebp
87	jnz	2f
88	add	%eax, %ebp
89	mov	$0, %ebx
90	adc	%edx, %ebx
91	mov	(%esi), %eax
92	jmp	L(cj2)
932:	add	%eax, %ebp
94	mov	$0, %ebx
95	adc	%edx, %ebx
96	mov	(%esi), %eax
97	jmp	L(2)
98
99L(c3):	lea	8(%esi), %esi
100	lea	-12(%edi), %edi
101	mul	%ecx
102	add	%eax, %ebx
103	mov	$0, %ebp
104	adc	%edx, %ebp
105	mov	-4(%esi), %eax
106	incl	28(%esp)
107	jmp	L(3)
108
109L(c0):	mov	36(%esp), %ebx
110	lea	-4(%esi), %esi
111	lea	-8(%edi), %edi
112	mul	%ecx
113	mov	%ebx, %ebp
114	add	%eax, %ebp
115	mov	$0, %ebx
116	adc	%edx, %ebx
117	mov	8(%esi), %eax
118	jmp	L(0)
119
120EPILOGUE()
121	ALIGN(16)
122PROLOGUE(mpn_mul_1)
123	add	$-16, %esp
124	mov	%ebp, (%esp)
125	mov	%ebx, 4(%esp)
126	mov	%esi, 8(%esp)
127	mov	%edi, 12(%esp)
128
129	mov	20(%esp), %edi
130	mov	24(%esp), %esi
131	mov	28(%esp), %ebp
132	mov	32(%esp), %ecx
133	mov	%ebp, %ebx
134	shr	$2, %ebp
135	mov	%ebp, 28(%esp)
136	mov	(%esi), %eax
137	and	$3, %ebx
138	jz	L(b0)
139	cmp	$2, %ebx
140	jz	L(b2)
141	jg	L(b3)
142
143L(b1):	lea	-4(%edi), %edi
144	mul	%ecx
145	test	%ebp, %ebp
146	jnz	L(gt1)
147	mov	%eax, 4(%edi)
148	mov	%edx, %eax
149	jmp	L(rt)
150L(gt1):	mov	%eax, %ebx
151	mov	%edx, %ebp
152	mov	4(%esi), %eax
153	jmp	L(1)
154
155L(b2):	lea	4(%esi), %esi
156	mul	%ecx
157	test	%ebp, %ebp
158	mov	%eax, %ebp
159	mov	%edx, %ebx
160	mov	(%esi), %eax
161	jnz	L(2)
162	jmp	L(cj2)
163
164L(b3):	lea	8(%esi), %esi
165	lea	-12(%edi), %edi
166	mul	%ecx
167	mov	%eax, %ebx
168	mov	%edx, %ebp
169	mov	-4(%esi), %eax
170	incl	28(%esp)
171	jmp	L(3)
172
173L(b0):	lea	-4(%esi), %esi
174	lea	-8(%edi), %edi
175	mul	%ecx
176	mov	%eax, %ebp
177	mov	%edx, %ebx
178	mov	8(%esi), %eax
179	jmp	L(0)
180
181	ALIGN(16)
182L(top):	mov	$0, %ebx
183	adc	%edx, %ebx
184L(2):	mul	%ecx
185	add	%eax, %ebx
186	mov	%ebp, 0(%edi)
187	mov	4(%esi), %eax
188	mov	$0, %ebp
189	adc	%edx, %ebp
190L(1):	mul	%ecx
191	add	%eax, %ebp
192	mov	8(%esi), %eax
193	mov	%ebx, 4(%edi)
194	mov	$0, %ebx
195	adc	%edx, %ebx
196L(0):	mov	%ebp, 8(%edi)
197	mul	%ecx
198	add	%eax, %ebx
199	mov	12(%esi), %eax
200	lea	16(%esi), %esi
201	mov	$0, %ebp
202	adc	%edx, %ebp
203L(3):	mov	%ebx, 12(%edi)
204	mul	%ecx
205	lea	16(%edi), %edi
206	add	%eax, %ebp
207	decl	28(%esp)
208	mov	0(%esi), %eax
209	jnz	L(top)
210
211L(end):	mov	$0, %ebx
212	adc	%edx, %ebx
213L(cj2):	mul	%ecx
214	add	%eax, %ebx
215	mov	%ebp, (%edi)
216L(cj1):	mov	%ebx, 4(%edi)
217	adc	$0, %edx
218	mov	%edx, %eax
219
220L(rt):	mov	(%esp), %ebp
221	mov	4(%esp), %ebx
222	mov	8(%esp), %esi
223	mov	12(%esp), %edi
224	add	$16, %esp
225	ret
226EPILOGUE()
227ASM_END()
228