xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/pentium4/sse2/mul_1.asm (revision 1897181a7231d5fc7ab48994d1447fcbc4e13a49)
1dnl  mpn_mul_1 for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
2
3dnl  Copyright 2005, 2007 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20
21include(`../config.m4')
22
23C TODO:
24C  * Tweak eax/edx offsets in loop as to save some lea's
25C  * Perhaps software pipeline small-case code
26
27C                           cycles/limb
28C P6 model 0-8,10-12)           -
29C P6 model 9   (Banias)		?
30C P6 model 13  (Dothan)         4.17
31C P4 model 0-1 (Willamette):	4
32C P4 model 2   (Northwood):     4
33C P4 model 3-4 (Prescott):      4.55
34
35C INPUT PARAMETERS
36C rp		sp + 4
37C up		sp + 8
38C n		sp + 12
39C v0		sp + 16
40
41	TEXT
42	ALIGN(16)
43PROLOGUE(mpn_mul_1c)
44	mov	4(%esp), %edx
45	mov	8(%esp), %eax
46	mov	12(%esp), %ecx
47	movd	16(%esp), %mm7
48	movd	20(%esp), %mm6
49	jmp	L(ent)
50EPILOGUE()
51	ALIGN(16)
52PROLOGUE(mpn_mul_1)
53	mov	4(%esp), %edx
54	mov	8(%esp), %eax
55	mov	12(%esp), %ecx
56	movd	16(%esp), %mm7
57	pxor	%mm6, %mm6
58L(ent):	cmp	$4, %ecx
59	jnc	L(big)
60
61L(lp0):	movd	(%eax), %mm0
62	lea	4(%eax), %eax
63	lea	4(%edx), %edx
64	pmuludq	%mm7, %mm0
65	paddq	%mm0, %mm6
66	movd	%mm6, -4(%edx)
67	psrlq	$32, %mm6
68	dec	%ecx
69	jnz	L(lp0)
70	movd	%mm6, %eax
71	emms
72	ret
73
74L(big):	and	$3, %ecx
75	je	L(0)
76	cmp	$2, %ecx
77	jc	L(1)
78	je	L(2)
79	jmp	L(3)			C FIXME: one case should fall through
80
81L(0):	movd	(%eax), %mm3
82	sub	12(%esp), %ecx		C loop count
83	lea	-16(%eax), %eax
84	lea	-12(%edx), %edx
85	pmuludq	%mm7, %mm3
86	movd	20(%eax), %mm0
87	pmuludq	%mm7, %mm0
88	movd	24(%eax), %mm1
89	jmp	L(00)
90
91L(1):	movd	(%eax), %mm2
92	sub	12(%esp), %ecx
93	lea	-12(%eax), %eax
94	lea	-8(%edx), %edx
95	pmuludq	%mm7, %mm2
96	movd	16(%eax), %mm3
97	pmuludq	%mm7, %mm3
98	movd	20(%eax), %mm0
99	jmp	L(01)
100
101L(2):	movd	(%eax), %mm1
102	sub	12(%esp), %ecx
103	lea	-8(%eax), %eax
104	lea	-4(%edx), %edx
105	pmuludq	%mm7, %mm1
106	movd	12(%eax), %mm2
107	pmuludq	%mm7, %mm2
108	movd	16(%eax), %mm3
109	jmp	L(10)
110
111L(3):	movd	(%eax), %mm0
112	sub	12(%esp), %ecx
113	lea	-4(%eax), %eax
114	pmuludq	%mm7, %mm0
115	movd	8(%eax), %mm1
116	pmuludq	%mm7, %mm1
117	movd	12(%eax), %mm2
118
119	ALIGN(16)
120L(top):	pmuludq	%mm7, %mm2
121	paddq	%mm0, %mm6
122	movd	16(%eax), %mm3
123	movd	%mm6, 0(%edx)
124	psrlq	$32, %mm6
125L(10):	pmuludq	%mm7, %mm3
126	paddq	%mm1, %mm6
127	movd	20(%eax), %mm0
128	movd	%mm6, 4(%edx)
129	psrlq	$32, %mm6
130L(01):	pmuludq	%mm7, %mm0
131	paddq	%mm2, %mm6
132	movd	24(%eax), %mm1
133	movd	%mm6, 8(%edx)
134	psrlq	$32, %mm6
135L(00):	pmuludq	%mm7, %mm1
136	paddq	%mm3, %mm6
137	movd	28(%eax), %mm2
138	movd	%mm6, 12(%edx)
139	psrlq	$32, %mm6
140	lea	16(%eax), %eax
141	lea	16(%edx), %edx
142	add	$4, %ecx
143	ja	L(top)
144
145L(end):	pmuludq	%mm7, %mm2
146	paddq	%mm0, %mm6
147	movd	%mm6, 0(%edx)
148	psrlq	$32, %mm6
149	paddq	%mm1, %mm6
150	movd	%mm6, 4(%edx)
151	psrlq	$32, %mm6
152	paddq	%mm2, %mm6
153	movd	%mm6, 8(%edx)
154	psrlq	$32, %mm6
155	movd	%mm6, %eax
156	emms
157	ret
158EPILOGUE()
159