xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/pentium4/sse2/mul_1.asm (revision 7c192b2a5e1093666e67801684f930ef49b3b363)
1dnl  mpn_mul_1 for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
2
3dnl  Copyright 2005, 2007, 2011 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20
21include(`../config.m4')
22
23C                           cycles/limb
24C P6 model 0-8,10-12		-
25C P6 model 9   (Banias)		4.17
26C P6 model 13  (Dothan)		4.17
27C P4 model 0-1 (Willamette)	4
28C P4 model 2   (Northwood)	4
29C P4 model 3-4 (Prescott)	4.55
30
31C TODO:
32C  * Tweak eax/edx offsets in loop as to save some lea's
33C  * Perhaps software pipeline small-case code
34
35C INPUT PARAMETERS
36C rp		sp + 4
37C up		sp + 8
38C n		sp + 12
39C v0		sp + 16
40
41	TEXT
42	ALIGN(16)
43PROLOGUE(mpn_mul_1)
44	pxor	%mm6, %mm6
45L(ent):	mov	4(%esp), %edx
46	mov	8(%esp), %eax
47	mov	12(%esp), %ecx
48	movd	16(%esp), %mm7
49	cmp	$4, %ecx
50	jnc	L(big)
51
52L(lp0):	movd	(%eax), %mm0
53	lea	4(%eax), %eax
54	lea	4(%edx), %edx
55	pmuludq	%mm7, %mm0
56	paddq	%mm0, %mm6
57	movd	%mm6, -4(%edx)
58	psrlq	$32, %mm6
59	dec	%ecx
60	jnz	L(lp0)
61	movd	%mm6, %eax
62	emms
63	ret
64
65L(big):	and	$3, %ecx
66	je	L(0)
67	cmp	$2, %ecx
68	jc	L(1)
69	je	L(2)
70	jmp	L(3)			C FIXME: one case should fall through
71
72L(0):	movd	(%eax), %mm3
73	sub	12(%esp), %ecx		C loop count
74	lea	-16(%eax), %eax
75	lea	-12(%edx), %edx
76	pmuludq	%mm7, %mm3
77	movd	20(%eax), %mm0
78	pmuludq	%mm7, %mm0
79	movd	24(%eax), %mm1
80	jmp	L(00)
81
82L(1):	movd	(%eax), %mm2
83	sub	12(%esp), %ecx
84	lea	-12(%eax), %eax
85	lea	-8(%edx), %edx
86	pmuludq	%mm7, %mm2
87	movd	16(%eax), %mm3
88	pmuludq	%mm7, %mm3
89	movd	20(%eax), %mm0
90	jmp	L(01)
91
92L(2):	movd	(%eax), %mm1
93	sub	12(%esp), %ecx
94	lea	-8(%eax), %eax
95	lea	-4(%edx), %edx
96	pmuludq	%mm7, %mm1
97	movd	12(%eax), %mm2
98	pmuludq	%mm7, %mm2
99	movd	16(%eax), %mm3
100	jmp	L(10)
101
102L(3):	movd	(%eax), %mm0
103	sub	12(%esp), %ecx
104	lea	-4(%eax), %eax
105	pmuludq	%mm7, %mm0
106	movd	8(%eax), %mm1
107	pmuludq	%mm7, %mm1
108	movd	12(%eax), %mm2
109
110	ALIGN(16)
111L(top):	pmuludq	%mm7, %mm2
112	paddq	%mm0, %mm6
113	movd	16(%eax), %mm3
114	movd	%mm6, 0(%edx)
115	psrlq	$32, %mm6
116L(10):	pmuludq	%mm7, %mm3
117	paddq	%mm1, %mm6
118	movd	20(%eax), %mm0
119	movd	%mm6, 4(%edx)
120	psrlq	$32, %mm6
121L(01):	pmuludq	%mm7, %mm0
122	paddq	%mm2, %mm6
123	movd	24(%eax), %mm1
124	movd	%mm6, 8(%edx)
125	psrlq	$32, %mm6
126L(00):	pmuludq	%mm7, %mm1
127	paddq	%mm3, %mm6
128	movd	28(%eax), %mm2
129	movd	%mm6, 12(%edx)
130	psrlq	$32, %mm6
131	lea	16(%eax), %eax
132	lea	16(%edx), %edx
133	add	$4, %ecx
134	ja	L(top)
135
136L(end):	pmuludq	%mm7, %mm2
137	paddq	%mm0, %mm6
138	movd	%mm6, 0(%edx)
139	psrlq	$32, %mm6
140	paddq	%mm1, %mm6
141	movd	%mm6, 4(%edx)
142	psrlq	$32, %mm6
143	paddq	%mm2, %mm6
144	movd	%mm6, 8(%edx)
145	psrlq	$32, %mm6
146	movd	%mm6, %eax
147	emms
148	ret
149EPILOGUE()
150PROLOGUE(mpn_mul_1c)
151	movd	20(%esp), %mm6
152	jmp	L(ent)
153EPILOGUE()
154