xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/pentium4/sse2/addmul_1.asm (revision 413d532bcc3f62d122e56d92e13ac64825a40baf)
1dnl  mpn_addmul_1 for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
2
3dnl  Copyright 2005, 2007, 2011 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20
21include(`../config.m4')
22
23C			    cycles/limb
24C P6 model 0-8,10-12		-
25C P6 model 9   (Banias)		5.24
26C P6 model 13  (Dothan)		5.24
27C P4 model 0-1 (Willamette)	5
28C P4 model 2   (Northwood)	5
29C P4 model 3-4 (Prescott)	5
30
31C TODO:
32C  * Tweak eax/edx offsets in loop as to save some lea's
33C  * Perhaps software pipeline small-case code
34
35C INPUT PARAMETERS
36C rp		sp + 4
37C up		sp + 8
38C n		sp + 12
39C v0		sp + 16
40
41	TEXT
42	ALIGN(16)
43PROLOGUE(mpn_addmul_1)
44	pxor	%mm6, %mm6
45L(ent):	mov	4(%esp), %edx
46	mov	8(%esp), %eax
47	mov	12(%esp), %ecx
48	movd	16(%esp), %mm7
49	cmp	$4, %ecx
50	jnc	L(big)
51
52L(lp0):	movd	(%eax), %mm0
53	lea	4(%eax), %eax
54	movd	(%edx), %mm4
55	lea	4(%edx), %edx
56	pmuludq	%mm7, %mm0
57	paddq	%mm0, %mm4
58	paddq	%mm4, %mm6
59	movd	%mm6, -4(%edx)
60	psrlq	$32, %mm6
61	dec	%ecx
62	jnz	L(lp0)
63	movd	%mm6, %eax
64	emms
65	ret
66
67L(big):	and	$3, %ecx
68	je	L(0)
69	cmp	$2, %ecx
70	jc	L(1)
71	je	L(2)
72	jmp	L(3)			C FIXME: one case should fall through
73
74L(0):	movd	(%eax), %mm3
75	sub	12(%esp), %ecx		C loop count
76	lea	-16(%eax), %eax
77	lea	-12(%edx), %edx
78	pmuludq	%mm7, %mm3
79	movd	20(%eax), %mm0
80	movd	12(%edx), %mm5
81	pmuludq	%mm7, %mm0
82	movd	24(%eax), %mm1
83	paddq	%mm3, %mm5
84	movd	16(%edx), %mm4
85	jmp	L(00)
86
87L(1):	movd	(%eax), %mm2
88	sub	12(%esp), %ecx
89	lea	-12(%eax), %eax
90	lea	-8(%edx), %edx
91	movd	8(%edx), %mm4
92	pmuludq	%mm7, %mm2
93	movd	16(%eax), %mm3
94	pmuludq	%mm7, %mm3
95	movd	20(%eax), %mm0
96	paddq	%mm2, %mm4
97	movd	12(%edx), %mm5
98	jmp	L(01)
99
100L(2):	movd	(%eax), %mm1
101	sub	12(%esp), %ecx
102	lea	-8(%eax), %eax
103	lea	-4(%edx), %edx
104	pmuludq	%mm7, %mm1
105	movd	12(%eax), %mm2
106	movd	4(%edx), %mm5
107	pmuludq	%mm7, %mm2
108	movd	16(%eax), %mm3
109	paddq	%mm1, %mm5
110	movd	8(%edx), %mm4
111	jmp	L(10)
112
113L(3):	movd	(%eax), %mm0
114	sub	12(%esp), %ecx
115	lea	-4(%eax), %eax
116	pmuludq	%mm7, %mm0
117	movd	8(%eax), %mm1
118	movd	(%edx), %mm4
119	pmuludq	%mm7, %mm1
120	movd	12(%eax), %mm2
121	paddq	%mm0, %mm4
122	movd	4(%edx), %mm5
123
124	ALIGN(16)
125L(top):	pmuludq	%mm7, %mm2
126	paddq	%mm4, %mm6
127	movd	16(%eax), %mm3
128	paddq	%mm1, %mm5
129	movd	8(%edx), %mm4
130	movd	%mm6, 0(%edx)
131	psrlq	$32, %mm6
132L(10):	pmuludq	%mm7, %mm3
133	paddq	%mm5, %mm6
134	movd	20(%eax), %mm0
135	paddq	%mm2, %mm4
136	movd	12(%edx), %mm5
137	movd	%mm6, 4(%edx)
138	psrlq	$32, %mm6
139L(01):	pmuludq	%mm7, %mm0
140	paddq	%mm4, %mm6
141	movd	24(%eax), %mm1
142	paddq	%mm3, %mm5
143	movd	16(%edx), %mm4
144	movd	%mm6, 8(%edx)
145	psrlq	$32, %mm6
146L(00):	pmuludq	%mm7, %mm1
147	paddq	%mm5, %mm6
148	movd	28(%eax), %mm2
149	paddq	%mm0, %mm4
150	movd	20(%edx), %mm5
151	movd	%mm6, 12(%edx)
152	psrlq	$32, %mm6
153	lea	16(%eax), %eax
154	lea	16(%edx), %edx
155	add	$4, %ecx
156	jnz	L(top)
157
158L(end):	pmuludq	%mm7, %mm2
159	paddq	%mm4, %mm6
160	paddq	%mm1, %mm5
161	movd	8(%edx), %mm4
162	movd	%mm6, 0(%edx)
163	psrlq	$32, %mm6
164	paddq	%mm5, %mm6
165	paddq	%mm2, %mm4
166	movd	%mm6, 4(%edx)
167	psrlq	$32, %mm6
168	paddq	%mm4, %mm6
169	movd	%mm6, 8(%edx)
170	psrlq	$32, %mm6
171	movd	%mm6, %eax
172	emms
173	ret
174EPILOGUE()
175PROLOGUE(mpn_addmul_1c)
176	movd	20(%esp), %mm6
177	jmp	L(ent)
178EPILOGUE()
179