xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/pentium4/sse2/addmul_1.asm (revision d90047b5d07facf36e6c01dcc0bded8997ce9cc2)
1dnl  mpn_addmul_1 for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
2
3dnl  Copyright 2005, 2007, 2011 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31
32include(`../config.m4')
33
34C			    cycles/limb
35C P6 model 0-8,10-12		-
36C P6 model 9   (Banias)		5.24
37C P6 model 13  (Dothan)		5.24
38C P4 model 0-1 (Willamette)	5
39C P4 model 2   (Northwood)	5
40C P4 model 3-4 (Prescott)	5
41
42C TODO:
43C  * Tweak eax/edx offsets in loop as to save some lea's
44C  * Perhaps software pipeline small-case code
45
46C INPUT PARAMETERS
47C rp		sp + 4
48C up		sp + 8
49C n		sp + 12
50C v0		sp + 16
51
52	TEXT
53	ALIGN(16)
54PROLOGUE(mpn_addmul_1)
55	pxor	%mm6, %mm6
56L(ent):	mov	4(%esp), %edx
57	mov	8(%esp), %eax
58	mov	12(%esp), %ecx
59	movd	16(%esp), %mm7
60	cmp	$4, %ecx
61	jnc	L(big)
62
63L(lp0):	movd	(%eax), %mm0
64	lea	4(%eax), %eax
65	movd	(%edx), %mm4
66	lea	4(%edx), %edx
67	pmuludq	%mm7, %mm0
68	paddq	%mm0, %mm4
69	paddq	%mm4, %mm6
70	movd	%mm6, -4(%edx)
71	psrlq	$32, %mm6
72	dec	%ecx
73	jnz	L(lp0)
74	movd	%mm6, %eax
75	emms
76	ret
77
78L(big):	and	$3, %ecx
79	je	L(0)
80	cmp	$2, %ecx
81	jc	L(1)
82	je	L(2)
83	jmp	L(3)			C FIXME: one case should fall through
84
85L(0):	movd	(%eax), %mm3
86	sub	12(%esp), %ecx		C loop count
87	lea	-16(%eax), %eax
88	lea	-12(%edx), %edx
89	pmuludq	%mm7, %mm3
90	movd	20(%eax), %mm0
91	movd	12(%edx), %mm5
92	pmuludq	%mm7, %mm0
93	movd	24(%eax), %mm1
94	paddq	%mm3, %mm5
95	movd	16(%edx), %mm4
96	jmp	L(00)
97
98L(1):	movd	(%eax), %mm2
99	sub	12(%esp), %ecx
100	lea	-12(%eax), %eax
101	lea	-8(%edx), %edx
102	movd	8(%edx), %mm4
103	pmuludq	%mm7, %mm2
104	movd	16(%eax), %mm3
105	pmuludq	%mm7, %mm3
106	movd	20(%eax), %mm0
107	paddq	%mm2, %mm4
108	movd	12(%edx), %mm5
109	jmp	L(01)
110
111L(2):	movd	(%eax), %mm1
112	sub	12(%esp), %ecx
113	lea	-8(%eax), %eax
114	lea	-4(%edx), %edx
115	pmuludq	%mm7, %mm1
116	movd	12(%eax), %mm2
117	movd	4(%edx), %mm5
118	pmuludq	%mm7, %mm2
119	movd	16(%eax), %mm3
120	paddq	%mm1, %mm5
121	movd	8(%edx), %mm4
122	jmp	L(10)
123
124L(3):	movd	(%eax), %mm0
125	sub	12(%esp), %ecx
126	lea	-4(%eax), %eax
127	pmuludq	%mm7, %mm0
128	movd	8(%eax), %mm1
129	movd	(%edx), %mm4
130	pmuludq	%mm7, %mm1
131	movd	12(%eax), %mm2
132	paddq	%mm0, %mm4
133	movd	4(%edx), %mm5
134
135	ALIGN(16)
136L(top):	pmuludq	%mm7, %mm2
137	paddq	%mm4, %mm6
138	movd	16(%eax), %mm3
139	paddq	%mm1, %mm5
140	movd	8(%edx), %mm4
141	movd	%mm6, 0(%edx)
142	psrlq	$32, %mm6
143L(10):	pmuludq	%mm7, %mm3
144	paddq	%mm5, %mm6
145	movd	20(%eax), %mm0
146	paddq	%mm2, %mm4
147	movd	12(%edx), %mm5
148	movd	%mm6, 4(%edx)
149	psrlq	$32, %mm6
150L(01):	pmuludq	%mm7, %mm0
151	paddq	%mm4, %mm6
152	movd	24(%eax), %mm1
153	paddq	%mm3, %mm5
154	movd	16(%edx), %mm4
155	movd	%mm6, 8(%edx)
156	psrlq	$32, %mm6
157L(00):	pmuludq	%mm7, %mm1
158	paddq	%mm5, %mm6
159	movd	28(%eax), %mm2
160	paddq	%mm0, %mm4
161	movd	20(%edx), %mm5
162	movd	%mm6, 12(%edx)
163	psrlq	$32, %mm6
164	lea	16(%eax), %eax
165	lea	16(%edx), %edx
166	add	$4, %ecx
167	jnz	L(top)
168
169L(end):	pmuludq	%mm7, %mm2
170	paddq	%mm4, %mm6
171	paddq	%mm1, %mm5
172	movd	8(%edx), %mm4
173	movd	%mm6, 0(%edx)
174	psrlq	$32, %mm6
175	paddq	%mm5, %mm6
176	paddq	%mm2, %mm4
177	movd	%mm6, 4(%edx)
178	psrlq	$32, %mm6
179	paddq	%mm4, %mm6
180	movd	%mm6, 8(%edx)
181	psrlq	$32, %mm6
182	movd	%mm6, %eax
183	emms
184	ret
185EPILOGUE()
186PROLOGUE(mpn_addmul_1c)
187	movd	20(%esp), %mm6
188	jmp	L(ent)
189EPILOGUE()
190