xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/pentium4/sse2/mul_1.asm (revision a04395531661c5e8d314125d5ae77d4cbedd5d73)
1dnl  mpn_mul_1 for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
2
3dnl  Copyright 2005, 2007, 2011 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31
32include(`../config.m4')
33
34C                           cycles/limb
35C P6 model 0-8,10-12		-
36C P6 model 9   (Banias)		4.17
37C P6 model 13  (Dothan)		4.17
38C P4 model 0-1 (Willamette)	4
39C P4 model 2   (Northwood)	4
40C P4 model 3-4 (Prescott)	4.55
41
42C TODO:
43C  * Tweak eax/edx offsets in loop as to save some lea's
44C  * Perhaps software pipeline small-case code
45
46C INPUT PARAMETERS
47C rp		sp + 4
48C up		sp + 8
49C n		sp + 12
50C v0		sp + 16
51
52	TEXT
53	ALIGN(16)
54PROLOGUE(mpn_mul_1)
55	pxor	%mm6, %mm6
56L(ent):	mov	4(%esp), %edx
57	mov	8(%esp), %eax
58	mov	12(%esp), %ecx
59	movd	16(%esp), %mm7
60	cmp	$4, %ecx
61	jnc	L(big)
62
63L(lp0):	movd	(%eax), %mm0
64	lea	4(%eax), %eax
65	lea	4(%edx), %edx
66	pmuludq	%mm7, %mm0
67	paddq	%mm0, %mm6
68	movd	%mm6, -4(%edx)
69	psrlq	$32, %mm6
70	dec	%ecx
71	jnz	L(lp0)
72	movd	%mm6, %eax
73	emms
74	ret
75
76L(big):	and	$3, %ecx
77	je	L(0)
78	cmp	$2, %ecx
79	jc	L(1)
80	je	L(2)
81	jmp	L(3)			C FIXME: one case should fall through
82
83L(0):	movd	(%eax), %mm3
84	sub	12(%esp), %ecx		C loop count
85	lea	-16(%eax), %eax
86	lea	-12(%edx), %edx
87	pmuludq	%mm7, %mm3
88	movd	20(%eax), %mm0
89	pmuludq	%mm7, %mm0
90	movd	24(%eax), %mm1
91	jmp	L(00)
92
93L(1):	movd	(%eax), %mm2
94	sub	12(%esp), %ecx
95	lea	-12(%eax), %eax
96	lea	-8(%edx), %edx
97	pmuludq	%mm7, %mm2
98	movd	16(%eax), %mm3
99	pmuludq	%mm7, %mm3
100	movd	20(%eax), %mm0
101	jmp	L(01)
102
103L(2):	movd	(%eax), %mm1
104	sub	12(%esp), %ecx
105	lea	-8(%eax), %eax
106	lea	-4(%edx), %edx
107	pmuludq	%mm7, %mm1
108	movd	12(%eax), %mm2
109	pmuludq	%mm7, %mm2
110	movd	16(%eax), %mm3
111	jmp	L(10)
112
113L(3):	movd	(%eax), %mm0
114	sub	12(%esp), %ecx
115	lea	-4(%eax), %eax
116	pmuludq	%mm7, %mm0
117	movd	8(%eax), %mm1
118	pmuludq	%mm7, %mm1
119	movd	12(%eax), %mm2
120
121	ALIGN(16)
122L(top):	pmuludq	%mm7, %mm2
123	paddq	%mm0, %mm6
124	movd	16(%eax), %mm3
125	movd	%mm6, 0(%edx)
126	psrlq	$32, %mm6
127L(10):	pmuludq	%mm7, %mm3
128	paddq	%mm1, %mm6
129	movd	20(%eax), %mm0
130	movd	%mm6, 4(%edx)
131	psrlq	$32, %mm6
132L(01):	pmuludq	%mm7, %mm0
133	paddq	%mm2, %mm6
134	movd	24(%eax), %mm1
135	movd	%mm6, 8(%edx)
136	psrlq	$32, %mm6
137L(00):	pmuludq	%mm7, %mm1
138	paddq	%mm3, %mm6
139	movd	28(%eax), %mm2
140	movd	%mm6, 12(%edx)
141	psrlq	$32, %mm6
142	lea	16(%eax), %eax
143	lea	16(%edx), %edx
144	add	$4, %ecx
145	ja	L(top)
146
147L(end):	pmuludq	%mm7, %mm2
148	paddq	%mm0, %mm6
149	movd	%mm6, 0(%edx)
150	psrlq	$32, %mm6
151	paddq	%mm1, %mm6
152	movd	%mm6, 4(%edx)
153	psrlq	$32, %mm6
154	paddq	%mm2, %mm6
155	movd	%mm6, 8(%edx)
156	psrlq	$32, %mm6
157	movd	%mm6, %eax
158	emms
159	ret
160EPILOGUE()
161PROLOGUE(mpn_mul_1c)
162	movd	20(%esp), %mm6
163	jmp	L(ent)
164EPILOGUE()
165