xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/fastsse/lshiftc.asm (revision 6a493d6bc668897c91594964a732d38505b70cbb)
1dnl  AMD64 mpn_lshiftc optimised for CPUs with fast SSE.
2
3dnl  Contributed to the GNU project by David Harvey and Torbjorn Granlund.
4
5dnl  Copyright 2010, 2011, 2012 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of the GNU Lesser General Public License as published
11dnl  by the Free Software Foundation; either version 3 of the License, or (at
12dnl  your option) any later version.
13
14dnl  The GNU MP Library is distributed in the hope that it will be useful, but
15dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17dnl  License for more details.
18
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24
25C	     cycles/limb	     cycles/limb	      good
26C          16-byte aligned         16-byte unaligned	    for cpu?
27C AMD K8,K9	 ?			 ?
28C AMD K10	 1.85  (1.635)		 1.9   (1.67)		Y
29C AMD bd1	 1.82  (1.75)		 1.82  (1.75)		Y
30C AMD bobcat	 4.5			 4.5
31C Intel P4	 3.6   (3.125)		 3.6   (3.125)		Y
32C Intel core2	 2.05  (1.67)		 2.55  (1.75)
33C Intel NHM	 2.05  (1.875)		 2.6   (2.25)
34C Intel SBR	 1.55  (1.44)		 2     (1.57)		Y
35C Intel atom	 ?			 ?
36C VIA nano	 2.5   (2.5)		 2.5   (2.5)		Y
37
38C We try to do as many 16-byte operations as possible.  The top-most and
39C bottom-most writes might need 8-byte operations.  We always write using
40C 16-byte operations, we read with both 8-byte and 16-byte operations.
41
42C There are two inner-loops, one for when rp = ap (mod 16) and one when this is
43C not true.  The aligned case reads 16+8 bytes, the unaligned case reads
44C 16+8+X bytes, where X is 8 or 16 depending on how punpcklqdq is implemented.
45
46C This is not yet great code:
47C   (1) The unaligned case makes too many reads.
48C   (2) We should do some unrolling, at least 2-way.
49C With 2-way unrolling but no scheduling we reach 1.5 c/l on K10 and 2 c/l on
50C Nano.
51
52C INPUT PARAMETERS
53define(`rp',  `%rdi')
54define(`ap',  `%rsi')
55define(`n',   `%rdx')
56define(`cnt', `%rcx')
57
58ASM_START()
59	TEXT
60	ALIGN(16)
61PROLOGUE(mpn_lshiftc)
62	movd	R32(%rcx), %xmm4
63	mov	$64, R32(%rax)
64	sub	R32(%rcx), R32(%rax)
65	movd	R32(%rax), %xmm5
66
67	neg	R32(%rcx)
68	mov	-8(ap,n,8), %rax
69	shr	R8(%rcx), %rax
70
71	pcmpeqb	%xmm7, %xmm7		C set to 111...111
72
73	cmp	$2, n
74	jle	L(le2)
75
76	lea	(rp,n,8), R32(%rcx)
77	test	$8, R8(%rcx)
78	je	L(rp_aligned)
79
80C Do one initial limb in order to make rp aligned
81	movq	-8(ap,n,8), %xmm0
82	movq	-16(ap,n,8), %xmm1
83	psllq	%xmm4, %xmm0
84	psrlq	%xmm5, %xmm1
85	por	%xmm1, %xmm0
86	pxor	%xmm7, %xmm0
87	movq	%xmm0, -8(rp,n,8)
88	dec	n
89
90L(rp_aligned):
91	lea	(ap,n,8), R32(%rcx)
92	test	$8, R8(%rcx)
93	je	L(aent)
94	jmp	L(uent)
95C *****************************************************************************
96
97C Handle the case when ap != rp (mod 16).
98
99	ALIGN(16)
100L(utop):movq	(ap,n,8), %xmm1
101	punpcklqdq  8(ap,n,8), %xmm1
102	movdqa	-8(ap,n,8), %xmm0
103	psllq	%xmm4, %xmm1
104	psrlq	%xmm5, %xmm0
105	por	%xmm1, %xmm0
106	pxor	%xmm7, %xmm0
107	movdqa	%xmm0, (rp,n,8)
108L(uent):sub	$2, n
109	ja	L(utop)
110
111	jne	L(end8)
112
113	movq	(ap), %xmm1
114	pxor	%xmm0, %xmm0
115	punpcklqdq  %xmm1, %xmm0
116	punpcklqdq  8(ap), %xmm1
117	psllq	%xmm4, %xmm1
118	psrlq	%xmm5, %xmm0
119	por	%xmm1, %xmm0
120	pxor	%xmm7, %xmm0
121	movdqa	%xmm0, (rp)
122	ret
123C *****************************************************************************
124
125C Handle the case when ap = rp (mod 16).
126
127	ALIGN(16)
128L(atop):movdqa	(ap,n,8), %xmm0		C xmm0 = B*ap[n-1] + ap[n-2]
129	movq	-8(ap,n,8), %xmm1	C xmm1 = ap[n-3]
130	punpcklqdq  %xmm0, %xmm1	C xmm1 = B*ap[n-2] + ap[n-3]
131	psllq	%xmm4, %xmm0
132	psrlq	%xmm5, %xmm1
133	por	%xmm1, %xmm0
134	pxor	%xmm7, %xmm0
135	movdqa	%xmm0, (rp,n,8)
136L(aent):sub	$2, n
137	ja	L(atop)
138
139	jne	L(end8)
140
141	movdqa	(ap), %xmm0
142	pxor	%xmm1, %xmm1
143	punpcklqdq  %xmm0, %xmm1
144	psllq	%xmm4, %xmm0
145	psrlq	%xmm5, %xmm1
146	por	%xmm1, %xmm0
147	pxor	%xmm7, %xmm0
148	movdqa	%xmm0, (rp)
149	ret
150C *****************************************************************************
151
152	ALIGN(16)
153L(le2):	jne	L(end8)
154
155	movq	8(ap), %xmm0
156	movq	(ap), %xmm1
157	psllq	%xmm4, %xmm0
158	psrlq	%xmm5, %xmm1
159	por	%xmm1, %xmm0
160	pxor	%xmm7, %xmm0
161	movq	%xmm0, 8(rp)
162
163L(end8):movq	(ap), %xmm0
164	psllq	%xmm4, %xmm0
165	pxor	%xmm7, %xmm0
166	movq	%xmm0, (rp)
167	ret
168EPILOGUE()
169