xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/fastsse/lshiftc-movdqu2.asm (revision 212397c69a103ae7e5eafa8731ddfae671d2dee7)
1dnl  AMD64 mpn_lshiftc optimised for CPUs with fast SSE including fast movdqu.
2
3dnl  Contributed to the GNU project by Torbjorn Granlund.
4
5dnl  Copyright 2010, 2011, 2012 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of the GNU Lesser General Public License as published
11dnl  by the Free Software Foundation; either version 3 of the License, or (at
12dnl  your option) any later version.
13
14dnl  The GNU MP Library is distributed in the hope that it will be useful, but
15dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17dnl  License for more details.
18
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24
25C	     cycles/limb     cycles/limb     cycles/limb    good
26C              aligned	      unaligned	      best seen	   for cpu?
27C AMD K8,K9	 3		 3		 ?	  no, use shl/shr
28C AMD K10	 1.8-2.0	 1.8-2.0	 ?	  yes
29C AMD bd1	 1.9		 1.9		 ?	  yes
30C AMD bobcat	 3.67		 3.67			  yes, bad for n < 20
31C Intel P4	 4.75		 4.75		 ?	  no, slow movdqu
32C Intel core2	 2.27		 2.27		 ?	  no, use shld/shrd
33C Intel NHM	 2.15		 2.15		 ?	  no, use shld/shrd
34C Intel SBR	 1.45		 1.45		 ?	  yes, bad for n = 4-6
35C Intel atom	12.9		12.9		 ?	  no
36C VIA nano	 6.18		 6.44		 ?	  no, slow movdqu
37
38C We try to do as many aligned 16-byte operations as possible.  The top-most
39C and bottom-most writes might need 8-byte operations.
40C
41C This variant rely on fast load movdqu, and uses it even for aligned operands,
42C in order to avoid the need for two separate loops.
43C
44C TODO
45C  * Could 2-limb wind-down code be simplified?
46C  * Improve basecase code, using shld/shrd for SBR, discrete integer shifts
47C    for other affected CPUs.
48
49C INPUT PARAMETERS
50define(`rp',  `%rdi')
51define(`ap',  `%rsi')
52define(`n',   `%rdx')
53define(`cnt', `%rcx')
54
55ASM_START()
56	TEXT
57	ALIGN(64)
58PROLOGUE(mpn_lshiftc)
59	FUNC_ENTRY(4)
60	movd	R32(%rcx), %xmm4
61	mov	$64, R32(%rax)
62	sub	R32(%rcx), R32(%rax)
63	movd	R32(%rax), %xmm5
64
65	neg	R32(%rcx)
66	mov	-8(ap,n,8), %rax
67	shr	R8(%rcx), %rax
68
69	pcmpeqb	%xmm3, %xmm3		C set to 111...111
70
71	cmp	$3, n
72	jle	L(bc)
73
74	lea	(rp,n,8), R32(%rcx)
75	bt	$3, R32(%rcx)
76	jnc	L(rp_aligned)
77
78C Do one initial limb in order to make rp aligned
79	movq	-8(ap,n,8), %xmm0
80	movq	-16(ap,n,8), %xmm1
81	psllq	%xmm4, %xmm0
82	psrlq	%xmm5, %xmm1
83	por	%xmm1, %xmm0
84	pxor	%xmm3, %xmm0
85	movq	%xmm0, -8(rp,n,8)
86	dec	n
87
88L(rp_aligned):
89	lea	1(n), %r8d
90
91	and	$6, R32(%r8)
92	jz	L(ba0)
93	cmp	$4, R32(%r8)
94	jz	L(ba4)
95	jc	L(ba2)
96L(ba6):	add	$-4, n
97	jmp	L(i56)
98L(ba0):	add	$-6, n
99	jmp	L(i70)
100L(ba4):	add	$-2, n
101	jmp	L(i34)
102L(ba2):	add	$-8, n
103	jle	L(end)
104
105	ALIGN(16)
106L(top):	movdqu	40(ap,n,8), %xmm1
107	movdqu	48(ap,n,8), %xmm0
108	psllq	%xmm4, %xmm0
109	psrlq	%xmm5, %xmm1
110	por	%xmm1, %xmm0
111	pxor	%xmm3, %xmm0
112	movdqa	%xmm0, 48(rp,n,8)
113L(i70):
114	movdqu	24(ap,n,8), %xmm1
115	movdqu	32(ap,n,8), %xmm0
116	psllq	%xmm4, %xmm0
117	psrlq	%xmm5, %xmm1
118	por	%xmm1, %xmm0
119	pxor	%xmm3, %xmm0
120	movdqa	%xmm0, 32(rp,n,8)
121L(i56):
122	movdqu	8(ap,n,8), %xmm1
123	movdqu	16(ap,n,8), %xmm0
124	psllq	%xmm4, %xmm0
125	psrlq	%xmm5, %xmm1
126	por	%xmm1, %xmm0
127	pxor	%xmm3, %xmm0
128	movdqa	%xmm0, 16(rp,n,8)
129L(i34):
130	movdqu	-8(ap,n,8), %xmm1
131	movdqu	(ap,n,8), %xmm0
132	psllq	%xmm4, %xmm0
133	psrlq	%xmm5, %xmm1
134	por	%xmm1, %xmm0
135	pxor	%xmm3, %xmm0
136	movdqa	%xmm0, (rp,n,8)
137	sub	$8, n
138	jg	L(top)
139
140L(end):	bt	$0, R32(n)
141	jc	L(end8)
142
143	movdqu	(ap), %xmm1
144	pxor	%xmm0, %xmm0
145	punpcklqdq  %xmm1, %xmm0
146	psllq	%xmm4, %xmm1
147	psrlq	%xmm5, %xmm0
148	por	%xmm1, %xmm0
149	pxor	%xmm3, %xmm0
150	movdqa	%xmm0, (rp)
151	FUNC_EXIT()
152	ret
153
154C Basecase
155	ALIGN(16)
156L(bc):	dec	R32(n)
157	jz	L(end8)
158
159	movq	(ap,n,8), %xmm1
160	movq	-8(ap,n,8), %xmm0
161	psllq	%xmm4, %xmm1
162	psrlq	%xmm5, %xmm0
163	por	%xmm1, %xmm0
164	pxor	%xmm3, %xmm0
165	movq	%xmm0, (rp,n,8)
166	sub	$2, R32(n)
167	jl	L(end8)
168	movq	8(ap), %xmm1
169	movq	(ap), %xmm0
170	psllq	%xmm4, %xmm1
171	psrlq	%xmm5, %xmm0
172	por	%xmm1, %xmm0
173	pxor	%xmm3, %xmm0
174	movq	%xmm0, 8(rp)
175
176L(end8):movq	(ap), %xmm0
177	psllq	%xmm4, %xmm0
178	pxor	%xmm3, %xmm0
179	movq	%xmm0, (rp)
180	FUNC_EXIT()
181	ret
182EPILOGUE()
183