xref: /llvm-project/llvm/test/CodeGen/X86/vector-fshr-sub128.ll (revision dd8e1adbf22f9b84e9fc5ed65530df55a3c3b693)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512F
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512VL
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512BW
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512VLBW
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512VBMI2
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512VLVBMI2
12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
13; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
14
15; Just one 32-bit run to make sure we do reasonable things for i64 cases.
16; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X86-SSE2
17
18declare <2 x i32> @llvm.fshr.v2i32(<2 x i32>, <2 x i32>, <2 x i32>)
19
20;
21; Variable Shifts
22;
23
24define <2 x i32> @var_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %amt) nounwind {
25; SSE2-LABEL: var_funnnel_v2i32:
26; SSE2:       # %bb.0:
27; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [31,31,31,31]
28; SSE2-NEXT:    movdqa %xmm2, %xmm5
29; SSE2-NEXT:    pand %xmm4, %xmm5
30; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7]
31; SSE2-NEXT:    movdqa %xmm1, %xmm6
32; SSE2-NEXT:    psrld %xmm3, %xmm6
33; SSE2-NEXT:    pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7]
34; SSE2-NEXT:    movdqa %xmm1, %xmm3
35; SSE2-NEXT:    psrld %xmm7, %xmm3
36; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0]
37; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
38; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
39; SSE2-NEXT:    movdqa %xmm1, %xmm7
40; SSE2-NEXT:    psrld %xmm6, %xmm7
41; SSE2-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
42; SSE2-NEXT:    psrld %xmm5, %xmm1
43; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1]
44; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3]
45; SSE2-NEXT:    pandn %xmm4, %xmm2
46; SSE2-NEXT:    pslld $23, %xmm2
47; SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
48; SSE2-NEXT:    cvttps2dq %xmm2, %xmm1
49; SSE2-NEXT:    paddd %xmm0, %xmm0
50; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
51; SSE2-NEXT:    pmuludq %xmm1, %xmm0
52; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
53; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
54; SSE2-NEXT:    pmuludq %xmm2, %xmm1
55; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
56; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
57; SSE2-NEXT:    por %xmm3, %xmm0
58; SSE2-NEXT:    retq
59;
60; SSE41-LABEL: var_funnnel_v2i32:
61; SSE41:       # %bb.0:
62; SSE41-NEXT:    pmovsxbd {{.*#+}} xmm3 = [31,31,31,31]
63; SSE41-NEXT:    movdqa %xmm2, %xmm4
64; SSE41-NEXT:    pand %xmm3, %xmm4
65; SSE41-NEXT:    pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7]
66; SSE41-NEXT:    movdqa %xmm1, %xmm6
67; SSE41-NEXT:    psrld %xmm5, %xmm6
68; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
69; SSE41-NEXT:    pshuflw {{.*#+}} xmm7 = xmm5[2,3,3,3,4,5,6,7]
70; SSE41-NEXT:    movdqa %xmm1, %xmm8
71; SSE41-NEXT:    psrld %xmm7, %xmm8
72; SSE41-NEXT:    pblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm8[4,5,6,7]
73; SSE41-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7]
74; SSE41-NEXT:    movdqa %xmm1, %xmm6
75; SSE41-NEXT:    psrld %xmm4, %xmm6
76; SSE41-NEXT:    pshuflw {{.*#+}} xmm4 = xmm5[0,1,1,1,4,5,6,7]
77; SSE41-NEXT:    psrld %xmm4, %xmm1
78; SSE41-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm1[4,5,6,7]
79; SSE41-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3],xmm6[4,5],xmm8[6,7]
80; SSE41-NEXT:    pandn %xmm3, %xmm2
81; SSE41-NEXT:    pslld $23, %xmm2
82; SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
83; SSE41-NEXT:    cvttps2dq %xmm2, %xmm1
84; SSE41-NEXT:    paddd %xmm0, %xmm0
85; SSE41-NEXT:    pmulld %xmm1, %xmm0
86; SSE41-NEXT:    por %xmm6, %xmm0
87; SSE41-NEXT:    retq
88;
89; AVX1-LABEL: var_funnnel_v2i32:
90; AVX1:       # %bb.0:
91; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [31,31,31,31]
92; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
93; AVX1-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
94; AVX1-NEXT:    vpsrld %xmm5, %xmm1, %xmm5
95; AVX1-NEXT:    vpsrlq $32, %xmm4, %xmm6
96; AVX1-NEXT:    vpsrld %xmm6, %xmm1, %xmm6
97; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
98; AVX1-NEXT:    vpxor %xmm6, %xmm6, %xmm6
99; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm4[2],xmm6[2],xmm4[3],xmm6[3]
100; AVX1-NEXT:    vpsrld %xmm6, %xmm1, %xmm6
101; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
102; AVX1-NEXT:    vpsrld %xmm4, %xmm1, %xmm1
103; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7]
104; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7]
105; AVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
106; AVX1-NEXT:    vpslld $23, %xmm2, %xmm2
107; AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
108; AVX1-NEXT:    vcvttps2dq %xmm2, %xmm2
109; AVX1-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
110; AVX1-NEXT:    vpmulld %xmm2, %xmm0, %xmm0
111; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
112; AVX1-NEXT:    retq
113;
114; AVX2-LABEL: var_funnnel_v2i32:
115; AVX2:       # %bb.0:
116; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
117; AVX2-NEXT:    vpand %xmm3, %xmm2, %xmm4
118; AVX2-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm1
119; AVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm2
120; AVX2-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
121; AVX2-NEXT:    vpsllvd %xmm2, %xmm0, %xmm0
122; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
123; AVX2-NEXT:    retq
124;
125; AVX512F-LABEL: var_funnnel_v2i32:
126; AVX512F:       # %bb.0:
127; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
128; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm4
129; AVX512F-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm1
130; AVX512F-NEXT:    vpandn %xmm3, %xmm2, %xmm2
131; AVX512F-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
132; AVX512F-NEXT:    vpsllvd %xmm2, %xmm0, %xmm0
133; AVX512F-NEXT:    vpor %xmm1, %xmm0, %xmm0
134; AVX512F-NEXT:    retq
135;
136; AVX512VL-LABEL: var_funnnel_v2i32:
137; AVX512VL:       # %bb.0:
138; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
139; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm4
140; AVX512VL-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm1
141; AVX512VL-NEXT:    vpandn %xmm3, %xmm2, %xmm2
142; AVX512VL-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
143; AVX512VL-NEXT:    vpsllvd %xmm2, %xmm0, %xmm0
144; AVX512VL-NEXT:    vpor %xmm1, %xmm0, %xmm0
145; AVX512VL-NEXT:    retq
146;
147; AVX512BW-LABEL: var_funnnel_v2i32:
148; AVX512BW:       # %bb.0:
149; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
150; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm4
151; AVX512BW-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm1
152; AVX512BW-NEXT:    vpandn %xmm3, %xmm2, %xmm2
153; AVX512BW-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
154; AVX512BW-NEXT:    vpsllvd %xmm2, %xmm0, %xmm0
155; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
156; AVX512BW-NEXT:    retq
157;
158; AVX512VLBW-LABEL: var_funnnel_v2i32:
159; AVX512VLBW:       # %bb.0:
160; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
161; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
162; AVX512VLBW-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm1
163; AVX512VLBW-NEXT:    vpandn %xmm3, %xmm2, %xmm2
164; AVX512VLBW-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
165; AVX512VLBW-NEXT:    vpsllvd %xmm2, %xmm0, %xmm0
166; AVX512VLBW-NEXT:    vpor %xmm1, %xmm0, %xmm0
167; AVX512VLBW-NEXT:    retq
168;
169; AVX512VBMI2-LABEL: var_funnnel_v2i32:
170; AVX512VBMI2:       # %bb.0:
171; AVX512VBMI2-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
172; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
173; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
174; AVX512VBMI2-NEXT:    vpshrdvd %zmm2, %zmm0, %zmm1
175; AVX512VBMI2-NEXT:    vmovdqa %xmm1, %xmm0
176; AVX512VBMI2-NEXT:    vzeroupper
177; AVX512VBMI2-NEXT:    retq
178;
179; AVX512VLVBMI2-LABEL: var_funnnel_v2i32:
180; AVX512VLVBMI2:       # %bb.0:
181; AVX512VLVBMI2-NEXT:    vpshrdvd %xmm2, %xmm0, %xmm1
182; AVX512VLVBMI2-NEXT:    vmovdqa %xmm1, %xmm0
183; AVX512VLVBMI2-NEXT:    retq
184;
185; XOPAVX1-LABEL: var_funnnel_v2i32:
186; XOPAVX1:       # %bb.0:
187; XOPAVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [31,31,31,31]
188; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm4
189; XOPAVX1-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
190; XOPAVX1-NEXT:    vpshld %xmm4, %xmm0, %xmm0
191; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
192; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
193; XOPAVX1-NEXT:    vpsubd %xmm2, %xmm3, %xmm2
194; XOPAVX1-NEXT:    vpshld %xmm2, %xmm1, %xmm1
195; XOPAVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
196; XOPAVX1-NEXT:    retq
197;
198; XOPAVX2-LABEL: var_funnnel_v2i32:
199; XOPAVX2:       # %bb.0:
200; XOPAVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
201; XOPAVX2-NEXT:    vpand %xmm3, %xmm2, %xmm4
202; XOPAVX2-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm1
203; XOPAVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm2
204; XOPAVX2-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
205; XOPAVX2-NEXT:    vpsllvd %xmm2, %xmm0, %xmm0
206; XOPAVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
207; XOPAVX2-NEXT:    retq
208;
209; X86-SSE2-LABEL: var_funnnel_v2i32:
210; X86-SSE2:       # %bb.0:
211; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [31,31,31,31]
212; X86-SSE2-NEXT:    movdqa %xmm2, %xmm5
213; X86-SSE2-NEXT:    pand %xmm4, %xmm5
214; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7]
215; X86-SSE2-NEXT:    movdqa %xmm1, %xmm6
216; X86-SSE2-NEXT:    psrld %xmm3, %xmm6
217; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7]
218; X86-SSE2-NEXT:    movdqa %xmm1, %xmm3
219; X86-SSE2-NEXT:    psrld %xmm7, %xmm3
220; X86-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0]
221; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
222; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
223; X86-SSE2-NEXT:    movdqa %xmm1, %xmm7
224; X86-SSE2-NEXT:    psrld %xmm6, %xmm7
225; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
226; X86-SSE2-NEXT:    psrld %xmm5, %xmm1
227; X86-SSE2-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1]
228; X86-SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3]
229; X86-SSE2-NEXT:    pandn %xmm4, %xmm2
230; X86-SSE2-NEXT:    pslld $23, %xmm2
231; X86-SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
232; X86-SSE2-NEXT:    cvttps2dq %xmm2, %xmm1
233; X86-SSE2-NEXT:    paddd %xmm0, %xmm0
234; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
235; X86-SSE2-NEXT:    pmuludq %xmm1, %xmm0
236; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
237; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
238; X86-SSE2-NEXT:    pmuludq %xmm2, %xmm1
239; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
240; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
241; X86-SSE2-NEXT:    por %xmm3, %xmm0
242; X86-SSE2-NEXT:    retl
243  %res = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %amt)
244  ret <2 x i32> %res
245}
246
247;
248; Uniform Variable Shifts
249;
250
251define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %amt) nounwind {
252; SSE-LABEL: splatvar_funnnel_v2i32:
253; SSE:       # %bb.0:
254; SSE-NEXT:    movdqa %xmm1, %xmm3
255; SSE-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
256; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
257; SSE-NEXT:    psrlq %xmm2, %xmm3
258; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
259; SSE-NEXT:    psrlq %xmm2, %xmm1
260; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
261; SSE-NEXT:    movaps %xmm1, %xmm0
262; SSE-NEXT:    retq
263;
264; AVX1-LABEL: splatvar_funnnel_v2i32:
265; AVX1:       # %bb.0:
266; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
267; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
268; AVX1-NEXT:    vpsrlq %xmm2, %xmm3, %xmm3
269; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
270; AVX1-NEXT:    vpsrlq %xmm2, %xmm0, %xmm0
271; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
272; AVX1-NEXT:    retq
273;
274; AVX2-LABEL: splatvar_funnnel_v2i32:
275; AVX2:       # %bb.0:
276; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
277; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
278; AVX2-NEXT:    vpsrlq %xmm2, %xmm3, %xmm3
279; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
280; AVX2-NEXT:    vpsrlq %xmm2, %xmm0, %xmm0
281; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
282; AVX2-NEXT:    retq
283;
284; AVX512F-LABEL: splatvar_funnnel_v2i32:
285; AVX512F:       # %bb.0:
286; AVX512F-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
287; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
288; AVX512F-NEXT:    vpsrlq %xmm2, %xmm3, %xmm3
289; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
290; AVX512F-NEXT:    vpsrlq %xmm2, %xmm0, %xmm0
291; AVX512F-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
292; AVX512F-NEXT:    retq
293;
294; AVX512VL-LABEL: splatvar_funnnel_v2i32:
295; AVX512VL:       # %bb.0:
296; AVX512VL-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
297; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
298; AVX512VL-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
299; AVX512VL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1
300; AVX512VL-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
301; AVX512VL-NEXT:    vpmovqd %ymm0, %xmm0
302; AVX512VL-NEXT:    vzeroupper
303; AVX512VL-NEXT:    retq
304;
305; AVX512BW-LABEL: splatvar_funnnel_v2i32:
306; AVX512BW:       # %bb.0:
307; AVX512BW-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
308; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
309; AVX512BW-NEXT:    vpsrlq %xmm2, %xmm3, %xmm3
310; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
311; AVX512BW-NEXT:    vpsrlq %xmm2, %xmm0, %xmm0
312; AVX512BW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
313; AVX512BW-NEXT:    retq
314;
315; AVX512VLBW-LABEL: splatvar_funnnel_v2i32:
316; AVX512VLBW:       # %bb.0:
317; AVX512VLBW-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
318; AVX512VLBW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
319; AVX512VLBW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
320; AVX512VLBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1
321; AVX512VLBW-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
322; AVX512VLBW-NEXT:    vpmovqd %ymm0, %xmm0
323; AVX512VLBW-NEXT:    vzeroupper
324; AVX512VLBW-NEXT:    retq
325;
326; AVX512VBMI2-LABEL: splatvar_funnnel_v2i32:
327; AVX512VBMI2:       # %bb.0:
328; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
329; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
330; AVX512VBMI2-NEXT:    vpbroadcastd %xmm2, %xmm2
331; AVX512VBMI2-NEXT:    vpshrdvd %zmm2, %zmm0, %zmm1
332; AVX512VBMI2-NEXT:    vmovdqa %xmm1, %xmm0
333; AVX512VBMI2-NEXT:    vzeroupper
334; AVX512VBMI2-NEXT:    retq
335;
336; AVX512VLVBMI2-LABEL: splatvar_funnnel_v2i32:
337; AVX512VLVBMI2:       # %bb.0:
338; AVX512VLVBMI2-NEXT:    vpbroadcastd %xmm2, %xmm2
339; AVX512VLVBMI2-NEXT:    vpshrdvd %xmm2, %xmm0, %xmm1
340; AVX512VLVBMI2-NEXT:    vmovdqa %xmm1, %xmm0
341; AVX512VLVBMI2-NEXT:    retq
342;
343; XOP-LABEL: splatvar_funnnel_v2i32:
344; XOP:       # %bb.0:
345; XOP-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
346; XOP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
347; XOP-NEXT:    vpsrlq %xmm2, %xmm3, %xmm3
348; XOP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
349; XOP-NEXT:    vpsrlq %xmm2, %xmm0, %xmm0
350; XOP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
351; XOP-NEXT:    retq
352;
353; X86-SSE2-LABEL: splatvar_funnnel_v2i32:
354; X86-SSE2:       # %bb.0:
355; X86-SSE2-NEXT:    movdqa %xmm1, %xmm3
356; X86-SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
357; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
358; X86-SSE2-NEXT:    psrlq %xmm2, %xmm3
359; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
360; X86-SSE2-NEXT:    psrlq %xmm2, %xmm1
361; X86-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
362; X86-SSE2-NEXT:    movaps %xmm1, %xmm0
363; X86-SSE2-NEXT:    retl
364  %splat = shufflevector <2 x i32> %amt, <2 x i32> undef, <2 x i32> zeroinitializer
365  %res = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %splat)
366  ret <2 x i32> %res
367}
368
369;
370; Constant Shifts
371;
372
373define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
374; SSE2-LABEL: constant_funnnel_v2i32:
375; SSE2:       # %bb.0:
376; SSE2-NEXT:    movdqa %xmm1, %xmm2
377; SSE2-NEXT:    psrld $5, %xmm2
378; SSE2-NEXT:    movdqa %xmm1, %xmm3
379; SSE2-NEXT:    psrld $4, %xmm3
380; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
381; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[2,3]
382; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
383; SSE2-NEXT:    pslld $28, %xmm0
384; SSE2-NEXT:    pslld $27, %xmm1
385; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
386; SSE2-NEXT:    por %xmm3, %xmm0
387; SSE2-NEXT:    retq
388;
389; SSE41-LABEL: constant_funnnel_v2i32:
390; SSE41:       # %bb.0:
391; SSE41-NEXT:    movdqa %xmm1, %xmm2
392; SSE41-NEXT:    psrld $5, %xmm2
393; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
394; SSE41-NEXT:    movdqa %xmm1, %xmm3
395; SSE41-NEXT:    psrld $4, %xmm3
396; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
397; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
398; SSE41-NEXT:    movdqa %xmm0, %xmm1
399; SSE41-NEXT:    pslld $27, %xmm1
400; SSE41-NEXT:    pslld $28, %xmm0
401; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
402; SSE41-NEXT:    por %xmm3, %xmm0
403; SSE41-NEXT:    retq
404;
405; AVX1-LABEL: constant_funnnel_v2i32:
406; AVX1:       # %bb.0:
407; AVX1-NEXT:    vpsrld $5, %xmm1, %xmm2
408; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
409; AVX1-NEXT:    vpsrld $4, %xmm1, %xmm3
410; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
411; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
412; AVX1-NEXT:    vpslld $27, %xmm0, %xmm2
413; AVX1-NEXT:    vpslld $28, %xmm0, %xmm0
414; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7]
415; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
416; AVX1-NEXT:    retq
417;
418; AVX2-LABEL: constant_funnnel_v2i32:
419; AVX2:       # %bb.0:
420; AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
421; AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
422; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
423; AVX2-NEXT:    retq
424;
425; AVX512F-LABEL: constant_funnnel_v2i32:
426; AVX512F:       # %bb.0:
427; AVX512F-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
428; AVX512F-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
429; AVX512F-NEXT:    vpor %xmm1, %xmm0, %xmm0
430; AVX512F-NEXT:    retq
431;
432; AVX512VL-LABEL: constant_funnnel_v2i32:
433; AVX512VL:       # %bb.0:
434; AVX512VL-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
435; AVX512VL-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
436; AVX512VL-NEXT:    vpor %xmm1, %xmm0, %xmm0
437; AVX512VL-NEXT:    retq
438;
439; AVX512BW-LABEL: constant_funnnel_v2i32:
440; AVX512BW:       # %bb.0:
441; AVX512BW-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
442; AVX512BW-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
443; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
444; AVX512BW-NEXT:    retq
445;
446; AVX512VLBW-LABEL: constant_funnnel_v2i32:
447; AVX512VLBW:       # %bb.0:
448; AVX512VLBW-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
449; AVX512VLBW-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
450; AVX512VLBW-NEXT:    vpor %xmm1, %xmm0, %xmm0
451; AVX512VLBW-NEXT:    retq
452;
453; AVX512VBMI2-LABEL: constant_funnnel_v2i32:
454; AVX512VBMI2:       # %bb.0:
455; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
456; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
457; AVX512VBMI2-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [4,5,0,0]
458; AVX512VBMI2-NEXT:    vpshrdvd %zmm2, %zmm0, %zmm1
459; AVX512VBMI2-NEXT:    vmovdqa %xmm1, %xmm0
460; AVX512VBMI2-NEXT:    vzeroupper
461; AVX512VBMI2-NEXT:    retq
462;
463; AVX512VLVBMI2-LABEL: constant_funnnel_v2i32:
464; AVX512VLVBMI2:       # %bb.0:
465; AVX512VLVBMI2-NEXT:    vpshrdvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
466; AVX512VLVBMI2-NEXT:    vmovdqa %xmm1, %xmm0
467; AVX512VLVBMI2-NEXT:    retq
468;
469; XOPAVX1-LABEL: constant_funnnel_v2i32:
470; XOPAVX1:       # %bb.0:
471; XOPAVX1-NEXT:    vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
472; XOPAVX1-NEXT:    vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
473; XOPAVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
474; XOPAVX1-NEXT:    retq
475;
476; XOPAVX2-LABEL: constant_funnnel_v2i32:
477; XOPAVX2:       # %bb.0:
478; XOPAVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
479; XOPAVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
480; XOPAVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
481; XOPAVX2-NEXT:    retq
482;
483; X86-SSE2-LABEL: constant_funnnel_v2i32:
484; X86-SSE2:       # %bb.0:
485; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
486; X86-SSE2-NEXT:    psrld $5, %xmm2
487; X86-SSE2-NEXT:    movdqa %xmm1, %xmm3
488; X86-SSE2-NEXT:    psrld $4, %xmm3
489; X86-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
490; X86-SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[2,3]
491; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
492; X86-SSE2-NEXT:    pslld $28, %xmm0
493; X86-SSE2-NEXT:    pslld $27, %xmm1
494; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
495; X86-SSE2-NEXT:    por %xmm3, %xmm0
496; X86-SSE2-NEXT:    retl
497  %res = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 4, i32 5>)
498  ret <2 x i32> %res
499}
500
501;
502; Uniform Constant Shifts
503;
504
505define <2 x i32> @splatconstant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
506; SSE-LABEL: splatconstant_funnnel_v2i32:
507; SSE:       # %bb.0:
508; SSE-NEXT:    psrld $4, %xmm1
509; SSE-NEXT:    pslld $28, %xmm0
510; SSE-NEXT:    por %xmm1, %xmm0
511; SSE-NEXT:    retq
512;
513; AVX-LABEL: splatconstant_funnnel_v2i32:
514; AVX:       # %bb.0:
515; AVX-NEXT:    vpsrld $4, %xmm1, %xmm1
516; AVX-NEXT:    vpslld $28, %xmm0, %xmm0
517; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
518; AVX-NEXT:    retq
519;
520; AVX512VBMI2-LABEL: splatconstant_funnnel_v2i32:
521; AVX512VBMI2:       # %bb.0:
522; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
523; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
524; AVX512VBMI2-NEXT:    vpshrdd $4, %zmm0, %zmm1, %zmm0
525; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
526; AVX512VBMI2-NEXT:    vzeroupper
527; AVX512VBMI2-NEXT:    retq
528;
529; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v2i32:
530; AVX512VLVBMI2:       # %bb.0:
531; AVX512VLVBMI2-NEXT:    vpshrdd $4, %xmm0, %xmm1, %xmm0
532; AVX512VLVBMI2-NEXT:    retq
533;
534; XOP-LABEL: splatconstant_funnnel_v2i32:
535; XOP:       # %bb.0:
536; XOP-NEXT:    vpsrld $4, %xmm1, %xmm1
537; XOP-NEXT:    vpslld $28, %xmm0, %xmm0
538; XOP-NEXT:    vpor %xmm1, %xmm0, %xmm0
539; XOP-NEXT:    retq
540;
541; X86-SSE2-LABEL: splatconstant_funnnel_v2i32:
542; X86-SSE2:       # %bb.0:
543; X86-SSE2-NEXT:    psrld $4, %xmm1
544; X86-SSE2-NEXT:    pslld $28, %xmm0
545; X86-SSE2-NEXT:    por %xmm1, %xmm0
546; X86-SSE2-NEXT:    retl
547  %res = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 4, i32 4>)
548  ret <2 x i32> %res
549}
550