xref: /llvm-project/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll (revision 1715549373ab774bd73de0c982f7f01f30f94720)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512,AVX512VBMI2
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVBMI2
8
9declare <8 x i64> @llvm.fshr.v8i64(<8 x i64>, <8 x i64>, <8 x i64>)
10declare <16 x i32> @llvm.fshr.v16i32(<16 x i32>, <16 x i32>, <16 x i32>)
11declare <32 x i16> @llvm.fshr.v32i16(<32 x i16>, <32 x i16>, <32 x i16>)
12declare <64 x i8> @llvm.fshr.v64i8(<64 x i8>, <64 x i8>, <64 x i8>)
13
14;
15; Variable Shifts
16;
17
18define <8 x i64> @var_funnnel_v8i64(<8 x i64> %x, <8 x i64> %amt) nounwind {
19; AVX512-LABEL: var_funnnel_v8i64:
20; AVX512:       # %bb.0:
21; AVX512-NEXT:    vprorvq %zmm1, %zmm0, %zmm0
22; AVX512-NEXT:    retq
23  %res = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %x, <8 x i64> %x, <8 x i64> %amt)
24  ret <8 x i64> %res
25}
26
27define <16 x i32> @var_funnnel_v16i32(<16 x i32> %x, <16 x i32> %amt) nounwind {
28; AVX512-LABEL: var_funnnel_v16i32:
29; AVX512:       # %bb.0:
30; AVX512-NEXT:    vprorvd %zmm1, %zmm0, %zmm0
31; AVX512-NEXT:    retq
32  %res = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %x, <16 x i32> %x, <16 x i32> %amt)
33  ret <16 x i32> %res
34}
35
36define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
37; AVX512F-LABEL: var_funnnel_v32i16:
38; AVX512F:       # %bb.0:
39; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
40; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
41; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm2
42; AVX512F-NEXT:    vpxor %xmm4, %xmm4, %xmm4
43; AVX512F-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15]
44; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm6
45; AVX512F-NEXT:    vpunpckhwd {{.*#+}} ymm7 = ymm6[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
46; AVX512F-NEXT:    vpsrlvd %ymm5, %ymm7, %ymm5
47; AVX512F-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7],ymm5[8],ymm4[9],ymm5[10],ymm4[11],ymm5[12],ymm4[13],ymm5[14],ymm4[15]
48; AVX512F-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11]
49; AVX512F-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
50; AVX512F-NEXT:    vpsrlvd %ymm2, %ymm6, %ymm2
51; AVX512F-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3],ymm2[4],ymm4[5],ymm2[6],ymm4[7],ymm2[8],ymm4[9],ymm2[10],ymm4[11],ymm2[12],ymm4[13],ymm2[14],ymm4[15]
52; AVX512F-NEXT:    vpackusdw %ymm5, %ymm2, %ymm2
53; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm1
54; AVX512F-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm4[4],ymm1[5],ymm4[5],ymm1[6],ymm4[6],ymm1[7],ymm4[7],ymm1[12],ymm4[12],ymm1[13],ymm4[13],ymm1[14],ymm4[14],ymm1[15],ymm4[15]
55; AVX512F-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
56; AVX512F-NEXT:    vpsrlvd %ymm3, %ymm5, %ymm3
57; AVX512F-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7],ymm3[8],ymm4[9],ymm3[10],ymm4[11],ymm3[12],ymm4[13],ymm3[14],ymm4[15]
58; AVX512F-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11]
59; AVX512F-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
60; AVX512F-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
61; AVX512F-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2],ymm4[3],ymm0[4],ymm4[5],ymm0[6],ymm4[7],ymm0[8],ymm4[9],ymm0[10],ymm4[11],ymm0[12],ymm4[13],ymm0[14],ymm4[15]
62; AVX512F-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
63; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
64; AVX512F-NEXT:    retq
65;
66; AVX512VL-LABEL: var_funnnel_v32i16:
67; AVX512VL:       # %bb.0:
68; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
69; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
70; AVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm2
71; AVX512VL-NEXT:    vpxor %xmm4, %xmm4, %xmm4
72; AVX512VL-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15]
73; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm6
74; AVX512VL-NEXT:    vpunpckhwd {{.*#+}} ymm7 = ymm6[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
75; AVX512VL-NEXT:    vpsrlvd %ymm5, %ymm7, %ymm5
76; AVX512VL-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7],ymm5[8],ymm4[9],ymm5[10],ymm4[11],ymm5[12],ymm4[13],ymm5[14],ymm4[15]
77; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11]
78; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
79; AVX512VL-NEXT:    vpsrlvd %ymm2, %ymm6, %ymm2
80; AVX512VL-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3],ymm2[4],ymm4[5],ymm2[6],ymm4[7],ymm2[8],ymm4[9],ymm2[10],ymm4[11],ymm2[12],ymm4[13],ymm2[14],ymm4[15]
81; AVX512VL-NEXT:    vpackusdw %ymm5, %ymm2, %ymm2
82; AVX512VL-NEXT:    vpand %ymm3, %ymm1, %ymm1
83; AVX512VL-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm4[4],ymm1[5],ymm4[5],ymm1[6],ymm4[6],ymm1[7],ymm4[7],ymm1[12],ymm4[12],ymm1[13],ymm4[13],ymm1[14],ymm4[14],ymm1[15],ymm4[15]
84; AVX512VL-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
85; AVX512VL-NEXT:    vpsrlvd %ymm3, %ymm5, %ymm3
86; AVX512VL-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7],ymm3[8],ymm4[9],ymm3[10],ymm4[11],ymm3[12],ymm4[13],ymm3[14],ymm4[15]
87; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11]
88; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
89; AVX512VL-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
90; AVX512VL-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2],ymm4[3],ymm0[4],ymm4[5],ymm0[6],ymm4[7],ymm0[8],ymm4[9],ymm0[10],ymm4[11],ymm0[12],ymm4[13],ymm0[14],ymm4[15]
91; AVX512VL-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
92; AVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
93; AVX512VL-NEXT:    retq
94;
95; AVX512BW-LABEL: var_funnnel_v32i16:
96; AVX512BW:       # %bb.0:
97; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
98; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm2
99; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
100; AVX512BW-NEXT:    vpsubw %zmm1, %zmm3, %zmm1
101; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
102; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
103; AVX512BW-NEXT:    retq
104;
105; AVX512VLBW-LABEL: var_funnnel_v32i16:
106; AVX512VLBW:       # %bb.0:
107; AVX512VLBW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
108; AVX512VLBW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm2
109; AVX512VLBW-NEXT:    vpbroadcastw {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
110; AVX512VLBW-NEXT:    vpsubw %zmm1, %zmm3, %zmm1
111; AVX512VLBW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
112; AVX512VLBW-NEXT:    vporq %zmm0, %zmm2, %zmm0
113; AVX512VLBW-NEXT:    retq
114;
115; AVX512VBMI2-LABEL: var_funnnel_v32i16:
116; AVX512VBMI2:       # %bb.0:
117; AVX512VBMI2-NEXT:    vpshrdvw %zmm1, %zmm0, %zmm0
118; AVX512VBMI2-NEXT:    retq
119;
120; AVX512VLVBMI2-LABEL: var_funnnel_v32i16:
121; AVX512VLVBMI2:       # %bb.0:
122; AVX512VLVBMI2-NEXT:    vpshrdvw %zmm1, %zmm0, %zmm0
123; AVX512VLVBMI2-NEXT:    retq
124  %res = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> %amt)
125  ret <32 x i16> %res
126}
127
128define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
129; AVX512F-LABEL: var_funnnel_v64i8:
130; AVX512F:       # %bb.0:
131; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
132; AVX512F-NEXT:    vpsllw $4, %ymm2, %ymm3
133; AVX512F-NEXT:    vpsrlw $4, %ymm2, %ymm4
134; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm5 = [252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135]
135; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm4 = zmm3 ^ (zmm5 & (zmm4 ^ zmm3))
136; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
137; AVX512F-NEXT:    vpsllw $5, %ymm3, %ymm3
138; AVX512F-NEXT:    vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
139; AVX512F-NEXT:    vpsllw $6, %ymm2, %ymm4
140; AVX512F-NEXT:    vpsrlw $2, %ymm2, %ymm6
141; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm7 = [1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567]
142; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm6 = zmm4 ^ (zmm7 & (zmm6 ^ zmm4))
143; AVX512F-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
144; AVX512F-NEXT:    vpblendvb %ymm3, %ymm6, %ymm2, %ymm2
145; AVX512F-NEXT:    vpsllw $7, %ymm2, %ymm4
146; AVX512F-NEXT:    vpsrlw $1, %ymm2, %ymm6
147; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm8 = [2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143]
148; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm6 = zmm4 ^ (zmm8 & (zmm6 ^ zmm4))
149; AVX512F-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
150; AVX512F-NEXT:    vpblendvb %ymm3, %ymm6, %ymm2, %ymm2
151; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm3
152; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm4
153; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm4 = zmm3 ^ (zmm5 & (zmm4 ^ zmm3))
154; AVX512F-NEXT:    vpsllw $5, %ymm1, %ymm1
155; AVX512F-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
156; AVX512F-NEXT:    vpsllw $6, %ymm0, %ymm3
157; AVX512F-NEXT:    vpsrlw $2, %ymm0, %ymm4
158; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm4 = zmm3 ^ (zmm7 & (zmm4 ^ zmm3))
159; AVX512F-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
160; AVX512F-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
161; AVX512F-NEXT:    vpsllw $7, %ymm0, %ymm3
162; AVX512F-NEXT:    vpsrlw $1, %ymm0, %ymm4
163; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm4 = zmm3 ^ (zmm8 & (zmm4 ^ zmm3))
164; AVX512F-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
165; AVX512F-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
166; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
167; AVX512F-NEXT:    retq
168;
169; AVX512VL-LABEL: var_funnnel_v64i8:
170; AVX512VL:       # %bb.0:
171; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
172; AVX512VL-NEXT:    vpsllw $4, %ymm2, %ymm3
173; AVX512VL-NEXT:    vpsrlw $4, %ymm2, %ymm4
174; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135]
175; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm4 = ymm3 ^ (ymm5 & (ymm4 ^ ymm3))
176; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
177; AVX512VL-NEXT:    vpsllw $5, %ymm3, %ymm3
178; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
179; AVX512VL-NEXT:    vpsllw $6, %ymm2, %ymm4
180; AVX512VL-NEXT:    vpsrlw $2, %ymm2, %ymm6
181; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm7 = [1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567]
182; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm6 = ymm4 ^ (ymm7 & (ymm6 ^ ymm4))
183; AVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
184; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm6, %ymm2, %ymm2
185; AVX512VL-NEXT:    vpsllw $7, %ymm2, %ymm4
186; AVX512VL-NEXT:    vpsrlw $1, %ymm2, %ymm6
187; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143]
188; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm6 = ymm4 ^ (ymm8 & (ymm6 ^ ymm4))
189; AVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
190; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm6, %ymm2, %ymm2
191; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm3
192; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm4
193; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm4 = ymm3 ^ (ymm5 & (ymm4 ^ ymm3))
194; AVX512VL-NEXT:    vpsllw $5, %ymm1, %ymm1
195; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
196; AVX512VL-NEXT:    vpsllw $6, %ymm0, %ymm3
197; AVX512VL-NEXT:    vpsrlw $2, %ymm0, %ymm4
198; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm4 = ymm3 ^ (ymm7 & (ymm4 ^ ymm3))
199; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
200; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
201; AVX512VL-NEXT:    vpsllw $7, %ymm0, %ymm3
202; AVX512VL-NEXT:    vpsrlw $1, %ymm0, %ymm4
203; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm4 = ymm3 ^ (ymm8 & (ymm4 ^ ymm3))
204; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
205; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
206; AVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
207; AVX512VL-NEXT:    retq
208;
209; AVX512BW-LABEL: var_funnnel_v64i8:
210; AVX512BW:       # %bb.0:
211; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
212; AVX512BW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
213; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
214; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
215; AVX512BW-NEXT:    vpsrlvw %zmm3, %zmm4, %zmm3
216; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
217; AVX512BW-NEXT:    vpandq %zmm4, %zmm3, %zmm3
218; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
219; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
220; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
221; AVX512BW-NEXT:    vpandq %zmm4, %zmm0, %zmm0
222; AVX512BW-NEXT:    vpackuswb %zmm3, %zmm0, %zmm0
223; AVX512BW-NEXT:    retq
224;
225; AVX512VLBW-LABEL: var_funnnel_v64i8:
226; AVX512VLBW:       # %bb.0:
227; AVX512VLBW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
228; AVX512VLBW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
229; AVX512VLBW-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
230; AVX512VLBW-NEXT:    vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
231; AVX512VLBW-NEXT:    vpsrlvw %zmm3, %zmm4, %zmm3
232; AVX512VLBW-NEXT:    vpbroadcastw {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
233; AVX512VLBW-NEXT:    vpandq %zmm4, %zmm3, %zmm3
234; AVX512VLBW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
235; AVX512VLBW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
236; AVX512VLBW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
237; AVX512VLBW-NEXT:    vpandq %zmm4, %zmm0, %zmm0
238; AVX512VLBW-NEXT:    vpackuswb %zmm3, %zmm0, %zmm0
239; AVX512VLBW-NEXT:    retq
240;
241; AVX512VBMI2-LABEL: var_funnnel_v64i8:
242; AVX512VBMI2:       # %bb.0:
243; AVX512VBMI2-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
244; AVX512VBMI2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
245; AVX512VBMI2-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
246; AVX512VBMI2-NEXT:    vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
247; AVX512VBMI2-NEXT:    vpsrlvw %zmm3, %zmm4, %zmm3
248; AVX512VBMI2-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
249; AVX512VBMI2-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
250; AVX512VBMI2-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm1
251; AVX512VBMI2-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78,16,18,20,22,24,26,28,30,80,82,84,86,88,90,92,94,32,34,36,38,40,42,44,46,96,98,100,102,104,106,108,110,48,50,52,54,56,58,60,62,112,114,116,118,120,122,124,126]
252; AVX512VBMI2-NEXT:    vpermi2b %zmm3, %zmm1, %zmm0
253; AVX512VBMI2-NEXT:    retq
254;
255; AVX512VLVBMI2-LABEL: var_funnnel_v64i8:
256; AVX512VLVBMI2:       # %bb.0:
257; AVX512VLVBMI2-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
258; AVX512VLVBMI2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
259; AVX512VLVBMI2-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
260; AVX512VLVBMI2-NEXT:    vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
261; AVX512VLVBMI2-NEXT:    vpsrlvw %zmm3, %zmm4, %zmm3
262; AVX512VLVBMI2-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
263; AVX512VLVBMI2-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
264; AVX512VLVBMI2-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm1
265; AVX512VLVBMI2-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78,16,18,20,22,24,26,28,30,80,82,84,86,88,90,92,94,32,34,36,38,40,42,44,46,96,98,100,102,104,106,108,110,48,50,52,54,56,58,60,62,112,114,116,118,120,122,124,126]
266; AVX512VLVBMI2-NEXT:    vpermi2b %zmm3, %zmm1, %zmm0
267; AVX512VLVBMI2-NEXT:    retq
268  %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> %amt)
269  ret <64 x i8> %res
270}
271
272;
273; Uniform Variable Shifts
274;
275
276define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %amt) nounwind {
277; AVX512-LABEL: splatvar_funnnel_v8i64:
278; AVX512:       # %bb.0:
279; AVX512-NEXT:    vpbroadcastq %xmm1, %zmm1
280; AVX512-NEXT:    vprorvq %zmm1, %zmm0, %zmm0
281; AVX512-NEXT:    retq
282  %splat = shufflevector <8 x i64> %amt, <8 x i64> undef, <8 x i32> zeroinitializer
283  %res = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %x, <8 x i64> %x, <8 x i64> %splat)
284  ret <8 x i64> %res
285}
286
287define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %amt) nounwind {
288; AVX512-LABEL: splatvar_funnnel_v16i32:
289; AVX512:       # %bb.0:
290; AVX512-NEXT:    vpbroadcastd %xmm1, %zmm1
291; AVX512-NEXT:    vprorvd %zmm1, %zmm0, %zmm0
292; AVX512-NEXT:    retq
293  %splat = shufflevector <16 x i32> %amt, <16 x i32> undef, <16 x i32> zeroinitializer
294  %res = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %x, <16 x i32> %x, <16 x i32> %splat)
295  ret <16 x i32> %res
296}
297
298define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
299; AVX512F-LABEL: splatvar_funnnel_v32i16:
300; AVX512F:       # %bb.0:
301; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
302; AVX512F-NEXT:    vpand %xmm2, %xmm1, %xmm3
303; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
304; AVX512F-NEXT:    vpsrlw %xmm3, %ymm4, %ymm5
305; AVX512F-NEXT:    vpsrlw %xmm3, %ymm0, %ymm3
306; AVX512F-NEXT:    vinserti64x4 $1, %ymm5, %zmm3, %zmm3
307; AVX512F-NEXT:    vpandn %xmm2, %xmm1, %xmm1
308; AVX512F-NEXT:    vpaddw %ymm4, %ymm4, %ymm2
309; AVX512F-NEXT:    vpsllw %xmm1, %ymm2, %ymm2
310; AVX512F-NEXT:    vpaddw %ymm0, %ymm0, %ymm0
311; AVX512F-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
312; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
313; AVX512F-NEXT:    vporq %zmm3, %zmm0, %zmm0
314; AVX512F-NEXT:    retq
315;
316; AVX512VL-LABEL: splatvar_funnnel_v32i16:
317; AVX512VL:       # %bb.0:
318; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
319; AVX512VL-NEXT:    vpand %xmm2, %xmm1, %xmm3
320; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
321; AVX512VL-NEXT:    vpsrlw %xmm3, %ymm4, %ymm5
322; AVX512VL-NEXT:    vpsrlw %xmm3, %ymm0, %ymm3
323; AVX512VL-NEXT:    vinserti64x4 $1, %ymm5, %zmm3, %zmm3
324; AVX512VL-NEXT:    vpandn %xmm2, %xmm1, %xmm1
325; AVX512VL-NEXT:    vpaddw %ymm4, %ymm4, %ymm2
326; AVX512VL-NEXT:    vpsllw %xmm1, %ymm2, %ymm2
327; AVX512VL-NEXT:    vpaddw %ymm0, %ymm0, %ymm0
328; AVX512VL-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
329; AVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
330; AVX512VL-NEXT:    vporq %zmm3, %zmm0, %zmm0
331; AVX512VL-NEXT:    retq
332;
333; AVX512BW-LABEL: splatvar_funnnel_v32i16:
334; AVX512BW:       # %bb.0:
335; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
336; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm3
337; AVX512BW-NEXT:    vpsrlw %xmm3, %zmm0, %zmm3
338; AVX512BW-NEXT:    vpandn %xmm2, %xmm1, %xmm1
339; AVX512BW-NEXT:    vpaddw %zmm0, %zmm0, %zmm0
340; AVX512BW-NEXT:    vpsllw %xmm1, %zmm0, %zmm0
341; AVX512BW-NEXT:    vporq %zmm3, %zmm0, %zmm0
342; AVX512BW-NEXT:    retq
343;
344; AVX512VLBW-LABEL: splatvar_funnnel_v32i16:
345; AVX512VLBW:       # %bb.0:
346; AVX512VLBW-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
347; AVX512VLBW-NEXT:    vpand %xmm2, %xmm1, %xmm3
348; AVX512VLBW-NEXT:    vpsrlw %xmm3, %zmm0, %zmm3
349; AVX512VLBW-NEXT:    vpandn %xmm2, %xmm1, %xmm1
350; AVX512VLBW-NEXT:    vpaddw %zmm0, %zmm0, %zmm0
351; AVX512VLBW-NEXT:    vpsllw %xmm1, %zmm0, %zmm0
352; AVX512VLBW-NEXT:    vporq %zmm3, %zmm0, %zmm0
353; AVX512VLBW-NEXT:    retq
354;
355; AVX512VBMI2-LABEL: splatvar_funnnel_v32i16:
356; AVX512VBMI2:       # %bb.0:
357; AVX512VBMI2-NEXT:    vpbroadcastw %xmm1, %zmm1
358; AVX512VBMI2-NEXT:    vpshrdvw %zmm1, %zmm0, %zmm0
359; AVX512VBMI2-NEXT:    retq
360;
361; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i16:
362; AVX512VLVBMI2:       # %bb.0:
363; AVX512VLVBMI2-NEXT:    vpbroadcastw %xmm1, %zmm1
364; AVX512VLVBMI2-NEXT:    vpshrdvw %zmm1, %zmm0, %zmm0
365; AVX512VLVBMI2-NEXT:    retq
366  %splat = shufflevector <32 x i16> %amt, <32 x i16> undef, <32 x i32> zeroinitializer
367  %res = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> %splat)
368  ret <32 x i16> %res
369}
370
371define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
372; AVX512F-LABEL: splatvar_funnnel_v64i8:
373; AVX512F:       # %bb.0:
374; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
375; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
376; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
377; AVX512F-NEXT:    vpsrlw %xmm1, %ymm3, %ymm3
378; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
379; AVX512F-NEXT:    vpand %ymm4, %ymm3, %ymm3
380; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
381; AVX512F-NEXT:    vpsrlw %xmm1, %ymm2, %ymm2
382; AVX512F-NEXT:    vpand %ymm4, %ymm2, %ymm2
383; AVX512F-NEXT:    vpackuswb %ymm3, %ymm2, %ymm2
384; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
385; AVX512F-NEXT:    vpsrlw %xmm1, %ymm3, %ymm3
386; AVX512F-NEXT:    vpand %ymm4, %ymm3, %ymm3
387; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
388; AVX512F-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
389; AVX512F-NEXT:    vpand %ymm4, %ymm0, %ymm0
390; AVX512F-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
391; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
392; AVX512F-NEXT:    retq
393;
394; AVX512VL-LABEL: splatvar_funnnel_v64i8:
395; AVX512VL:       # %bb.0:
396; AVX512VL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
397; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
398; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
399; AVX512VL-NEXT:    vpsrlw %xmm1, %ymm3, %ymm3
400; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
401; AVX512VL-NEXT:    vpand %ymm4, %ymm3, %ymm3
402; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
403; AVX512VL-NEXT:    vpsrlw %xmm1, %ymm2, %ymm2
404; AVX512VL-NEXT:    vpand %ymm4, %ymm2, %ymm2
405; AVX512VL-NEXT:    vpackuswb %ymm3, %ymm2, %ymm2
406; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
407; AVX512VL-NEXT:    vpsrlw %xmm1, %ymm3, %ymm3
408; AVX512VL-NEXT:    vpand %ymm4, %ymm3, %ymm3
409; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
410; AVX512VL-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
411; AVX512VL-NEXT:    vpand %ymm4, %ymm0, %ymm0
412; AVX512VL-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
413; AVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
414; AVX512VL-NEXT:    retq
415;
416; AVX512BW-LABEL: splatvar_funnnel_v64i8:
417; AVX512BW:       # %bb.0:
418; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
419; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
420; AVX512BW-NEXT:    vpsrlw %xmm1, %zmm2, %zmm2
421; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
422; AVX512BW-NEXT:    vpandq %zmm3, %zmm2, %zmm2
423; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
424; AVX512BW-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0
425; AVX512BW-NEXT:    vpandq %zmm3, %zmm0, %zmm0
426; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
427; AVX512BW-NEXT:    retq
428;
429; AVX512VLBW-LABEL: splatvar_funnnel_v64i8:
430; AVX512VLBW:       # %bb.0:
431; AVX512VLBW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
432; AVX512VLBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
433; AVX512VLBW-NEXT:    vpsrlw %xmm1, %zmm2, %zmm2
434; AVX512VLBW-NEXT:    vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
435; AVX512VLBW-NEXT:    vpandq %zmm3, %zmm2, %zmm2
436; AVX512VLBW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
437; AVX512VLBW-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0
438; AVX512VLBW-NEXT:    vpandq %zmm3, %zmm0, %zmm0
439; AVX512VLBW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
440; AVX512VLBW-NEXT:    retq
441;
442; AVX512VBMI2-LABEL: splatvar_funnnel_v64i8:
443; AVX512VBMI2:       # %bb.0:
444; AVX512VBMI2-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
445; AVX512VBMI2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
446; AVX512VBMI2-NEXT:    vpsrlw %xmm1, %zmm2, %zmm2
447; AVX512VBMI2-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
448; AVX512VBMI2-NEXT:    vpsrlw %xmm1, %zmm0, %zmm1
449; AVX512VBMI2-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78,16,18,20,22,24,26,28,30,80,82,84,86,88,90,92,94,32,34,36,38,40,42,44,46,96,98,100,102,104,106,108,110,48,50,52,54,56,58,60,62,112,114,116,118,120,122,124,126]
450; AVX512VBMI2-NEXT:    vpermi2b %zmm2, %zmm1, %zmm0
451; AVX512VBMI2-NEXT:    retq
452;
453; AVX512VLVBMI2-LABEL: splatvar_funnnel_v64i8:
454; AVX512VLVBMI2:       # %bb.0:
455; AVX512VLVBMI2-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
456; AVX512VLVBMI2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
457; AVX512VLVBMI2-NEXT:    vpsrlw %xmm1, %zmm2, %zmm2
458; AVX512VLVBMI2-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
459; AVX512VLVBMI2-NEXT:    vpsrlw %xmm1, %zmm0, %zmm1
460; AVX512VLVBMI2-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78,16,18,20,22,24,26,28,30,80,82,84,86,88,90,92,94,32,34,36,38,40,42,44,46,96,98,100,102,104,106,108,110,48,50,52,54,56,58,60,62,112,114,116,118,120,122,124,126]
461; AVX512VLVBMI2-NEXT:    vpermi2b %zmm2, %zmm1, %zmm0
462; AVX512VLVBMI2-NEXT:    retq
463  %splat = shufflevector <64 x i8> %amt, <64 x i8> undef, <64 x i32> zeroinitializer
464  %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> %splat)
465  ret <64 x i8> %res
466}
467
468;
469; Constant Shifts
470;
471
472define <8 x i64> @constant_funnnel_v8i64(<8 x i64> %x) nounwind {
473; AVX512-LABEL: constant_funnnel_v8i64:
474; AVX512:       # %bb.0:
475; AVX512-NEXT:    vprorvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
476; AVX512-NEXT:    retq
477  %res = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %x, <8 x i64> %x, <8 x i64> <i64 4, i64 14, i64 50, i64 60, i64 4, i64 14, i64 50, i64 60>)
478  ret <8 x i64> %res
479}
480
481define <16 x i32> @constant_funnnel_v16i32(<16 x i32> %x) nounwind {
482; AVX512-LABEL: constant_funnnel_v16i32:
483; AVX512:       # %bb.0:
484; AVX512-NEXT:    vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
485; AVX512-NEXT:    retq
486  %res = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %x, <16 x i32> %x, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>)
487  ret <16 x i32> %res
488}
489
490define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x) nounwind {
491; AVX512F-LABEL: constant_funnnel_v32i16:
492; AVX512F:       # %bb.0:
493; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
494; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2]
495; AVX512F-NEXT:    vpmulhuw %ymm2, %ymm1, %ymm3
496; AVX512F-NEXT:    vpmulhuw %ymm2, %ymm0, %ymm4
497; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
498; AVX512F-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
499; AVX512F-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
500; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
501; AVX512F-NEXT:    vporq %zmm3, %zmm0, %zmm0
502; AVX512F-NEXT:    retq
503;
504; AVX512VL-LABEL: constant_funnnel_v32i16:
505; AVX512VL:       # %bb.0:
506; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
507; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2]
508; AVX512VL-NEXT:    vpmulhuw %ymm2, %ymm1, %ymm3
509; AVX512VL-NEXT:    vpmulhuw %ymm2, %ymm0, %ymm4
510; AVX512VL-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
511; AVX512VL-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
512; AVX512VL-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
513; AVX512VL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
514; AVX512VL-NEXT:    vporq %zmm3, %zmm0, %zmm0
515; AVX512VL-NEXT:    retq
516;
517; AVX512BW-LABEL: constant_funnnel_v32i16:
518; AVX512BW:       # %bb.0:
519; AVX512BW-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
520; AVX512BW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
521; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
522; AVX512BW-NEXT:    retq
523;
524; AVX512VLBW-LABEL: constant_funnnel_v32i16:
525; AVX512VLBW:       # %bb.0:
526; AVX512VLBW-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
527; AVX512VLBW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
528; AVX512VLBW-NEXT:    vporq %zmm1, %zmm0, %zmm0
529; AVX512VLBW-NEXT:    retq
530;
531; AVX512VBMI2-LABEL: constant_funnnel_v32i16:
532; AVX512VBMI2:       # %bb.0:
533; AVX512VBMI2-NEXT:    vpshrdvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
534; AVX512VBMI2-NEXT:    retq
535;
536; AVX512VLVBMI2-LABEL: constant_funnnel_v32i16:
537; AVX512VLVBMI2:       # %bb.0:
538; AVX512VLVBMI2-NEXT:    vpshrdvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
539; AVX512VLVBMI2-NEXT:    retq
540  %res = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>)
541  ret <32 x i16> %res
542}
543
544define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x) nounwind {
545; AVX512F-LABEL: constant_funnnel_v64i8:
546; AVX512F:       # %bb.0:
547; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
548; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
549; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
550; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
551; AVX512F-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
552; AVX512F-NEXT:    vpsrlw $8, %ymm2, %ymm2
553; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
554; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2]
555; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
556; AVX512F-NEXT:    vpmullw %ymm4, %ymm1, %ymm1
557; AVX512F-NEXT:    vpsrlw $8, %ymm1, %ymm1
558; AVX512F-NEXT:    vpackuswb %ymm2, %ymm1, %ymm1
559; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
560; AVX512F-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
561; AVX512F-NEXT:    vpsrlw $8, %ymm2, %ymm2
562; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
563; AVX512F-NEXT:    vpmullw %ymm4, %ymm0, %ymm0
564; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
565; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
566; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
567; AVX512F-NEXT:    retq
568;
569; AVX512VL-LABEL: constant_funnnel_v64i8:
570; AVX512VL:       # %bb.0:
571; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
572; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
573; AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
574; AVX512VL-NEXT:    # ymm3 = mem[0,1,0,1]
575; AVX512VL-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
576; AVX512VL-NEXT:    vpsrlw $8, %ymm2, %ymm2
577; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
578; AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2]
579; AVX512VL-NEXT:    # ymm4 = mem[0,1,0,1]
580; AVX512VL-NEXT:    vpmullw %ymm4, %ymm1, %ymm1
581; AVX512VL-NEXT:    vpsrlw $8, %ymm1, %ymm1
582; AVX512VL-NEXT:    vpackuswb %ymm2, %ymm1, %ymm1
583; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
584; AVX512VL-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
585; AVX512VL-NEXT:    vpsrlw $8, %ymm2, %ymm2
586; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
587; AVX512VL-NEXT:    vpmullw %ymm4, %ymm0, %ymm0
588; AVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
589; AVX512VL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
590; AVX512VL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
591; AVX512VL-NEXT:    retq
592;
593; AVX512BW-LABEL: constant_funnnel_v64i8:
594; AVX512BW:       # %bb.0:
595; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
596; AVX512BW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
597; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
598; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
599; AVX512BW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
600; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
601; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
602; AVX512BW-NEXT:    retq
603;
604; AVX512VLBW-LABEL: constant_funnnel_v64i8:
605; AVX512VLBW:       # %bb.0:
606; AVX512VLBW-NEXT:    vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
607; AVX512VLBW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
608; AVX512VLBW-NEXT:    vpsrlw $8, %zmm1, %zmm1
609; AVX512VLBW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
610; AVX512VLBW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
611; AVX512VLBW-NEXT:    vpsrlw $8, %zmm0, %zmm0
612; AVX512VLBW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
613; AVX512VLBW-NEXT:    retq
614;
615; AVX512VBMI2-LABEL: constant_funnnel_v64i8:
616; AVX512VBMI2:       # %bb.0:
617; AVX512VBMI2-NEXT:    vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
618; AVX512VBMI2-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
619; AVX512VBMI2-NEXT:    vpsrlw $8, %zmm1, %zmm1
620; AVX512VBMI2-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
621; AVX512VBMI2-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
622; AVX512VBMI2-NEXT:    vpsrlw $8, %zmm0, %zmm0
623; AVX512VBMI2-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
624; AVX512VBMI2-NEXT:    retq
625;
626; AVX512VLVBMI2-LABEL: constant_funnnel_v64i8:
627; AVX512VLVBMI2:       # %bb.0:
628; AVX512VLVBMI2-NEXT:    vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
629; AVX512VLVBMI2-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
630; AVX512VLVBMI2-NEXT:    vpsrlw $8, %zmm1, %zmm1
631; AVX512VLVBMI2-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
632; AVX512VLVBMI2-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
633; AVX512VLVBMI2-NEXT:    vpsrlw $8, %zmm0, %zmm0
634; AVX512VLVBMI2-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
635; AVX512VLVBMI2-NEXT:    retq
636  %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
637  ret <64 x i8> %res
638}
639
640;
641; Uniform Constant Shifts
642;
643
644define <8 x i64> @splatconstant_funnnel_v8i64(<8 x i64> %x) nounwind {
645; AVX512-LABEL: splatconstant_funnnel_v8i64:
646; AVX512:       # %bb.0:
647; AVX512-NEXT:    vprorq $14, %zmm0, %zmm0
648; AVX512-NEXT:    retq
649  %res = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %x, <8 x i64> %x, <8 x i64> <i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14>)
650  ret <8 x i64> %res
651}
652
653define <16 x i32> @splatconstant_funnnel_v16i32(<16 x i32> %x) nounwind {
654; AVX512-LABEL: splatconstant_funnnel_v16i32:
655; AVX512:       # %bb.0:
656; AVX512-NEXT:    vprord $4, %zmm0, %zmm0
657; AVX512-NEXT:    retq
658  %res = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %x, <16 x i32> %x, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>)
659  ret <16 x i32> %res
660}
661
662define <32 x i16> @splatconstant_funnnel_v32i16(<32 x i16> %x) nounwind {
663; AVX512F-LABEL: splatconstant_funnnel_v32i16:
664; AVX512F:       # %bb.0:
665; AVX512F-NEXT:    vpsrlw $7, %ymm0, %ymm1
666; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
667; AVX512F-NEXT:    vpsrlw $7, %ymm2, %ymm3
668; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
669; AVX512F-NEXT:    vpsllw $9, %ymm0, %ymm0
670; AVX512F-NEXT:    vpsllw $9, %ymm2, %ymm2
671; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
672; AVX512F-NEXT:    vporq %zmm1, %zmm0, %zmm0
673; AVX512F-NEXT:    retq
674;
675; AVX512VL-LABEL: splatconstant_funnnel_v32i16:
676; AVX512VL:       # %bb.0:
677; AVX512VL-NEXT:    vpsrlw $7, %ymm0, %ymm1
678; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
679; AVX512VL-NEXT:    vpsrlw $7, %ymm2, %ymm3
680; AVX512VL-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
681; AVX512VL-NEXT:    vpsllw $9, %ymm0, %ymm0
682; AVX512VL-NEXT:    vpsllw $9, %ymm2, %ymm2
683; AVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
684; AVX512VL-NEXT:    vporq %zmm1, %zmm0, %zmm0
685; AVX512VL-NEXT:    retq
686;
687; AVX512BW-LABEL: splatconstant_funnnel_v32i16:
688; AVX512BW:       # %bb.0:
689; AVX512BW-NEXT:    vpsrlw $7, %zmm0, %zmm1
690; AVX512BW-NEXT:    vpsllw $9, %zmm0, %zmm0
691; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
692; AVX512BW-NEXT:    retq
693;
694; AVX512VLBW-LABEL: splatconstant_funnnel_v32i16:
695; AVX512VLBW:       # %bb.0:
696; AVX512VLBW-NEXT:    vpsrlw $7, %zmm0, %zmm1
697; AVX512VLBW-NEXT:    vpsllw $9, %zmm0, %zmm0
698; AVX512VLBW-NEXT:    vporq %zmm1, %zmm0, %zmm0
699; AVX512VLBW-NEXT:    retq
700;
701; AVX512VBMI2-LABEL: splatconstant_funnnel_v32i16:
702; AVX512VBMI2:       # %bb.0:
703; AVX512VBMI2-NEXT:    vpshrdw $7, %zmm0, %zmm0, %zmm0
704; AVX512VBMI2-NEXT:    retq
705;
706; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v32i16:
707; AVX512VLVBMI2:       # %bb.0:
708; AVX512VLVBMI2-NEXT:    vpshrdw $7, %zmm0, %zmm0, %zmm0
709; AVX512VLVBMI2-NEXT:    retq
710  %res = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
711  ret <32 x i16> %res
712}
713
714define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x) nounwind {
715; AVX512F-LABEL: splatconstant_funnnel_v64i8:
716; AVX512F:       # %bb.0:
717; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm1
718; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
719; AVX512F-NEXT:    vpsllw $4, %ymm2, %ymm3
720; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
721; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
722; AVX512F-NEXT:    vpsrlw $4, %ymm2, %ymm2
723; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
724; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
725; AVX512F-NEXT:    retq
726;
727; AVX512VL-LABEL: splatconstant_funnnel_v64i8:
728; AVX512VL:       # %bb.0:
729; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm1
730; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
731; AVX512VL-NEXT:    vpsllw $4, %ymm2, %ymm3
732; AVX512VL-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
733; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm0
734; AVX512VL-NEXT:    vpsrlw $4, %ymm2, %ymm2
735; AVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
736; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
737; AVX512VL-NEXT:    retq
738;
739; AVX512BW-LABEL: splatconstant_funnnel_v64i8:
740; AVX512BW:       # %bb.0:
741; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm1
742; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
743; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
744; AVX512BW-NEXT:    retq
745;
746; AVX512VLBW-LABEL: splatconstant_funnnel_v64i8:
747; AVX512VLBW:       # %bb.0:
748; AVX512VLBW-NEXT:    vpsllw $4, %zmm0, %zmm1
749; AVX512VLBW-NEXT:    vpsrlw $4, %zmm0, %zmm0
750; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
751; AVX512VLBW-NEXT:    retq
752;
753; AVX512VBMI2-LABEL: splatconstant_funnnel_v64i8:
754; AVX512VBMI2:       # %bb.0:
755; AVX512VBMI2-NEXT:    vpsllw $4, %zmm0, %zmm1
756; AVX512VBMI2-NEXT:    vpsrlw $4, %zmm0, %zmm0
757; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
758; AVX512VBMI2-NEXT:    retq
759;
760; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v64i8:
761; AVX512VLVBMI2:       # %bb.0:
762; AVX512VLVBMI2-NEXT:    vpsllw $4, %zmm0, %zmm1
763; AVX512VLVBMI2-NEXT:    vpsrlw $4, %zmm0, %zmm0
764; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
765; AVX512VLVBMI2-NEXT:    retq
766  %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)
767  ret <64 x i8> %res
768}
769