xref: /llvm-project/llvm/test/CodeGen/X86/vector-fshr-512.ll (revision 1715549373ab774bd73de0c982f7f01f30f94720)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512VL
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512VBMI2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512VLBW
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512VLVBMI2
8
9declare <8 x i64> @llvm.fshr.v8i64(<8 x i64>, <8 x i64>, <8 x i64>)
10declare <16 x i32> @llvm.fshr.v16i32(<16 x i32>, <16 x i32>, <16 x i32>)
11declare <32 x i16> @llvm.fshr.v32i16(<32 x i16>, <32 x i16>, <32 x i16>)
12declare <64 x i8> @llvm.fshr.v64i8(<64 x i8>, <64 x i8>, <64 x i8>)
13
14;
15; Variable Shifts
16;
17
18define <8 x i64> @var_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) nounwind {
19; AVX512F-LABEL: var_funnnel_v8i64:
20; AVX512F:       # %bb.0:
21; AVX512F-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
22; AVX512F-NEXT:    vpandq %zmm3, %zmm2, %zmm4
23; AVX512F-NEXT:    vpsrlvq %zmm4, %zmm1, %zmm1
24; AVX512F-NEXT:    vpandnq %zmm3, %zmm2, %zmm2
25; AVX512F-NEXT:    vpaddq %zmm0, %zmm0, %zmm0
26; AVX512F-NEXT:    vpsllvq %zmm2, %zmm0, %zmm0
27; AVX512F-NEXT:    vporq %zmm1, %zmm0, %zmm0
28; AVX512F-NEXT:    retq
29;
30; AVX512VL-LABEL: var_funnnel_v8i64:
31; AVX512VL:       # %bb.0:
32; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
33; AVX512VL-NEXT:    vpandq %zmm3, %zmm2, %zmm4
34; AVX512VL-NEXT:    vpsrlvq %zmm4, %zmm1, %zmm1
35; AVX512VL-NEXT:    vpandnq %zmm3, %zmm2, %zmm2
36; AVX512VL-NEXT:    vpaddq %zmm0, %zmm0, %zmm0
37; AVX512VL-NEXT:    vpsllvq %zmm2, %zmm0, %zmm0
38; AVX512VL-NEXT:    vporq %zmm1, %zmm0, %zmm0
39; AVX512VL-NEXT:    retq
40;
41; AVX512BW-LABEL: var_funnnel_v8i64:
42; AVX512BW:       # %bb.0:
43; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
44; AVX512BW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
45; AVX512BW-NEXT:    vpsrlvq %zmm4, %zmm1, %zmm1
46; AVX512BW-NEXT:    vpandnq %zmm3, %zmm2, %zmm2
47; AVX512BW-NEXT:    vpaddq %zmm0, %zmm0, %zmm0
48; AVX512BW-NEXT:    vpsllvq %zmm2, %zmm0, %zmm0
49; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
50; AVX512BW-NEXT:    retq
51;
52; AVX512VBMI2-LABEL: var_funnnel_v8i64:
53; AVX512VBMI2:       # %bb.0:
54; AVX512VBMI2-NEXT:    vpshrdvq %zmm2, %zmm0, %zmm1
55; AVX512VBMI2-NEXT:    vmovdqa64 %zmm1, %zmm0
56; AVX512VBMI2-NEXT:    retq
57;
58; AVX512VLBW-LABEL: var_funnnel_v8i64:
59; AVX512VLBW:       # %bb.0:
60; AVX512VLBW-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
61; AVX512VLBW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
62; AVX512VLBW-NEXT:    vpsrlvq %zmm4, %zmm1, %zmm1
63; AVX512VLBW-NEXT:    vpandnq %zmm3, %zmm2, %zmm2
64; AVX512VLBW-NEXT:    vpaddq %zmm0, %zmm0, %zmm0
65; AVX512VLBW-NEXT:    vpsllvq %zmm2, %zmm0, %zmm0
66; AVX512VLBW-NEXT:    vporq %zmm1, %zmm0, %zmm0
67; AVX512VLBW-NEXT:    retq
68;
69; AVX512VLVBMI2-LABEL: var_funnnel_v8i64:
70; AVX512VLVBMI2:       # %bb.0:
71; AVX512VLVBMI2-NEXT:    vpshrdvq %zmm2, %zmm0, %zmm1
72; AVX512VLVBMI2-NEXT:    vmovdqa64 %zmm1, %zmm0
73; AVX512VLVBMI2-NEXT:    retq
74  %res = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt)
75  ret <8 x i64> %res
76}
77
78define <16 x i32> @var_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %amt) nounwind {
79; AVX512F-LABEL: var_funnnel_v16i32:
80; AVX512F:       # %bb.0:
81; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
82; AVX512F-NEXT:    vpandd %zmm3, %zmm2, %zmm4
83; AVX512F-NEXT:    vpsrlvd %zmm4, %zmm1, %zmm1
84; AVX512F-NEXT:    vpandnd %zmm3, %zmm2, %zmm2
85; AVX512F-NEXT:    vpaddd %zmm0, %zmm0, %zmm0
86; AVX512F-NEXT:    vpsllvd %zmm2, %zmm0, %zmm0
87; AVX512F-NEXT:    vpord %zmm1, %zmm0, %zmm0
88; AVX512F-NEXT:    retq
89;
90; AVX512VL-LABEL: var_funnnel_v16i32:
91; AVX512VL:       # %bb.0:
92; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
93; AVX512VL-NEXT:    vpandd %zmm3, %zmm2, %zmm4
94; AVX512VL-NEXT:    vpsrlvd %zmm4, %zmm1, %zmm1
95; AVX512VL-NEXT:    vpandnd %zmm3, %zmm2, %zmm2
96; AVX512VL-NEXT:    vpaddd %zmm0, %zmm0, %zmm0
97; AVX512VL-NEXT:    vpsllvd %zmm2, %zmm0, %zmm0
98; AVX512VL-NEXT:    vpord %zmm1, %zmm0, %zmm0
99; AVX512VL-NEXT:    retq
100;
101; AVX512BW-LABEL: var_funnnel_v16i32:
102; AVX512BW:       # %bb.0:
103; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
104; AVX512BW-NEXT:    vpandd %zmm3, %zmm2, %zmm4
105; AVX512BW-NEXT:    vpsrlvd %zmm4, %zmm1, %zmm1
106; AVX512BW-NEXT:    vpandnd %zmm3, %zmm2, %zmm2
107; AVX512BW-NEXT:    vpaddd %zmm0, %zmm0, %zmm0
108; AVX512BW-NEXT:    vpsllvd %zmm2, %zmm0, %zmm0
109; AVX512BW-NEXT:    vpord %zmm1, %zmm0, %zmm0
110; AVX512BW-NEXT:    retq
111;
112; AVX512VBMI2-LABEL: var_funnnel_v16i32:
113; AVX512VBMI2:       # %bb.0:
114; AVX512VBMI2-NEXT:    vpshrdvd %zmm2, %zmm0, %zmm1
115; AVX512VBMI2-NEXT:    vmovdqa64 %zmm1, %zmm0
116; AVX512VBMI2-NEXT:    retq
117;
118; AVX512VLBW-LABEL: var_funnnel_v16i32:
119; AVX512VLBW:       # %bb.0:
120; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
121; AVX512VLBW-NEXT:    vpandd %zmm3, %zmm2, %zmm4
122; AVX512VLBW-NEXT:    vpsrlvd %zmm4, %zmm1, %zmm1
123; AVX512VLBW-NEXT:    vpandnd %zmm3, %zmm2, %zmm2
124; AVX512VLBW-NEXT:    vpaddd %zmm0, %zmm0, %zmm0
125; AVX512VLBW-NEXT:    vpsllvd %zmm2, %zmm0, %zmm0
126; AVX512VLBW-NEXT:    vpord %zmm1, %zmm0, %zmm0
127; AVX512VLBW-NEXT:    retq
128;
129; AVX512VLVBMI2-LABEL: var_funnnel_v16i32:
130; AVX512VLVBMI2:       # %bb.0:
131; AVX512VLVBMI2-NEXT:    vpshrdvd %zmm2, %zmm0, %zmm1
132; AVX512VLVBMI2-NEXT:    vmovdqa64 %zmm1, %zmm0
133; AVX512VLVBMI2-NEXT:    retq
134  %res = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %amt)
135  ret <16 x i32> %res
136}
137
138define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) nounwind {
139; AVX512F-LABEL: var_funnnel_v32i16:
140; AVX512F:       # %bb.0:
141; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
142; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm4 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
143; AVX512F-NEXT:    vpslld $16, %zmm4, %zmm4
144; AVX512F-NEXT:    vpord %zmm3, %zmm4, %zmm3
145; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
146; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
147; AVX512F-NEXT:    vpsrlvd %zmm4, %zmm3, %zmm3
148; AVX512F-NEXT:    vpmovdw %zmm3, %ymm3
149; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
150; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
151; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
152; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
153; AVX512F-NEXT:    vpslld $16, %zmm0, %zmm0
154; AVX512F-NEXT:    vpord %zmm1, %zmm0, %zmm0
155; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm1
156; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
157; AVX512F-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
158; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
159; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm3, %zmm0
160; AVX512F-NEXT:    retq
161;
162; AVX512VL-LABEL: var_funnnel_v32i16:
163; AVX512VL:       # %bb.0:
164; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
165; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm4 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
166; AVX512VL-NEXT:    vpslld $16, %zmm4, %zmm4
167; AVX512VL-NEXT:    vpord %zmm3, %zmm4, %zmm3
168; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
169; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
170; AVX512VL-NEXT:    vpsrlvd %zmm4, %zmm3, %zmm3
171; AVX512VL-NEXT:    vpmovdw %zmm3, %ymm3
172; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
173; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
174; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
175; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
176; AVX512VL-NEXT:    vpslld $16, %zmm0, %zmm0
177; AVX512VL-NEXT:    vpord %zmm1, %zmm0, %zmm0
178; AVX512VL-NEXT:    vextracti64x4 $1, %zmm2, %ymm1
179; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
180; AVX512VL-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
181; AVX512VL-NEXT:    vpmovdw %zmm0, %ymm0
182; AVX512VL-NEXT:    vinserti64x4 $1, %ymm0, %zmm3, %zmm0
183; AVX512VL-NEXT:    retq
184;
185; AVX512BW-LABEL: var_funnnel_v32i16:
186; AVX512BW:       # %bb.0:
187; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
188; AVX512BW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
189; AVX512BW-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm1
190; AVX512BW-NEXT:    vpandnq %zmm3, %zmm2, %zmm2
191; AVX512BW-NEXT:    vpaddw %zmm0, %zmm0, %zmm0
192; AVX512BW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm0
193; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
194; AVX512BW-NEXT:    retq
195;
196; AVX512VBMI2-LABEL: var_funnnel_v32i16:
197; AVX512VBMI2:       # %bb.0:
198; AVX512VBMI2-NEXT:    vpshrdvw %zmm2, %zmm0, %zmm1
199; AVX512VBMI2-NEXT:    vmovdqa64 %zmm1, %zmm0
200; AVX512VBMI2-NEXT:    retq
201;
202; AVX512VLBW-LABEL: var_funnnel_v32i16:
203; AVX512VLBW:       # %bb.0:
204; AVX512VLBW-NEXT:    vpbroadcastw {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
205; AVX512VLBW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
206; AVX512VLBW-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm1
207; AVX512VLBW-NEXT:    vpandnq %zmm3, %zmm2, %zmm2
208; AVX512VLBW-NEXT:    vpaddw %zmm0, %zmm0, %zmm0
209; AVX512VLBW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm0
210; AVX512VLBW-NEXT:    vporq %zmm1, %zmm0, %zmm0
211; AVX512VLBW-NEXT:    retq
212;
213; AVX512VLVBMI2-LABEL: var_funnnel_v32i16:
214; AVX512VLVBMI2:       # %bb.0:
215; AVX512VLVBMI2-NEXT:    vpshrdvw %zmm2, %zmm0, %zmm1
216; AVX512VLVBMI2-NEXT:    vmovdqa64 %zmm1, %zmm0
217; AVX512VLVBMI2-NEXT:    retq
218  %res = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt)
219  ret <32 x i16> %res
220}
221
222define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) nounwind {
223; AVX512F-LABEL: var_funnnel_v64i8:
224; AVX512F:       # %bb.0:
225; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm5
226; AVX512F-NEXT:    vpsrlw $4, %ymm5, %ymm3
227; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
228; AVX512F-NEXT:    vpand %ymm6, %ymm3, %ymm7
229; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
230; AVX512F-NEXT:    vpandq %zmm3, %zmm2, %zmm2
231; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm4
232; AVX512F-NEXT:    vpsllw $5, %ymm4, %ymm8
233; AVX512F-NEXT:    vpblendvb %ymm8, %ymm7, %ymm5, %ymm5
234; AVX512F-NEXT:    vpsrlw $2, %ymm5, %ymm7
235; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm9 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
236; AVX512F-NEXT:    vpand %ymm7, %ymm9, %ymm7
237; AVX512F-NEXT:    vpaddb %ymm8, %ymm8, %ymm8
238; AVX512F-NEXT:    vpblendvb %ymm8, %ymm7, %ymm5, %ymm5
239; AVX512F-NEXT:    vpsrlw $1, %ymm5, %ymm7
240; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm10 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
241; AVX512F-NEXT:    vpand %ymm7, %ymm10, %ymm7
242; AVX512F-NEXT:    vpaddb %ymm8, %ymm8, %ymm8
243; AVX512F-NEXT:    vpblendvb %ymm8, %ymm7, %ymm5, %ymm5
244; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm7
245; AVX512F-NEXT:    vpand %ymm6, %ymm7, %ymm6
246; AVX512F-NEXT:    vpsllw $5, %ymm2, %ymm7
247; AVX512F-NEXT:    vpblendvb %ymm7, %ymm6, %ymm1, %ymm1
248; AVX512F-NEXT:    vpsrlw $2, %ymm1, %ymm6
249; AVX512F-NEXT:    vpand %ymm6, %ymm9, %ymm6
250; AVX512F-NEXT:    vpaddb %ymm7, %ymm7, %ymm7
251; AVX512F-NEXT:    vpblendvb %ymm7, %ymm6, %ymm1, %ymm1
252; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm6
253; AVX512F-NEXT:    vpand %ymm6, %ymm10, %ymm6
254; AVX512F-NEXT:    vpaddb %ymm7, %ymm7, %ymm7
255; AVX512F-NEXT:    vpblendvb %ymm7, %ymm6, %ymm1, %ymm1
256; AVX512F-NEXT:    vinserti64x4 $1, %ymm5, %zmm1, %zmm1
257; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm5
258; AVX512F-NEXT:    vpaddb %ymm5, %ymm5, %ymm5
259; AVX512F-NEXT:    vpsllw $4, %ymm5, %ymm6
260; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
261; AVX512F-NEXT:    vpand %ymm7, %ymm6, %ymm6
262; AVX512F-NEXT:    vpxor %ymm3, %ymm4, %ymm4
263; AVX512F-NEXT:    vpsllw $5, %ymm4, %ymm4
264; AVX512F-NEXT:    vpblendvb %ymm4, %ymm6, %ymm5, %ymm5
265; AVX512F-NEXT:    vpsllw $2, %ymm5, %ymm6
266; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
267; AVX512F-NEXT:    vpand %ymm6, %ymm8, %ymm6
268; AVX512F-NEXT:    vpaddb %ymm4, %ymm4, %ymm4
269; AVX512F-NEXT:    vpblendvb %ymm4, %ymm6, %ymm5, %ymm5
270; AVX512F-NEXT:    vpaddb %ymm5, %ymm5, %ymm6
271; AVX512F-NEXT:    vpaddb %ymm4, %ymm4, %ymm4
272; AVX512F-NEXT:    vpblendvb %ymm4, %ymm6, %ymm5, %ymm4
273; AVX512F-NEXT:    vpaddb %ymm0, %ymm0, %ymm0
274; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm5
275; AVX512F-NEXT:    vpand %ymm7, %ymm5, %ymm5
276; AVX512F-NEXT:    vpxor %ymm3, %ymm2, %ymm2
277; AVX512F-NEXT:    vpsllw $5, %ymm2, %ymm2
278; AVX512F-NEXT:    vpblendvb %ymm2, %ymm5, %ymm0, %ymm0
279; AVX512F-NEXT:    vpsllw $2, %ymm0, %ymm3
280; AVX512F-NEXT:    vpand %ymm3, %ymm8, %ymm3
281; AVX512F-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
282; AVX512F-NEXT:    vpblendvb %ymm2, %ymm3, %ymm0, %ymm0
283; AVX512F-NEXT:    vpaddb %ymm0, %ymm0, %ymm3
284; AVX512F-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
285; AVX512F-NEXT:    vpblendvb %ymm2, %ymm3, %ymm0, %ymm0
286; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0
287; AVX512F-NEXT:    vporq %zmm1, %zmm0, %zmm0
288; AVX512F-NEXT:    retq
289;
290; AVX512VL-LABEL: var_funnnel_v64i8:
291; AVX512VL:       # %bb.0:
292; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
293; AVX512VL-NEXT:    vpsrlw $4, %ymm4, %ymm3
294; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
295; AVX512VL-NEXT:    vpand %ymm5, %ymm3, %ymm6
296; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} zmm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
297; AVX512VL-NEXT:    vpandq %zmm7, %zmm2, %zmm2
298; AVX512VL-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
299; AVX512VL-NEXT:    vpsllw $5, %ymm3, %ymm8
300; AVX512VL-NEXT:    vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
301; AVX512VL-NEXT:    vpsrlw $2, %ymm4, %ymm6
302; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm9 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
303; AVX512VL-NEXT:    vpand %ymm6, %ymm9, %ymm6
304; AVX512VL-NEXT:    vpaddb %ymm8, %ymm8, %ymm8
305; AVX512VL-NEXT:    vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
306; AVX512VL-NEXT:    vpsrlw $1, %ymm4, %ymm6
307; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm10 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
308; AVX512VL-NEXT:    vpand %ymm6, %ymm10, %ymm6
309; AVX512VL-NEXT:    vpaddb %ymm8, %ymm8, %ymm8
310; AVX512VL-NEXT:    vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
311; AVX512VL-NEXT:    vpsrlw $4, %ymm1, %ymm6
312; AVX512VL-NEXT:    vpand %ymm5, %ymm6, %ymm5
313; AVX512VL-NEXT:    vpsllw $5, %ymm2, %ymm6
314; AVX512VL-NEXT:    vpblendvb %ymm6, %ymm5, %ymm1, %ymm1
315; AVX512VL-NEXT:    vpsrlw $2, %ymm1, %ymm5
316; AVX512VL-NEXT:    vpand %ymm5, %ymm9, %ymm5
317; AVX512VL-NEXT:    vpaddb %ymm6, %ymm6, %ymm6
318; AVX512VL-NEXT:    vpblendvb %ymm6, %ymm5, %ymm1, %ymm1
319; AVX512VL-NEXT:    vpsrlw $1, %ymm1, %ymm5
320; AVX512VL-NEXT:    vpand %ymm5, %ymm10, %ymm5
321; AVX512VL-NEXT:    vpaddb %ymm6, %ymm6, %ymm6
322; AVX512VL-NEXT:    vpblendvb %ymm6, %ymm5, %ymm1, %ymm1
323; AVX512VL-NEXT:    vinserti64x4 $1, %ymm4, %zmm1, %zmm1
324; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
325; AVX512VL-NEXT:    vpaddb %ymm4, %ymm4, %ymm4
326; AVX512VL-NEXT:    vpsllw $4, %ymm4, %ymm5
327; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
328; AVX512VL-NEXT:    vpand %ymm6, %ymm5, %ymm5
329; AVX512VL-NEXT:    vpxor %ymm7, %ymm3, %ymm3
330; AVX512VL-NEXT:    vpsllw $5, %ymm3, %ymm3
331; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
332; AVX512VL-NEXT:    vpsllw $2, %ymm4, %ymm5
333; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
334; AVX512VL-NEXT:    vpand %ymm5, %ymm8, %ymm5
335; AVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
336; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
337; AVX512VL-NEXT:    vpaddb %ymm4, %ymm4, %ymm5
338; AVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
339; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm3
340; AVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm0
341; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm4
342; AVX512VL-NEXT:    vpand %ymm6, %ymm4, %ymm4
343; AVX512VL-NEXT:    vpxor %ymm7, %ymm2, %ymm2
344; AVX512VL-NEXT:    vpsllw $5, %ymm2, %ymm2
345; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
346; AVX512VL-NEXT:    vpsllw $2, %ymm0, %ymm4
347; AVX512VL-NEXT:    vpand %ymm4, %ymm8, %ymm4
348; AVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
349; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
350; AVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm4
351; AVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
352; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
353; AVX512VL-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
354; AVX512VL-NEXT:    vporq %zmm1, %zmm0, %zmm0
355; AVX512VL-NEXT:    retq
356;
357; AVX512BW-LABEL: var_funnnel_v64i8:
358; AVX512BW:       # %bb.0:
359; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
360; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
361; AVX512BW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
362; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm5 = zmm2[8],zmm4[8],zmm2[9],zmm4[9],zmm2[10],zmm4[10],zmm2[11],zmm4[11],zmm2[12],zmm4[12],zmm2[13],zmm4[13],zmm2[14],zmm4[14],zmm2[15],zmm4[15],zmm2[24],zmm4[24],zmm2[25],zmm4[25],zmm2[26],zmm4[26],zmm2[27],zmm4[27],zmm2[28],zmm4[28],zmm2[29],zmm4[29],zmm2[30],zmm4[30],zmm2[31],zmm4[31],zmm2[40],zmm4[40],zmm2[41],zmm4[41],zmm2[42],zmm4[42],zmm2[43],zmm4[43],zmm2[44],zmm4[44],zmm2[45],zmm4[45],zmm2[46],zmm4[46],zmm2[47],zmm4[47],zmm2[56],zmm4[56],zmm2[57],zmm4[57],zmm2[58],zmm4[58],zmm2[59],zmm4[59],zmm2[60],zmm4[60],zmm2[61],zmm4[61],zmm2[62],zmm4[62],zmm2[63],zmm4[63]
363; AVX512BW-NEXT:    vpsrlvw %zmm5, %zmm3, %zmm3
364; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
365; AVX512BW-NEXT:    vpandq %zmm5, %zmm3, %zmm3
366; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
367; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm2[0],zmm4[0],zmm2[1],zmm4[1],zmm2[2],zmm4[2],zmm2[3],zmm4[3],zmm2[4],zmm4[4],zmm2[5],zmm4[5],zmm2[6],zmm4[6],zmm2[7],zmm4[7],zmm2[16],zmm4[16],zmm2[17],zmm4[17],zmm2[18],zmm4[18],zmm2[19],zmm4[19],zmm2[20],zmm4[20],zmm2[21],zmm4[21],zmm2[22],zmm4[22],zmm2[23],zmm4[23],zmm2[32],zmm4[32],zmm2[33],zmm4[33],zmm2[34],zmm4[34],zmm2[35],zmm4[35],zmm2[36],zmm4[36],zmm2[37],zmm4[37],zmm2[38],zmm4[38],zmm2[39],zmm4[39],zmm2[48],zmm4[48],zmm2[49],zmm4[49],zmm2[50],zmm4[50],zmm2[51],zmm4[51],zmm2[52],zmm4[52],zmm2[53],zmm4[53],zmm2[54],zmm4[54],zmm2[55],zmm4[55]
368; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
369; AVX512BW-NEXT:    vpandq %zmm5, %zmm0, %zmm0
370; AVX512BW-NEXT:    vpackuswb %zmm3, %zmm0, %zmm0
371; AVX512BW-NEXT:    retq
372;
373; AVX512VBMI2-LABEL: var_funnnel_v64i8:
374; AVX512VBMI2:       # %bb.0:
375; AVX512VBMI2-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
376; AVX512VBMI2-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
377; AVX512VBMI2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
378; AVX512VBMI2-NEXT:    vpunpckhbw {{.*#+}} zmm5 = zmm2[8],zmm4[8],zmm2[9],zmm4[9],zmm2[10],zmm4[10],zmm2[11],zmm4[11],zmm2[12],zmm4[12],zmm2[13],zmm4[13],zmm2[14],zmm4[14],zmm2[15],zmm4[15],zmm2[24],zmm4[24],zmm2[25],zmm4[25],zmm2[26],zmm4[26],zmm2[27],zmm4[27],zmm2[28],zmm4[28],zmm2[29],zmm4[29],zmm2[30],zmm4[30],zmm2[31],zmm4[31],zmm2[40],zmm4[40],zmm2[41],zmm4[41],zmm2[42],zmm4[42],zmm2[43],zmm4[43],zmm2[44],zmm4[44],zmm2[45],zmm4[45],zmm2[46],zmm4[46],zmm2[47],zmm4[47],zmm2[56],zmm4[56],zmm2[57],zmm4[57],zmm2[58],zmm4[58],zmm2[59],zmm4[59],zmm2[60],zmm4[60],zmm2[61],zmm4[61],zmm2[62],zmm4[62],zmm2[63],zmm4[63]
379; AVX512VBMI2-NEXT:    vpsrlvw %zmm5, %zmm3, %zmm3
380; AVX512VBMI2-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
381; AVX512VBMI2-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm2[0],zmm4[0],zmm2[1],zmm4[1],zmm2[2],zmm4[2],zmm2[3],zmm4[3],zmm2[4],zmm4[4],zmm2[5],zmm4[5],zmm2[6],zmm4[6],zmm2[7],zmm4[7],zmm2[16],zmm4[16],zmm2[17],zmm4[17],zmm2[18],zmm4[18],zmm2[19],zmm4[19],zmm2[20],zmm4[20],zmm2[21],zmm4[21],zmm2[22],zmm4[22],zmm2[23],zmm4[23],zmm2[32],zmm4[32],zmm2[33],zmm4[33],zmm2[34],zmm4[34],zmm2[35],zmm4[35],zmm2[36],zmm4[36],zmm2[37],zmm4[37],zmm2[38],zmm4[38],zmm2[39],zmm4[39],zmm2[48],zmm4[48],zmm2[49],zmm4[49],zmm2[50],zmm4[50],zmm2[51],zmm4[51],zmm2[52],zmm4[52],zmm2[53],zmm4[53],zmm2[54],zmm4[54],zmm2[55],zmm4[55]
382; AVX512VBMI2-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm1
383; AVX512VBMI2-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78,16,18,20,22,24,26,28,30,80,82,84,86,88,90,92,94,32,34,36,38,40,42,44,46,96,98,100,102,104,106,108,110,48,50,52,54,56,58,60,62,112,114,116,118,120,122,124,126]
384; AVX512VBMI2-NEXT:    vpermi2b %zmm3, %zmm1, %zmm0
385; AVX512VBMI2-NEXT:    retq
386;
387; AVX512VLBW-LABEL: var_funnnel_v64i8:
388; AVX512VLBW:       # %bb.0:
389; AVX512VLBW-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
390; AVX512VLBW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
391; AVX512VLBW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
392; AVX512VLBW-NEXT:    vpunpckhbw {{.*#+}} zmm5 = zmm2[8],zmm4[8],zmm2[9],zmm4[9],zmm2[10],zmm4[10],zmm2[11],zmm4[11],zmm2[12],zmm4[12],zmm2[13],zmm4[13],zmm2[14],zmm4[14],zmm2[15],zmm4[15],zmm2[24],zmm4[24],zmm2[25],zmm4[25],zmm2[26],zmm4[26],zmm2[27],zmm4[27],zmm2[28],zmm4[28],zmm2[29],zmm4[29],zmm2[30],zmm4[30],zmm2[31],zmm4[31],zmm2[40],zmm4[40],zmm2[41],zmm4[41],zmm2[42],zmm4[42],zmm2[43],zmm4[43],zmm2[44],zmm4[44],zmm2[45],zmm4[45],zmm2[46],zmm4[46],zmm2[47],zmm4[47],zmm2[56],zmm4[56],zmm2[57],zmm4[57],zmm2[58],zmm4[58],zmm2[59],zmm4[59],zmm2[60],zmm4[60],zmm2[61],zmm4[61],zmm2[62],zmm4[62],zmm2[63],zmm4[63]
393; AVX512VLBW-NEXT:    vpsrlvw %zmm5, %zmm3, %zmm3
394; AVX512VLBW-NEXT:    vpbroadcastw {{.*#+}} zmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
395; AVX512VLBW-NEXT:    vpandq %zmm5, %zmm3, %zmm3
396; AVX512VLBW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
397; AVX512VLBW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm2[0],zmm4[0],zmm2[1],zmm4[1],zmm2[2],zmm4[2],zmm2[3],zmm4[3],zmm2[4],zmm4[4],zmm2[5],zmm4[5],zmm2[6],zmm4[6],zmm2[7],zmm4[7],zmm2[16],zmm4[16],zmm2[17],zmm4[17],zmm2[18],zmm4[18],zmm2[19],zmm4[19],zmm2[20],zmm4[20],zmm2[21],zmm4[21],zmm2[22],zmm4[22],zmm2[23],zmm4[23],zmm2[32],zmm4[32],zmm2[33],zmm4[33],zmm2[34],zmm4[34],zmm2[35],zmm4[35],zmm2[36],zmm4[36],zmm2[37],zmm4[37],zmm2[38],zmm4[38],zmm2[39],zmm4[39],zmm2[48],zmm4[48],zmm2[49],zmm4[49],zmm2[50],zmm4[50],zmm2[51],zmm4[51],zmm2[52],zmm4[52],zmm2[53],zmm4[53],zmm2[54],zmm4[54],zmm2[55],zmm4[55]
398; AVX512VLBW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
399; AVX512VLBW-NEXT:    vpandq %zmm5, %zmm0, %zmm0
400; AVX512VLBW-NEXT:    vpackuswb %zmm3, %zmm0, %zmm0
401; AVX512VLBW-NEXT:    retq
402;
403; AVX512VLVBMI2-LABEL: var_funnnel_v64i8:
404; AVX512VLVBMI2:       # %bb.0:
405; AVX512VLVBMI2-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
406; AVX512VLVBMI2-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
407; AVX512VLVBMI2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
408; AVX512VLVBMI2-NEXT:    vpunpckhbw {{.*#+}} zmm5 = zmm2[8],zmm4[8],zmm2[9],zmm4[9],zmm2[10],zmm4[10],zmm2[11],zmm4[11],zmm2[12],zmm4[12],zmm2[13],zmm4[13],zmm2[14],zmm4[14],zmm2[15],zmm4[15],zmm2[24],zmm4[24],zmm2[25],zmm4[25],zmm2[26],zmm4[26],zmm2[27],zmm4[27],zmm2[28],zmm4[28],zmm2[29],zmm4[29],zmm2[30],zmm4[30],zmm2[31],zmm4[31],zmm2[40],zmm4[40],zmm2[41],zmm4[41],zmm2[42],zmm4[42],zmm2[43],zmm4[43],zmm2[44],zmm4[44],zmm2[45],zmm4[45],zmm2[46],zmm4[46],zmm2[47],zmm4[47],zmm2[56],zmm4[56],zmm2[57],zmm4[57],zmm2[58],zmm4[58],zmm2[59],zmm4[59],zmm2[60],zmm4[60],zmm2[61],zmm4[61],zmm2[62],zmm4[62],zmm2[63],zmm4[63]
409; AVX512VLVBMI2-NEXT:    vpsrlvw %zmm5, %zmm3, %zmm3
410; AVX512VLVBMI2-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
411; AVX512VLVBMI2-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm2[0],zmm4[0],zmm2[1],zmm4[1],zmm2[2],zmm4[2],zmm2[3],zmm4[3],zmm2[4],zmm4[4],zmm2[5],zmm4[5],zmm2[6],zmm4[6],zmm2[7],zmm4[7],zmm2[16],zmm4[16],zmm2[17],zmm4[17],zmm2[18],zmm4[18],zmm2[19],zmm4[19],zmm2[20],zmm4[20],zmm2[21],zmm4[21],zmm2[22],zmm4[22],zmm2[23],zmm4[23],zmm2[32],zmm4[32],zmm2[33],zmm4[33],zmm2[34],zmm4[34],zmm2[35],zmm4[35],zmm2[36],zmm4[36],zmm2[37],zmm4[37],zmm2[38],zmm4[38],zmm2[39],zmm4[39],zmm2[48],zmm4[48],zmm2[49],zmm4[49],zmm2[50],zmm4[50],zmm2[51],zmm4[51],zmm2[52],zmm4[52],zmm2[53],zmm4[53],zmm2[54],zmm4[54],zmm2[55],zmm4[55]
412; AVX512VLVBMI2-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm1
413; AVX512VLVBMI2-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78,16,18,20,22,24,26,28,30,80,82,84,86,88,90,92,94,32,34,36,38,40,42,44,46,96,98,100,102,104,106,108,110,48,50,52,54,56,58,60,62,112,114,116,118,120,122,124,126]
414; AVX512VLVBMI2-NEXT:    vpermi2b %zmm3, %zmm1, %zmm0
415; AVX512VLVBMI2-NEXT:    retq
416  %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt)
417  ret <64 x i8> %res
418}
419
420;
421; Uniform Variable Shifts
422;
423
424define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) nounwind {
425; AVX512F-LABEL: splatvar_funnnel_v8i64:
426; AVX512F:       # %bb.0:
427; AVX512F-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [63,63]
428; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm4
429; AVX512F-NEXT:    vpsrlq %xmm4, %zmm1, %zmm1
430; AVX512F-NEXT:    vpandn %xmm3, %xmm2, %xmm2
431; AVX512F-NEXT:    vpaddq %zmm0, %zmm0, %zmm0
432; AVX512F-NEXT:    vpsllq %xmm2, %zmm0, %zmm0
433; AVX512F-NEXT:    vporq %zmm1, %zmm0, %zmm0
434; AVX512F-NEXT:    retq
435;
436; AVX512VL-LABEL: splatvar_funnnel_v8i64:
437; AVX512VL:       # %bb.0:
438; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
439; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm4
440; AVX512VL-NEXT:    vpsrlq %xmm4, %zmm1, %zmm1
441; AVX512VL-NEXT:    vpandn %xmm3, %xmm2, %xmm2
442; AVX512VL-NEXT:    vpaddq %zmm0, %zmm0, %zmm0
443; AVX512VL-NEXT:    vpsllq %xmm2, %zmm0, %zmm0
444; AVX512VL-NEXT:    vporq %zmm1, %zmm0, %zmm0
445; AVX512VL-NEXT:    retq
446;
447; AVX512BW-LABEL: splatvar_funnnel_v8i64:
448; AVX512BW:       # %bb.0:
449; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [63,63]
450; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm4
451; AVX512BW-NEXT:    vpsrlq %xmm4, %zmm1, %zmm1
452; AVX512BW-NEXT:    vpandn %xmm3, %xmm2, %xmm2
453; AVX512BW-NEXT:    vpaddq %zmm0, %zmm0, %zmm0
454; AVX512BW-NEXT:    vpsllq %xmm2, %zmm0, %zmm0
455; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
456; AVX512BW-NEXT:    retq
457;
458; AVX512VBMI2-LABEL: splatvar_funnnel_v8i64:
459; AVX512VBMI2:       # %bb.0:
460; AVX512VBMI2-NEXT:    vpbroadcastq %xmm2, %zmm2
461; AVX512VBMI2-NEXT:    vpshrdvq %zmm2, %zmm0, %zmm1
462; AVX512VBMI2-NEXT:    vmovdqa64 %zmm1, %zmm0
463; AVX512VBMI2-NEXT:    retq
464;
465; AVX512VLBW-LABEL: splatvar_funnnel_v8i64:
466; AVX512VLBW:       # %bb.0:
467; AVX512VLBW-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
468; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
469; AVX512VLBW-NEXT:    vpsrlq %xmm4, %zmm1, %zmm1
470; AVX512VLBW-NEXT:    vpandn %xmm3, %xmm2, %xmm2
471; AVX512VLBW-NEXT:    vpaddq %zmm0, %zmm0, %zmm0
472; AVX512VLBW-NEXT:    vpsllq %xmm2, %zmm0, %zmm0
473; AVX512VLBW-NEXT:    vporq %zmm1, %zmm0, %zmm0
474; AVX512VLBW-NEXT:    retq
475;
476; AVX512VLVBMI2-LABEL: splatvar_funnnel_v8i64:
477; AVX512VLVBMI2:       # %bb.0:
478; AVX512VLVBMI2-NEXT:    vpbroadcastq %xmm2, %zmm2
479; AVX512VLVBMI2-NEXT:    vpshrdvq %zmm2, %zmm0, %zmm1
480; AVX512VLVBMI2-NEXT:    vmovdqa64 %zmm1, %zmm0
481; AVX512VLVBMI2-NEXT:    retq
482  %splat = shufflevector <8 x i64> %amt, <8 x i64> undef, <8 x i32> zeroinitializer
483  %res = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %splat)
484  ret <8 x i64> %res
485}
486
487define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %amt) nounwind {
488; AVX512F-LABEL: splatvar_funnnel_v16i32:
489; AVX512F:       # %bb.0:
490; AVX512F-NEXT:    vpunpckhdq {{.*#+}} zmm3 = zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[14],zmm0[14],zmm1[15],zmm0[15]
491; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
492; AVX512F-NEXT:    vpsrlq %xmm2, %zmm3, %zmm3
493; AVX512F-NEXT:    vpunpckldq {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[12],zmm0[12],zmm1[13],zmm0[13]
494; AVX512F-NEXT:    vpsrlq %xmm2, %zmm0, %zmm0
495; AVX512F-NEXT:    vshufps {{.*#+}} zmm0 = zmm0[0,2],zmm3[0,2],zmm0[4,6],zmm3[4,6],zmm0[8,10],zmm3[8,10],zmm0[12,14],zmm3[12,14]
496; AVX512F-NEXT:    retq
497;
498; AVX512VL-LABEL: splatvar_funnnel_v16i32:
499; AVX512VL:       # %bb.0:
500; AVX512VL-NEXT:    vpunpckhdq {{.*#+}} zmm3 = zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[14],zmm0[14],zmm1[15],zmm0[15]
501; AVX512VL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
502; AVX512VL-NEXT:    vpsrlq %xmm2, %zmm3, %zmm3
503; AVX512VL-NEXT:    vpunpckldq {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[12],zmm0[12],zmm1[13],zmm0[13]
504; AVX512VL-NEXT:    vpsrlq %xmm2, %zmm0, %zmm0
505; AVX512VL-NEXT:    vshufps {{.*#+}} zmm0 = zmm0[0,2],zmm3[0,2],zmm0[4,6],zmm3[4,6],zmm0[8,10],zmm3[8,10],zmm0[12,14],zmm3[12,14]
506; AVX512VL-NEXT:    retq
507;
508; AVX512BW-LABEL: splatvar_funnnel_v16i32:
509; AVX512BW:       # %bb.0:
510; AVX512BW-NEXT:    vpunpckhdq {{.*#+}} zmm3 = zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[14],zmm0[14],zmm1[15],zmm0[15]
511; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
512; AVX512BW-NEXT:    vpsrlq %xmm2, %zmm3, %zmm3
513; AVX512BW-NEXT:    vpunpckldq {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[12],zmm0[12],zmm1[13],zmm0[13]
514; AVX512BW-NEXT:    vpsrlq %xmm2, %zmm0, %zmm0
515; AVX512BW-NEXT:    vshufps {{.*#+}} zmm0 = zmm0[0,2],zmm3[0,2],zmm0[4,6],zmm3[4,6],zmm0[8,10],zmm3[8,10],zmm0[12,14],zmm3[12,14]
516; AVX512BW-NEXT:    retq
517;
518; AVX512VBMI2-LABEL: splatvar_funnnel_v16i32:
519; AVX512VBMI2:       # %bb.0:
520; AVX512VBMI2-NEXT:    vpbroadcastd %xmm2, %zmm2
521; AVX512VBMI2-NEXT:    vpshrdvd %zmm2, %zmm0, %zmm1
522; AVX512VBMI2-NEXT:    vmovdqa64 %zmm1, %zmm0
523; AVX512VBMI2-NEXT:    retq
524;
525; AVX512VLBW-LABEL: splatvar_funnnel_v16i32:
526; AVX512VLBW:       # %bb.0:
527; AVX512VLBW-NEXT:    vpunpckhdq {{.*#+}} zmm3 = zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[14],zmm0[14],zmm1[15],zmm0[15]
528; AVX512VLBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
529; AVX512VLBW-NEXT:    vpsrlq %xmm2, %zmm3, %zmm3
530; AVX512VLBW-NEXT:    vpunpckldq {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[12],zmm0[12],zmm1[13],zmm0[13]
531; AVX512VLBW-NEXT:    vpsrlq %xmm2, %zmm0, %zmm0
532; AVX512VLBW-NEXT:    vshufps {{.*#+}} zmm0 = zmm0[0,2],zmm3[0,2],zmm0[4,6],zmm3[4,6],zmm0[8,10],zmm3[8,10],zmm0[12,14],zmm3[12,14]
533; AVX512VLBW-NEXT:    retq
534;
535; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i32:
536; AVX512VLVBMI2:       # %bb.0:
537; AVX512VLVBMI2-NEXT:    vpbroadcastd %xmm2, %zmm2
538; AVX512VLVBMI2-NEXT:    vpshrdvd %zmm2, %zmm0, %zmm1
539; AVX512VLVBMI2-NEXT:    vmovdqa64 %zmm1, %zmm0
540; AVX512VLVBMI2-NEXT:    retq
541  %splat = shufflevector <16 x i32> %amt, <16 x i32> undef, <16 x i32> zeroinitializer
542  %res = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %splat)
543  ret <16 x i32> %res
544}
545
546define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) nounwind {
547; AVX512F-LABEL: splatvar_funnnel_v32i16:
548; AVX512F:       # %bb.0:
549; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0]
550; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm4
551; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm5
552; AVX512F-NEXT:    vpsrlw %xmm4, %ymm5, %ymm5
553; AVX512F-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
554; AVX512F-NEXT:    vinserti64x4 $1, %ymm5, %zmm1, %zmm1
555; AVX512F-NEXT:    vpandn %xmm3, %xmm2, %xmm2
556; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
557; AVX512F-NEXT:    vpaddw %ymm3, %ymm3, %ymm3
558; AVX512F-NEXT:    vpsllw %xmm2, %ymm3, %ymm3
559; AVX512F-NEXT:    vpaddw %ymm0, %ymm0, %ymm0
560; AVX512F-NEXT:    vpsllw %xmm2, %ymm0, %ymm0
561; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
562; AVX512F-NEXT:    vporq %zmm1, %zmm0, %zmm0
563; AVX512F-NEXT:    retq
564;
565; AVX512VL-LABEL: splatvar_funnnel_v32i16:
566; AVX512VL:       # %bb.0:
567; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0]
568; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm4
569; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm5
570; AVX512VL-NEXT:    vpsrlw %xmm4, %ymm5, %ymm5
571; AVX512VL-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
572; AVX512VL-NEXT:    vinserti64x4 $1, %ymm5, %zmm1, %zmm1
573; AVX512VL-NEXT:    vpandn %xmm3, %xmm2, %xmm2
574; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
575; AVX512VL-NEXT:    vpaddw %ymm3, %ymm3, %ymm3
576; AVX512VL-NEXT:    vpsllw %xmm2, %ymm3, %ymm3
577; AVX512VL-NEXT:    vpaddw %ymm0, %ymm0, %ymm0
578; AVX512VL-NEXT:    vpsllw %xmm2, %ymm0, %ymm0
579; AVX512VL-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
580; AVX512VL-NEXT:    vporq %zmm1, %zmm0, %zmm0
581; AVX512VL-NEXT:    retq
582;
583; AVX512BW-LABEL: splatvar_funnnel_v32i16:
584; AVX512BW:       # %bb.0:
585; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0]
586; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm4
587; AVX512BW-NEXT:    vpsrlw %xmm4, %zmm1, %zmm1
588; AVX512BW-NEXT:    vpandn %xmm3, %xmm2, %xmm2
589; AVX512BW-NEXT:    vpaddw %zmm0, %zmm0, %zmm0
590; AVX512BW-NEXT:    vpsllw %xmm2, %zmm0, %zmm0
591; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
592; AVX512BW-NEXT:    retq
593;
594; AVX512VBMI2-LABEL: splatvar_funnnel_v32i16:
595; AVX512VBMI2:       # %bb.0:
596; AVX512VBMI2-NEXT:    vpbroadcastw %xmm2, %zmm2
597; AVX512VBMI2-NEXT:    vpshrdvw %zmm2, %zmm0, %zmm1
598; AVX512VBMI2-NEXT:    vmovdqa64 %zmm1, %zmm0
599; AVX512VBMI2-NEXT:    retq
600;
601; AVX512VLBW-LABEL: splatvar_funnnel_v32i16:
602; AVX512VLBW:       # %bb.0:
603; AVX512VLBW-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0]
604; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
605; AVX512VLBW-NEXT:    vpsrlw %xmm4, %zmm1, %zmm1
606; AVX512VLBW-NEXT:    vpandn %xmm3, %xmm2, %xmm2
607; AVX512VLBW-NEXT:    vpaddw %zmm0, %zmm0, %zmm0
608; AVX512VLBW-NEXT:    vpsllw %xmm2, %zmm0, %zmm0
609; AVX512VLBW-NEXT:    vporq %zmm1, %zmm0, %zmm0
610; AVX512VLBW-NEXT:    retq
611;
612; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i16:
613; AVX512VLVBMI2:       # %bb.0:
614; AVX512VLVBMI2-NEXT:    vpbroadcastw %xmm2, %zmm2
615; AVX512VLVBMI2-NEXT:    vpshrdvw %zmm2, %zmm0, %zmm1
616; AVX512VLVBMI2-NEXT:    vmovdqa64 %zmm1, %zmm0
617; AVX512VLVBMI2-NEXT:    retq
618  %splat = shufflevector <32 x i16> %amt, <32 x i16> undef, <32 x i32> zeroinitializer
619  %res = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %splat)
620  ret <32 x i16> %res
621}
622
623define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) nounwind {
624; AVX512F-LABEL: splatvar_funnnel_v64i8:
625; AVX512F:       # %bb.0:
626; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
627; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
628; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31]
629; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
630; AVX512F-NEXT:    vpsrlw %xmm2, %ymm5, %ymm5
631; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
632; AVX512F-NEXT:    vpand %ymm6, %ymm5, %ymm5
633; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23]
634; AVX512F-NEXT:    vpsrlw %xmm2, %ymm3, %ymm3
635; AVX512F-NEXT:    vpand %ymm6, %ymm3, %ymm3
636; AVX512F-NEXT:    vpackuswb %ymm5, %ymm3, %ymm3
637; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
638; AVX512F-NEXT:    vpsrlw %xmm2, %ymm4, %ymm4
639; AVX512F-NEXT:    vpand %ymm6, %ymm4, %ymm4
640; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
641; AVX512F-NEXT:    vpsrlw %xmm2, %ymm0, %ymm0
642; AVX512F-NEXT:    vpand %ymm6, %ymm0, %ymm0
643; AVX512F-NEXT:    vpackuswb %ymm4, %ymm0, %ymm0
644; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
645; AVX512F-NEXT:    retq
646;
647; AVX512VL-LABEL: splatvar_funnnel_v64i8:
648; AVX512VL:       # %bb.0:
649; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
650; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
651; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31]
652; AVX512VL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
653; AVX512VL-NEXT:    vpsrlw %xmm2, %ymm5, %ymm5
654; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
655; AVX512VL-NEXT:    vpand %ymm6, %ymm5, %ymm5
656; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23]
657; AVX512VL-NEXT:    vpsrlw %xmm2, %ymm3, %ymm3
658; AVX512VL-NEXT:    vpand %ymm6, %ymm3, %ymm3
659; AVX512VL-NEXT:    vpackuswb %ymm5, %ymm3, %ymm3
660; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
661; AVX512VL-NEXT:    vpsrlw %xmm2, %ymm4, %ymm4
662; AVX512VL-NEXT:    vpand %ymm6, %ymm4, %ymm4
663; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
664; AVX512VL-NEXT:    vpsrlw %xmm2, %ymm0, %ymm0
665; AVX512VL-NEXT:    vpand %ymm6, %ymm0, %ymm0
666; AVX512VL-NEXT:    vpackuswb %ymm4, %ymm0, %ymm0
667; AVX512VL-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
668; AVX512VL-NEXT:    retq
669;
670; AVX512BW-LABEL: splatvar_funnnel_v64i8:
671; AVX512BW:       # %bb.0:
672; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
673; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
674; AVX512BW-NEXT:    vpsrlw %xmm2, %zmm3, %zmm3
675; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
676; AVX512BW-NEXT:    vpandq %zmm4, %zmm3, %zmm3
677; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
678; AVX512BW-NEXT:    vpsrlw %xmm2, %zmm0, %zmm0
679; AVX512BW-NEXT:    vpandq %zmm4, %zmm0, %zmm0
680; AVX512BW-NEXT:    vpackuswb %zmm3, %zmm0, %zmm0
681; AVX512BW-NEXT:    retq
682;
683; AVX512VBMI2-LABEL: splatvar_funnnel_v64i8:
684; AVX512VBMI2:       # %bb.0:
685; AVX512VBMI2-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
686; AVX512VBMI2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
687; AVX512VBMI2-NEXT:    vpsrlw %xmm2, %zmm3, %zmm3
688; AVX512VBMI2-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
689; AVX512VBMI2-NEXT:    vpsrlw %xmm2, %zmm0, %zmm1
690; AVX512VBMI2-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78,16,18,20,22,24,26,28,30,80,82,84,86,88,90,92,94,32,34,36,38,40,42,44,46,96,98,100,102,104,106,108,110,48,50,52,54,56,58,60,62,112,114,116,118,120,122,124,126]
691; AVX512VBMI2-NEXT:    vpermi2b %zmm3, %zmm1, %zmm0
692; AVX512VBMI2-NEXT:    retq
693;
694; AVX512VLBW-LABEL: splatvar_funnnel_v64i8:
695; AVX512VLBW:       # %bb.0:
696; AVX512VLBW-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
697; AVX512VLBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
698; AVX512VLBW-NEXT:    vpsrlw %xmm2, %zmm3, %zmm3
699; AVX512VLBW-NEXT:    vpbroadcastw {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
700; AVX512VLBW-NEXT:    vpandq %zmm4, %zmm3, %zmm3
701; AVX512VLBW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
702; AVX512VLBW-NEXT:    vpsrlw %xmm2, %zmm0, %zmm0
703; AVX512VLBW-NEXT:    vpandq %zmm4, %zmm0, %zmm0
704; AVX512VLBW-NEXT:    vpackuswb %zmm3, %zmm0, %zmm0
705; AVX512VLBW-NEXT:    retq
706;
707; AVX512VLVBMI2-LABEL: splatvar_funnnel_v64i8:
708; AVX512VLVBMI2:       # %bb.0:
709; AVX512VLVBMI2-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
710; AVX512VLVBMI2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
711; AVX512VLVBMI2-NEXT:    vpsrlw %xmm2, %zmm3, %zmm3
712; AVX512VLVBMI2-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
713; AVX512VLVBMI2-NEXT:    vpsrlw %xmm2, %zmm0, %zmm1
714; AVX512VLVBMI2-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78,16,18,20,22,24,26,28,30,80,82,84,86,88,90,92,94,32,34,36,38,40,42,44,46,96,98,100,102,104,106,108,110,48,50,52,54,56,58,60,62,112,114,116,118,120,122,124,126]
715; AVX512VLVBMI2-NEXT:    vpermi2b %zmm3, %zmm1, %zmm0
716; AVX512VLVBMI2-NEXT:    retq
717  %splat = shufflevector <64 x i8> %amt, <64 x i8> undef, <64 x i32> zeroinitializer
718  %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %splat)
719  ret <64 x i8> %res
720}
721
722;
723; Constant Shifts
724;
725
726define <8 x i64> @constant_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
727; AVX512F-LABEL: constant_funnnel_v8i64:
728; AVX512F:       # %bb.0:
729; AVX512F-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
730; AVX512F-NEXT:    vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
731; AVX512F-NEXT:    vporq %zmm1, %zmm0, %zmm0
732; AVX512F-NEXT:    retq
733;
734; AVX512VL-LABEL: constant_funnnel_v8i64:
735; AVX512VL:       # %bb.0:
736; AVX512VL-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
737; AVX512VL-NEXT:    vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
738; AVX512VL-NEXT:    vporq %zmm1, %zmm0, %zmm0
739; AVX512VL-NEXT:    retq
740;
741; AVX512BW-LABEL: constant_funnnel_v8i64:
742; AVX512BW:       # %bb.0:
743; AVX512BW-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
744; AVX512BW-NEXT:    vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
745; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
746; AVX512BW-NEXT:    retq
747;
748; AVX512VBMI2-LABEL: constant_funnnel_v8i64:
749; AVX512VBMI2:       # %bb.0:
750; AVX512VBMI2-NEXT:    vpshrdvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
751; AVX512VBMI2-NEXT:    vmovdqa64 %zmm1, %zmm0
752; AVX512VBMI2-NEXT:    retq
753;
754; AVX512VLBW-LABEL: constant_funnnel_v8i64:
755; AVX512VLBW:       # %bb.0:
756; AVX512VLBW-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
757; AVX512VLBW-NEXT:    vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
758; AVX512VLBW-NEXT:    vporq %zmm1, %zmm0, %zmm0
759; AVX512VLBW-NEXT:    retq
760;
761; AVX512VLVBMI2-LABEL: constant_funnnel_v8i64:
762; AVX512VLVBMI2:       # %bb.0:
763; AVX512VLVBMI2-NEXT:    vpshrdvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
764; AVX512VLVBMI2-NEXT:    vmovdqa64 %zmm1, %zmm0
765; AVX512VLVBMI2-NEXT:    retq
766  %res = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> <i64 4, i64 14, i64 50, i64 60, i64 4, i64 14, i64 50, i64 60>)
767  ret <8 x i64> %res
768}
769
770define <16 x i32> @constant_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
771; AVX512F-LABEL: constant_funnnel_v16i32:
772; AVX512F:       # %bb.0:
773; AVX512F-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
774; AVX512F-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
775; AVX512F-NEXT:    vpord %zmm1, %zmm0, %zmm0
776; AVX512F-NEXT:    retq
777;
778; AVX512VL-LABEL: constant_funnnel_v16i32:
779; AVX512VL:       # %bb.0:
780; AVX512VL-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
781; AVX512VL-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
782; AVX512VL-NEXT:    vpord %zmm1, %zmm0, %zmm0
783; AVX512VL-NEXT:    retq
784;
785; AVX512BW-LABEL: constant_funnnel_v16i32:
786; AVX512BW:       # %bb.0:
787; AVX512BW-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
788; AVX512BW-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
789; AVX512BW-NEXT:    vpord %zmm1, %zmm0, %zmm0
790; AVX512BW-NEXT:    retq
791;
792; AVX512VBMI2-LABEL: constant_funnnel_v16i32:
793; AVX512VBMI2:       # %bb.0:
794; AVX512VBMI2-NEXT:    vpshrdvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
795; AVX512VBMI2-NEXT:    vmovdqa64 %zmm1, %zmm0
796; AVX512VBMI2-NEXT:    retq
797;
798; AVX512VLBW-LABEL: constant_funnnel_v16i32:
799; AVX512VLBW:       # %bb.0:
800; AVX512VLBW-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
801; AVX512VLBW-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
802; AVX512VLBW-NEXT:    vpord %zmm1, %zmm0, %zmm0
803; AVX512VLBW-NEXT:    retq
804;
805; AVX512VLVBMI2-LABEL: constant_funnnel_v16i32:
806; AVX512VLVBMI2:       # %bb.0:
807; AVX512VLVBMI2-NEXT:    vpshrdvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
808; AVX512VLVBMI2-NEXT:    vmovdqa64 %zmm1, %zmm0
809; AVX512VLVBMI2-NEXT:    retq
810  %res = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>)
811  ret <16 x i32> %res
812}
813
814define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
815; AVX512F-LABEL: constant_funnnel_v32i16:
816; AVX512F:       # %bb.0:
817; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
818; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2]
819; AVX512F-NEXT:    vpmulhuw %ymm3, %ymm2, %ymm4
820; AVX512F-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3,4,5,6,7]
821; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
822; AVX512F-NEXT:    vpmulhuw %ymm3, %ymm1, %ymm3
823; AVX512F-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3,4,5,6,7]
824; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
825; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
826; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
827; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2]
828; AVX512F-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
829; AVX512F-NEXT:    vpmullw %ymm3, %ymm0, %ymm0
830; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
831; AVX512F-NEXT:    vporq %zmm1, %zmm0, %zmm0
832; AVX512F-NEXT:    retq
833;
834; AVX512VL-LABEL: constant_funnnel_v32i16:
835; AVX512VL:       # %bb.0:
836; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
837; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2]
838; AVX512VL-NEXT:    vpmulhuw %ymm3, %ymm2, %ymm4
839; AVX512VL-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3,4,5,6,7]
840; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
841; AVX512VL-NEXT:    vpmulhuw %ymm3, %ymm1, %ymm3
842; AVX512VL-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3,4,5,6,7]
843; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
844; AVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
845; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
846; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2]
847; AVX512VL-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
848; AVX512VL-NEXT:    vpmullw %ymm3, %ymm0, %ymm0
849; AVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
850; AVX512VL-NEXT:    vporq %zmm1, %zmm0, %zmm0
851; AVX512VL-NEXT:    retq
852;
853; AVX512BW-LABEL: constant_funnnel_v32i16:
854; AVX512BW:       # %bb.0:
855; AVX512BW-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
856; AVX512BW-NEXT:    vpaddw %zmm0, %zmm0, %zmm0
857; AVX512BW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
858; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
859; AVX512BW-NEXT:    retq
860;
861; AVX512VBMI2-LABEL: constant_funnnel_v32i16:
862; AVX512VBMI2:       # %bb.0:
863; AVX512VBMI2-NEXT:    vpshrdvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
864; AVX512VBMI2-NEXT:    vmovdqa64 %zmm1, %zmm0
865; AVX512VBMI2-NEXT:    retq
866;
867; AVX512VLBW-LABEL: constant_funnnel_v32i16:
868; AVX512VLBW:       # %bb.0:
869; AVX512VLBW-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
870; AVX512VLBW-NEXT:    vpaddw %zmm0, %zmm0, %zmm0
871; AVX512VLBW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
872; AVX512VLBW-NEXT:    vporq %zmm1, %zmm0, %zmm0
873; AVX512VLBW-NEXT:    retq
874;
875; AVX512VLVBMI2-LABEL: constant_funnnel_v32i16:
876; AVX512VLVBMI2:       # %bb.0:
877; AVX512VLVBMI2-NEXT:    vpshrdvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
878; AVX512VLVBMI2-NEXT:    vmovdqa64 %zmm1, %zmm0
879; AVX512VLVBMI2-NEXT:    retq
880  %res = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>)
881  ret <32 x i16> %res
882}
883
884define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
885; AVX512F-LABEL: constant_funnnel_v64i8:
886; AVX512F:       # %bb.0:
887; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
888; AVX512F-NEXT:    vpxor %xmm3, %xmm3, %xmm3
889; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31]
890; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
891; AVX512F-NEXT:    # ymm5 = mem[0,1,0,1]
892; AVX512F-NEXT:    vpmullw %ymm5, %ymm4, %ymm4
893; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
894; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23]
895; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
896; AVX512F-NEXT:    # ymm6 = mem[0,1,0,1]
897; AVX512F-NEXT:    vpmullw %ymm6, %ymm2, %ymm2
898; AVX512F-NEXT:    vpsrlw $8, %ymm2, %ymm2
899; AVX512F-NEXT:    vpackuswb %ymm4, %ymm2, %ymm2
900; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31]
901; AVX512F-NEXT:    vpmullw %ymm5, %ymm4, %ymm4
902; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
903; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
904; AVX512F-NEXT:    vpmullw %ymm6, %ymm1, %ymm1
905; AVX512F-NEXT:    vpsrlw $8, %ymm1, %ymm1
906; AVX512F-NEXT:    vpackuswb %ymm4, %ymm1, %ymm1
907; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
908; AVX512F-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
909; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64,0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64]
910; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
911; AVX512F-NEXT:    vpmaddubsw %ymm3, %ymm2, %ymm4
912; AVX512F-NEXT:    vpsllw $8, %ymm4, %ymm4
913; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
914; AVX512F-NEXT:    vpaddb %ymm0, %ymm0, %ymm0
915; AVX512F-NEXT:    vpmaddubsw %ymm3, %ymm0, %ymm3
916; AVX512F-NEXT:    vpsllw $8, %ymm3, %ymm3
917; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
918; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0,128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0]
919; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
920; AVX512F-NEXT:    vpmaddubsw %ymm4, %ymm0, %ymm0
921; AVX512F-NEXT:    vpmaddubsw %ymm4, %ymm2, %ymm2
922; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
923; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
924; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 | zmm1 | zmm3
925; AVX512F-NEXT:    retq
926;
927; AVX512VL-LABEL: constant_funnnel_v64i8:
928; AVX512VL:       # %bb.0:
929; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
930; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
931; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31]
932; AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
933; AVX512VL-NEXT:    # ymm5 = mem[0,1,0,1]
934; AVX512VL-NEXT:    vpmullw %ymm5, %ymm4, %ymm4
935; AVX512VL-NEXT:    vpsrlw $8, %ymm4, %ymm4
936; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23]
937; AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
938; AVX512VL-NEXT:    # ymm6 = mem[0,1,0,1]
939; AVX512VL-NEXT:    vpmullw %ymm6, %ymm2, %ymm2
940; AVX512VL-NEXT:    vpsrlw $8, %ymm2, %ymm2
941; AVX512VL-NEXT:    vpackuswb %ymm4, %ymm2, %ymm2
942; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31]
943; AVX512VL-NEXT:    vpmullw %ymm5, %ymm4, %ymm4
944; AVX512VL-NEXT:    vpsrlw $8, %ymm4, %ymm4
945; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
946; AVX512VL-NEXT:    vpmullw %ymm6, %ymm1, %ymm1
947; AVX512VL-NEXT:    vpsrlw $8, %ymm1, %ymm1
948; AVX512VL-NEXT:    vpackuswb %ymm4, %ymm1, %ymm1
949; AVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
950; AVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
951; AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64,0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64]
952; AVX512VL-NEXT:    # ymm3 = mem[0,1,0,1]
953; AVX512VL-NEXT:    vpmaddubsw %ymm3, %ymm2, %ymm4
954; AVX512VL-NEXT:    vpsllw $8, %ymm4, %ymm4
955; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
956; AVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm0
957; AVX512VL-NEXT:    vpmaddubsw %ymm3, %ymm0, %ymm3
958; AVX512VL-NEXT:    vpsllw $8, %ymm3, %ymm3
959; AVX512VL-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
960; AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0,128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0]
961; AVX512VL-NEXT:    # ymm4 = mem[0,1,0,1]
962; AVX512VL-NEXT:    vpmaddubsw %ymm4, %ymm0, %ymm0
963; AVX512VL-NEXT:    vpmaddubsw %ymm4, %ymm2, %ymm2
964; AVX512VL-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
965; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
966; AVX512VL-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 | zmm1 | zmm3
967; AVX512VL-NEXT:    retq
968;
969; AVX512BW-LABEL: constant_funnnel_v64i8:
970; AVX512BW:       # %bb.0:
971; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
972; AVX512BW-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
973; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
974; AVX512BW-NEXT:    vpandq %zmm3, %zmm2, %zmm2
975; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
976; AVX512BW-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
977; AVX512BW-NEXT:    vpandq %zmm3, %zmm0, %zmm0
978; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
979; AVX512BW-NEXT:    retq
980;
981; AVX512VBMI2-LABEL: constant_funnnel_v64i8:
982; AVX512VBMI2:       # %bb.0:
983; AVX512VBMI2-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
984; AVX512VBMI2-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
985; AVX512VBMI2-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
986; AVX512VBMI2-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
987; AVX512VBMI2-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78,16,18,20,22,24,26,28,30,80,82,84,86,88,90,92,94,32,34,36,38,40,42,44,46,96,98,100,102,104,106,108,110,48,50,52,54,56,58,60,62,112,114,116,118,120,122,124,126]
988; AVX512VBMI2-NEXT:    vpermi2b %zmm2, %zmm1, %zmm0
989; AVX512VBMI2-NEXT:    retq
990;
991; AVX512VLBW-LABEL: constant_funnnel_v64i8:
992; AVX512VLBW:       # %bb.0:
993; AVX512VLBW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
994; AVX512VLBW-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
995; AVX512VLBW-NEXT:    vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
996; AVX512VLBW-NEXT:    vpandq %zmm3, %zmm2, %zmm2
997; AVX512VLBW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
998; AVX512VLBW-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
999; AVX512VLBW-NEXT:    vpandq %zmm3, %zmm0, %zmm0
1000; AVX512VLBW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
1001; AVX512VLBW-NEXT:    retq
1002;
1003; AVX512VLVBMI2-LABEL: constant_funnnel_v64i8:
1004; AVX512VLVBMI2:       # %bb.0:
1005; AVX512VLVBMI2-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
1006; AVX512VLVBMI2-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
1007; AVX512VLVBMI2-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
1008; AVX512VLVBMI2-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
1009; AVX512VLVBMI2-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78,16,18,20,22,24,26,28,30,80,82,84,86,88,90,92,94,32,34,36,38,40,42,44,46,96,98,100,102,104,106,108,110,48,50,52,54,56,58,60,62,112,114,116,118,120,122,124,126]
1010; AVX512VLVBMI2-NEXT:    vpermi2b %zmm2, %zmm1, %zmm0
1011; AVX512VLVBMI2-NEXT:    retq
1012  %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
1013  ret <64 x i8> %res
1014}
1015
1016;
1017; Uniform Constant Shifts
1018;
1019
1020define <8 x i64> @splatconstant_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
1021; AVX512F-LABEL: splatconstant_funnnel_v8i64:
1022; AVX512F:       # %bb.0:
1023; AVX512F-NEXT:    vpsrlq $14, %zmm1, %zmm1
1024; AVX512F-NEXT:    vpsllq $50, %zmm0, %zmm0
1025; AVX512F-NEXT:    vporq %zmm1, %zmm0, %zmm0
1026; AVX512F-NEXT:    retq
1027;
1028; AVX512VL-LABEL: splatconstant_funnnel_v8i64:
1029; AVX512VL:       # %bb.0:
1030; AVX512VL-NEXT:    vpsrlq $14, %zmm1, %zmm1
1031; AVX512VL-NEXT:    vpsllq $50, %zmm0, %zmm0
1032; AVX512VL-NEXT:    vporq %zmm1, %zmm0, %zmm0
1033; AVX512VL-NEXT:    retq
1034;
1035; AVX512BW-LABEL: splatconstant_funnnel_v8i64:
1036; AVX512BW:       # %bb.0:
1037; AVX512BW-NEXT:    vpsrlq $14, %zmm1, %zmm1
1038; AVX512BW-NEXT:    vpsllq $50, %zmm0, %zmm0
1039; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
1040; AVX512BW-NEXT:    retq
1041;
1042; AVX512VBMI2-LABEL: splatconstant_funnnel_v8i64:
1043; AVX512VBMI2:       # %bb.0:
1044; AVX512VBMI2-NEXT:    vpshrdq $14, %zmm0, %zmm1, %zmm0
1045; AVX512VBMI2-NEXT:    retq
1046;
1047; AVX512VLBW-LABEL: splatconstant_funnnel_v8i64:
1048; AVX512VLBW:       # %bb.0:
1049; AVX512VLBW-NEXT:    vpsrlq $14, %zmm1, %zmm1
1050; AVX512VLBW-NEXT:    vpsllq $50, %zmm0, %zmm0
1051; AVX512VLBW-NEXT:    vporq %zmm1, %zmm0, %zmm0
1052; AVX512VLBW-NEXT:    retq
1053;
1054; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v8i64:
1055; AVX512VLVBMI2:       # %bb.0:
1056; AVX512VLVBMI2-NEXT:    vpshrdq $14, %zmm0, %zmm1, %zmm0
1057; AVX512VLVBMI2-NEXT:    retq
1058  %res = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> <i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14>)
1059  ret <8 x i64> %res
1060}
1061
1062define <16 x i32> @splatconstant_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
1063; AVX512F-LABEL: splatconstant_funnnel_v16i32:
1064; AVX512F:       # %bb.0:
1065; AVX512F-NEXT:    vpsrld $4, %zmm1, %zmm1
1066; AVX512F-NEXT:    vpslld $28, %zmm0, %zmm0
1067; AVX512F-NEXT:    vpord %zmm1, %zmm0, %zmm0
1068; AVX512F-NEXT:    retq
1069;
1070; AVX512VL-LABEL: splatconstant_funnnel_v16i32:
1071; AVX512VL:       # %bb.0:
1072; AVX512VL-NEXT:    vpsrld $4, %zmm1, %zmm1
1073; AVX512VL-NEXT:    vpslld $28, %zmm0, %zmm0
1074; AVX512VL-NEXT:    vpord %zmm1, %zmm0, %zmm0
1075; AVX512VL-NEXT:    retq
1076;
1077; AVX512BW-LABEL: splatconstant_funnnel_v16i32:
1078; AVX512BW:       # %bb.0:
1079; AVX512BW-NEXT:    vpsrld $4, %zmm1, %zmm1
1080; AVX512BW-NEXT:    vpslld $28, %zmm0, %zmm0
1081; AVX512BW-NEXT:    vpord %zmm1, %zmm0, %zmm0
1082; AVX512BW-NEXT:    retq
1083;
1084; AVX512VBMI2-LABEL: splatconstant_funnnel_v16i32:
1085; AVX512VBMI2:       # %bb.0:
1086; AVX512VBMI2-NEXT:    vpshrdd $4, %zmm0, %zmm1, %zmm0
1087; AVX512VBMI2-NEXT:    retq
1088;
1089; AVX512VLBW-LABEL: splatconstant_funnnel_v16i32:
1090; AVX512VLBW:       # %bb.0:
1091; AVX512VLBW-NEXT:    vpsrld $4, %zmm1, %zmm1
1092; AVX512VLBW-NEXT:    vpslld $28, %zmm0, %zmm0
1093; AVX512VLBW-NEXT:    vpord %zmm1, %zmm0, %zmm0
1094; AVX512VLBW-NEXT:    retq
1095;
1096; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v16i32:
1097; AVX512VLVBMI2:       # %bb.0:
1098; AVX512VLVBMI2-NEXT:    vpshrdd $4, %zmm0, %zmm1, %zmm0
1099; AVX512VLVBMI2-NEXT:    retq
1100  %res = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>)
1101  ret <16 x i32> %res
1102}
1103
1104define <32 x i16> @splatconstant_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
1105; AVX512F-LABEL: splatconstant_funnnel_v32i16:
1106; AVX512F:       # %bb.0:
1107; AVX512F-NEXT:    vpsrlw $7, %ymm1, %ymm2
1108; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
1109; AVX512F-NEXT:    vpsrlw $7, %ymm1, %ymm1
1110; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
1111; AVX512F-NEXT:    vpsllw $9, %ymm0, %ymm2
1112; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
1113; AVX512F-NEXT:    vpsllw $9, %ymm0, %ymm0
1114; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
1115; AVX512F-NEXT:    vporq %zmm1, %zmm0, %zmm0
1116; AVX512F-NEXT:    retq
1117;
1118; AVX512VL-LABEL: splatconstant_funnnel_v32i16:
1119; AVX512VL:       # %bb.0:
1120; AVX512VL-NEXT:    vpsrlw $7, %ymm1, %ymm2
1121; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
1122; AVX512VL-NEXT:    vpsrlw $7, %ymm1, %ymm1
1123; AVX512VL-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
1124; AVX512VL-NEXT:    vpsllw $9, %ymm0, %ymm2
1125; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
1126; AVX512VL-NEXT:    vpsllw $9, %ymm0, %ymm0
1127; AVX512VL-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
1128; AVX512VL-NEXT:    vporq %zmm1, %zmm0, %zmm0
1129; AVX512VL-NEXT:    retq
1130;
1131; AVX512BW-LABEL: splatconstant_funnnel_v32i16:
1132; AVX512BW:       # %bb.0:
1133; AVX512BW-NEXT:    vpsrlw $7, %zmm1, %zmm1
1134; AVX512BW-NEXT:    vpsllw $9, %zmm0, %zmm0
1135; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
1136; AVX512BW-NEXT:    retq
1137;
1138; AVX512VBMI2-LABEL: splatconstant_funnnel_v32i16:
1139; AVX512VBMI2:       # %bb.0:
1140; AVX512VBMI2-NEXT:    vpshrdw $7, %zmm0, %zmm1, %zmm0
1141; AVX512VBMI2-NEXT:    retq
1142;
1143; AVX512VLBW-LABEL: splatconstant_funnnel_v32i16:
1144; AVX512VLBW:       # %bb.0:
1145; AVX512VLBW-NEXT:    vpsrlw $7, %zmm1, %zmm1
1146; AVX512VLBW-NEXT:    vpsllw $9, %zmm0, %zmm0
1147; AVX512VLBW-NEXT:    vporq %zmm1, %zmm0, %zmm0
1148; AVX512VLBW-NEXT:    retq
1149;
1150; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v32i16:
1151; AVX512VLVBMI2:       # %bb.0:
1152; AVX512VLVBMI2-NEXT:    vpshrdw $7, %zmm0, %zmm1, %zmm0
1153; AVX512VLVBMI2-NEXT:    retq
1154  %res = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
1155  ret <32 x i16> %res
1156}
1157
1158define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
1159; AVX512F-LABEL: splatconstant_funnnel_v64i8:
1160; AVX512F:       # %bb.0:
1161; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm2
1162; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
1163; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm0
1164; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm2
1165; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm0
1166; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
1167; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
1168; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1169; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
1170; AVX512F-NEXT:    retq
1171;
1172; AVX512VL-LABEL: splatconstant_funnnel_v64i8:
1173; AVX512VL:       # %bb.0:
1174; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm2
1175; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
1176; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm0
1177; AVX512VL-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm2
1178; AVX512VL-NEXT:    vpsrlw $4, %ymm1, %ymm0
1179; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
1180; AVX512VL-NEXT:    vpsrlw $4, %ymm1, %ymm1
1181; AVX512VL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1182; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
1183; AVX512VL-NEXT:    retq
1184;
1185; AVX512BW-LABEL: splatconstant_funnnel_v64i8:
1186; AVX512BW:       # %bb.0:
1187; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm2
1188; AVX512BW-NEXT:    vpsrlw $4, %zmm1, %zmm0
1189; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
1190; AVX512BW-NEXT:    retq
1191;
1192; AVX512VBMI2-LABEL: splatconstant_funnnel_v64i8:
1193; AVX512VBMI2:       # %bb.0:
1194; AVX512VBMI2-NEXT:    vpsllw $4, %zmm0, %zmm2
1195; AVX512VBMI2-NEXT:    vpsrlw $4, %zmm1, %zmm0
1196; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
1197; AVX512VBMI2-NEXT:    retq
1198;
1199; AVX512VLBW-LABEL: splatconstant_funnnel_v64i8:
1200; AVX512VLBW:       # %bb.0:
1201; AVX512VLBW-NEXT:    vpsllw $4, %zmm0, %zmm2
1202; AVX512VLBW-NEXT:    vpsrlw $4, %zmm1, %zmm0
1203; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
1204; AVX512VLBW-NEXT:    retq
1205;
1206; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v64i8:
1207; AVX512VLVBMI2:       # %bb.0:
1208; AVX512VLVBMI2-NEXT:    vpsllw $4, %zmm0, %zmm2
1209; AVX512VLVBMI2-NEXT:    vpsrlw $4, %zmm1, %zmm0
1210; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
1211; AVX512VLVBMI2-NEXT:    retq
1212  %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)
1213  ret <64 x i8> %res
1214}
1215