xref: /llvm-project/llvm/test/CodeGen/X86/vector-shuffle-v192.ll (revision b5d35feacb7246573c6a4ab2bddc4919a4228ed5)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
3; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=AVX512F
4; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefix=AVX512BW
5; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx512f,+avx512vl,+avx512vbmi | FileCheck %s --check-prefix=AVX512VBMI
6
7define <64 x i8> @f1(ptr %p0) {
8; AVX2-LABEL: f1:
9; AVX2:       # %bb.0:
10; AVX2-NEXT:    vmovdqa 128(%rdi), %ymm1
11; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm0
12; AVX2-NEXT:    vmovdqa (%rdi), %xmm2
13; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm3
14; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,128,3,5,9,11,15,u,u,u,u,u]
15; AVX2-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
16; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = [1,3,7,9,13,15,128,128,128,128,128,u,u,u,u,u]
17; AVX2-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
18; AVX2-NEXT:    vpor %xmm3, %xmm2, %xmm2
19; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [1,3,7,9,13,15,0,0,0,0,0,1,5,7,11,13,1,3,7,9,13,15,0,0,0,0,0,1,5,7,11,13]
20; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
21; AVX2-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
22; AVX2-NEXT:    vpmovsxdq {{.*#+}} xmm6 = [18446744073709551615,16777215]
23; AVX2-NEXT:    vpblendvb %ymm6, %ymm2, %ymm0, %ymm0
24; AVX2-NEXT:    vmovdqa 80(%rdi), %xmm2
25; AVX2-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,128,128,128,128,128,1,5,7,11,13]
26; AVX2-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
27; AVX2-NEXT:    vmovdqa 64(%rdi), %xmm8
28; AVX2-NEXT:    vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,u,3,5,9,11,15,128,128,128,128,128]
29; AVX2-NEXT:    vpshufb %xmm9, %xmm8, %xmm8
30; AVX2-NEXT:    vpor %xmm2, %xmm8, %xmm2
31; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
32; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7],ymm0[8,9,10],ymm2[11,12,13,14,15]
33; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
34; AVX2-NEXT:    vmovdqa 112(%rdi), %xmm2
35; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
36; AVX2-NEXT:    vmovdqa 96(%rdi), %xmm4
37; AVX2-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
38; AVX2-NEXT:    vpor %xmm2, %xmm4, %xmm2
39; AVX2-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
40; AVX2-NEXT:    vpblendvb %ymm6, %ymm2, %ymm1, %ymm1
41; AVX2-NEXT:    vmovdqa 176(%rdi), %xmm2
42; AVX2-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
43; AVX2-NEXT:    vmovdqa 160(%rdi), %xmm3
44; AVX2-NEXT:    vpshufb %xmm9, %xmm3, %xmm3
45; AVX2-NEXT:    vpor %xmm2, %xmm3, %xmm2
46; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
47; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
48; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
49; AVX2-NEXT:    retq
50;
51; AVX512F-LABEL: f1:
52; AVX512F:       # %bb.0:
53; AVX512F-NEXT:    vmovdqa 112(%rdi), %xmm0
54; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,3,5,9,11,15,u,u,u,u,u]
55; AVX512F-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
56; AVX512F-NEXT:    vmovdqa 96(%rdi), %xmm2
57; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,3,7,9,13,15,128,128,128,128,128,u,u,u,u,u]
58; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
59; AVX512F-NEXT:    vpor %xmm0, %xmm2, %xmm0
60; AVX512F-NEXT:    vmovdqa 176(%rdi), %xmm2
61; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = [u,u,u,u,u,u,128,128,128,128,128,1,5,7,11,13]
62; AVX512F-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
63; AVX512F-NEXT:    vmovdqa 160(%rdi), %xmm5
64; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,u,u,u,u,u,3,5,9,11,15,128,128,128,128,128]
65; AVX512F-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
66; AVX512F-NEXT:    vpor %xmm2, %xmm5, %xmm2
67; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
68; AVX512F-NEXT:    vmovdqa 128(%rdi), %ymm5
69; AVX512F-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,1,5,7,11,13,17,19,23,25,29,31,u,u,u,u,u,u,u,u,u,u]
70; AVX512F-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7],ymm5[8,9,10],ymm2[11,12,13,14,15]
71; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
72; AVX512F-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
73; AVX512F-NEXT:    vmovdqa 80(%rdi), %xmm0
74; AVX512F-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
75; AVX512F-NEXT:    vmovdqa 64(%rdi), %xmm4
76; AVX512F-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
77; AVX512F-NEXT:    vpor %xmm0, %xmm4, %xmm0
78; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
79; AVX512F-NEXT:    vmovdqa (%rdi), %xmm4
80; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm5
81; AVX512F-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
82; AVX512F-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
83; AVX512F-NEXT:    vpor %xmm1, %xmm3, %xmm1
84; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm3
85; AVX512F-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[1,5,7,11,13,17,19,23,25,29,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
86; AVX512F-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3
87; AVX512F-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15]
88; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
89; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
90; AVX512F-NEXT:    retq
91;
92; AVX512BW-LABEL: f1:
93; AVX512BW:       # %bb.0:
94; AVX512BW-NEXT:    vmovdqa 112(%rdi), %xmm0
95; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,3,5,9,11,15,u,u,u,u,u]
96; AVX512BW-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
97; AVX512BW-NEXT:    vmovdqa 96(%rdi), %xmm2
98; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,3,7,9,13,15,128,128,128,128,128,u,u,u,u,u]
99; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
100; AVX512BW-NEXT:    vpor %xmm0, %xmm2, %xmm0
101; AVX512BW-NEXT:    vmovdqa 176(%rdi), %xmm2
102; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [u,u,u,u,u,u,128,128,128,128,128,1,5,7,11,13]
103; AVX512BW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
104; AVX512BW-NEXT:    vmovdqa 160(%rdi), %xmm5
105; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,u,u,u,u,u,3,5,9,11,15,128,128,128,128,128]
106; AVX512BW-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
107; AVX512BW-NEXT:    vpor %xmm2, %xmm5, %xmm2
108; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
109; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm5
110; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [1,3,7,9,13,15,0,0,0,0,0,1,5,7,11,13,1,3,7,9,13,15,0,0,0,0,0,1,5,7,11,13]
111; AVX512BW-NEXT:    # ymm7 = mem[0,1,0,1]
112; AVX512BW-NEXT:    vpshufb %ymm7, %ymm5, %ymm5
113; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7],ymm5[8,9,10],ymm2[11,12,13,14,15]
114; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
115; AVX512BW-NEXT:    movl $2047, %eax # imm = 0x7FF
116; AVX512BW-NEXT:    kmovd %eax, %k1
117; AVX512BW-NEXT:    vmovdqu8 %ymm0, %ymm2 {%k1}
118; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
119; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm5
120; AVX512BW-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
121; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
122; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
123; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm1
124; AVX512BW-NEXT:    movl $4192256, %eax # imm = 0x3FF800
125; AVX512BW-NEXT:    kmovd %eax, %k1
126; AVX512BW-NEXT:    vpshufb %ymm7, %ymm1, %ymm0 {%k1}
127; AVX512BW-NEXT:    vmovdqa 80(%rdi), %xmm1
128; AVX512BW-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
129; AVX512BW-NEXT:    vmovdqa 64(%rdi), %xmm3
130; AVX512BW-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
131; AVX512BW-NEXT:    vpor %xmm1, %xmm3, %xmm1
132; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
133; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
134; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
135; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
136; AVX512BW-NEXT:    retq
137;
138; AVX512VBMI-LABEL: f1:
139; AVX512VBMI:       # %bb.0:
140; AVX512VBMI-NEXT:    vmovdqa64 (%rdi), %zmm0
141; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,3,7,9,13,15,19,21,25,27,31,33,37,39,43,45,49,51,55,57,61,63,67,69,73,75,79,81,85,87,91,93,97,99,103,105,109,111,115,117,121,123,127,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
142; AVX512VBMI-NEXT:    vpermi2b 64(%rdi), %zmm0, %zmm1
143; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,65,69,71,75,77,81,83,87,89,93,95,99,101,105,107,111,113,117,119,123,125]
144; AVX512VBMI-NEXT:    vpermi2b 128(%rdi), %zmm1, %zmm0
145; AVX512VBMI-NEXT:    retq
146  %a0 = load <192 x i8>, ptr %p0
147  %r = shufflevector <192 x i8> %a0, <192 x i8> poison, <64 x i32> <i32 1, i32 3, i32 7, i32 9, i32 13, i32 15, i32 19, i32 21, i32 25, i32 27, i32 31, i32 33, i32 37, i32 39, i32 43, i32 45, i32 49, i32 51, i32 55, i32 57, i32 61, i32 63, i32 67, i32 69, i32 73, i32 75, i32 79, i32 81, i32 85, i32 87, i32 91, i32 93, i32 97, i32 99, i32 103, i32 105, i32 109, i32 111, i32 115, i32 117, i32 121, i32 123, i32 127, i32 129, i32 133, i32 135, i32 139, i32 141, i32 145, i32 147, i32 151, i32 153, i32 157, i32 159, i32 163, i32 165, i32 169, i32 171, i32 175, i32 177, i32 181, i32 183, i32 187, i32 189>
148  ret <64 x i8> %r
149}
150
151define <64 x i8> @f2(ptr %p0) {
152; AVX2-LABEL: f2:
153; AVX2:       # %bb.0:
154; AVX2-NEXT:    vmovdqa 128(%rdi), %ymm1
155; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm0
156; AVX2-NEXT:    vmovdqa (%rdi), %xmm2
157; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm3
158; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [1,5,7,11,13,128,128,128,128,128,128,u,u,u,u,u]
159; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
160; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,1,3,7,9,13,15,u,u,u,u,u]
161; AVX2-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
162; AVX2-NEXT:    vpor %xmm2, %xmm3, %xmm2
163; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [1,5,7,11,13,0,0,0,0,0,0,3,5,9,11,15,1,5,7,11,13,0,0,0,0,0,0,3,5,9,11,15]
164; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
165; AVX2-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
166; AVX2-NEXT:    vpmovsxdq {{.*#+}} xmm6 = [18446744073709551615,16777215]
167; AVX2-NEXT:    vpblendvb %ymm6, %ymm2, %ymm0, %ymm0
168; AVX2-NEXT:    vmovdqa 80(%rdi), %xmm2
169; AVX2-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,128,128,128,128,128,128,3,5,9,11,15]
170; AVX2-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
171; AVX2-NEXT:    vmovdqa 64(%rdi), %xmm8
172; AVX2-NEXT:    vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,1,3,7,9,13,15,128,128,128,128,128]
173; AVX2-NEXT:    vpshufb %xmm9, %xmm8, %xmm8
174; AVX2-NEXT:    vpor %xmm2, %xmm8, %xmm2
175; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
176; AVX2-NEXT:    vpmovsxwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8
177; AVX2-NEXT:    vpblendvb %ymm8, %ymm0, %ymm2, %ymm0
178; AVX2-NEXT:    vmovdqa 96(%rdi), %xmm2
179; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
180; AVX2-NEXT:    vmovdqa 112(%rdi), %xmm4
181; AVX2-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
182; AVX2-NEXT:    vpor %xmm2, %xmm4, %xmm2
183; AVX2-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
184; AVX2-NEXT:    vpblendvb %ymm6, %ymm2, %ymm1, %ymm1
185; AVX2-NEXT:    vmovdqa 176(%rdi), %xmm2
186; AVX2-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
187; AVX2-NEXT:    vmovdqa 160(%rdi), %xmm3
188; AVX2-NEXT:    vpshufb %xmm9, %xmm3, %xmm3
189; AVX2-NEXT:    vpor %xmm2, %xmm3, %xmm2
190; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
191; AVX2-NEXT:    vpblendvb %ymm8, %ymm1, %ymm2, %ymm1
192; AVX2-NEXT:    retq
193;
194; AVX512F-LABEL: f2:
195; AVX512F:       # %bb.0:
196; AVX512F-NEXT:    vmovdqa 176(%rdi), %xmm0
197; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,128,128,128,128,128,128,3,5,9,11,15]
198; AVX512F-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
199; AVX512F-NEXT:    vmovdqa 160(%rdi), %xmm2
200; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,1,3,7,9,13,15,128,128,128,128,128]
201; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
202; AVX512F-NEXT:    vpor %xmm0, %xmm2, %xmm0
203; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
204; AVX512F-NEXT:    vmovdqa (%rdi), %xmm2
205; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm4
206; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm5 = [1,5,7,11,13,128,128,128,128,128,128,u,u,u,u,u]
207; AVX512F-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
208; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm6 = [128,128,128,128,128,1,3,7,9,13,15,u,u,u,u,u]
209; AVX512F-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
210; AVX512F-NEXT:    vpor %xmm2, %xmm4, %xmm2
211; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
212; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm2
213; AVX512F-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,5,9,11,15,17,21,23,27,29,u,u,u,u,u,u,u,u,u,u,u]
214; AVX512F-NEXT:    vmovdqa 128(%rdi), %ymm4
215; AVX512F-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,3,5,9,11,15,17,21,23,27,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
216; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
217; AVX512F-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2
218; AVX512F-NEXT:    vmovdqa 96(%rdi), %xmm0
219; AVX512F-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
220; AVX512F-NEXT:    vmovdqa 112(%rdi), %xmm4
221; AVX512F-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
222; AVX512F-NEXT:    vpor %xmm0, %xmm4, %xmm0
223; AVX512F-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
224; AVX512F-NEXT:    vmovdqa 80(%rdi), %xmm4
225; AVX512F-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
226; AVX512F-NEXT:    vmovdqa 64(%rdi), %xmm4
227; AVX512F-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
228; AVX512F-NEXT:    vpor %xmm1, %xmm3, %xmm1
229; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
230; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
231; AVX512F-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
232; AVX512F-NEXT:    retq
233;
234; AVX512BW-LABEL: f2:
235; AVX512BW:       # %bb.0:
236; AVX512BW-NEXT:    vmovdqa 176(%rdi), %xmm0
237; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,128,128,128,128,128,128,3,5,9,11,15]
238; AVX512BW-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
239; AVX512BW-NEXT:    vmovdqa 160(%rdi), %xmm2
240; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,1,3,7,9,13,15,128,128,128,128,128]
241; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
242; AVX512BW-NEXT:    vpor %xmm0, %xmm2, %xmm0
243; AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
244; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm2
245; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm4
246; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm5 = [1,5,7,11,13,128,128,128,128,128,128,u,u,u,u,u]
247; AVX512BW-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
248; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [128,128,128,128,128,1,3,7,9,13,15,u,u,u,u,u]
249; AVX512BW-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
250; AVX512BW-NEXT:    vpor %xmm2, %xmm4, %xmm2
251; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
252; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm2
253; AVX512BW-NEXT:    vinserti64x4 $1, 128(%rdi), %zmm2, %zmm2
254; AVX512BW-NEXT:    movabsq $8998403163813888, %rax # imm = 0x1FF800001FF800
255; AVX512BW-NEXT:    kmovq %rax, %k1
256; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[u,u,u,u,u,u,u,u,u,u,u,3,5,9,11,15,17,21,23,27,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,35,37,41,43,47,49,53,55,59,61,u,u,u,u,u,u,u,u,u,u,u]
257; AVX512BW-NEXT:    vmovdqa 96(%rdi), %xmm2
258; AVX512BW-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
259; AVX512BW-NEXT:    vmovdqa 112(%rdi), %xmm4
260; AVX512BW-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
261; AVX512BW-NEXT:    vpor %xmm2, %xmm4, %xmm2
262; AVX512BW-NEXT:    vinserti32x4 $2, %xmm2, %zmm0, %zmm2
263; AVX512BW-NEXT:    vmovdqa 80(%rdi), %xmm4
264; AVX512BW-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
265; AVX512BW-NEXT:    vmovdqa 64(%rdi), %xmm4
266; AVX512BW-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
267; AVX512BW-NEXT:    vpor %xmm1, %xmm3, %xmm1
268; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
269; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7]
270; AVX512BW-NEXT:    movabsq $8796090925056, %rax # imm = 0x7FFFFE00000
271; AVX512BW-NEXT:    kmovq %rax, %k1
272; AVX512BW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
273; AVX512BW-NEXT:    retq
274;
275; AVX512VBMI-LABEL: f2:
276; AVX512VBMI:       # %bb.0:
277; AVX512VBMI-NEXT:    vmovdqa64 64(%rdi), %zmm0
278; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65,69,71,75,77,81,83,87,89,93,95,99,101,105,107,111,113,117,119,123,125,1,3,7,9,13,15,19,21,25,27,31,33,37,39,43,45,49,51,55,57,61,63,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
279; AVX512VBMI-NEXT:    vpermi2b (%rdi), %zmm0, %zmm1
280; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,67,69,73,75,79,81,85,87,91,93,97,99,103,105,109,111,115,117,121,123,127]
281; AVX512VBMI-NEXT:    vpermi2b 128(%rdi), %zmm1, %zmm0
282; AVX512VBMI-NEXT:    retq
283  %a0 = load <192 x i8>, ptr %p0
284  %r = shufflevector <192 x i8> %a0, <192 x i8> poison, <64 x i32> <i32 1, i32 5, i32 7, i32 11, i32 13, i32 17, i32 19, i32 23, i32 25, i32 29, i32 31, i32 35, i32 37, i32 41, i32 43, i32 47, i32 49, i32 53, i32 55, i32 59, i32 61, i32 65, i32 67, i32 71, i32 73, i32 77, i32 79, i32 83, i32 85, i32 89, i32 91, i32 95, i32 97, i32 101, i32 103, i32 107, i32 109, i32 113, i32 115, i32 119, i32 121, i32 125, i32 127, i32 131, i32 133, i32 137, i32 139, i32 143, i32 145, i32 149, i32 151, i32 155, i32 157, i32 161, i32 163, i32 167, i32 169, i32 173, i32 175, i32 179, i32 181, i32 185, i32 187, i32 191>
285  ret <64 x i8> %r
286}
287
288define <64 x i8> @f3(ptr %p0) {
289; AVX2-LABEL: f3:
290; AVX2:       # %bb.0:
291; AVX2-NEXT:    vmovdqa 128(%rdi), %ymm1
292; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm0
293; AVX2-NEXT:    vmovdqa 64(%rdi), %xmm2
294; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,0,4,6,10,12,128,128,128,128,128,128]
295; AVX2-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
296; AVX2-NEXT:    vmovdqa 80(%rdi), %xmm4
297; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,128,128,128,128,128,0,2,6,8,12,14]
298; AVX2-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
299; AVX2-NEXT:    vpor %xmm2, %xmm4, %xmm2
300; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
301; AVX2-NEXT:    vmovdqa (%rdi), %xmm4
302; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm6
303; AVX2-NEXT:    vmovdqa {{.*#+}} xmm7 = [128,128,128,128,128,0,4,6,10,12,u,u,u,u,u,u]
304; AVX2-NEXT:    vpshufb %xmm7, %xmm6, %xmm6
305; AVX2-NEXT:    vmovdqa {{.*#+}} xmm8 = [2,4,8,10,14,128,128,128,128,128,u,u,u,u,u,u]
306; AVX2-NEXT:    vpshufb %xmm8, %xmm4, %xmm4
307; AVX2-NEXT:    vpor %xmm6, %xmm4, %xmm4
308; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14,2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14]
309; AVX2-NEXT:    # ymm6 = mem[0,1,0,1]
310; AVX2-NEXT:    vpshufb %ymm6, %ymm0, %ymm0
311; AVX2-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm0[5,6,7]
312; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
313; AVX2-NEXT:    vpmovsxwd {{.*#+}} ymm4 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0]
314; AVX2-NEXT:    vpblendvb %ymm4, %ymm0, %ymm2, %ymm0
315; AVX2-NEXT:    vmovdqa 160(%rdi), %xmm2
316; AVX2-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
317; AVX2-NEXT:    vmovdqa 176(%rdi), %xmm3
318; AVX2-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
319; AVX2-NEXT:    vpor %xmm2, %xmm3, %xmm2
320; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
321; AVX2-NEXT:    vmovdqa 112(%rdi), %xmm3
322; AVX2-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
323; AVX2-NEXT:    vmovdqa 96(%rdi), %xmm5
324; AVX2-NEXT:    vpshufb %xmm8, %xmm5, %xmm5
325; AVX2-NEXT:    vpor %xmm3, %xmm5, %xmm3
326; AVX2-NEXT:    vpshufb %ymm6, %ymm1, %ymm1
327; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm1[5,6,7]
328; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
329; AVX2-NEXT:    vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
330; AVX2-NEXT:    retq
331;
332; AVX512F-LABEL: f3:
333; AVX512F:       # %bb.0:
334; AVX512F-NEXT:    vmovdqa 160(%rdi), %xmm0
335; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,0,4,6,10,12,128,128,128,128,128,128]
336; AVX512F-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
337; AVX512F-NEXT:    vmovdqa 176(%rdi), %xmm2
338; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,128,128,128,128,128,0,2,6,8,12,14]
339; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
340; AVX512F-NEXT:    vpor %xmm0, %xmm2, %xmm0
341; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
342; AVX512F-NEXT:    vmovdqa 128(%rdi), %ymm2
343; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14,2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14]
344; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
345; AVX512F-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
346; AVX512F-NEXT:    vpmovsxwd {{.*#+}} ymm5 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0]
347; AVX512F-NEXT:    vpternlogq $216, %ymm5, %ymm2, %ymm0
348; AVX512F-NEXT:    vmovdqa 112(%rdi), %xmm6
349; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm7 = [128,128,128,128,128,0,4,6,10,12,u,u,u,u,u,u]
350; AVX512F-NEXT:    vpshufb %xmm7, %xmm6, %xmm6
351; AVX512F-NEXT:    vmovdqa 96(%rdi), %xmm8
352; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm9 = [2,4,8,10,14,128,128,128,128,128,u,u,u,u,u,u]
353; AVX512F-NEXT:    vpshufb %xmm9, %xmm8, %xmm8
354; AVX512F-NEXT:    vpor %xmm6, %xmm8, %xmm6
355; AVX512F-NEXT:    vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3,4],xmm2[5,6,7]
356; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
357; AVX512F-NEXT:    vmovdqa 64(%rdi), %xmm2
358; AVX512F-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
359; AVX512F-NEXT:    vmovdqa 80(%rdi), %xmm2
360; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
361; AVX512F-NEXT:    vpor %xmm1, %xmm2, %xmm1
362; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
363; AVX512F-NEXT:    vmovdqa (%rdi), %xmm2
364; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm3
365; AVX512F-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
366; AVX512F-NEXT:    vpshufb %xmm9, %xmm2, %xmm2
367; AVX512F-NEXT:    vpor %xmm3, %xmm2, %xmm2
368; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm3
369; AVX512F-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
370; AVX512F-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7]
371; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
372; AVX512F-NEXT:    vpternlogq $226, %ymm1, %ymm5, %ymm2
373; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
374; AVX512F-NEXT:    retq
375;
376; AVX512BW-LABEL: f3:
377; AVX512BW:       # %bb.0:
378; AVX512BW-NEXT:    vmovdqa 64(%rdi), %xmm0
379; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,0,4,6,10,12,128,128,128,128,128,128]
380; AVX512BW-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
381; AVX512BW-NEXT:    vmovdqa 80(%rdi), %xmm2
382; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,128,128,128,128,128,0,2,6,8,12,14]
383; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
384; AVX512BW-NEXT:    vpor %xmm0, %xmm2, %xmm0
385; AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
386; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm2
387; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm4
388; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,0,4,6,10,12,u,u,u,u,u,u]
389; AVX512BW-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
390; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [2,4,8,10,14,128,128,128,128,128,u,u,u,u,u,u]
391; AVX512BW-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
392; AVX512BW-NEXT:    vpor %xmm4, %xmm2, %xmm2
393; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm4
394; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14,2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14]
395; AVX512BW-NEXT:    # ymm7 = mem[0,1,0,1]
396; AVX512BW-NEXT:    vpshufb %ymm7, %ymm4, %ymm4
397; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5,6,7]
398; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
399; AVX512BW-NEXT:    movl $-2097152, %eax # imm = 0xFFE00000
400; AVX512BW-NEXT:    kmovd %eax, %k1
401; AVX512BW-NEXT:    vmovdqu8 %ymm0, %ymm2 {%k1}
402; AVX512BW-NEXT:    vmovdqa 112(%rdi), %xmm0
403; AVX512BW-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
404; AVX512BW-NEXT:    vmovdqa 96(%rdi), %xmm4
405; AVX512BW-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
406; AVX512BW-NEXT:    vpor %xmm0, %xmm4, %xmm0
407; AVX512BW-NEXT:    vmovdqa 160(%rdi), %xmm4
408; AVX512BW-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
409; AVX512BW-NEXT:    vmovdqa 176(%rdi), %xmm4
410; AVX512BW-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
411; AVX512BW-NEXT:    vpor %xmm1, %xmm3, %xmm1
412; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
413; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm3
414; AVX512BW-NEXT:    vpshufb %ymm7, %ymm3, %ymm3
415; AVX512BW-NEXT:    vmovdqu8 %ymm1, %ymm3 {%k1}
416; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7]
417; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
418; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
419; AVX512BW-NEXT:    retq
420;
421; AVX512VBMI-LABEL: f3:
422; AVX512VBMI:       # %bb.0:
423; AVX512VBMI-NEXT:    vmovdqa64 (%rdi), %zmm0
424; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,4,8,10,14,16,20,22,26,28,32,34,38,40,44,46,50,52,56,58,62,64,68,70,74,76,80,82,86,88,92,94,98,100,104,106,110,112,116,118,122,124,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
425; AVX512VBMI-NEXT:    vpermi2b 64(%rdi), %zmm0, %zmm1
426; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,64,66,70,72,76,78,82,84,88,90,94,96,100,102,106,108,112,114,118,120,124,126]
427; AVX512VBMI-NEXT:    vpermi2b 128(%rdi), %zmm1, %zmm0
428; AVX512VBMI-NEXT:    retq
429  %a0 = load <192 x i8>, ptr %p0
430  %r = shufflevector <192 x i8> %a0, <192 x i8> poison, <64 x i32> <i32 2, i32 4, i32 8, i32 10, i32 14, i32 16, i32 20, i32 22, i32 26, i32 28, i32 32, i32 34, i32 38, i32 40, i32 44, i32 46, i32 50, i32 52, i32 56, i32 58, i32 62, i32 64, i32 68, i32 70, i32 74, i32 76, i32 80, i32 82, i32 86, i32 88, i32 92, i32 94, i32 98, i32 100, i32 104, i32 106, i32 110, i32 112, i32 116, i32 118, i32 122, i32 124, i32 128, i32 130, i32 134, i32 136, i32 140, i32 142, i32 146, i32 148, i32 152, i32 154, i32 158, i32 160, i32 164, i32 166, i32 170, i32 172, i32 176, i32 178, i32 182, i32 184, i32 188, i32 190>
431  ret <64 x i8> %r
432}
433
434define <64 x i8> @f4(ptr %p0) {
435; AVX2-LABEL: f4:
436; AVX2:       # %bb.0:
437; AVX2-NEXT:    vmovdqa 128(%rdi), %ymm1
438; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm0
439; AVX2-NEXT:    vmovdqa (%rdi), %xmm2
440; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm3
441; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,4,6,10,12,128,128,128,128,128,128,u,u,u,u,u]
442; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
443; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,0,2,6,8,12,14,u,u,u,u,u]
444; AVX2-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
445; AVX2-NEXT:    vpor %xmm2, %xmm3, %xmm2
446; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,4,6,10,12,0,0,0,0,0,0,2,4,8,10,14,0,4,6,10,12,0,0,0,0,0,0,2,4,8,10,14]
447; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
448; AVX2-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
449; AVX2-NEXT:    vpmovsxdq {{.*#+}} xmm6 = [18446744073709551615,16777215]
450; AVX2-NEXT:    vpblendvb %ymm6, %ymm2, %ymm0, %ymm0
451; AVX2-NEXT:    vmovdqa 80(%rdi), %xmm2
452; AVX2-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,128,128,128,128,128,128,2,4,8,10,14]
453; AVX2-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
454; AVX2-NEXT:    vmovdqa 64(%rdi), %xmm8
455; AVX2-NEXT:    vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,0,2,6,8,12,14,128,128,128,128,128]
456; AVX2-NEXT:    vpshufb %xmm9, %xmm8, %xmm8
457; AVX2-NEXT:    vpor %xmm2, %xmm8, %xmm2
458; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
459; AVX2-NEXT:    vpmovsxwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8
460; AVX2-NEXT:    vpblendvb %ymm8, %ymm0, %ymm2, %ymm0
461; AVX2-NEXT:    vmovdqa 96(%rdi), %xmm2
462; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
463; AVX2-NEXT:    vmovdqa 112(%rdi), %xmm4
464; AVX2-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
465; AVX2-NEXT:    vpor %xmm2, %xmm4, %xmm2
466; AVX2-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
467; AVX2-NEXT:    vpblendvb %ymm6, %ymm2, %ymm1, %ymm1
468; AVX2-NEXT:    vmovdqa 176(%rdi), %xmm2
469; AVX2-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
470; AVX2-NEXT:    vmovdqa 160(%rdi), %xmm3
471; AVX2-NEXT:    vpshufb %xmm9, %xmm3, %xmm3
472; AVX2-NEXT:    vpor %xmm2, %xmm3, %xmm2
473; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
474; AVX2-NEXT:    vpblendvb %ymm8, %ymm1, %ymm2, %ymm1
475; AVX2-NEXT:    retq
476;
477; AVX512F-LABEL: f4:
478; AVX512F:       # %bb.0:
479; AVX512F-NEXT:    vmovdqa 176(%rdi), %xmm0
480; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,128,128,128,128,128,128,2,4,8,10,14]
481; AVX512F-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
482; AVX512F-NEXT:    vmovdqa 160(%rdi), %xmm2
483; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,0,2,6,8,12,14,128,128,128,128,128]
484; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
485; AVX512F-NEXT:    vpor %xmm0, %xmm2, %xmm0
486; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
487; AVX512F-NEXT:    vmovdqa (%rdi), %xmm2
488; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm4
489; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,4,6,10,12,128,128,128,128,128,128,u,u,u,u,u]
490; AVX512F-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
491; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm6 = [128,128,128,128,128,0,2,6,8,12,14,u,u,u,u,u]
492; AVX512F-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
493; AVX512F-NEXT:    vpor %xmm2, %xmm4, %xmm2
494; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
495; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm2
496; AVX512F-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,4,8,10,14,16,20,22,26,28,u,u,u,u,u,u,u,u,u,u,u]
497; AVX512F-NEXT:    vmovdqa 128(%rdi), %ymm4
498; AVX512F-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,2,4,8,10,14,16,20,22,26,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
499; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
500; AVX512F-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2
501; AVX512F-NEXT:    vmovdqa 96(%rdi), %xmm0
502; AVX512F-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
503; AVX512F-NEXT:    vmovdqa 112(%rdi), %xmm4
504; AVX512F-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
505; AVX512F-NEXT:    vpor %xmm0, %xmm4, %xmm0
506; AVX512F-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
507; AVX512F-NEXT:    vmovdqa 80(%rdi), %xmm4
508; AVX512F-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
509; AVX512F-NEXT:    vmovdqa 64(%rdi), %xmm4
510; AVX512F-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
511; AVX512F-NEXT:    vpor %xmm1, %xmm3, %xmm1
512; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
513; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
514; AVX512F-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
515; AVX512F-NEXT:    retq
516;
517; AVX512BW-LABEL: f4:
518; AVX512BW:       # %bb.0:
519; AVX512BW-NEXT:    vmovdqa 176(%rdi), %xmm0
520; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,128,128,128,128,128,128,2,4,8,10,14]
521; AVX512BW-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
522; AVX512BW-NEXT:    vmovdqa 160(%rdi), %xmm2
523; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,0,2,6,8,12,14,128,128,128,128,128]
524; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
525; AVX512BW-NEXT:    vpor %xmm0, %xmm2, %xmm0
526; AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
527; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm2
528; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm4
529; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,4,6,10,12,128,128,128,128,128,128,u,u,u,u,u]
530; AVX512BW-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
531; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [128,128,128,128,128,0,2,6,8,12,14,u,u,u,u,u]
532; AVX512BW-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
533; AVX512BW-NEXT:    vpor %xmm2, %xmm4, %xmm2
534; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
535; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm2
536; AVX512BW-NEXT:    vinserti64x4 $1, 128(%rdi), %zmm2, %zmm2
537; AVX512BW-NEXT:    movabsq $8998403163813888, %rax # imm = 0x1FF800001FF800
538; AVX512BW-NEXT:    kmovq %rax, %k1
539; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[u,u,u,u,u,u,u,u,u,u,u,2,4,8,10,14,16,20,22,26,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,34,36,40,42,46,48,52,54,58,60,u,u,u,u,u,u,u,u,u,u,u]
540; AVX512BW-NEXT:    vmovdqa 96(%rdi), %xmm2
541; AVX512BW-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
542; AVX512BW-NEXT:    vmovdqa 112(%rdi), %xmm4
543; AVX512BW-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
544; AVX512BW-NEXT:    vpor %xmm2, %xmm4, %xmm2
545; AVX512BW-NEXT:    vinserti32x4 $2, %xmm2, %zmm0, %zmm2
546; AVX512BW-NEXT:    vmovdqa 80(%rdi), %xmm4
547; AVX512BW-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
548; AVX512BW-NEXT:    vmovdqa 64(%rdi), %xmm4
549; AVX512BW-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
550; AVX512BW-NEXT:    vpor %xmm1, %xmm3, %xmm1
551; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
552; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7]
553; AVX512BW-NEXT:    movabsq $8796090925056, %rax # imm = 0x7FFFFE00000
554; AVX512BW-NEXT:    kmovq %rax, %k1
555; AVX512BW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
556; AVX512BW-NEXT:    retq
557;
558; AVX512VBMI-LABEL: f4:
559; AVX512VBMI:       # %bb.0:
560; AVX512VBMI-NEXT:    vmovdqa64 64(%rdi), %zmm0
561; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [64,68,70,74,76,80,82,86,88,92,94,98,100,104,106,110,112,116,118,122,124,0,2,6,8,12,14,18,20,24,26,30,32,36,38,42,44,48,50,54,56,60,62,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
562; AVX512VBMI-NEXT:    vpermi2b (%rdi), %zmm0, %zmm1
563; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,66,68,72,74,78,80,84,86,90,92,96,98,102,104,108,110,114,116,120,122,126]
564; AVX512VBMI-NEXT:    vpermi2b 128(%rdi), %zmm1, %zmm0
565; AVX512VBMI-NEXT:    retq
566  %a0 = load <192 x i8>, ptr %p0
567  %r = shufflevector <192 x i8> %a0, <192 x i8> poison, <64 x i32> <i32 0, i32 4, i32 6, i32 10, i32 12, i32 16, i32 18, i32 22, i32 24, i32 28, i32 30, i32 34, i32 36, i32 40, i32 42, i32 46, i32 48, i32 52, i32 54, i32 58, i32 60, i32 64, i32 66, i32 70, i32 72, i32 76, i32 78, i32 82, i32 84, i32 88, i32 90, i32 94, i32 96, i32 100, i32 102, i32 106, i32 108, i32 112, i32 114, i32 118, i32 120, i32 124, i32 126, i32 130, i32 132, i32 136, i32 138, i32 142, i32 144, i32 148, i32 150, i32 154, i32 156, i32 160, i32 162, i32 166, i32 168, i32 172, i32 174, i32 178, i32 180, i32 184, i32 186, i32 190>
568  ret <64 x i8> %r
569}
570