xref: /llvm-project/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll (revision be6c752e157638849f1f59f7e2b7ecbe11a022fe)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i386-pc-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,X86-AVX512,AVX512-SLOW,X86-AVX512-SLOW
3; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,X64-AVX512,AVX512-SLOW,X64-AVX512-SLOW
4; RUN: llc < %s -mtriple=i386-pc-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512,X86-AVX512,AVX512-FAST,X86-AVX512-FAST
5; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512,X64-AVX512,AVX512-FAST,X64-AVX512-FAST
6; RUN: llc < %s -mtriple=i386-pc-linux-gnu -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512F,X86-AVX512F
7; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512F,X64-AVX512F
8
9;expand 128 -> 256 include <4 x float> <2 x double>
10define <8 x float> @expand(<4 x float> %a) {
11; AVX512-LABEL: expand:
12; AVX512:       # %bb.0:
13; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
14; AVX512-NEXT:    movb $5, %al
15; AVX512-NEXT:    kmovd %eax, %k1
16; AVX512-NEXT:    vexpandps %ymm0, %ymm0 {%k1} {z}
17; AVX512-NEXT:    ret{{[l|q]}}
18;
19; AVX512F-LABEL: expand:
20; AVX512F:       # %bb.0:
21; AVX512F-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
22; AVX512F-NEXT:    vxorps %xmm1, %xmm1, %xmm1
23; AVX512F-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4,5,6,7]
24; AVX512F-NEXT:    ret{{[l|q]}}
25   %res = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 5, i32 1, i32 5, i32 5, i32 5, i32 5, i32 5>
26   ret <8 x float> %res
27}
28
29define <8 x float> @expand1(<4 x float> %a ) {
30; AVX512-LABEL: expand1:
31; AVX512:       # %bb.0:
32; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
33; AVX512-NEXT:    movb $-86, %al
34; AVX512-NEXT:    kmovd %eax, %k1
35; AVX512-NEXT:    vexpandps %ymm0, %ymm0 {%k1} {z}
36; AVX512-NEXT:    ret{{[l|q]}}
37;
38; AVX512F-LABEL: expand1:
39; AVX512F:       # %bb.0:
40; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
41; AVX512F-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [16,0,18,1,20,2,22,3]
42; AVX512F-NEXT:    vxorps %xmm2, %xmm2, %xmm2
43; AVX512F-NEXT:    vpermt2ps %zmm2, %zmm1, %zmm0
44; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
45; AVX512F-NEXT:    ret{{[l|q]}}
46   %res = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
47   ret <8 x float> %res
48}
49
50;Expand 128 -> 256 test <2 x double> -> <4 x double>
51define <4 x double> @expand2(<2 x double> %a) {
52; CHECK-LABEL: expand2:
53; CHECK:       # %bb.0:
54; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
55; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm1 = zero,zero,ymm0[0,1]
56; CHECK-NEXT:    vmovaps %xmm0, %xmm0
57; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
58; CHECK-NEXT:    ret{{[l|q]}}
59   %res = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 2, i32 1>
60   ret <4 x double> %res
61}
62
63;expand 128 -> 256 include case <4 x i32> <8 x i32>
64define <8 x i32> @expand3(<4 x i32> %a ) {
65; AVX512-LABEL: expand3:
66; AVX512:       # %bb.0:
67; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
68; AVX512-NEXT:    movb $-127, %al
69; AVX512-NEXT:    kmovd %eax, %k1
70; AVX512-NEXT:    vpexpandd %ymm0, %ymm0 {%k1} {z}
71; AVX512-NEXT:    ret{{[l|q]}}
72;
73; AVX512F-LABEL: expand3:
74; AVX512F:       # %bb.0:
75; AVX512F-NEXT:    vbroadcastsd %xmm0, %ymm0
76; AVX512F-NEXT:    vxorps %xmm1, %xmm1, %xmm1
77; AVX512F-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6],ymm0[7]
78; AVX512F-NEXT:    ret{{[l|q]}}
79   %res = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <8 x i32> <i32 4, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0,i32 5>
80   ret <8 x i32> %res
81}
82
83;expand 128 -> 256 include case <2 x i64> <4 x i64>
84define <4 x i64> @expand4(<2 x i64> %a ) {
85; AVX512-LABEL: expand4:
86; AVX512:       # %bb.0:
87; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
88; AVX512-NEXT:    movb $9, %al
89; AVX512-NEXT:    kmovd %eax, %k1
90; AVX512-NEXT:    vpexpandq %ymm0, %ymm0 {%k1} {z}
91; AVX512-NEXT:    ret{{[l|q]}}
92;
93; AVX512F-LABEL: expand4:
94; AVX512F:       # %bb.0:
95; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
96; AVX512F-NEXT:    vperm2f128 {{.*#+}} ymm1 = zero,zero,ymm0[0,1]
97; AVX512F-NEXT:    vmovaps %xmm0, %xmm0
98; AVX512F-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
99; AVX512F-NEXT:    ret{{[l|q]}}
100   %res = shufflevector <2 x i64> zeroinitializer, <2 x i64> %a, <4 x i32> <i32 2, i32 0, i32 0, i32 3>
101   ret <4 x i64> %res
102}
103
104;Negative test for 128-> 256
105define <8 x float> @expand5(<4 x float> %a ) {
106; AVX512-SLOW-LABEL: expand5:
107; AVX512-SLOW:       # %bb.0:
108; AVX512-SLOW-NEXT:    vbroadcastss %xmm0, %ymm0
109; AVX512-SLOW-NEXT:    vxorps %xmm1, %xmm1, %xmm1
110; AVX512-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
111; AVX512-SLOW-NEXT:    ret{{[l|q]}}
112;
113; AVX512-FAST-LABEL: expand5:
114; AVX512-FAST:       # %bb.0:
115; AVX512-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
116; AVX512-FAST-NEXT:    vxorps %xmm1, %xmm1, %xmm1
117; AVX512-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [8,10,12,14]
118; AVX512-FAST-NEXT:    vpermt2ps %ymm1, %ymm2, %ymm0
119; AVX512-FAST-NEXT:    ret{{[l|q]}}
120;
121; AVX512F-LABEL: expand5:
122; AVX512F:       # %bb.0:
123; AVX512F-NEXT:    vbroadcastss %xmm0, %ymm0
124; AVX512F-NEXT:    vxorps %xmm1, %xmm1, %xmm1
125; AVX512F-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
126; AVX512F-NEXT:    ret{{[l|q]}}
127   %res = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <8 x i32> <i32 0, i32 4, i32 1, i32 4, i32 2, i32 4, i32 3, i32 4>
128   ret <8 x float> %res
129}
130
131;expand 256 -> 512 include <8 x float> <16 x float>
132define <8 x float> @expand6(<4 x float> %a ) {
133; CHECK-LABEL: expand6:
134; CHECK:       # %bb.0:
135; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
136; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
137; CHECK-NEXT:    ret{{[l|q]}}
138   %res = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
139   ret <8 x float> %res
140}
141
142define <16 x float> @expand7(<8 x float> %a) {
143; AVX512-LABEL: expand7:
144; AVX512:       # %bb.0:
145; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
146; AVX512-NEXT:    movw $1285, %ax # imm = 0x505
147; AVX512-NEXT:    kmovd %eax, %k1
148; AVX512-NEXT:    vexpandps %zmm0, %zmm0 {%k1} {z}
149; AVX512-NEXT:    ret{{[l|q]}}
150;
151; AVX512F-LABEL: expand7:
152; AVX512F:       # %bb.0:
153; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
154; AVX512F-NEXT:    movw $1285, %ax # imm = 0x505
155; AVX512F-NEXT:    kmovw %eax, %k1
156; AVX512F-NEXT:    vexpandps %zmm0, %zmm0 {%k1} {z}
157; AVX512F-NEXT:    ret{{[l|q]}}
158   %res = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <16 x i32> <i32 0, i32 8, i32 1, i32 8, i32 8, i32 8, i32 8, i32 8, i32 2, i32 8, i32 3, i32 8, i32 8, i32 8, i32 8, i32 8>
159   ret <16 x float> %res
160}
161
162define <16 x float> @expand8(<8 x float> %a ) {
163; AVX512-LABEL: expand8:
164; AVX512:       # %bb.0:
165; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
166; AVX512-NEXT:    movw $-21846, %ax # imm = 0xAAAA
167; AVX512-NEXT:    kmovd %eax, %k1
168; AVX512-NEXT:    vexpandps %zmm0, %zmm0 {%k1} {z}
169; AVX512-NEXT:    ret{{[l|q]}}
170;
171; AVX512F-LABEL: expand8:
172; AVX512F:       # %bb.0:
173; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
174; AVX512F-NEXT:    movw $-21846, %ax # imm = 0xAAAA
175; AVX512F-NEXT:    kmovw %eax, %k1
176; AVX512F-NEXT:    vexpandps %zmm0, %zmm0 {%k1} {z}
177; AVX512F-NEXT:    ret{{[l|q]}}
178   %res = shufflevector <8 x float> zeroinitializer, <8 x float> %a, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
179   ret <16 x float> %res
180}
181
182;expand 256 -> 512 include <4 x double> <8 x double>
183define <8 x double> @expand9(<4 x double> %a) {
184; AVX512-LABEL: expand9:
185; AVX512:       # %bb.0:
186; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
187; AVX512-NEXT:    movb $-127, %al
188; AVX512-NEXT:    kmovd %eax, %k1
189; AVX512-NEXT:    vexpandpd %zmm0, %zmm0 {%k1} {z}
190; AVX512-NEXT:    ret{{[l|q]}}
191;
192; AVX512F-LABEL: expand9:
193; AVX512F:       # %bb.0:
194; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
195; AVX512F-NEXT:    movb $-127, %al
196; AVX512F-NEXT:    kmovw %eax, %k1
197; AVX512F-NEXT:    vexpandpd %zmm0, %zmm0 {%k1} {z}
198; AVX512F-NEXT:    ret{{[l|q]}}
199   %res = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <8 x i32> <i32 0, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 1>
200   ret <8 x double> %res
201}
202
203define <16 x i32> @expand10(<8 x i32> %a ) {
204; AVX512-LABEL: expand10:
205; AVX512:       # %bb.0:
206; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
207; AVX512-NEXT:    movw $-21846, %ax # imm = 0xAAAA
208; AVX512-NEXT:    kmovd %eax, %k1
209; AVX512-NEXT:    vpexpandd %zmm0, %zmm0 {%k1} {z}
210; AVX512-NEXT:    ret{{[l|q]}}
211;
212; AVX512F-LABEL: expand10:
213; AVX512F:       # %bb.0:
214; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
215; AVX512F-NEXT:    movw $-21846, %ax # imm = 0xAAAA
216; AVX512F-NEXT:    kmovw %eax, %k1
217; AVX512F-NEXT:    vpexpandd %zmm0, %zmm0 {%k1} {z}
218; AVX512F-NEXT:    ret{{[l|q]}}
219   %res = shufflevector <8 x i32> zeroinitializer, <8 x i32> %a, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
220   ret <16 x i32> %res
221}
222
223define <8 x i64> @expand11(<4 x i64> %a) {
224; AVX512-LABEL: expand11:
225; AVX512:       # %bb.0:
226; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
227; AVX512-NEXT:    movb $-127, %al
228; AVX512-NEXT:    kmovd %eax, %k1
229; AVX512-NEXT:    vpexpandq %zmm0, %zmm0 {%k1} {z}
230; AVX512-NEXT:    ret{{[l|q]}}
231;
232; AVX512F-LABEL: expand11:
233; AVX512F:       # %bb.0:
234; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
235; AVX512F-NEXT:    movb $-127, %al
236; AVX512F-NEXT:    kmovw %eax, %k1
237; AVX512F-NEXT:    vpexpandq %zmm0, %zmm0 {%k1} {z}
238; AVX512F-NEXT:    ret{{[l|q]}}
239   %res = shufflevector <4 x i64> %a, <4 x i64> zeroinitializer, <8 x i32> <i32 0, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 1>
240   ret <8 x i64> %res
241}
242
243;Negative test for 256-> 512
244define <16 x float> @expand12(<8 x float> %a) {
245; CHECK-LABEL: expand12:
246; CHECK:       # %bb.0:
247; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
248; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,16,2,16,4,16,6,16,0,16,1,16,2,16,3,16]
249; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
250; CHECK-NEXT:    vpermt2ps %zmm0, %zmm2, %zmm1
251; CHECK-NEXT:    vmovaps %zmm1, %zmm0
252; CHECK-NEXT:    ret{{[l|q]}}
253   %res = shufflevector <8 x float> zeroinitializer, <8 x float> %a, <16 x i32> <i32 0, i32 8, i32 1, i32 8, i32 2, i32 8, i32 3, i32 8,i32 0, i32 8, i32 1, i32 8, i32 2, i32 8, i32 3, i32 8>
254   ret <16 x float> %res
255}
256
257define <16 x float> @expand13(<8 x float> %a ) {
258; CHECK-LABEL: expand13:
259; CHECK:       # %bb.0:
260; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
261; CHECK-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
262; CHECK-NEXT:    ret{{[l|q]}}
263   %res = shufflevector <8 x float> zeroinitializer, <8 x float> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
264   ret <16 x float> %res
265}
266
267; The function checks for a case where the vector is mixed values vector ,and the mask points on zero elements from this vector.
268
269define <8 x float> @expand14(<4 x float> %a) {
270; AVX512-LABEL: expand14:
271; AVX512:       # %bb.0:
272; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
273; AVX512-NEXT:    movb $20, %al
274; AVX512-NEXT:    kmovd %eax, %k1
275; AVX512-NEXT:    vexpandps %ymm0, %ymm0 {%k1} {z}
276; AVX512-NEXT:    ret{{[l|q]}}
277;
278; AVX512F-LABEL: expand14:
279; AVX512F:       # %bb.0:
280; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
281; AVX512F-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [16,17,0,19,1,21,22,23]
282; AVX512F-NEXT:    vxorps %xmm2, %xmm2, %xmm2
283; AVX512F-NEXT:    vpermt2ps %zmm2, %zmm1, %zmm0
284; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
285; AVX512F-NEXT:    ret{{[l|q]}}
286   %addV = fadd <4 x float> <float 0.0,float 1.0,float 2.0,float 0.0> , <float 0.0,float 1.0,float 2.0,float 0.0>
287   %res = shufflevector <4 x float> %addV, <4 x float> %a, <8 x i32> <i32 3, i32 3, i32 4, i32 0, i32 5, i32 0, i32 0, i32 0>
288   ret <8 x float> %res
289}
290
291;Negative test.
292define <8 x float> @expand15(<4 x float> %a) {
293; AVX512-SLOW-LABEL: expand15:
294; AVX512-SLOW:       # %bb.0:
295; AVX512-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
296; AVX512-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3]
297; AVX512-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6,7]
298; AVX512-SLOW-NEXT:    ret{{[l|q]}}
299;
300; AVX512-FAST-LABEL: expand15:
301; AVX512-FAST:       # %bb.0:
302; AVX512-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
303; AVX512-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,0,1,0]
304; AVX512-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0
305; AVX512-FAST-NEXT:    vblendps {{.*#+}} ymm0 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6,7]
306; AVX512-FAST-NEXT:    ret{{[l|q]}}
307;
308; AVX512F-LABEL: expand15:
309; AVX512F:       # %bb.0:
310; AVX512F-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
311; AVX512F-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3]
312; AVX512F-NEXT:    vblendps {{.*#+}} ymm0 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6,7]
313; AVX512F-NEXT:    ret{{[l|q]}}
314   %addV = fadd <4 x float> <float 0.0,float 1.0,float 2.0,float 0.0> , <float 0.0,float 1.0,float 2.0,float 0.0>
315   %res = shufflevector <4 x float> %addV, <4 x float> %a, <8 x i32> <i32 0, i32 1, i32 4, i32 0, i32 5, i32 0, i32 0, i32 0>
316   ret <8 x float> %res
317}
318
319; Shuffle to blend test
320
321define <64 x i8> @test_mm512_mask_blend_epi8(<64 x i8> %A, <64 x i8> %W){
322; X86-AVX512-LABEL: test_mm512_mask_blend_epi8:
323; X86-AVX512:       # %bb.0: # %entry
324; X86-AVX512-NEXT:    movl $-1431655766, %eax # imm = 0xAAAAAAAA
325; X86-AVX512-NEXT:    kmovd %eax, %k0
326; X86-AVX512-NEXT:    kunpckdq %k0, %k0, %k1
327; X86-AVX512-NEXT:    vpblendmb %zmm0, %zmm1, %zmm0 {%k1}
328; X86-AVX512-NEXT:    retl
329;
330; X64-AVX512-LABEL: test_mm512_mask_blend_epi8:
331; X64-AVX512:       # %bb.0: # %entry
332; X64-AVX512-NEXT:    movabsq $-6148914691236517206, %rax # imm = 0xAAAAAAAAAAAAAAAA
333; X64-AVX512-NEXT:    kmovq %rax, %k1
334; X64-AVX512-NEXT:    vpblendmb %zmm0, %zmm1, %zmm0 {%k1}
335; X64-AVX512-NEXT:    retq
336;
337; AVX512F-LABEL: test_mm512_mask_blend_epi8:
338; AVX512F:       # %bb.0: # %entry
339; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
340; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm2, %zmm2
341; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm2 & (zmm0 ^ zmm1))
342; AVX512F-NEXT:    ret{{[l|q]}}
343entry:
344  %0 = shufflevector <64 x i8> %A, <64 x i8> %W, <64 x i32>  <i32 64, i32 1, i32 66, i32 3, i32 68, i32 5, i32 70, i32 7, i32 72, i32 9, i32 74, i32 11, i32 76, i32 13, i32 78, i32 15, i32 80, i32 17, i32 82, i32 19, i32 84, i32 21, i32 86, i32 23, i32 88, i32 25, i32 90, i32 27, i32 92, i32 29, i32 94, i32 31, i32 96, i32 33, i32 98, i32 35, i32 100, i32 37, i32 102, i32 39, i32 104, i32 41, i32 106, i32 43, i32 108, i32 45, i32 110, i32 47, i32 112, i32 49, i32 114, i32 51, i32 116, i32 53, i32 118, i32 55, i32 120, i32 57, i32 122, i32 59, i32 124, i32 61, i32 126, i32 63>
345  ret <64 x i8> %0
346}
347
348define <32 x i16> @test_mm512_mask_blend_epi16(<32 x i16> %A, <32 x i16> %W){
349; AVX512-LABEL: test_mm512_mask_blend_epi16:
350; AVX512:       # %bb.0: # %entry
351; AVX512-NEXT:    movl $-1431655766, %eax # imm = 0xAAAAAAAA
352; AVX512-NEXT:    kmovd %eax, %k1
353; AVX512-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
354; AVX512-NEXT:    ret{{[l|q]}}
355;
356; AVX512F-LABEL: test_mm512_mask_blend_epi16:
357; AVX512F:       # %bb.0: # %entry
358; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
359; AVX512F-NEXT:    ret{{[l|q]}}
360entry:
361  %0 = shufflevector <32 x i16> %A, <32 x i16> %W, <32 x i32>  <i32 32, i32 1, i32 34, i32 3, i32 36, i32 5, i32 38, i32 7, i32 40, i32 9, i32 42, i32 11, i32 44, i32 13, i32 46, i32 15, i32 48, i32 17, i32 50, i32 19, i32 52, i32 21, i32 54, i32 23, i32 56, i32 25, i32 58, i32 27, i32 60, i32 29, i32 62, i32 31>
362  ret <32 x i16> %0
363}
364
365define <16 x i32> @test_mm512_mask_blend_epi32(<16 x i32> %A, <16 x i32> %W){
366; AVX512-LABEL: test_mm512_mask_blend_epi32:
367; AVX512:       # %bb.0: # %entry
368; AVX512-NEXT:    movw $-21846, %ax # imm = 0xAAAA
369; AVX512-NEXT:    kmovd %eax, %k1
370; AVX512-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
371; AVX512-NEXT:    ret{{[l|q]}}
372;
373; AVX512F-LABEL: test_mm512_mask_blend_epi32:
374; AVX512F:       # %bb.0: # %entry
375; AVX512F-NEXT:    movw $-21846, %ax # imm = 0xAAAA
376; AVX512F-NEXT:    kmovw %eax, %k1
377; AVX512F-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
378; AVX512F-NEXT:    ret{{[l|q]}}
379entry:
380  %0 = shufflevector <16 x i32> %A, <16 x i32> %W, <16 x i32>  <i32 16, i32 1, i32 18, i32 3, i32 20, i32 5, i32 22, i32 7, i32 24, i32 9, i32 26, i32 11, i32 28, i32 13, i32 30, i32 15>
381  ret <16 x i32> %0
382}
383
384define <8 x i64> @test_mm512_mask_blend_epi64(<8 x i64> %A, <8 x i64> %W){
385; AVX512-LABEL: test_mm512_mask_blend_epi64:
386; AVX512:       # %bb.0: # %entry
387; AVX512-NEXT:    movb $-86, %al
388; AVX512-NEXT:    kmovd %eax, %k1
389; AVX512-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
390; AVX512-NEXT:    ret{{[l|q]}}
391;
392; AVX512F-LABEL: test_mm512_mask_blend_epi64:
393; AVX512F:       # %bb.0: # %entry
394; AVX512F-NEXT:    movb $-86, %al
395; AVX512F-NEXT:    kmovw %eax, %k1
396; AVX512F-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
397; AVX512F-NEXT:    ret{{[l|q]}}
398entry:
399  %0 = shufflevector <8 x i64> %A, <8 x i64> %W, <8 x i32>  <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
400  ret <8 x i64> %0
401}
402
403define <16 x float> @test_mm512_mask_blend_ps(<16 x float> %A, <16 x float> %W){
404; AVX512-LABEL: test_mm512_mask_blend_ps:
405; AVX512:       # %bb.0: # %entry
406; AVX512-NEXT:    movw $-21846, %ax # imm = 0xAAAA
407; AVX512-NEXT:    kmovd %eax, %k1
408; AVX512-NEXT:    vblendmps %zmm0, %zmm1, %zmm0 {%k1}
409; AVX512-NEXT:    ret{{[l|q]}}
410;
411; AVX512F-LABEL: test_mm512_mask_blend_ps:
412; AVX512F:       # %bb.0: # %entry
413; AVX512F-NEXT:    movw $-21846, %ax # imm = 0xAAAA
414; AVX512F-NEXT:    kmovw %eax, %k1
415; AVX512F-NEXT:    vblendmps %zmm0, %zmm1, %zmm0 {%k1}
416; AVX512F-NEXT:    ret{{[l|q]}}
417entry:
418  %0 = shufflevector <16 x float> %A, <16 x float> %W, <16 x i32>  <i32 16, i32 1, i32 18, i32 3, i32 20, i32 5, i32 22, i32 7, i32 24, i32 9, i32 26, i32 11, i32 28, i32 13, i32 30, i32 15>
419  ret <16 x float> %0
420}
421
422define <8 x double> @test_mm512_mask_blend_pd(<8 x double> %A, <8 x double> %W){
423; AVX512-LABEL: test_mm512_mask_blend_pd:
424; AVX512:       # %bb.0: # %entry
425; AVX512-NEXT:    movb $-88, %al
426; AVX512-NEXT:    kmovd %eax, %k1
427; AVX512-NEXT:    vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
428; AVX512-NEXT:    ret{{[l|q]}}
429;
430; AVX512F-LABEL: test_mm512_mask_blend_pd:
431; AVX512F:       # %bb.0: # %entry
432; AVX512F-NEXT:    movb $-88, %al
433; AVX512F-NEXT:    kmovw %eax, %k1
434; AVX512F-NEXT:    vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
435; AVX512F-NEXT:    ret{{[l|q]}}
436entry:
437  %0 = shufflevector <8 x double> %A, <8 x double> %W, <8 x i32>  <i32 8, i32 9, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
438  ret <8 x double> %0
439}
440
441
442define <32 x i8> @test_mm256_mask_blend_epi8(<32 x i8> %A, <32 x i8> %W){
443; AVX512-LABEL: test_mm256_mask_blend_epi8:
444; AVX512:       # %bb.0: # %entry
445; AVX512-NEXT:    movl $-1431655766, %eax # imm = 0xAAAAAAAA
446; AVX512-NEXT:    kmovd %eax, %k1
447; AVX512-NEXT:    vpblendmb %ymm0, %ymm1, %ymm0 {%k1}
448; AVX512-NEXT:    ret{{[l|q]}}
449;
450; AVX512F-LABEL: test_mm256_mask_blend_epi8:
451; AVX512F:       # %bb.0: # %entry
452; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
453; AVX512F-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
454; AVX512F-NEXT:    ret{{[l|q]}}
455entry:
456  %0 = shufflevector <32 x i8> %A, <32 x i8> %W, <32 x i32>  <i32 32, i32 1, i32 34, i32 3, i32 36, i32 5, i32 38, i32 7, i32 40, i32 9, i32 42, i32 11, i32 44, i32 13, i32 46, i32 15, i32 48, i32 17, i32 50, i32 19, i32 52, i32 21, i32 54, i32 23, i32 56, i32 25, i32 58, i32 27, i32 60, i32 29, i32 62, i32 31>
457  ret <32 x i8> %0
458}
459
460define <16 x i8> @test_mm_mask_blend_epi8(<16 x i8> %A, <16 x i8> %W){
461; AVX512-LABEL: test_mm_mask_blend_epi8:
462; AVX512:       # %bb.0: # %entry
463; AVX512-NEXT:    movw $-21846, %ax # imm = 0xAAAA
464; AVX512-NEXT:    kmovd %eax, %k1
465; AVX512-NEXT:    vpblendmb %xmm0, %xmm1, %xmm0 {%k1}
466; AVX512-NEXT:    ret{{[l|q]}}
467;
468; AVX512F-LABEL: test_mm_mask_blend_epi8:
469; AVX512F:       # %bb.0: # %entry
470; AVX512F-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
471; AVX512F-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
472; AVX512F-NEXT:    ret{{[l|q]}}
473entry:
474  %0 = shufflevector <16 x i8> %A, <16 x i8> %W, <16 x i32>  <i32 16, i32 1, i32 18, i32 3, i32 20, i32 5, i32 22, i32 7, i32 24, i32 9, i32 26, i32 11, i32 28, i32 13, i32 30, i32 15>
475  ret <16 x i8> %0
476}
477
478; PR34370
479define <8 x float> @test_masked_permps_v8f32(ptr %vp, <8 x float> %vec2) {
480; X86-AVX512-LABEL: test_masked_permps_v8f32:
481; X86-AVX512:       # %bb.0:
482; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
483; X86-AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [15,14,11,3,15,14,6,7]
484; X86-AVX512-NEXT:    vpermt2ps (%eax), %ymm1, %ymm0
485; X86-AVX512-NEXT:    retl
486;
487; X64-AVX512-LABEL: test_masked_permps_v8f32:
488; X64-AVX512:       # %bb.0:
489; X64-AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [15,14,11,3,15,14,6,7]
490; X64-AVX512-NEXT:    vpermt2ps (%rdi), %ymm1, %ymm0
491; X64-AVX512-NEXT:    retq
492;
493; X86-AVX512F-LABEL: test_masked_permps_v8f32:
494; X86-AVX512F:       # %bb.0:
495; X86-AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
496; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
497; X86-AVX512F-NEXT:    vmovaps (%eax), %ymm1
498; X86-AVX512F-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [23,22,19,3,23,22,6,7]
499; X86-AVX512F-NEXT:    vpermt2ps %zmm1, %zmm2, %zmm0
500; X86-AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
501; X86-AVX512F-NEXT:    retl
502;
503; X64-AVX512F-LABEL: test_masked_permps_v8f32:
504; X64-AVX512F:       # %bb.0:
505; X64-AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
506; X64-AVX512F-NEXT:    vmovaps (%rdi), %ymm1
507; X64-AVX512F-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [23,22,19,3,23,22,6,7]
508; X64-AVX512F-NEXT:    vpermt2ps %zmm1, %zmm2, %zmm0
509; X64-AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
510; X64-AVX512F-NEXT:    retq
511  %vec = load <8 x float>, ptr %vp
512  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 3, i32 0, i32 7, i32 6, i32 3, i32 0>
513  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x float> %shuf, <8 x float> %vec2
514  ret <8 x float> %res
515}
516
517define <16 x float> @test_masked_permps_v16f32(ptr %vp, <16 x float> %vec2) {
518; X86-AVX512-LABEL: test_masked_permps_v16f32:
519; X86-AVX512:       # %bb.0:
520; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
521; X86-AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [31,29,27,3,30,28,6,7,23,22,19,11,23,13,19,15]
522; X86-AVX512-NEXT:    vpermt2ps (%eax), %zmm1, %zmm0
523; X86-AVX512-NEXT:    retl
524;
525; X64-AVX512-LABEL: test_masked_permps_v16f32:
526; X64-AVX512:       # %bb.0:
527; X64-AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [31,29,27,3,30,28,6,7,23,22,19,11,23,13,19,15]
528; X64-AVX512-NEXT:    vpermt2ps (%rdi), %zmm1, %zmm0
529; X64-AVX512-NEXT:    retq
530;
531; X86-AVX512F-LABEL: test_masked_permps_v16f32:
532; X86-AVX512F:       # %bb.0:
533; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
534; X86-AVX512F-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [31,29,27,3,30,28,6,7,23,22,19,11,23,13,19,15]
535; X86-AVX512F-NEXT:    vpermt2ps (%eax), %zmm1, %zmm0
536; X86-AVX512F-NEXT:    retl
537;
538; X64-AVX512F-LABEL: test_masked_permps_v16f32:
539; X64-AVX512F:       # %bb.0:
540; X64-AVX512F-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [31,29,27,3,30,28,6,7,23,22,19,11,23,13,19,15]
541; X64-AVX512F-NEXT:    vpermt2ps (%rdi), %zmm1, %zmm0
542; X64-AVX512F-NEXT:    retq
543  %vec = load <16 x float>, ptr %vp
544  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 13, i32 11, i32 9, i32 14, i32 12, i32 10, i32 8, i32 7, i32 6, i32 3, i32 0, i32 7, i32 6, i32 3, i32 0>
545  %res = select <16 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0>, <16 x float> %shuf, <16 x float> %vec2
546  ret <16 x float> %res
547}
548
549define void @test_demandedelts_pshufb_v32i8_v16i8(ptr %src, ptr %dst) {
550; X86-AVX512-SLOW-LABEL: test_demandedelts_pshufb_v32i8_v16i8:
551; X86-AVX512-SLOW:       # %bb.0:
552; X86-AVX512-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
553; X86-AVX512-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
554; X86-AVX512-SLOW-NEXT:    vpbroadcastd 44(%ecx), %xmm0
555; X86-AVX512-SLOW-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
556; X86-AVX512-SLOW-NEXT:    vmovdqa %ymm0, 672(%eax)
557; X86-AVX512-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[1,0,2,3]
558; X86-AVX512-SLOW-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
559; X86-AVX512-SLOW-NEXT:    vmovdqa %ymm0, 832(%eax)
560; X86-AVX512-SLOW-NEXT:    vzeroupper
561; X86-AVX512-SLOW-NEXT:    retl
562;
563; X64-AVX512-SLOW-LABEL: test_demandedelts_pshufb_v32i8_v16i8:
564; X64-AVX512-SLOW:       # %bb.0:
565; X64-AVX512-SLOW-NEXT:    vpbroadcastd 44(%rdi), %xmm0
566; X64-AVX512-SLOW-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
567; X64-AVX512-SLOW-NEXT:    vmovdqa %ymm0, 672(%rsi)
568; X64-AVX512-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[1,0,2,3]
569; X64-AVX512-SLOW-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
570; X64-AVX512-SLOW-NEXT:    vmovdqa %ymm0, 832(%rsi)
571; X64-AVX512-SLOW-NEXT:    vzeroupper
572; X64-AVX512-SLOW-NEXT:    retq
573;
574; X86-AVX512-FAST-LABEL: test_demandedelts_pshufb_v32i8_v16i8:
575; X86-AVX512-FAST:       # %bb.0:
576; X86-AVX512-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
577; X86-AVX512-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ecx
578; X86-AVX512-FAST-NEXT:    vpbroadcastd 44(%ecx), %xmm0
579; X86-AVX512-FAST-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
580; X86-AVX512-FAST-NEXT:    vmovdqa %ymm0, 672(%eax)
581; X86-AVX512-FAST-NEXT:    vmovdqa 208(%ecx), %xmm0
582; X86-AVX512-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7,0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
583; X86-AVX512-FAST-NEXT:    vmovdqa %ymm0, 832(%eax)
584; X86-AVX512-FAST-NEXT:    vzeroupper
585; X86-AVX512-FAST-NEXT:    retl
586;
587; X64-AVX512-FAST-LABEL: test_demandedelts_pshufb_v32i8_v16i8:
588; X64-AVX512-FAST:       # %bb.0:
589; X64-AVX512-FAST-NEXT:    vpbroadcastd 44(%rdi), %xmm0
590; X64-AVX512-FAST-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
591; X64-AVX512-FAST-NEXT:    vmovdqa %ymm0, 672(%rsi)
592; X64-AVX512-FAST-NEXT:    vmovdqa 208(%rdi), %xmm0
593; X64-AVX512-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7,0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
594; X64-AVX512-FAST-NEXT:    vmovdqa %ymm0, 832(%rsi)
595; X64-AVX512-FAST-NEXT:    vzeroupper
596; X64-AVX512-FAST-NEXT:    retq
597;
598; X86-AVX512F-LABEL: test_demandedelts_pshufb_v32i8_v16i8:
599; X86-AVX512F:       # %bb.0:
600; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
601; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %ecx
602; X86-AVX512F-NEXT:    vpbroadcastd 44(%ecx), %xmm0
603; X86-AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
604; X86-AVX512F-NEXT:    vmovdqa %ymm0, 672(%eax)
605; X86-AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = mem[1,0,2,3]
606; X86-AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
607; X86-AVX512F-NEXT:    vmovdqa %ymm0, 832(%eax)
608; X86-AVX512F-NEXT:    vzeroupper
609; X86-AVX512F-NEXT:    retl
610;
611; X64-AVX512F-LABEL: test_demandedelts_pshufb_v32i8_v16i8:
612; X64-AVX512F:       # %bb.0:
613; X64-AVX512F-NEXT:    vpbroadcastd 44(%rdi), %xmm0
614; X64-AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
615; X64-AVX512F-NEXT:    vmovdqa %ymm0, 672(%rsi)
616; X64-AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = mem[1,0,2,3]
617; X64-AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
618; X64-AVX512F-NEXT:    vmovdqa %ymm0, 832(%rsi)
619; X64-AVX512F-NEXT:    vzeroupper
620; X64-AVX512F-NEXT:    retq
621  %t87 = load <16 x i32>, ptr %src, align 64
622  %t88 = extractelement <16 x i32> %t87, i64 11
623  %t89 = insertelement <8 x i32> <i32 undef, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, i32 %t88, i64 0
624  %t90 = insertelement <8 x i32> %t89, i32 %t88, i64 1
625  %ptridx49.i = getelementptr inbounds <8 x i32>, ptr %dst, i64 21
626  store <8 x i32> %t90, ptr %ptridx49.i, align 32
627  %ptridx56.i = getelementptr inbounds <2 x i32>, ptr %src, i64 24
628  %t09 = load <16 x i32>, ptr %ptridx56.i, align 64
629  %t10 = extractelement <16 x i32> %t09, i64 5
630  %t11 = insertelement <8 x i32> <i32 undef, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, i32 %t10, i64 0
631  %t12 = extractelement <16 x i32> %t09, i64 4
632  %t13 = insertelement <8 x i32> %t11, i32 %t12, i64 1
633  %ptridx64.i = getelementptr inbounds <8 x i32>, ptr %dst, i64 26
634  store <8 x i32> %t13, ptr %ptridx64.i, align 32
635  ret void
636}
637
638define <32 x float> @PR47534(<8 x float> %tmp) {
639; CHECK-LABEL: PR47534:
640; CHECK:       # %bb.0:
641; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
642; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
643; CHECK-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [7,25,26,27,7,29,30,31,7,25,26,27,7,29,30,31]
644; CHECK-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
645; CHECK-NEXT:    vpermi2ps %zmm2, %zmm0, %zmm1
646; CHECK-NEXT:    ret{{[l|q]}}
647  %tmp1 = shufflevector <8 x float> %tmp, <8 x float> undef, <32 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
648  %tmp2 = shufflevector <32 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, <32 x float> undef, <32 x i32> <i32 39, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 29, i32 30, i32 31>
649  %tmp18 = shufflevector <32 x float> %tmp2, <32 x float> %tmp1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 39, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 29, i32 30, i32 31>
650  ret <32 x float> %tmp18
651}
652
653%union1= type { <16 x float> }
654@src1 = external dso_local local_unnamed_addr global %union1, align 64
655
656define void @PR43170(ptr %a0) {
657; X86-AVX512-LABEL: PR43170:
658; X86-AVX512:       # %bb.0: # %entry
659; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
660; X86-AVX512-NEXT:    vmovaps src1, %ymm0
661; X86-AVX512-NEXT:    vmovaps %zmm0, (%eax)
662; X86-AVX512-NEXT:    vzeroupper
663; X86-AVX512-NEXT:    retl
664;
665; X64-AVX512-LABEL: PR43170:
666; X64-AVX512:       # %bb.0: # %entry
667; X64-AVX512-NEXT:    vmovaps src1(%rip), %ymm0
668; X64-AVX512-NEXT:    vmovaps %zmm0, (%rdi)
669; X64-AVX512-NEXT:    vzeroupper
670; X64-AVX512-NEXT:    retq
671;
672; X86-AVX512F-LABEL: PR43170:
673; X86-AVX512F:       # %bb.0: # %entry
674; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
675; X86-AVX512F-NEXT:    vmovaps src1, %ymm0
676; X86-AVX512F-NEXT:    vmovaps %zmm0, (%eax)
677; X86-AVX512F-NEXT:    vzeroupper
678; X86-AVX512F-NEXT:    retl
679;
680; X64-AVX512F-LABEL: PR43170:
681; X64-AVX512F:       # %bb.0: # %entry
682; X64-AVX512F-NEXT:    vmovaps src1(%rip), %ymm0
683; X64-AVX512F-NEXT:    vmovaps %zmm0, (%rdi)
684; X64-AVX512F-NEXT:    vzeroupper
685; X64-AVX512F-NEXT:    retq
686entry:
687  %0 = load <8 x float>, ptr @src1, align 64
688  %1 = shufflevector <8 x float> %0, <8 x float> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
689  store <16 x float> %1, ptr %a0, align 64
690  ret void
691}
692