xref: /llvm-project/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll (revision 7457f51f6cf61b960e3e6e45e63378debd5c1d5c)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_mem_shuffle
2; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle %s -o - | FileCheck --check-prefixes=CHECK,CHECK-FAST %s
3; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw,+fast-variable-perlane-shuffle %s -o - | FileCheck --check-prefixes=CHECK,CHECK-FAST-PERLANE %s
4
5; FIXME: All cases here should be fixed by PR34380
6
7define <8 x i16> @test_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec) {
8; CHECK-LABEL: test_16xi16_to_8xi16_perm_mask0:
9; CHECK:       # %bb.0:
10; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [8,6,12,4,7,9,14,8]
11; CHECK-NEXT:    vpermw %ymm0, %ymm1, %ymm0
12; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
13; CHECK-NEXT:    vzeroupper
14; CHECK-NEXT:    retq
15  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8>
16  ret <8 x i16> %res
17}
18define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
19; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask0:
20; CHECK:       # %bb.0:
21; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
22; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [8,6,12,4,7,9,14,8]
23; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
24; CHECK-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1}
25; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
26; CHECK-NEXT:    vzeroupper
27; CHECK-NEXT:    retq
28  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8>
29  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
30  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
31  ret <8 x i16> %res
32}
33
34define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %mask) {
35; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask0:
36; CHECK:       # %bb.0:
37; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [8,6,12,4,7,9,14,8]
38; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
39; CHECK-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
40; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
41; CHECK-NEXT:    vzeroupper
42; CHECK-NEXT:    retq
43  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8>
44  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
45  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
46  ret <8 x i16> %res
47}
48define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
49; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask1:
50; CHECK:       # %bb.0:
51; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
52; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [4,12,9,4,14,15,12,14]
53; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
54; CHECK-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1}
55; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
56; CHECK-NEXT:    vzeroupper
57; CHECK-NEXT:    retq
58  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
59  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
60  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
61  ret <8 x i16> %res
62}
63
64define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %mask) {
65; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask1:
66; CHECK:       # %bb.0:
67; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [4,12,9,4,14,15,12,14]
68; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
69; CHECK-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
70; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
71; CHECK-NEXT:    vzeroupper
72; CHECK-NEXT:    retq
73  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
74  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
75  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
76  ret <8 x i16> %res
77}
78define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
79; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask2:
80; CHECK:       # %bb.0:
81; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
82; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [4,11,14,10,7,1,6,9]
83; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
84; CHECK-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1}
85; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
86; CHECK-NEXT:    vzeroupper
87; CHECK-NEXT:    retq
88  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9>
89  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
90  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
91  ret <8 x i16> %res
92}
93
94define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i16> %mask) {
95; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask2:
96; CHECK:       # %bb.0:
97; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [4,11,14,10,7,1,6,9]
98; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
99; CHECK-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
100; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
101; CHECK-NEXT:    vzeroupper
102; CHECK-NEXT:    retq
103  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9>
104  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
105  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
106  ret <8 x i16> %res
107}
108define <8 x i16> @test_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec) {
109; CHECK-LABEL: test_16xi16_to_8xi16_perm_mask3:
110; CHECK:       # %bb.0:
111; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [14,15,7,13,4,12,8,0]
112; CHECK-NEXT:    vpermw %ymm0, %ymm1, %ymm0
113; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
114; CHECK-NEXT:    vzeroupper
115; CHECK-NEXT:    retq
116  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
117  ret <8 x i16> %res
118}
119define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
120; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask3:
121; CHECK:       # %bb.0:
122; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
123; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [14,15,7,13,4,12,8,0]
124; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
125; CHECK-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1}
126; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
127; CHECK-NEXT:    vzeroupper
128; CHECK-NEXT:    retq
129  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
130  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
131  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
132  ret <8 x i16> %res
133}
134
135define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x i16> %mask) {
136; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask3:
137; CHECK:       # %bb.0:
138; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [14,15,7,13,4,12,8,0]
139; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
140; CHECK-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
141; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
142; CHECK-NEXT:    vzeroupper
143; CHECK-NEXT:    retq
144  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
145  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
146  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
147  ret <8 x i16> %res
148}
149define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask0(ptr %vp) {
150; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask0:
151; CHECK:       # %bb.0:
152; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [0,7,13,3,5,13,3,9]
153; CHECK-NEXT:    vpermw (%rdi), %ymm0, %ymm0
154; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
155; CHECK-NEXT:    vzeroupper
156; CHECK-NEXT:    retq
157  %vec = load <16 x i16>, ptr %vp
158  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9>
159  ret <8 x i16> %res
160}
161define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) {
162; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask0:
163; CHECK:       # %bb.0:
164; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
165; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [0,7,13,3,5,13,3,9]
166; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
167; CHECK-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1}
168; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
169; CHECK-NEXT:    vzeroupper
170; CHECK-NEXT:    retq
171  %vec = load <16 x i16>, ptr %vp
172  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9>
173  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
174  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
175  ret <8 x i16> %res
176}
177
178define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16> %mask) {
179; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask0:
180; CHECK:       # %bb.0:
181; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [0,7,13,3,5,13,3,9]
182; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
183; CHECK-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z}
184; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
185; CHECK-NEXT:    vzeroupper
186; CHECK-NEXT:    retq
187  %vec = load <16 x i16>, ptr %vp
188  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9>
189  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
190  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
191  ret <8 x i16> %res
192}
193
194define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) {
195; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask1:
196; CHECK:       # %bb.0:
197; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
198; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [3,15,12,7,1,5,8,14]
199; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
200; CHECK-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1}
201; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
202; CHECK-NEXT:    vzeroupper
203; CHECK-NEXT:    retq
204  %vec = load <16 x i16>, ptr %vp
205  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 3, i32 15, i32 12, i32 7, i32 1, i32 5, i32 8, i32 14>
206  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
207  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
208  ret <8 x i16> %res
209}
210
211define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> %mask) {
212; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask1:
213; CHECK:       # %bb.0:
214; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [3,15,12,7,1,5,8,14]
215; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
216; CHECK-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z}
217; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
218; CHECK-NEXT:    vzeroupper
219; CHECK-NEXT:    retq
220  %vec = load <16 x i16>, ptr %vp
221  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 3, i32 15, i32 12, i32 7, i32 1, i32 5, i32 8, i32 14>
222  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
223  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
224  ret <8 x i16> %res
225}
226
227define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) {
228; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask2:
229; CHECK:       # %bb.0:
230; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm2
231; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [9,0,3,0,5,0,7,1]
232; CHECK-NEXT:    vpermi2w (%rdi), %xmm2, %xmm3
233; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
234; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1}
235; CHECK-NEXT:    retq
236  %vec = load <16 x i16>, ptr %vp
237  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9>
238  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
239  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
240  ret <8 x i16> %res
241}
242
243define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> %mask) {
244; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask2:
245; CHECK:       # %bb.0:
246; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm2
247; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [9,0,3,0,5,0,7,1]
248; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
249; CHECK-NEXT:    vpermi2w (%rdi), %xmm2, %xmm1 {%k1} {z}
250; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
251; CHECK-NEXT:    retq
252  %vec = load <16 x i16>, ptr %vp
253  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9>
254  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
255  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
256  ret <8 x i16> %res
257}
258
259define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask3(ptr %vp) {
260; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask3:
261; CHECK:       # %bb.0:
262; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [9,7,9,6,9,4,3,2]
263; CHECK-NEXT:    vpermw (%rdi), %ymm0, %ymm0
264; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
265; CHECK-NEXT:    vzeroupper
266; CHECK-NEXT:    retq
267  %vec = load <16 x i16>, ptr %vp
268  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
269  ret <8 x i16> %res
270}
271define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) {
272; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask3:
273; CHECK:       # %bb.0:
274; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
275; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [9,7,9,6,9,4,3,2]
276; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
277; CHECK-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1}
278; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
279; CHECK-NEXT:    vzeroupper
280; CHECK-NEXT:    retq
281  %vec = load <16 x i16>, ptr %vp
282  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
283  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
284  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
285  ret <8 x i16> %res
286}
287
288define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> %mask) {
289; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask3:
290; CHECK:       # %bb.0:
291; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [9,7,9,6,9,4,3,2]
292; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
293; CHECK-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z}
294; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
295; CHECK-NEXT:    vzeroupper
296; CHECK-NEXT:    retq
297  %vec = load <16 x i16>, ptr %vp
298  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
299  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
300  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
301  ret <8 x i16> %res
302}
303
304define <16 x i16> @test_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec) {
305; CHECK-LABEL: test_32xi16_to_16xi16_perm_mask0:
306; CHECK:       # %bb.0:
307; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
308; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2]
309; CHECK-NEXT:    vpermi2w %ymm0, %ymm2, %ymm1
310; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
311; CHECK-NEXT:    retq
312  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18>
313  ret <16 x i16> %res
314}
315define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
316; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask0:
317; CHECK:       # %bb.0:
318; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
319; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm4 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2]
320; CHECK-NEXT:    vpermi2w %ymm0, %ymm3, %ymm4
321; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
322; CHECK-NEXT:    vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
323; CHECK-NEXT:    retq
324  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18>
325  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
326  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
327  ret <16 x i16> %res
328}
329
330define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <16 x i16> %mask) {
331; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask0:
332; CHECK:       # %bb.0:
333; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
334; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2]
335; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
336; CHECK-NEXT:    vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z}
337; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
338; CHECK-NEXT:    retq
339  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18>
340  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
341  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
342  ret <16 x i16> %res
343}
344define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
345; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask1:
346; CHECK:       # %bb.0:
347; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
348; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm4 = [30,5,15,13,9,18,3,31,4,11,23,7,19,23,9,26]
349; CHECK-NEXT:    vpermi2w %ymm0, %ymm3, %ymm4
350; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
351; CHECK-NEXT:    vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
352; CHECK-NEXT:    retq
353  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 14, i32 21, i32 31, i32 29, i32 25, i32 2, i32 19, i32 15, i32 20, i32 27, i32 7, i32 23, i32 3, i32 7, i32 25, i32 10>
354  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
355  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
356  ret <16 x i16> %res
357}
358
359define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <16 x i16> %mask) {
360; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask1:
361; CHECK:       # %bb.0:
362; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
363; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [30,5,15,13,9,18,3,31,4,11,23,7,19,23,9,26]
364; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
365; CHECK-NEXT:    vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z}
366; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
367; CHECK-NEXT:    retq
368  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 14, i32 21, i32 31, i32 29, i32 25, i32 2, i32 19, i32 15, i32 20, i32 27, i32 7, i32 23, i32 3, i32 7, i32 25, i32 10>
369  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
370  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
371  ret <16 x i16> %res
372}
373define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
374; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask2:
375; CHECK:       # %bb.0:
376; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
377; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm4 = [10,19,20,6,17,2,13,1,5,16,4,3,2,28,27,15]
378; CHECK-NEXT:    vpermi2w %ymm0, %ymm3, %ymm4
379; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
380; CHECK-NEXT:    vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
381; CHECK-NEXT:    retq
382  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 26, i32 3, i32 4, i32 22, i32 1, i32 18, i32 29, i32 17, i32 21, i32 0, i32 20, i32 19, i32 18, i32 12, i32 11, i32 31>
383  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
384  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
385  ret <16 x i16> %res
386}
387
388define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <16 x i16> %mask) {
389; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask2:
390; CHECK:       # %bb.0:
391; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
392; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [10,19,20,6,17,2,13,1,5,16,4,3,2,28,27,15]
393; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
394; CHECK-NEXT:    vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z}
395; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
396; CHECK-NEXT:    retq
397  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 26, i32 3, i32 4, i32 22, i32 1, i32 18, i32 29, i32 17, i32 21, i32 0, i32 20, i32 19, i32 18, i32 12, i32 11, i32 31>
398  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
399  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
400  ret <16 x i16> %res
401}
402define <16 x i16> @test_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec) {
403; CHECK-LABEL: test_32xi16_to_16xi16_perm_mask3:
404; CHECK:       # %bb.0:
405; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
406; CHECK-NEXT:    vpermw %zmm0, %zmm1, %zmm0
407; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
408; CHECK-NEXT:    retq
409  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5>
410  ret <16 x i16> %res
411}
412define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
413; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask3:
414; CHECK:       # %bb.0:
415; CHECK-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
416; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
417; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
418; CHECK-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1}
419; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
420; CHECK-NEXT:    retq
421  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5>
422  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
423  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
424  ret <16 x i16> %res
425}
426
427define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <16 x i16> %mask) {
428; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask3:
429; CHECK:       # %bb.0:
430; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
431; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
432; CHECK-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z}
433; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
434; CHECK-NEXT:    retq
435  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5>
436  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
437  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
438  ret <16 x i16> %res
439}
440define <8 x i16> @test_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec) {
441; CHECK-LABEL: test_32xi16_to_8xi16_perm_mask0:
442; CHECK:       # %bb.0:
443; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [22,27,7,10,13,21,5,14]
444; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
445; CHECK-NEXT:    vpermt2w %ymm0, %ymm2, %ymm1
446; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
447; CHECK-NEXT:    vzeroupper
448; CHECK-NEXT:    retq
449  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30>
450  ret <8 x i16> %res
451}
452define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
453; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask0:
454; CHECK:       # %bb.0:
455; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [22,27,7,10,13,21,5,14]
456; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
457; CHECK-NEXT:    vpermt2w %ymm0, %ymm3, %ymm4
458; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
459; CHECK-NEXT:    vpblendmw %xmm4, %xmm1, %xmm0 {%k1}
460; CHECK-NEXT:    vzeroupper
461; CHECK-NEXT:    retq
462  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30>
463  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
464  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
465  ret <8 x i16> %res
466}
467
468define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i16> %mask) {
469; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask0:
470; CHECK:       # %bb.0:
471; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [22,27,7,10,13,21,5,14]
472; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
473; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
474; CHECK-NEXT:    vpermt2w %ymm0, %ymm3, %ymm2 {%k1} {z}
475; CHECK-NEXT:    vmovdqa %xmm2, %xmm0
476; CHECK-NEXT:    vzeroupper
477; CHECK-NEXT:    retq
478  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30>
479  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
480  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
481  ret <8 x i16> %res
482}
483define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
484; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask1:
485; CHECK:       # %bb.0:
486; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
487; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [1,21,27,10,8,19,14,5]
488; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
489; CHECK-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1}
490; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
491; CHECK-NEXT:    vzeroupper
492; CHECK-NEXT:    retq
493  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5>
494  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
495  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
496  ret <8 x i16> %res
497}
498
499define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x i16> %mask) {
500; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask1:
501; CHECK:       # %bb.0:
502; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [1,21,27,10,8,19,14,5]
503; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
504; CHECK-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z}
505; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
506; CHECK-NEXT:    vzeroupper
507; CHECK-NEXT:    retq
508  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5>
509  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
510  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
511  ret <8 x i16> %res
512}
513define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
514; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask2:
515; CHECK:       # %bb.0:
516; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
517; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [15,13,18,16,9,11,26,8]
518; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
519; CHECK-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1}
520; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
521; CHECK-NEXT:    vzeroupper
522; CHECK-NEXT:    retq
523  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8>
524  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
525  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
526  ret <8 x i16> %res
527}
528
529define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x i16> %mask) {
530; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask2:
531; CHECK:       # %bb.0:
532; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [15,13,18,16,9,11,26,8]
533; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
534; CHECK-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z}
535; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
536; CHECK-NEXT:    vzeroupper
537; CHECK-NEXT:    retq
538  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8>
539  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
540  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
541  ret <8 x i16> %res
542}
543define <8 x i16> @test_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec) {
544; CHECK-LABEL: test_32xi16_to_8xi16_perm_mask3:
545; CHECK:       # %bb.0:
546; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [17,0,23,10,1,8,7,30]
547; CHECK-NEXT:    vpermw %zmm0, %zmm1, %zmm0
548; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
549; CHECK-NEXT:    vzeroupper
550; CHECK-NEXT:    retq
551  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 17, i32 0, i32 23, i32 10, i32 1, i32 8, i32 7, i32 30>
552  ret <8 x i16> %res
553}
554define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
555; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask3:
556; CHECK:       # %bb.0:
557; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
558; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [17,0,23,10,1,8,7,30]
559; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
560; CHECK-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1}
561; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
562; CHECK-NEXT:    vzeroupper
563; CHECK-NEXT:    retq
564  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 17, i32 0, i32 23, i32 10, i32 1, i32 8, i32 7, i32 30>
565  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
566  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
567  ret <8 x i16> %res
568}
569
570define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x i16> %mask) {
571; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask3:
572; CHECK:       # %bb.0:
573; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [17,0,23,10,1,8,7,30]
574; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
575; CHECK-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z}
576; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
577; CHECK-NEXT:    vzeroupper
578; CHECK-NEXT:    retq
579  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 17, i32 0, i32 23, i32 10, i32 1, i32 8, i32 7, i32 30>
580  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
581  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
582  ret <8 x i16> %res
583}
584define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask0(ptr %vp) {
585; CHECK-LABEL: test_32xi16_to_16xi16_perm_mem_mask0:
586; CHECK:       # %bb.0:
587; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12]
588; CHECK-NEXT:    vpermw (%rdi), %zmm0, %zmm0
589; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
590; CHECK-NEXT:    retq
591  %vec = load <32 x i16>, ptr %vp
592  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12>
593  ret <16 x i16> %res
594}
595define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask0(ptr %vp, <16 x i16> %vec2, <16 x i16> %mask) {
596; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask0:
597; CHECK:       # %bb.0:
598; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
599; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12]
600; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
601; CHECK-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1}
602; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
603; CHECK-NEXT:    retq
604  %vec = load <32 x i16>, ptr %vp
605  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12>
606  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
607  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
608  ret <16 x i16> %res
609}
610
611define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask0(ptr %vp, <16 x i16> %mask) {
612; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask0:
613; CHECK:       # %bb.0:
614; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12]
615; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
616; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z}
617; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
618; CHECK-NEXT:    retq
619  %vec = load <32 x i16>, ptr %vp
620  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12>
621  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
622  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
623  ret <16 x i16> %res
624}
625
626define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask1(ptr %vp, <16 x i16> %vec2, <16 x i16> %mask) {
627; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask1:
628; CHECK:       # %bb.0:
629; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
630; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25]
631; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
632; CHECK-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1}
633; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
634; CHECK-NEXT:    retq
635  %vec = load <32 x i16>, ptr %vp
636  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16, i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25>
637  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
638  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
639  ret <16 x i16> %res
640}
641
642define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask1(ptr %vp, <16 x i16> %mask) {
643; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask1:
644; CHECK:       # %bb.0:
645; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25]
646; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
647; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z}
648; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
649; CHECK-NEXT:    retq
650  %vec = load <32 x i16>, ptr %vp
651  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16, i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25>
652  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
653  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
654  ret <16 x i16> %res
655}
656
657define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask2(ptr %vp, <16 x i16> %vec2, <16 x i16> %mask) {
658; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask2:
659; CHECK:       # %bb.0:
660; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
661; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0]
662; CHECK-NEXT:    vpermi2w (%rdi), %ymm2, %ymm3
663; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
664; CHECK-NEXT:    vmovdqu16 %ymm3, %ymm0 {%k1}
665; CHECK-NEXT:    retq
666  %vec = load <32 x i16>, ptr %vp
667  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16>
668  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
669  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
670  ret <16 x i16> %res
671}
672
673define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask2(ptr %vp, <16 x i16> %mask) {
674; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask2:
675; CHECK:       # %bb.0:
676; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
677; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0]
678; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
679; CHECK-NEXT:    vpermi2w (%rdi), %ymm2, %ymm1 {%k1} {z}
680; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
681; CHECK-NEXT:    retq
682  %vec = load <32 x i16>, ptr %vp
683  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16>
684  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
685  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
686  ret <16 x i16> %res
687}
688
689define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask3(ptr %vp) {
690; CHECK-LABEL: test_32xi16_to_16xi16_perm_mem_mask3:
691; CHECK:       # %bb.0:
692; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16]
693; CHECK-NEXT:    vpermw (%rdi), %zmm0, %zmm0
694; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
695; CHECK-NEXT:    retq
696  %vec = load <32 x i16>, ptr %vp
697  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27, i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16>
698  ret <16 x i16> %res
699}
700define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask3(ptr %vp, <16 x i16> %vec2, <16 x i16> %mask) {
701; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask3:
702; CHECK:       # %bb.0:
703; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
704; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16]
705; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
706; CHECK-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1}
707; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
708; CHECK-NEXT:    retq
709  %vec = load <32 x i16>, ptr %vp
710  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27, i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16>
711  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
712  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
713  ret <16 x i16> %res
714}
715
716define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask3(ptr %vp, <16 x i16> %mask) {
717; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask3:
718; CHECK:       # %bb.0:
719; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16]
720; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
721; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z}
722; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
723; CHECK-NEXT:    retq
724  %vec = load <32 x i16>, ptr %vp
725  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27, i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16>
726  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
727  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
728  ret <16 x i16> %res
729}
730
731define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask0(ptr %vp) {
732; CHECK-LABEL: test_32xi16_to_8xi16_perm_mem_mask0:
733; CHECK:       # %bb.0:
734; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [16,17,5,1,14,14,13,17]
735; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm0
736; CHECK-NEXT:    vpermt2w (%rdi), %ymm1, %ymm0
737; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
738; CHECK-NEXT:    vzeroupper
739; CHECK-NEXT:    retq
740  %vec = load <32 x i16>, ptr %vp
741  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1>
742  ret <8 x i16> %res
743}
744define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) {
745; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask0:
746; CHECK:       # %bb.0:
747; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [16,17,5,1,14,14,13,17]
748; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm3
749; CHECK-NEXT:    vpermt2w (%rdi), %ymm2, %ymm3
750; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
751; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1}
752; CHECK-NEXT:    vzeroupper
753; CHECK-NEXT:    retq
754  %vec = load <32 x i16>, ptr %vp
755  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1>
756  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
757  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
758  ret <8 x i16> %res
759}
760
761define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16> %mask) {
762; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask0:
763; CHECK:       # %bb.0:
764; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [16,17,5,1,14,14,13,17]
765; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm1
766; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
767; CHECK-NEXT:    vpermt2w (%rdi), %ymm2, %ymm1 {%k1} {z}
768; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
769; CHECK-NEXT:    vzeroupper
770; CHECK-NEXT:    retq
771  %vec = load <32 x i16>, ptr %vp
772  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1>
773  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
774  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
775  ret <8 x i16> %res
776}
777
778define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) {
779; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask1:
780; CHECK:       # %bb.0:
781; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [7,6,4,6,12,4,27,1]
782; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm3
783; CHECK-NEXT:    vpermt2w (%rdi), %ymm2, %ymm3
784; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
785; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1}
786; CHECK-NEXT:    vzeroupper
787; CHECK-NEXT:    retq
788  %vec = load <32 x i16>, ptr %vp
789  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17>
790  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
791  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
792  ret <8 x i16> %res
793}
794
795define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> %mask) {
796; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask1:
797; CHECK:       # %bb.0:
798; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [7,6,4,6,12,4,27,1]
799; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm1
800; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
801; CHECK-NEXT:    vpermt2w (%rdi), %ymm2, %ymm1 {%k1} {z}
802; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
803; CHECK-NEXT:    vzeroupper
804; CHECK-NEXT:    retq
805  %vec = load <32 x i16>, ptr %vp
806  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17>
807  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
808  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
809  ret <8 x i16> %res
810}
811
812define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) {
813; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask2:
814; CHECK:       # %bb.0:
815; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
816; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [6,18,0,4,10,25,22,10]
817; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
818; CHECK-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1}
819; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
820; CHECK-NEXT:    vzeroupper
821; CHECK-NEXT:    retq
822  %vec = load <32 x i16>, ptr %vp
823  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 18, i32 0, i32 4, i32 10, i32 25, i32 22, i32 10>
824  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
825  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
826  ret <8 x i16> %res
827}
828
829define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> %mask) {
830; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask2:
831; CHECK:       # %bb.0:
832; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [6,18,0,4,10,25,22,10]
833; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
834; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z}
835; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
836; CHECK-NEXT:    vzeroupper
837; CHECK-NEXT:    retq
838  %vec = load <32 x i16>, ptr %vp
839  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 18, i32 0, i32 4, i32 10, i32 25, i32 22, i32 10>
840  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
841  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
842  ret <8 x i16> %res
843}
844
845define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask3(ptr %vp) {
846; CHECK-LABEL: test_32xi16_to_8xi16_perm_mem_mask3:
847; CHECK:       # %bb.0:
848; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [19,1,5,31,9,12,17,9]
849; CHECK-NEXT:    vpermw (%rdi), %zmm0, %zmm0
850; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
851; CHECK-NEXT:    vzeroupper
852; CHECK-NEXT:    retq
853  %vec = load <32 x i16>, ptr %vp
854  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9>
855  ret <8 x i16> %res
856}
857define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) {
858; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask3:
859; CHECK:       # %bb.0:
860; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
861; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [19,1,5,31,9,12,17,9]
862; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
863; CHECK-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1}
864; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
865; CHECK-NEXT:    vzeroupper
866; CHECK-NEXT:    retq
867  %vec = load <32 x i16>, ptr %vp
868  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9>
869  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
870  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
871  ret <8 x i16> %res
872}
873
874define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> %mask) {
875; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask3:
876; CHECK:       # %bb.0:
877; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [19,1,5,31,9,12,17,9]
878; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
879; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z}
880; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
881; CHECK-NEXT:    vzeroupper
882; CHECK-NEXT:    retq
883  %vec = load <32 x i16>, ptr %vp
884  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9>
885  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
886  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
887  ret <8 x i16> %res
888}
889
890define <8 x i16> @test_16xi16_to_8xi16_E84C94EF(<16 x i16> %vec) {
891; CHECK-LABEL: test_16xi16_to_8xi16_E84C94EF:
892; CHECK:       # %bb.0:
893; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [14,8,4,12,9,4,14,15]
894; CHECK-NEXT:    vpermw %ymm0, %ymm1, %ymm0
895; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
896; CHECK-NEXT:    vzeroupper
897; CHECK-NEXT:    retq
898  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15>
899  ret <8 x i16> %res
900}
901
902define <4 x i32> @test_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec) {
903; CHECK-LABEL: test_8xi32_to_4xi32_perm_mask0:
904; CHECK:       # %bb.0:
905; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,0,3,2]
906; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
907; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
908; CHECK-NEXT:    vzeroupper
909; CHECK-NEXT:    retq
910  %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 0, i32 3, i32 2>
911  ret <4 x i32> %res
912}
913define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
914; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask0:
915; CHECK:       # %bb.0:
916; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
917; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [4,0,3,2]
918; CHECK-NEXT:    vptestnmd %xmm2, %xmm2, %k1
919; CHECK-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1}
920; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
921; CHECK-NEXT:    vzeroupper
922; CHECK-NEXT:    retq
923  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 0, i32 3, i32 2>
924  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
925  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
926  ret <4 x i32> %res
927}
928
929define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec, <4 x i32> %mask) {
930; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask0:
931; CHECK:       # %bb.0:
932; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [4,0,3,2]
933; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
934; CHECK-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
935; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
936; CHECK-NEXT:    vzeroupper
937; CHECK-NEXT:    retq
938  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 0, i32 3, i32 2>
939  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
940  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
941  ret <4 x i32> %res
942}
943define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask1(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
944; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask1:
945; CHECK:       # %bb.0:
946; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
947; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [3,0,7,3]
948; CHECK-NEXT:    vptestnmd %xmm2, %xmm2, %k1
949; CHECK-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1}
950; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
951; CHECK-NEXT:    vzeroupper
952; CHECK-NEXT:    retq
953  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 3, i32 0, i32 7, i32 3>
954  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
955  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
956  ret <4 x i32> %res
957}
958
959define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask1(<8 x i32> %vec, <4 x i32> %mask) {
960; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask1:
961; CHECK:       # %bb.0:
962; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [3,0,7,3]
963; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
964; CHECK-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
965; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
966; CHECK-NEXT:    vzeroupper
967; CHECK-NEXT:    retq
968  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 3, i32 0, i32 7, i32 3>
969  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
970  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
971  ret <4 x i32> %res
972}
973define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask2(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
974; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask2:
975; CHECK:       # %bb.0:
976; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,1,2,3]
977; CHECK-NEXT:    vptestnmd %xmm2, %xmm2, %k1
978; CHECK-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
979; CHECK-NEXT:    vzeroupper
980; CHECK-NEXT:    retq
981  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
982  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
983  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
984  ret <4 x i32> %res
985}
986
987define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask2(<8 x i32> %vec, <4 x i32> %mask) {
988; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask2:
989; CHECK:       # %bb.0:
990; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,1,2,3]
991; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
992; CHECK-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
993; CHECK-NEXT:    vzeroupper
994; CHECK-NEXT:    retq
995  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
996  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
997  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
998  ret <4 x i32> %res
999}
1000define <4 x i32> @test_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec) {
1001; CHECK-LABEL: test_8xi32_to_4xi32_perm_mask3:
1002; CHECK:       # %bb.0:
1003; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [5,3,2,5]
1004; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
1005; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1006; CHECK-NEXT:    vzeroupper
1007; CHECK-NEXT:    retq
1008  %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 5>
1009  ret <4 x i32> %res
1010}
1011define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
1012; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask3:
1013; CHECK:       # %bb.0:
1014; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
1015; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [5,3,2,5]
1016; CHECK-NEXT:    vptestnmd %xmm2, %xmm2, %k1
1017; CHECK-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1}
1018; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
1019; CHECK-NEXT:    vzeroupper
1020; CHECK-NEXT:    retq
1021  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 5>
1022  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1023  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1024  ret <4 x i32> %res
1025}
1026
1027define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i32> %mask) {
1028; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask3:
1029; CHECK:       # %bb.0:
1030; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [5,3,2,5]
1031; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
1032; CHECK-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
1033; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1034; CHECK-NEXT:    vzeroupper
1035; CHECK-NEXT:    retq
1036  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 5>
1037  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1038  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1039  ret <4 x i32> %res
1040}
1041define <4 x i32> @test_8xi32_to_4xi32_perm_mem_mask0(ptr %vp) {
1042; CHECK-LABEL: test_8xi32_to_4xi32_perm_mem_mask0:
1043; CHECK:       # %bb.0:
1044; CHECK-NEXT:    vmovaps 16(%rdi), %xmm0
1045; CHECK-NEXT:    vshufps $7, (%rdi), %xmm0, %xmm0 # xmm0 = xmm0[3,1],mem[0,0]
1046; CHECK-NEXT:    retq
1047  %vec = load <8 x i32>, ptr %vp
1048  %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 7, i32 5, i32 0, i32 0>
1049  ret <4 x i32> %res
1050}
1051define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1052; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask0:
1053; CHECK:       # %bb.0:
1054; CHECK-NEXT:    vmovaps 16(%rdi), %xmm2
1055; CHECK-NEXT:    vshufps $7, (%rdi), %xmm2, %xmm2 # xmm2 = xmm2[3,1],mem[0,0]
1056; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
1057; CHECK-NEXT:    vmovdqa32 %xmm2, %xmm0 {%k1}
1058; CHECK-NEXT:    retq
1059  %vec = load <8 x i32>, ptr %vp
1060  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 7, i32 5, i32 0, i32 0>
1061  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1062  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1063  ret <4 x i32> %res
1064}
1065
1066define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> %mask) {
1067; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask0:
1068; CHECK:       # %bb.0:
1069; CHECK-NEXT:    vmovaps 16(%rdi), %xmm1
1070; CHECK-NEXT:    vshufps $7, (%rdi), %xmm1, %xmm1 # xmm1 = xmm1[3,1],mem[0,0]
1071; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
1072; CHECK-NEXT:    vmovdqa32 %xmm1, %xmm0 {%k1} {z}
1073; CHECK-NEXT:    retq
1074  %vec = load <8 x i32>, ptr %vp
1075  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 7, i32 5, i32 0, i32 0>
1076  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1077  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1078  ret <4 x i32> %res
1079}
1080
1081define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1082; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask1:
1083; CHECK:       # %bb.0:
1084; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1085; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [5,0,0,3]
1086; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
1087; CHECK-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1}
1088; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1089; CHECK-NEXT:    vzeroupper
1090; CHECK-NEXT:    retq
1091  %vec = load <8 x i32>, ptr %vp
1092  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 0, i32 0, i32 3>
1093  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1094  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1095  ret <4 x i32> %res
1096}
1097
1098define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %mask) {
1099; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask1:
1100; CHECK:       # %bb.0:
1101; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [5,0,0,3]
1102; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
1103; CHECK-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z}
1104; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1105; CHECK-NEXT:    vzeroupper
1106; CHECK-NEXT:    retq
1107  %vec = load <8 x i32>, ptr %vp
1108  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 0, i32 0, i32 3>
1109  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1110  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1111  ret <4 x i32> %res
1112}
1113
1114define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1115; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask2:
1116; CHECK:       # %bb.0:
1117; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm2
1118; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [0,7,7,0]
1119; CHECK-NEXT:    vpermi2d (%rdi), %xmm2, %xmm3
1120; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
1121; CHECK-NEXT:    vmovdqa32 %xmm3, %xmm0 {%k1}
1122; CHECK-NEXT:    retq
1123  %vec = load <8 x i32>, ptr %vp
1124  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 3, i32 3, i32 4>
1125  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1126  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1127  ret <4 x i32> %res
1128}
1129
1130define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> %mask) {
1131; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask2:
1132; CHECK:       # %bb.0:
1133; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm2
1134; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,7,7,0]
1135; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
1136; CHECK-NEXT:    vpermi2d (%rdi), %xmm2, %xmm1 {%k1} {z}
1137; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
1138; CHECK-NEXT:    retq
1139  %vec = load <8 x i32>, ptr %vp
1140  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 3, i32 3, i32 4>
1141  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1142  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1143  ret <4 x i32> %res
1144}
1145
1146define <4 x i32> @test_8xi32_to_4xi32_perm_mem_mask3(ptr %vp) {
1147; CHECK-LABEL: test_8xi32_to_4xi32_perm_mem_mask3:
1148; CHECK:       # %bb.0:
1149; CHECK-NEXT:    vpbroadcastq 8(%rdi), %xmm1
1150; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [5,1,2,7]
1151; CHECK-NEXT:    vpermi2d 16(%rdi), %xmm1, %xmm0
1152; CHECK-NEXT:    retq
1153  %vec = load <8 x i32>, ptr %vp
1154  %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7>
1155  ret <4 x i32> %res
1156}
1157define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1158; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask3:
1159; CHECK:       # %bb.0:
1160; CHECK-NEXT:    vpbroadcastq 8(%rdi), %xmm2
1161; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [5,1,2,7]
1162; CHECK-NEXT:    vpermi2d 16(%rdi), %xmm2, %xmm3
1163; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
1164; CHECK-NEXT:    vmovdqa32 %xmm3, %xmm0 {%k1}
1165; CHECK-NEXT:    retq
1166  %vec = load <8 x i32>, ptr %vp
1167  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7>
1168  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1169  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1170  ret <4 x i32> %res
1171}
1172
1173define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32> %mask) {
1174; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask3:
1175; CHECK:       # %bb.0:
1176; CHECK-NEXT:    vpbroadcastq 8(%rdi), %xmm2
1177; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [5,1,2,7]
1178; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
1179; CHECK-NEXT:    vpermi2d 16(%rdi), %xmm2, %xmm1 {%k1} {z}
1180; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
1181; CHECK-NEXT:    retq
1182  %vec = load <8 x i32>, ptr %vp
1183  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7>
1184  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1185  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1186  ret <4 x i32> %res
1187}
1188
1189define <8 x i32> @test_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec) {
1190; CHECK-LABEL: test_16xi32_to_8xi32_perm_mask0:
1191; CHECK:       # %bb.0:
1192; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [1,13,11,14,7,10,1,6]
1193; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
1194; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1195; CHECK-NEXT:    retq
1196  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 13, i32 11, i32 14, i32 7, i32 10, i32 1, i32 6>
1197  ret <8 x i32> %res
1198}
1199define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
1200; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask0:
1201; CHECK:       # %bb.0:
1202; CHECK-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
1203; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,13,11,14,7,10,1,6]
1204; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
1205; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1}
1206; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
1207; CHECK-NEXT:    retq
1208  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 13, i32 11, i32 14, i32 7, i32 10, i32 1, i32 6>
1209  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1210  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1211  ret <8 x i32> %res
1212}
1213
1214define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x i32> %mask) {
1215; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask0:
1216; CHECK:       # %bb.0:
1217; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [1,13,11,14,7,10,1,6]
1218; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
1219; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
1220; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1221; CHECK-NEXT:    retq
1222  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 13, i32 11, i32 14, i32 7, i32 10, i32 1, i32 6>
1223  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1224  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1225  ret <8 x i32> %res
1226}
1227define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
1228; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask1:
1229; CHECK:       # %bb.0:
1230; CHECK-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
1231; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [3,0,15,3,2,3,6,8]
1232; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
1233; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1}
1234; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
1235; CHECK-NEXT:    retq
1236  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 3, i32 0, i32 15, i32 3, i32 2, i32 3, i32 6, i32 8>
1237  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1238  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1239  ret <8 x i32> %res
1240}
1241
1242define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x i32> %mask) {
1243; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask1:
1244; CHECK:       # %bb.0:
1245; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [3,0,15,3,2,3,6,8]
1246; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
1247; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
1248; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1249; CHECK-NEXT:    retq
1250  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 3, i32 0, i32 15, i32 3, i32 2, i32 3, i32 6, i32 8>
1251  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1252  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1253  ret <8 x i32> %res
1254}
1255define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
1256; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask2:
1257; CHECK:       # %bb.0:
1258; CHECK-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
1259; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [2,15,15,2,6,10,14,7]
1260; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
1261; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1}
1262; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
1263; CHECK-NEXT:    retq
1264  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7>
1265  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1266  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1267  ret <8 x i32> %res
1268}
1269
1270define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x i32> %mask) {
1271; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask2:
1272; CHECK:       # %bb.0:
1273; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [2,15,15,2,6,10,14,7]
1274; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
1275; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
1276; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1277; CHECK-NEXT:    retq
1278  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7>
1279  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1280  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1281  ret <8 x i32> %res
1282}
1283define <8 x i32> @test_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec) {
1284; CHECK-LABEL: test_16xi32_to_8xi32_perm_mask3:
1285; CHECK:       # %bb.0:
1286; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [14,5,7,7,10,3,9,3]
1287; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
1288; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1289; CHECK-NEXT:    retq
1290  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 14, i32 5, i32 7, i32 7, i32 10, i32 3, i32 9, i32 3>
1291  ret <8 x i32> %res
1292}
1293define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
1294; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask3:
1295; CHECK:       # %bb.0:
1296; CHECK-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
1297; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [14,5,7,7,10,3,9,3]
1298; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
1299; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1}
1300; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
1301; CHECK-NEXT:    retq
1302  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 14, i32 5, i32 7, i32 7, i32 10, i32 3, i32 9, i32 3>
1303  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1304  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1305  ret <8 x i32> %res
1306}
1307
1308define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x i32> %mask) {
1309; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask3:
1310; CHECK:       # %bb.0:
1311; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [14,5,7,7,10,3,9,3]
1312; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
1313; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
1314; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1315; CHECK-NEXT:    retq
1316  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 14, i32 5, i32 7, i32 7, i32 10, i32 3, i32 9, i32 3>
1317  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1318  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1319  ret <8 x i32> %res
1320}
1321define <4 x i32> @test_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec) {
1322; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask0:
1323; CHECK:       # %bb.0:
1324; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,2,4,12,4,6,4,12]
1325; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
1326; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1327; CHECK-NEXT:    vzeroupper
1328; CHECK-NEXT:    retq
1329  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12>
1330  ret <4 x i32> %res
1331}
1332define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
1333; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask0:
1334; CHECK:       # %bb.0:
1335; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
1336; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,2,4,12,4,6,4,12]
1337; CHECK-NEXT:    vptestnmd %xmm2, %xmm2, %k1
1338; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1}
1339; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
1340; CHECK-NEXT:    vzeroupper
1341; CHECK-NEXT:    retq
1342  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12>
1343  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1344  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1345  ret <4 x i32> %res
1346}
1347
1348define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i32> %mask) {
1349; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask0:
1350; CHECK:       # %bb.0:
1351; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,2,4,12,4,6,4,12]
1352; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
1353; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
1354; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1355; CHECK-NEXT:    vzeroupper
1356; CHECK-NEXT:    retq
1357  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12>
1358  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1359  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1360  ret <4 x i32> %res
1361}
1362define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
1363; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask1:
1364; CHECK:       # %bb.0:
1365; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
1366; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [5,1,3,4]
1367; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
1368; CHECK-NEXT:    vptestnmd %xmm2, %xmm2, %k1
1369; CHECK-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1}
1370; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
1371; CHECK-NEXT:    vzeroupper
1372; CHECK-NEXT:    retq
1373  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 9, i32 11, i32 12>
1374  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1375  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1376  ret <4 x i32> %res
1377}
1378
1379define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i32> %mask) {
1380; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask1:
1381; CHECK:       # %bb.0:
1382; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [5,1,3,4]
1383; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
1384; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
1385; CHECK-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
1386; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1387; CHECK-NEXT:    vzeroupper
1388; CHECK-NEXT:    retq
1389  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 9, i32 11, i32 12>
1390  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1391  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1392  ret <4 x i32> %res
1393}
1394define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
1395; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask2:
1396; CHECK:       # %bb.0:
1397; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
1398; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,1,13,0]
1399; CHECK-NEXT:    vptestnmd %xmm2, %xmm2, %k1
1400; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1}
1401; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
1402; CHECK-NEXT:    vzeroupper
1403; CHECK-NEXT:    retq
1404  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 1, i32 1, i32 13, i32 0>
1405  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1406  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1407  ret <4 x i32> %res
1408}
1409
1410define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x i32> %mask) {
1411; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask2:
1412; CHECK:       # %bb.0:
1413; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,1,13,0]
1414; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
1415; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
1416; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1417; CHECK-NEXT:    vzeroupper
1418; CHECK-NEXT:    retq
1419  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 1, i32 1, i32 13, i32 0>
1420  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1421  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1422  ret <4 x i32> %res
1423}
1424define <4 x i32> @test_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec) {
1425; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask3:
1426; CHECK:       # %bb.0:
1427; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [3,0,0,13]
1428; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
1429; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1430; CHECK-NEXT:    vzeroupper
1431; CHECK-NEXT:    retq
1432  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 13>
1433  ret <4 x i32> %res
1434}
1435define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
1436; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask3:
1437; CHECK:       # %bb.0:
1438; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
1439; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [3,0,0,13]
1440; CHECK-NEXT:    vptestnmd %xmm2, %xmm2, %k1
1441; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1}
1442; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
1443; CHECK-NEXT:    vzeroupper
1444; CHECK-NEXT:    retq
1445  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 13>
1446  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1447  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1448  ret <4 x i32> %res
1449}
1450
1451define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x i32> %mask) {
1452; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask3:
1453; CHECK:       # %bb.0:
1454; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [3,0,0,13]
1455; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
1456; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
1457; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1458; CHECK-NEXT:    vzeroupper
1459; CHECK-NEXT:    retq
1460  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 13>
1461  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1462  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1463  ret <4 x i32> %res
1464}
1465define <8 x i32> @test_16xi32_to_8xi32_perm_mem_mask0(ptr %vp) {
1466; CHECK-LABEL: test_16xi32_to_8xi32_perm_mem_mask0:
1467; CHECK:       # %bb.0:
1468; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [7,0,6,0,1,2,4,4]
1469; CHECK-NEXT:    vpermps 32(%rdi), %ymm0, %ymm0
1470; CHECK-NEXT:    retq
1471  %vec = load <16 x i32>, ptr %vp
1472  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 8, i32 14, i32 8, i32 9, i32 10, i32 12, i32 12>
1473  ret <8 x i32> %res
1474}
1475define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask0(ptr %vp, <8 x i32> %vec2, <8 x i32> %mask) {
1476; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask0:
1477; CHECK:       # %bb.0:
1478; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [7,0,6,0,1,2,4,4]
1479; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
1480; CHECK-NEXT:    vpermd 32(%rdi), %ymm2, %ymm0 {%k1}
1481; CHECK-NEXT:    retq
1482  %vec = load <16 x i32>, ptr %vp
1483  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 8, i32 14, i32 8, i32 9, i32 10, i32 12, i32 12>
1484  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1485  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1486  ret <8 x i32> %res
1487}
1488
1489define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask0(ptr %vp, <8 x i32> %mask) {
1490; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask0:
1491; CHECK:       # %bb.0:
1492; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [7,0,6,0,1,2,4,4]
1493; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
1494; CHECK-NEXT:    vpermd 32(%rdi), %ymm1, %ymm0 {%k1} {z}
1495; CHECK-NEXT:    retq
1496  %vec = load <16 x i32>, ptr %vp
1497  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 8, i32 14, i32 8, i32 9, i32 10, i32 12, i32 12>
1498  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1499  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1500  ret <8 x i32> %res
1501}
1502
1503define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask1(ptr %vp, <8 x i32> %vec2, <8 x i32> %mask) {
1504; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask1:
1505; CHECK:       # %bb.0:
1506; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
1507; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [7,3,6,11,0,1,5,15]
1508; CHECK-NEXT:    vpermi2d (%rdi), %ymm2, %ymm3
1509; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
1510; CHECK-NEXT:    vmovdqa32 %ymm3, %ymm0 {%k1}
1511; CHECK-NEXT:    retq
1512  %vec = load <16 x i32>, ptr %vp
1513  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 11, i32 14, i32 3, i32 8, i32 9, i32 13, i32 7>
1514  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1515  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1516  ret <8 x i32> %res
1517}
1518
1519define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask1(ptr %vp, <8 x i32> %mask) {
1520; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask1:
1521; CHECK:       # %bb.0:
1522; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
1523; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [7,3,6,11,0,1,5,15]
1524; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
1525; CHECK-NEXT:    vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z}
1526; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
1527; CHECK-NEXT:    retq
1528  %vec = load <16 x i32>, ptr %vp
1529  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 11, i32 14, i32 3, i32 8, i32 9, i32 13, i32 7>
1530  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1531  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1532  ret <8 x i32> %res
1533}
1534
1535define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask2(ptr %vp, <8 x i32> %vec2, <8 x i32> %mask) {
1536; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask2:
1537; CHECK:       # %bb.0:
1538; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
1539; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [4,14,1,5,4,2,8,10]
1540; CHECK-NEXT:    vpermi2d (%rdi), %ymm2, %ymm3
1541; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
1542; CHECK-NEXT:    vmovdqa32 %ymm3, %ymm0 {%k1}
1543; CHECK-NEXT:    retq
1544  %vec = load <16 x i32>, ptr %vp
1545  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 12, i32 6, i32 9, i32 13, i32 12, i32 10, i32 0, i32 2>
1546  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1547  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1548  ret <8 x i32> %res
1549}
1550
1551define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask2(ptr %vp, <8 x i32> %mask) {
1552; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask2:
1553; CHECK:       # %bb.0:
1554; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
1555; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [4,14,1,5,4,2,8,10]
1556; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
1557; CHECK-NEXT:    vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z}
1558; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
1559; CHECK-NEXT:    retq
1560  %vec = load <16 x i32>, ptr %vp
1561  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 12, i32 6, i32 9, i32 13, i32 12, i32 10, i32 0, i32 2>
1562  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1563  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1564  ret <8 x i32> %res
1565}
1566
1567define <8 x i32> @test_16xi32_to_8xi32_perm_mem_mask3(ptr %vp) {
1568; CHECK-LABEL: test_16xi32_to_8xi32_perm_mem_mask3:
1569; CHECK:       # %bb.0:
1570; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [8,4,1,13,15,4,6,12]
1571; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0
1572; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1573; CHECK-NEXT:    retq
1574  %vec = load <16 x i32>, ptr %vp
1575  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12>
1576  ret <8 x i32> %res
1577}
1578define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask3(ptr %vp, <8 x i32> %vec2, <8 x i32> %mask) {
1579; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask3:
1580; CHECK:       # %bb.0:
1581; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1582; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [8,4,1,13,15,4,6,12]
1583; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
1584; CHECK-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1}
1585; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1586; CHECK-NEXT:    retq
1587  %vec = load <16 x i32>, ptr %vp
1588  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12>
1589  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1590  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1591  ret <8 x i32> %res
1592}
1593
1594define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask3(ptr %vp, <8 x i32> %mask) {
1595; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask3:
1596; CHECK:       # %bb.0:
1597; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [8,4,1,13,15,4,6,12]
1598; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
1599; CHECK-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z}
1600; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1601; CHECK-NEXT:    retq
1602  %vec = load <16 x i32>, ptr %vp
1603  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12>
1604  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1605  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1606  ret <8 x i32> %res
1607}
1608
1609define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask0(ptr %vp) {
1610; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask0:
1611; CHECK:       # %bb.0:
1612; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [13,0,0,6]
1613; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0
1614; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1615; CHECK-NEXT:    vzeroupper
1616; CHECK-NEXT:    retq
1617  %vec = load <16 x i32>, ptr %vp
1618  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 0, i32 0, i32 6>
1619  ret <4 x i32> %res
1620}
1621define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1622; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask0:
1623; CHECK:       # %bb.0:
1624; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1625; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [13,0,0,6]
1626; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
1627; CHECK-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1}
1628; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1629; CHECK-NEXT:    vzeroupper
1630; CHECK-NEXT:    retq
1631  %vec = load <16 x i32>, ptr %vp
1632  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 0, i32 0, i32 6>
1633  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1634  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1635  ret <4 x i32> %res
1636}
1637
1638define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> %mask) {
1639; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask0:
1640; CHECK:       # %bb.0:
1641; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [13,0,0,6]
1642; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
1643; CHECK-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z}
1644; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1645; CHECK-NEXT:    vzeroupper
1646; CHECK-NEXT:    retq
1647  %vec = load <16 x i32>, ptr %vp
1648  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 0, i32 0, i32 6>
1649  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1650  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1651  ret <4 x i32> %res
1652}
1653
1654define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1655; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask1:
1656; CHECK:       # %bb.0:
1657; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
1658; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [15,5,3,2,0,0,0,0]
1659; CHECK-NEXT:    vpermi2d (%rdi), %ymm2, %ymm3
1660; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
1661; CHECK-NEXT:    vmovdqa32 %xmm3, %xmm0 {%k1}
1662; CHECK-NEXT:    vzeroupper
1663; CHECK-NEXT:    retq
1664  %vec = load <16 x i32>, ptr %vp
1665  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 7, i32 13, i32 11, i32 10>
1666  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1667  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1668  ret <4 x i32> %res
1669}
1670
1671define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %mask) {
1672; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask1:
1673; CHECK:       # %bb.0:
1674; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
1675; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [15,5,3,2,0,0,0,0]
1676; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
1677; CHECK-NEXT:    vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z}
1678; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
1679; CHECK-NEXT:    vzeroupper
1680; CHECK-NEXT:    retq
1681  %vec = load <16 x i32>, ptr %vp
1682  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 7, i32 13, i32 11, i32 10>
1683  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1684  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1685  ret <4 x i32> %res
1686}
1687
1688define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1689; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask2:
1690; CHECK:       # %bb.0:
1691; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1692; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [2,15,6,9]
1693; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
1694; CHECK-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1}
1695; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1696; CHECK-NEXT:    vzeroupper
1697; CHECK-NEXT:    retq
1698  %vec = load <16 x i32>, ptr %vp
1699  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 2, i32 15, i32 6, i32 9>
1700  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1701  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1702  ret <4 x i32> %res
1703}
1704
1705define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> %mask) {
1706; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask2:
1707; CHECK:       # %bb.0:
1708; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [2,15,6,9]
1709; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
1710; CHECK-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z}
1711; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1712; CHECK-NEXT:    vzeroupper
1713; CHECK-NEXT:    retq
1714  %vec = load <16 x i32>, ptr %vp
1715  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 2, i32 15, i32 6, i32 9>
1716  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1717  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1718  ret <4 x i32> %res
1719}
1720
1721define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask3(ptr %vp) {
1722; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask3:
1723; CHECK:       # %bb.0:
1724; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm1
1725; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [2,4,3,6]
1726; CHECK-NEXT:    vpermi2d (%rdi), %xmm1, %xmm0
1727; CHECK-NEXT:    retq
1728  %vec = load <16 x i32>, ptr %vp
1729  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2>
1730  ret <4 x i32> %res
1731}
1732define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1733; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask3:
1734; CHECK:       # %bb.0:
1735; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm2
1736; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [2,4,3,6]
1737; CHECK-NEXT:    vpermi2d (%rdi), %xmm2, %xmm3
1738; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
1739; CHECK-NEXT:    vmovdqa32 %xmm3, %xmm0 {%k1}
1740; CHECK-NEXT:    retq
1741  %vec = load <16 x i32>, ptr %vp
1742  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2>
1743  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1744  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1745  ret <4 x i32> %res
1746}
1747
1748define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32> %mask) {
1749; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask3:
1750; CHECK:       # %bb.0:
1751; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm2
1752; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [2,4,3,6]
1753; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
1754; CHECK-NEXT:    vpermi2d (%rdi), %xmm2, %xmm1 {%k1} {z}
1755; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
1756; CHECK-NEXT:    retq
1757  %vec = load <16 x i32>, ptr %vp
1758  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2>
1759  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1760  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1761  ret <4 x i32> %res
1762}
1763
1764define <4 x i32> @test_16xi32_to_4xi32_perm_mask9(<16 x i32> %vec) {
1765; CHECK-FAST-LABEL: test_16xi32_to_4xi32_perm_mask9:
1766; CHECK-FAST:       # %bb.0:
1767; CHECK-FAST-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [12,9,4,10]
1768; CHECK-FAST-NEXT:    vpermps %zmm0, %zmm1, %zmm0
1769; CHECK-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1770; CHECK-FAST-NEXT:    vzeroupper
1771; CHECK-FAST-NEXT:    retq
1772;
1773; CHECK-FAST-PERLANE-LABEL: test_16xi32_to_4xi32_perm_mask9:
1774; CHECK-FAST-PERLANE:       # %bb.0:
1775; CHECK-FAST-PERLANE-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,1,0,2]
1776; CHECK-FAST-PERLANE-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
1777; CHECK-FAST-PERLANE-NEXT:    vpermd %ymm2, %ymm1, %ymm1
1778; CHECK-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm0, %xmm2
1779; CHECK-FAST-PERLANE-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,1,4,3]
1780; CHECK-FAST-PERLANE-NEXT:    vpermi2d %xmm2, %xmm1, %xmm0
1781; CHECK-FAST-PERLANE-NEXT:    vzeroupper
1782; CHECK-FAST-PERLANE-NEXT:    retq
1783  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 12, i32 9, i32 4, i32 10>
1784  ret <4 x i32> %res
1785}
1786
1787define <2 x i64> @test_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec) {
1788; CHECK-LABEL: test_4xi64_to_2xi64_perm_mask0:
1789; CHECK:       # %bb.0:
1790; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,0,2,3]
1791; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1792; CHECK-NEXT:    vzeroupper
1793; CHECK-NEXT:    retq
1794  %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 0>
1795  ret <2 x i64> %res
1796}
1797define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
1798; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mask0:
1799; CHECK:       # %bb.0:
1800; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
1801; CHECK-NEXT:    vptestnmq %xmm2, %xmm2, %k1
1802; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,2,3]
1803; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
1804; CHECK-NEXT:    vzeroupper
1805; CHECK-NEXT:    retq
1806  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 0>
1807  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1808  %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
1809  ret <2 x i64> %res
1810}
1811
1812define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec, <2 x i64> %mask) {
1813; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mask0:
1814; CHECK:       # %bb.0:
1815; CHECK-NEXT:    vptestnmq %xmm1, %xmm1, %k1
1816; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,2,3]
1817; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1818; CHECK-NEXT:    vzeroupper
1819; CHECK-NEXT:    retq
1820  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 0>
1821  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1822  %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
1823  ret <2 x i64> %res
1824}
1825define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mask1(<4 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
1826; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mask1:
1827; CHECK:       # %bb.0:
1828; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
1829; CHECK-NEXT:    vptestnmq %xmm2, %xmm2, %k1
1830; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,2,3]
1831; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
1832; CHECK-NEXT:    vzeroupper
1833; CHECK-NEXT:    retq
1834  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
1835  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1836  %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
1837  ret <2 x i64> %res
1838}
1839
1840define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mask1(<4 x i64> %vec, <2 x i64> %mask) {
1841; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mask1:
1842; CHECK:       # %bb.0:
1843; CHECK-NEXT:    vptestnmq %xmm1, %xmm1, %k1
1844; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,2,3]
1845; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1846; CHECK-NEXT:    vzeroupper
1847; CHECK-NEXT:    retq
1848  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
1849  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1850  %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
1851  ret <2 x i64> %res
1852}
1853define <2 x i64> @test_4xi64_to_2xi64_perm_mem_mask0(ptr %vp) {
1854; CHECK-LABEL: test_4xi64_to_2xi64_perm_mem_mask0:
1855; CHECK:       # %bb.0:
1856; CHECK-NEXT:    vmovaps (%rdi), %xmm0
1857; CHECK-NEXT:    vunpckhpd 16(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[1],mem[1]
1858; CHECK-NEXT:    retq
1859  %vec = load <4 x i64>, ptr %vp
1860  %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
1861  ret <2 x i64> %res
1862}
1863define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %vec2, <2 x i64> %mask) {
1864; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mem_mask0:
1865; CHECK:       # %bb.0:
1866; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
1867; CHECK-NEXT:    vptestnmq %xmm1, %xmm1, %k1
1868; CHECK-NEXT:    vpunpckhqdq 16(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[1],mem[1]
1869; CHECK-NEXT:    retq
1870  %vec = load <4 x i64>, ptr %vp
1871  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
1872  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1873  %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
1874  ret <2 x i64> %res
1875}
1876
1877define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %mask) {
1878; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mem_mask0:
1879; CHECK:       # %bb.0:
1880; CHECK-NEXT:    vmovdqa (%rdi), %xmm1
1881; CHECK-NEXT:    vptestnmq %xmm0, %xmm0, %k1
1882; CHECK-NEXT:    vpunpckhqdq 16(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[1],mem[1]
1883; CHECK-NEXT:    retq
1884  %vec = load <4 x i64>, ptr %vp
1885  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
1886  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1887  %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
1888  ret <2 x i64> %res
1889}
1890
1891define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %vec2, <2 x i64> %mask) {
1892; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mem_mask1:
1893; CHECK:       # %bb.0:
1894; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm2
1895; CHECK-NEXT:    vpblendd $12, (%rdi), %xmm2, %xmm2 # xmm2 = xmm2[0,1],mem[2,3]
1896; CHECK-NEXT:    vptestnmq %xmm1, %xmm1, %k1
1897; CHECK-NEXT:    vmovdqa64 %xmm2, %xmm0 {%k1}
1898; CHECK-NEXT:    retq
1899  %vec = load <4 x i64>, ptr %vp
1900  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
1901  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1902  %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
1903  ret <2 x i64> %res
1904}
1905
1906define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %mask) {
1907; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mem_mask1:
1908; CHECK:       # %bb.0:
1909; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm1
1910; CHECK-NEXT:    vpblendd $12, (%rdi), %xmm1, %xmm1 # xmm1 = xmm1[0,1],mem[2,3]
1911; CHECK-NEXT:    vptestnmq %xmm0, %xmm0, %k1
1912; CHECK-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1} {z}
1913; CHECK-NEXT:    retq
1914  %vec = load <4 x i64>, ptr %vp
1915  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
1916  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1917  %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
1918  ret <2 x i64> %res
1919}
1920
1921define <4 x i64> @test_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec) {
1922; CHECK-LABEL: test_8xi64_to_4xi64_perm_mask0:
1923; CHECK:       # %bb.0:
1924; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
1925; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,1]
1926; CHECK-NEXT:    retq
1927  %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5>
1928  ret <4 x i64> %res
1929}
1930define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
1931; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask0:
1932; CHECK:       # %bb.0:
1933; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
1934; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
1935; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,1]
1936; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
1937; CHECK-NEXT:    retq
1938  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5>
1939  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1940  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
1941  ret <4 x i64> %res
1942}
1943
1944define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec, <4 x i64> %mask) {
1945; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask0:
1946; CHECK:       # %bb.0:
1947; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
1948; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
1949; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,1]
1950; CHECK-NEXT:    retq
1951  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5>
1952  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1953  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
1954  ret <4 x i64> %res
1955}
1956define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
1957; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask1:
1958; CHECK-FAST:       # %bb.0:
1959; CHECK-FAST-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
1960; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [6,4,6,1]
1961; CHECK-FAST-NEXT:    vptestnmq %ymm2, %ymm2, %k1
1962; CHECK-FAST-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1}
1963; CHECK-FAST-NEXT:    vmovdqa %ymm1, %ymm0
1964; CHECK-FAST-NEXT:    retq
1965;
1966; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask1:
1967; CHECK-FAST-PERLANE:       # %bb.0:
1968; CHECK-FAST-PERLANE-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
1969; CHECK-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7]
1970; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm2, %ymm2, %k1
1971; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,2,1]
1972; CHECK-FAST-PERLANE-NEXT:    vmovdqa %ymm1, %ymm0
1973; CHECK-FAST-PERLANE-NEXT:    retq
1974  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 4, i32 6, i32 1>
1975  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1976  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
1977  ret <4 x i64> %res
1978}
1979
1980define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %mask) {
1981; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask1:
1982; CHECK-FAST:       # %bb.0:
1983; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [6,4,6,1]
1984; CHECK-FAST-NEXT:    vptestnmq %ymm1, %ymm1, %k1
1985; CHECK-FAST-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
1986; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1987; CHECK-FAST-NEXT:    retq
1988;
1989; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask1:
1990; CHECK-FAST-PERLANE:       # %bb.0:
1991; CHECK-FAST-PERLANE-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
1992; CHECK-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
1993; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm1, %ymm1, %k1
1994; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,2,1]
1995; CHECK-FAST-PERLANE-NEXT:    retq
1996  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 4, i32 6, i32 1>
1997  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1998  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
1999  ret <4 x i64> %res
2000}
2001define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
2002; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask2:
2003; CHECK-FAST:       # %bb.0:
2004; CHECK-FAST-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
2005; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [6,3,6,3]
2006; CHECK-FAST-NEXT:    vptestnmq %ymm2, %ymm2, %k1
2007; CHECK-FAST-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1}
2008; CHECK-FAST-NEXT:    vmovdqa %ymm1, %ymm0
2009; CHECK-FAST-NEXT:    retq
2010;
2011; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask2:
2012; CHECK-FAST-PERLANE:       # %bb.0:
2013; CHECK-FAST-PERLANE-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
2014; CHECK-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7]
2015; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm2, %ymm2, %k1
2016; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,3]
2017; CHECK-FAST-PERLANE-NEXT:    vmovdqa %ymm1, %ymm0
2018; CHECK-FAST-PERLANE-NEXT:    retq
2019  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 3, i32 6, i32 3>
2020  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2021  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2022  ret <4 x i64> %res
2023}
2024
2025define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %mask) {
2026; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask2:
2027; CHECK-FAST:       # %bb.0:
2028; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [6,3,6,3]
2029; CHECK-FAST-NEXT:    vptestnmq %ymm1, %ymm1, %k1
2030; CHECK-FAST-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
2031; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
2032; CHECK-FAST-NEXT:    retq
2033;
2034; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask2:
2035; CHECK-FAST-PERLANE:       # %bb.0:
2036; CHECK-FAST-PERLANE-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
2037; CHECK-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7]
2038; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm1, %ymm1, %k1
2039; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,3]
2040; CHECK-FAST-PERLANE-NEXT:    retq
2041  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 3, i32 6, i32 3>
2042  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2043  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2044  ret <4 x i64> %res
2045}
2046define <4 x i64> @test_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec) {
2047; CHECK-FAST-LABEL: test_8xi64_to_4xi64_perm_mask3:
2048; CHECK-FAST:       # %bb.0:
2049; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [6,0,0,7]
2050; CHECK-FAST-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
2051; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
2052; CHECK-FAST-NEXT:    retq
2053;
2054; CHECK-FAST-PERLANE-LABEL: test_8xi64_to_4xi64_perm_mask3:
2055; CHECK-FAST-PERLANE:       # %bb.0:
2056; CHECK-FAST-PERLANE-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
2057; CHECK-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2058; CHECK-FAST-PERLANE-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,0,0,3]
2059; CHECK-FAST-PERLANE-NEXT:    retq
2060  %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 0, i32 0, i32 7>
2061  ret <4 x i64> %res
2062}
2063define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
2064; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask3:
2065; CHECK-FAST:       # %bb.0:
2066; CHECK-FAST-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
2067; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [6,0,0,7]
2068; CHECK-FAST-NEXT:    vptestnmq %ymm2, %ymm2, %k1
2069; CHECK-FAST-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1}
2070; CHECK-FAST-NEXT:    vmovdqa %ymm1, %ymm0
2071; CHECK-FAST-NEXT:    retq
2072;
2073; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask3:
2074; CHECK-FAST-PERLANE:       # %bb.0:
2075; CHECK-FAST-PERLANE-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
2076; CHECK-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
2077; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm2, %ymm2, %k1
2078; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,0,3]
2079; CHECK-FAST-PERLANE-NEXT:    vmovdqa %ymm1, %ymm0
2080; CHECK-FAST-PERLANE-NEXT:    retq
2081  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 0, i32 0, i32 7>
2082  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2083  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2084  ret <4 x i64> %res
2085}
2086
2087define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i64> %mask) {
2088; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask3:
2089; CHECK-FAST:       # %bb.0:
2090; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [6,0,0,7]
2091; CHECK-FAST-NEXT:    vptestnmq %ymm1, %ymm1, %k1
2092; CHECK-FAST-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
2093; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
2094; CHECK-FAST-NEXT:    retq
2095;
2096; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask3:
2097; CHECK-FAST-PERLANE:       # %bb.0:
2098; CHECK-FAST-PERLANE-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
2099; CHECK-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
2100; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm1, %ymm1, %k1
2101; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,0,3]
2102; CHECK-FAST-PERLANE-NEXT:    retq
2103  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 0, i32 0, i32 7>
2104  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2105  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2106  ret <4 x i64> %res
2107}
2108define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
2109; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask4:
2110; CHECK-FAST:       # %bb.0:
2111; CHECK-FAST-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
2112; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [3,7,7,5]
2113; CHECK-FAST-NEXT:    vptestnmq %ymm2, %ymm2, %k1
2114; CHECK-FAST-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1}
2115; CHECK-FAST-NEXT:    vmovdqa %ymm1, %ymm0
2116; CHECK-FAST-NEXT:    retq
2117;
2118; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask4:
2119; CHECK-FAST-PERLANE:       # %bb.0:
2120; CHECK-FAST-PERLANE-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
2121; CHECK-FAST-PERLANE-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3]
2122; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm2, %ymm2, %k1
2123; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,3,1]
2124; CHECK-FAST-PERLANE-NEXT:    vmovdqa %ymm1, %ymm0
2125; CHECK-FAST-PERLANE-NEXT:    retq
2126  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 3, i32 7, i32 7, i32 5>
2127  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2128  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2129  ret <4 x i64> %res
2130}
2131
2132define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %mask) {
2133; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask4:
2134; CHECK-FAST:       # %bb.0:
2135; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [3,7,7,5]
2136; CHECK-FAST-NEXT:    vptestnmq %ymm1, %ymm1, %k1
2137; CHECK-FAST-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
2138; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
2139; CHECK-FAST-NEXT:    retq
2140;
2141; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask4:
2142; CHECK-FAST-PERLANE:       # %bb.0:
2143; CHECK-FAST-PERLANE-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
2144; CHECK-FAST-PERLANE-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
2145; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm1, %ymm1, %k1
2146; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,3,1]
2147; CHECK-FAST-PERLANE-NEXT:    retq
2148  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 3, i32 7, i32 7, i32 5>
2149  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2150  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2151  ret <4 x i64> %res
2152}
2153define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
2154; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask5:
2155; CHECK:       # %bb.0:
2156; CHECK-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
2157; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [4,1,0,6]
2158; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
2159; CHECK-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1}
2160; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
2161; CHECK-NEXT:    retq
2162  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 4, i32 1, i32 0, i32 6>
2163  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2164  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2165  ret <4 x i64> %res
2166}
2167
2168define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64> %mask) {
2169; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask5:
2170; CHECK:       # %bb.0:
2171; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [4,1,0,6]
2172; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
2173; CHECK-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
2174; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
2175; CHECK-NEXT:    retq
2176  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 4, i32 1, i32 0, i32 6>
2177  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2178  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2179  ret <4 x i64> %res
2180}
2181define <4 x i64> @test_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec) {
2182; CHECK-FAST-LABEL: test_8xi64_to_4xi64_perm_mask6:
2183; CHECK-FAST:       # %bb.0:
2184; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [7,6,5,3]
2185; CHECK-FAST-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
2186; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
2187; CHECK-FAST-NEXT:    retq
2188;
2189; CHECK-FAST-PERLANE-LABEL: test_8xi64_to_4xi64_perm_mask6:
2190; CHECK-FAST-PERLANE:       # %bb.0:
2191; CHECK-FAST-PERLANE-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
2192; CHECK-FAST-PERLANE-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[3,2,1,3]
2193; CHECK-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
2194; CHECK-FAST-PERLANE-NEXT:    retq
2195  %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3>
2196  ret <4 x i64> %res
2197}
2198define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
2199; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask6:
2200; CHECK-FAST:       # %bb.0:
2201; CHECK-FAST-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
2202; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [7,6,5,3]
2203; CHECK-FAST-NEXT:    vptestnmq %ymm2, %ymm2, %k1
2204; CHECK-FAST-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1}
2205; CHECK-FAST-NEXT:    vmovdqa %ymm1, %ymm0
2206; CHECK-FAST-NEXT:    retq
2207;
2208; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask6:
2209; CHECK-FAST-PERLANE:       # %bb.0:
2210; CHECK-FAST-PERLANE-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
2211; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[3,2,1,3]
2212; CHECK-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
2213; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm2, %ymm2, %k1
2214; CHECK-FAST-PERLANE-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
2215; CHECK-FAST-PERLANE-NEXT:    retq
2216  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3>
2217  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2218  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2219  ret <4 x i64> %res
2220}
2221
2222define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64> %mask) {
2223; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask6:
2224; CHECK-FAST:       # %bb.0:
2225; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [7,6,5,3]
2226; CHECK-FAST-NEXT:    vptestnmq %ymm1, %ymm1, %k1
2227; CHECK-FAST-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
2228; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
2229; CHECK-FAST-NEXT:    retq
2230;
2231; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask6:
2232; CHECK-FAST-PERLANE:       # %bb.0:
2233; CHECK-FAST-PERLANE-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
2234; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[3,2,1,3]
2235; CHECK-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
2236; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm1, %ymm1, %k1
2237; CHECK-FAST-PERLANE-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
2238; CHECK-FAST-PERLANE-NEXT:    retq
2239  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3>
2240  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2241  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2242  ret <4 x i64> %res
2243}
2244define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
2245; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask7:
2246; CHECK:       # %bb.0:
2247; CHECK-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
2248; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [2,0,3,4]
2249; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
2250; CHECK-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1}
2251; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
2252; CHECK-NEXT:    retq
2253  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 4>
2254  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2255  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2256  ret <4 x i64> %res
2257}
2258define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %mask) {
2259; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask7:
2260; CHECK:       # %bb.0:
2261; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [2,0,3,4]
2262; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
2263; CHECK-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
2264; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
2265; CHECK-NEXT:    retq
2266  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 4>
2267  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2268  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2269  ret <4 x i64> %res
2270}
2271
2272define <2 x i64> @test_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec) {
2273; CHECK-LABEL: test_8xi64_to_2xi64_perm_mask0:
2274; CHECK:       # %bb.0:
2275; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[3,0,2,3,7,4,6,7]
2276; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
2277; CHECK-NEXT:    vzeroupper
2278; CHECK-NEXT:    retq
2279  %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0>
2280  ret <2 x i64> %res
2281}
2282define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
2283; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mask0:
2284; CHECK:       # %bb.0:
2285; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
2286; CHECK-NEXT:    vptestnmq %xmm2, %xmm2, %k1
2287; CHECK-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,0,2,3,7,4,6,7]
2288; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
2289; CHECK-NEXT:    vzeroupper
2290; CHECK-NEXT:    retq
2291  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0>
2292  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2293  %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
2294  ret <2 x i64> %res
2295}
2296define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i64> %mask) {
2297; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mask0:
2298; CHECK:       # %bb.0:
2299; CHECK-NEXT:    vptestnmq %xmm1, %xmm1, %k1
2300; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,2,3,7,4,6,7]
2301; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
2302; CHECK-NEXT:    vzeroupper
2303; CHECK-NEXT:    retq
2304  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0>
2305  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2306  %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
2307  ret <2 x i64> %res
2308}
2309
2310define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask1(<8 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
2311; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mask1:
2312; CHECK:       # %bb.0:
2313; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
2314; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
2315; CHECK-NEXT:    vptestnmq %xmm2, %xmm2, %k1
2316; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,2,3]
2317; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
2318; CHECK-NEXT:    vzeroupper
2319; CHECK-NEXT:    retq
2320  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 5>
2321  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2322  %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
2323  ret <2 x i64> %res
2324}
2325define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask1(<8 x i64> %vec, <2 x i64> %mask) {
2326; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mask1:
2327; CHECK:       # %bb.0:
2328; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
2329; CHECK-NEXT:    vptestnmq %xmm1, %xmm1, %k1
2330; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,2,3]
2331; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2332; CHECK-NEXT:    vzeroupper
2333; CHECK-NEXT:    retq
2334  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 5>
2335  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2336  %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
2337  ret <2 x i64> %res
2338}
2339
2340define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask0(ptr %vp) {
2341; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask0:
2342; CHECK:       # %bb.0:
2343; CHECK-NEXT:    vpermpd $136, (%rdi), %ymm0 # ymm0 = mem[0,2,0,2]
2344; CHECK-NEXT:    retq
2345  %vec = load <8 x i64>, ptr %vp
2346  %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
2347  ret <4 x i64> %res
2348}
2349define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask0(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2350; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask0:
2351; CHECK:       # %bb.0:
2352; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
2353; CHECK-NEXT:    vpermq $136, (%rdi), %ymm0 {%k1} # ymm0 {%k1} = mem[0,2,0,2]
2354; CHECK-NEXT:    retq
2355  %vec = load <8 x i64>, ptr %vp
2356  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
2357  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2358  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2359  ret <4 x i64> %res
2360}
2361define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask0(ptr %vp, <4 x i64> %mask) {
2362; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask0:
2363; CHECK:       # %bb.0:
2364; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
2365; CHECK-NEXT:    vpermq $136, (%rdi), %ymm0 {%k1} {z} # ymm0 {%k1} {z} = mem[0,2,0,2]
2366; CHECK-NEXT:    retq
2367  %vec = load <8 x i64>, ptr %vp
2368  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
2369  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2370  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2371  ret <4 x i64> %res
2372}
2373
2374define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask1(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2375; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask1:
2376; CHECK-FAST:       # %bb.0:
2377; CHECK-FAST-NEXT:    vmovdqa 32(%rdi), %ymm2
2378; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [4,3,2,4]
2379; CHECK-FAST-NEXT:    vpermi2q (%rdi), %ymm2, %ymm3
2380; CHECK-FAST-NEXT:    vptestnmq %ymm1, %ymm1, %k1
2381; CHECK-FAST-NEXT:    vmovdqa64 %ymm3, %ymm0 {%k1}
2382; CHECK-FAST-NEXT:    retq
2383;
2384; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask1:
2385; CHECK-FAST-PERLANE:       # %bb.0:
2386; CHECK-FAST-PERLANE-NEXT:    vmovdqa 32(%rdi), %ymm2
2387; CHECK-FAST-PERLANE-NEXT:    vpblendd $15, (%rdi), %ymm2, %ymm2 # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
2388; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm1, %ymm1, %k1
2389; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = ymm2[0,3,2,0]
2390; CHECK-FAST-PERLANE-NEXT:    retq
2391  %vec = load <8 x i64>, ptr %vp
2392  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 7, i32 6, i32 0>
2393  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2394  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2395  ret <4 x i64> %res
2396}
2397
2398define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask1(ptr %vp, <4 x i64> %mask) {
2399; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask1:
2400; CHECK-FAST:       # %bb.0:
2401; CHECK-FAST-NEXT:    vmovdqa 32(%rdi), %ymm2
2402; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [4,3,2,4]
2403; CHECK-FAST-NEXT:    vptestnmq %ymm0, %ymm0, %k1
2404; CHECK-FAST-NEXT:    vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z}
2405; CHECK-FAST-NEXT:    vmovdqa %ymm1, %ymm0
2406; CHECK-FAST-NEXT:    retq
2407;
2408; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask1:
2409; CHECK-FAST-PERLANE:       # %bb.0:
2410; CHECK-FAST-PERLANE-NEXT:    vmovdqa 32(%rdi), %ymm1
2411; CHECK-FAST-PERLANE-NEXT:    vpblendd $15, (%rdi), %ymm1, %ymm1 # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
2412; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm0, %ymm0, %k1
2413; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[0,3,2,0]
2414; CHECK-FAST-PERLANE-NEXT:    retq
2415  %vec = load <8 x i64>, ptr %vp
2416  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 7, i32 6, i32 0>
2417  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2418  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2419  ret <4 x i64> %res
2420}
2421
2422define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask2(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2423; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask2:
2424; CHECK-FAST:       # %bb.0:
2425; CHECK-FAST-NEXT:    vmovdqa 32(%rdi), %ymm2
2426; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [3,5,5,1]
2427; CHECK-FAST-NEXT:    vpermi2q (%rdi), %ymm2, %ymm3
2428; CHECK-FAST-NEXT:    vptestnmq %ymm1, %ymm1, %k1
2429; CHECK-FAST-NEXT:    vmovdqa64 %ymm3, %ymm0 {%k1}
2430; CHECK-FAST-NEXT:    retq
2431;
2432; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask2:
2433; CHECK-FAST-PERLANE:       # %bb.0:
2434; CHECK-FAST-PERLANE-NEXT:    vmovdqa 32(%rdi), %ymm2
2435; CHECK-FAST-PERLANE-NEXT:    vpunpckhqdq (%rdi), %ymm2, %ymm2 # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3]
2436; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm1, %ymm1, %k1
2437; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = ymm2[2,1,1,0]
2438; CHECK-FAST-PERLANE-NEXT:    retq
2439  %vec = load <8 x i64>, ptr %vp
2440  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 1, i32 1, i32 5>
2441  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2442  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2443  ret <4 x i64> %res
2444}
2445
2446define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask2(ptr %vp, <4 x i64> %mask) {
2447; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask2:
2448; CHECK-FAST:       # %bb.0:
2449; CHECK-FAST-NEXT:    vmovdqa 32(%rdi), %ymm2
2450; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [3,5,5,1]
2451; CHECK-FAST-NEXT:    vptestnmq %ymm0, %ymm0, %k1
2452; CHECK-FAST-NEXT:    vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z}
2453; CHECK-FAST-NEXT:    vmovdqa %ymm1, %ymm0
2454; CHECK-FAST-NEXT:    retq
2455;
2456; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask2:
2457; CHECK-FAST-PERLANE:       # %bb.0:
2458; CHECK-FAST-PERLANE-NEXT:    vmovdqa 32(%rdi), %ymm1
2459; CHECK-FAST-PERLANE-NEXT:    vpunpckhqdq (%rdi), %ymm1, %ymm1 # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3]
2460; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm0, %ymm0, %k1
2461; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[2,1,1,0]
2462; CHECK-FAST-PERLANE-NEXT:    retq
2463  %vec = load <8 x i64>, ptr %vp
2464  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 1, i32 1, i32 5>
2465  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2466  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2467  ret <4 x i64> %res
2468}
2469
2470define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask3(ptr %vp) {
2471; CHECK-FAST-LABEL: test_8xi64_to_4xi64_perm_mem_mask3:
2472; CHECK-FAST:       # %bb.0:
2473; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [7,0,0,2]
2474; CHECK-FAST-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
2475; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
2476; CHECK-FAST-NEXT:    retq
2477;
2478; CHECK-FAST-PERLANE-LABEL: test_8xi64_to_4xi64_perm_mem_mask3:
2479; CHECK-FAST-PERLANE:       # %bb.0:
2480; CHECK-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %ymm0
2481; CHECK-FAST-PERLANE-NEXT:    vpalignr $8, 32(%rdi), %ymm0, %ymm0 # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
2482; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,1,3]
2483; CHECK-FAST-PERLANE-NEXT:    retq
2484  %vec = load <8 x i64>, ptr %vp
2485  %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 0, i32 0, i32 2>
2486  ret <4 x i64> %res
2487}
2488define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask3(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2489; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask3:
2490; CHECK-FAST:       # %bb.0:
2491; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
2492; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [7,0,0,2]
2493; CHECK-FAST-NEXT:    vptestnmq %ymm1, %ymm1, %k1
2494; CHECK-FAST-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1}
2495; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
2496; CHECK-FAST-NEXT:    retq
2497;
2498; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask3:
2499; CHECK-FAST-PERLANE:       # %bb.0:
2500; CHECK-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %ymm2
2501; CHECK-FAST-PERLANE-NEXT:    vpalignr $8, 32(%rdi), %ymm2, %ymm2 # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
2502; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm1, %ymm1, %k1
2503; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = ymm2[2,1,1,3]
2504; CHECK-FAST-PERLANE-NEXT:    retq
2505  %vec = load <8 x i64>, ptr %vp
2506  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 0, i32 0, i32 2>
2507  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2508  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2509  ret <4 x i64> %res
2510}
2511
2512define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask3(ptr %vp, <4 x i64> %mask) {
2513; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask3:
2514; CHECK-FAST:       # %bb.0:
2515; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [7,0,0,2]
2516; CHECK-FAST-NEXT:    vptestnmq %ymm0, %ymm0, %k1
2517; CHECK-FAST-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
2518; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
2519; CHECK-FAST-NEXT:    retq
2520;
2521; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask3:
2522; CHECK-FAST-PERLANE:       # %bb.0:
2523; CHECK-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %ymm1
2524; CHECK-FAST-PERLANE-NEXT:    vpalignr $8, 32(%rdi), %ymm1, %ymm1 # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
2525; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm0, %ymm0, %k1
2526; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[2,1,1,3]
2527; CHECK-FAST-PERLANE-NEXT:    retq
2528  %vec = load <8 x i64>, ptr %vp
2529  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 0, i32 0, i32 2>
2530  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2531  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2532  ret <4 x i64> %res
2533}
2534
2535define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask4(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2536; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask4:
2537; CHECK:       # %bb.0:
2538; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
2539; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [4,0,2,5]
2540; CHECK-NEXT:    vpermi2q (%rdi), %ymm2, %ymm3
2541; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
2542; CHECK-NEXT:    vmovdqa64 %ymm3, %ymm0 {%k1}
2543; CHECK-NEXT:    retq
2544  %vec = load <8 x i64>, ptr %vp
2545  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 4, i32 6, i32 1>
2546  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2547  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2548  ret <4 x i64> %res
2549}
2550
2551define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask4(ptr %vp, <4 x i64> %mask) {
2552; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask4:
2553; CHECK:       # %bb.0:
2554; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
2555; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [4,0,2,5]
2556; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
2557; CHECK-NEXT:    vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z}
2558; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
2559; CHECK-NEXT:    retq
2560  %vec = load <8 x i64>, ptr %vp
2561  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 4, i32 6, i32 1>
2562  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2563  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2564  ret <4 x i64> %res
2565}
2566
2567define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask5(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2568; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask5:
2569; CHECK-FAST:       # %bb.0:
2570; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
2571; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [0,2,7,1]
2572; CHECK-FAST-NEXT:    vptestnmq %ymm1, %ymm1, %k1
2573; CHECK-FAST-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1}
2574; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
2575; CHECK-FAST-NEXT:    retq
2576;
2577; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask5:
2578; CHECK-FAST-PERLANE:       # %bb.0:
2579; CHECK-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %ymm2
2580; CHECK-FAST-PERLANE-NEXT:    vpblendd $192, 32(%rdi), %ymm2, %ymm2 # ymm2 = ymm2[0,1,2,3,4,5],mem[6,7]
2581; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm1, %ymm1, %k1
2582; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = ymm2[0,2,3,1]
2583; CHECK-FAST-PERLANE-NEXT:    retq
2584  %vec = load <8 x i64>, ptr %vp
2585  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 7, i32 1>
2586  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2587  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2588  ret <4 x i64> %res
2589}
2590
2591define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask5(ptr %vp, <4 x i64> %mask) {
2592; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask5:
2593; CHECK-FAST:       # %bb.0:
2594; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,2,7,1]
2595; CHECK-FAST-NEXT:    vptestnmq %ymm0, %ymm0, %k1
2596; CHECK-FAST-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
2597; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
2598; CHECK-FAST-NEXT:    retq
2599;
2600; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask5:
2601; CHECK-FAST-PERLANE:       # %bb.0:
2602; CHECK-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %ymm1
2603; CHECK-FAST-PERLANE-NEXT:    vpblendd $192, 32(%rdi), %ymm1, %ymm1 # ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
2604; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm0, %ymm0, %k1
2605; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[0,2,3,1]
2606; CHECK-FAST-PERLANE-NEXT:    retq
2607  %vec = load <8 x i64>, ptr %vp
2608  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 7, i32 1>
2609  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2610  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2611  ret <4 x i64> %res
2612}
2613
2614define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask6(ptr %vp) {
2615; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask6:
2616; CHECK:       # %bb.0:
2617; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [7,2,3,2]
2618; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
2619; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
2620; CHECK-NEXT:    retq
2621  %vec = load <8 x i64>, ptr %vp
2622  %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2>
2623  ret <4 x i64> %res
2624}
2625define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask6(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2626; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask6:
2627; CHECK:       # %bb.0:
2628; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
2629; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [7,2,3,2]
2630; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
2631; CHECK-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1}
2632; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
2633; CHECK-NEXT:    retq
2634  %vec = load <8 x i64>, ptr %vp
2635  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2>
2636  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2637  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2638  ret <4 x i64> %res
2639}
2640
2641define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask6(ptr %vp, <4 x i64> %mask) {
2642; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask6:
2643; CHECK:       # %bb.0:
2644; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [7,2,3,2]
2645; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
2646; CHECK-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
2647; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
2648; CHECK-NEXT:    retq
2649  %vec = load <8 x i64>, ptr %vp
2650  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2>
2651  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2652  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2653  ret <4 x i64> %res
2654}
2655
2656define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask7(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2657; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask7:
2658; CHECK-FAST:       # %bb.0:
2659; CHECK-FAST-NEXT:    vmovdqa 32(%rdi), %ymm2
2660; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [3,3,1,5]
2661; CHECK-FAST-NEXT:    vpermi2q (%rdi), %ymm2, %ymm3
2662; CHECK-FAST-NEXT:    vptestnmq %ymm1, %ymm1, %k1
2663; CHECK-FAST-NEXT:    vmovdqa64 %ymm3, %ymm0 {%k1}
2664; CHECK-FAST-NEXT:    retq
2665;
2666; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask7:
2667; CHECK-FAST-PERLANE:       # %bb.0:
2668; CHECK-FAST-PERLANE-NEXT:    vmovdqa 32(%rdi), %ymm2
2669; CHECK-FAST-PERLANE-NEXT:    vpunpckhqdq (%rdi), %ymm2, %ymm2 # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3]
2670; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm1, %ymm1, %k1
2671; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = ymm2[2,2,0,1]
2672; CHECK-FAST-PERLANE-NEXT:    retq
2673  %vec = load <8 x i64>, ptr %vp
2674  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 7, i32 5, i32 1>
2675  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2676  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2677  ret <4 x i64> %res
2678}
2679
2680define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask7(ptr %vp, <4 x i64> %mask) {
2681; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask7:
2682; CHECK-FAST:       # %bb.0:
2683; CHECK-FAST-NEXT:    vmovdqa 32(%rdi), %ymm2
2684; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [3,3,1,5]
2685; CHECK-FAST-NEXT:    vptestnmq %ymm0, %ymm0, %k1
2686; CHECK-FAST-NEXT:    vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z}
2687; CHECK-FAST-NEXT:    vmovdqa %ymm1, %ymm0
2688; CHECK-FAST-NEXT:    retq
2689;
2690; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask7:
2691; CHECK-FAST-PERLANE:       # %bb.0:
2692; CHECK-FAST-PERLANE-NEXT:    vmovdqa 32(%rdi), %ymm1
2693; CHECK-FAST-PERLANE-NEXT:    vpunpckhqdq (%rdi), %ymm1, %ymm1 # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3]
2694; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm0, %ymm0, %k1
2695; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[2,2,0,1]
2696; CHECK-FAST-PERLANE-NEXT:    retq
2697  %vec = load <8 x i64>, ptr %vp
2698  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 7, i32 5, i32 1>
2699  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2700  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2701  ret <4 x i64> %res
2702}
2703
2704define <2 x i64> @test_8xi64_to_2xi64_perm_mem_mask0(ptr %vp) {
2705; CHECK-FAST-LABEL: test_8xi64_to_2xi64_perm_mem_mask0:
2706; CHECK-FAST:       # %bb.0:
2707; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [4,1]
2708; CHECK-FAST-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
2709; CHECK-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
2710; CHECK-FAST-NEXT:    vzeroupper
2711; CHECK-FAST-NEXT:    retq
2712;
2713; CHECK-FAST-PERLANE-LABEL: test_8xi64_to_2xi64_perm_mem_mask0:
2714; CHECK-FAST-PERLANE:       # %bb.0:
2715; CHECK-FAST-PERLANE-NEXT:    vmovaps 32(%rdi), %xmm0
2716; CHECK-FAST-PERLANE-NEXT:    vblendps $12, (%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0,1],mem[2,3]
2717; CHECK-FAST-PERLANE-NEXT:    retq
2718  %vec = load <8 x i64>, ptr %vp
2719  %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
2720  ret <2 x i64> %res
2721}
2722define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %vec2, <2 x i64> %mask) {
2723; CHECK-FAST-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0:
2724; CHECK-FAST:       # %bb.0:
2725; CHECK-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
2726; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [4,1]
2727; CHECK-FAST-NEXT:    vptestnmq %xmm1, %xmm1, %k1
2728; CHECK-FAST-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1}
2729; CHECK-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
2730; CHECK-FAST-NEXT:    vzeroupper
2731; CHECK-FAST-NEXT:    retq
2732;
2733; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0:
2734; CHECK-FAST-PERLANE:       # %bb.0:
2735; CHECK-FAST-PERLANE-NEXT:    vmovdqa 32(%rdi), %xmm2
2736; CHECK-FAST-PERLANE-NEXT:    vpblendd $12, (%rdi), %xmm2, %xmm2 # xmm2 = xmm2[0,1],mem[2,3]
2737; CHECK-FAST-PERLANE-NEXT:    vptestnmq %xmm1, %xmm1, %k1
2738; CHECK-FAST-PERLANE-NEXT:    vmovdqa64 %xmm2, %xmm0 {%k1}
2739; CHECK-FAST-PERLANE-NEXT:    retq
2740  %vec = load <8 x i64>, ptr %vp
2741  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
2742  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2743  %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
2744  ret <2 x i64> %res
2745}
2746
2747define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %mask) {
2748; CHECK-FAST-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0:
2749; CHECK-FAST:       # %bb.0:
2750; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [4,1]
2751; CHECK-FAST-NEXT:    vptestnmq %xmm0, %xmm0, %k1
2752; CHECK-FAST-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
2753; CHECK-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
2754; CHECK-FAST-NEXT:    vzeroupper
2755; CHECK-FAST-NEXT:    retq
2756;
2757; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0:
2758; CHECK-FAST-PERLANE:       # %bb.0:
2759; CHECK-FAST-PERLANE-NEXT:    vmovdqa 32(%rdi), %xmm1
2760; CHECK-FAST-PERLANE-NEXT:    vpblendd $12, (%rdi), %xmm1, %xmm1 # xmm1 = xmm1[0,1],mem[2,3]
2761; CHECK-FAST-PERLANE-NEXT:    vptestnmq %xmm0, %xmm0, %k1
2762; CHECK-FAST-PERLANE-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1} {z}
2763; CHECK-FAST-PERLANE-NEXT:    retq
2764  %vec = load <8 x i64>, ptr %vp
2765  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
2766  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2767  %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
2768  ret <2 x i64> %res
2769}
2770
2771define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %vec2, <2 x i64> %mask) {
2772; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask1:
2773; CHECK:       # %bb.0:
2774; CHECK-NEXT:    vmovdqa 48(%rdi), %xmm2
2775; CHECK-NEXT:    vptestnmq %xmm1, %xmm1, %k1
2776; CHECK-NEXT:    vpunpcklqdq 16(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[0],mem[0]
2777; CHECK-NEXT:    retq
2778  %vec = load <8 x i64>, ptr %vp
2779  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 2>
2780  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2781  %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
2782  ret <2 x i64> %res
2783}
2784
2785define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %mask) {
2786; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask1:
2787; CHECK:       # %bb.0:
2788; CHECK-NEXT:    vmovdqa 48(%rdi), %xmm1
2789; CHECK-NEXT:    vptestnmq %xmm0, %xmm0, %k1
2790; CHECK-NEXT:    vpunpcklqdq 16(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[0],mem[0]
2791; CHECK-NEXT:    retq
2792  %vec = load <8 x i64>, ptr %vp
2793  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 2>
2794  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2795  %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
2796  ret <2 x i64> %res
2797}
2798
2799define <4 x float> @test_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec) {
2800; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mask0:
2801; CHECK:       # %bb.0:
2802; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
2803; CHECK-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,1]
2804; CHECK-NEXT:    vzeroupper
2805; CHECK-NEXT:    retq
2806  %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 5>
2807  ret <4 x float> %res
2808}
2809define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
2810; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask0:
2811; CHECK:       # %bb.0:
2812; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm3
2813; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
2814; CHECK-NEXT:    vcmpeqps %xmm4, %xmm2, %k1
2815; CHECK-NEXT:    vshufps {{.*#+}} xmm1 {%k1} = xmm0[0,3],xmm3[0,1]
2816; CHECK-NEXT:    vmovaps %xmm1, %xmm0
2817; CHECK-NEXT:    vzeroupper
2818; CHECK-NEXT:    retq
2819  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 5>
2820  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2821  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
2822  ret <4 x float> %res
2823}
2824
2825define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec, <4 x float> %mask) {
2826; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask0:
2827; CHECK:       # %bb.0:
2828; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm2
2829; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
2830; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
2831; CHECK-NEXT:    vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3],xmm2[0,1]
2832; CHECK-NEXT:    vzeroupper
2833; CHECK-NEXT:    retq
2834  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 5>
2835  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2836  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
2837  ret <4 x float> %res
2838}
2839define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
2840; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask1:
2841; CHECK:       # %bb.0:
2842; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
2843; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,3,5,0]
2844; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
2845; CHECK-NEXT:    vcmpeqps %xmm4, %xmm2, %k1
2846; CHECK-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1}
2847; CHECK-NEXT:    vmovaps %xmm1, %xmm0
2848; CHECK-NEXT:    vzeroupper
2849; CHECK-NEXT:    retq
2850  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 0>
2851  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2852  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
2853  ret <4 x float> %res
2854}
2855
2856define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec, <4 x float> %mask) {
2857; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask1:
2858; CHECK:       # %bb.0:
2859; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,3,5,0]
2860; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
2861; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
2862; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
2863; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2864; CHECK-NEXT:    vzeroupper
2865; CHECK-NEXT:    retq
2866  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 0>
2867  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2868  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
2869  ret <4 x float> %res
2870}
2871define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
2872; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask2:
2873; CHECK:       # %bb.0:
2874; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
2875; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [3,2,7,0]
2876; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
2877; CHECK-NEXT:    vcmpeqps %xmm4, %xmm2, %k1
2878; CHECK-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1}
2879; CHECK-NEXT:    vmovaps %xmm1, %xmm0
2880; CHECK-NEXT:    vzeroupper
2881; CHECK-NEXT:    retq
2882  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 2, i32 7, i32 0>
2883  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2884  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
2885  ret <4 x float> %res
2886}
2887
2888define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec, <4 x float> %mask) {
2889; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask2:
2890; CHECK:       # %bb.0:
2891; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [3,2,7,0]
2892; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
2893; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
2894; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
2895; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2896; CHECK-NEXT:    vzeroupper
2897; CHECK-NEXT:    retq
2898  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 2, i32 7, i32 0>
2899  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2900  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
2901  ret <4 x float> %res
2902}
2903define <4 x float> @test_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec) {
2904; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mask3:
2905; CHECK:       # %bb.0:
2906; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [3,3,5,2]
2907; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
2908; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2909; CHECK-NEXT:    vzeroupper
2910; CHECK-NEXT:    retq
2911  %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 3, i32 5, i32 2>
2912  ret <4 x float> %res
2913}
2914define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
2915; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask3:
2916; CHECK:       # %bb.0:
2917; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
2918; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [3,3,5,2]
2919; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
2920; CHECK-NEXT:    vcmpeqps %xmm4, %xmm2, %k1
2921; CHECK-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1}
2922; CHECK-NEXT:    vmovaps %xmm1, %xmm0
2923; CHECK-NEXT:    vzeroupper
2924; CHECK-NEXT:    retq
2925  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 3, i32 5, i32 2>
2926  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2927  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
2928  ret <4 x float> %res
2929}
2930
2931define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec, <4 x float> %mask) {
2932; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask3:
2933; CHECK:       # %bb.0:
2934; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [3,3,5,2]
2935; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
2936; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
2937; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
2938; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2939; CHECK-NEXT:    vzeroupper
2940; CHECK-NEXT:    retq
2941  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 3, i32 5, i32 2>
2942  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2943  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
2944  ret <4 x float> %res
2945}
2946define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp) {
2947; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask0:
2948; CHECK:       # %bb.0:
2949; CHECK-NEXT:    vmovaps 16(%rdi), %xmm1
2950; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [2,6,0,1]
2951; CHECK-NEXT:    vpermi2ps (%rdi), %xmm1, %xmm0
2952; CHECK-NEXT:    retq
2953  %vec = load <8 x float>, ptr %vp
2954  %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5>
2955  ret <4 x float> %res
2956}
2957define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x float> %vec2, <4 x float> %mask) {
2958; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask0:
2959; CHECK:       # %bb.0:
2960; CHECK-NEXT:    vmovaps 16(%rdi), %xmm2
2961; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [2,6,0,1]
2962; CHECK-NEXT:    vpermi2ps (%rdi), %xmm2, %xmm3
2963; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
2964; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
2965; CHECK-NEXT:    vmovaps %xmm3, %xmm0 {%k1}
2966; CHECK-NEXT:    retq
2967  %vec = load <8 x float>, ptr %vp
2968  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5>
2969  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2970  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
2971  ret <4 x float> %res
2972}
2973
2974define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x float> %mask) {
2975; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0:
2976; CHECK:       # %bb.0:
2977; CHECK-NEXT:    vmovaps 16(%rdi), %xmm2
2978; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [2,6,0,1]
2979; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
2980; CHECK-NEXT:    vcmpeqps %xmm3, %xmm0, %k1
2981; CHECK-NEXT:    vpermi2ps (%rdi), %xmm2, %xmm1 {%k1} {z}
2982; CHECK-NEXT:    vmovaps %xmm1, %xmm0
2983; CHECK-NEXT:    retq
2984  %vec = load <8 x float>, ptr %vp
2985  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5>
2986  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2987  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
2988  ret <4 x float> %res
2989}
2990
2991define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x float> %vec2, <4 x float> %mask) {
2992; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask1:
2993; CHECK:       # %bb.0:
2994; CHECK-NEXT:    vmovaps 16(%rdi), %xmm2
2995; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [2,7,7,2]
2996; CHECK-NEXT:    vpermi2ps (%rdi), %xmm2, %xmm3
2997; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
2998; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
2999; CHECK-NEXT:    vmovaps %xmm3, %xmm0 {%k1}
3000; CHECK-NEXT:    retq
3001  %vec = load <8 x float>, ptr %vp
3002  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 3, i32 3, i32 6>
3003  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3004  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3005  ret <4 x float> %res
3006}
3007
3008define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x float> %mask) {
3009; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1:
3010; CHECK:       # %bb.0:
3011; CHECK-NEXT:    vmovaps 16(%rdi), %xmm2
3012; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [2,7,7,2]
3013; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
3014; CHECK-NEXT:    vcmpeqps %xmm3, %xmm0, %k1
3015; CHECK-NEXT:    vpermi2ps (%rdi), %xmm2, %xmm1 {%k1} {z}
3016; CHECK-NEXT:    vmovaps %xmm1, %xmm0
3017; CHECK-NEXT:    retq
3018  %vec = load <8 x float>, ptr %vp
3019  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 3, i32 3, i32 6>
3020  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3021  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3022  ret <4 x float> %res
3023}
3024
3025define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %vec2, <4 x float> %mask) {
3026; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask2:
3027; CHECK:       # %bb.0:
3028; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
3029; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [3,1,3,7]
3030; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
3031; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
3032; CHECK-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1}
3033; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3034; CHECK-NEXT:    vzeroupper
3035; CHECK-NEXT:    retq
3036  %vec = load <8 x float>, ptr %vp
3037  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 7>
3038  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3039  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3040  ret <4 x float> %res
3041}
3042
3043define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %mask) {
3044; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2:
3045; CHECK:       # %bb.0:
3046; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [3,1,3,7]
3047; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
3048; CHECK-NEXT:    vcmpeqps %xmm2, %xmm0, %k1
3049; CHECK-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z}
3050; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3051; CHECK-NEXT:    vzeroupper
3052; CHECK-NEXT:    retq
3053  %vec = load <8 x float>, ptr %vp
3054  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 7>
3055  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3056  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3057  ret <4 x float> %res
3058}
3059
3060define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp) {
3061; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask3:
3062; CHECK:       # %bb.0:
3063; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [1,3,5,3]
3064; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0
3065; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3066; CHECK-NEXT:    vzeroupper
3067; CHECK-NEXT:    retq
3068  %vec = load <8 x float>, ptr %vp
3069  %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3>
3070  ret <4 x float> %res
3071}
3072define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x float> %vec2, <4 x float> %mask) {
3073; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask3:
3074; CHECK:       # %bb.0:
3075; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
3076; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,3,5,3]
3077; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
3078; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
3079; CHECK-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1}
3080; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3081; CHECK-NEXT:    vzeroupper
3082; CHECK-NEXT:    retq
3083  %vec = load <8 x float>, ptr %vp
3084  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3>
3085  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3086  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3087  ret <4 x float> %res
3088}
3089
3090define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x float> %mask) {
3091; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3:
3092; CHECK:       # %bb.0:
3093; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [1,3,5,3]
3094; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
3095; CHECK-NEXT:    vcmpeqps %xmm2, %xmm0, %k1
3096; CHECK-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z}
3097; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3098; CHECK-NEXT:    vzeroupper
3099; CHECK-NEXT:    retq
3100  %vec = load <8 x float>, ptr %vp
3101  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3>
3102  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3103  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3104  ret <4 x float> %res
3105}
3106
3107define <8 x float> @test_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec) {
3108; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mask0:
3109; CHECK:       # %bb.0:
3110; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,4,12,10,8,2,11,7]
3111; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
3112; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
3113; CHECK-NEXT:    retq
3114  %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 12, i32 10, i32 8, i32 2, i32 11, i32 7>
3115  ret <8 x float> %res
3116}
3117define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
3118; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask0:
3119; CHECK:       # %bb.0:
3120; CHECK-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
3121; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,4,12,10,8,2,11,7]
3122; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
3123; CHECK-NEXT:    vcmpeqps %ymm4, %ymm2, %k1
3124; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1}
3125; CHECK-NEXT:    vmovaps %ymm1, %ymm0
3126; CHECK-NEXT:    retq
3127  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 12, i32 10, i32 8, i32 2, i32 11, i32 7>
3128  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3129  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3130  ret <8 x float> %res
3131}
3132
3133define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec, <8 x float> %mask) {
3134; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask0:
3135; CHECK:       # %bb.0:
3136; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,4,12,10,8,2,11,7]
3137; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
3138; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
3139; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
3140; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
3141; CHECK-NEXT:    retq
3142  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 12, i32 10, i32 8, i32 2, i32 11, i32 7>
3143  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3144  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3145  ret <8 x float> %res
3146}
3147define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask1(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
3148; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask1:
3149; CHECK:       # %bb.0:
3150; CHECK-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
3151; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [10,12,3,12,4,15,1,14]
3152; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
3153; CHECK-NEXT:    vcmpeqps %ymm4, %ymm2, %k1
3154; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1}
3155; CHECK-NEXT:    vmovaps %ymm1, %ymm0
3156; CHECK-NEXT:    retq
3157  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14>
3158  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3159  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3160  ret <8 x float> %res
3161}
3162
3163define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask1(<16 x float> %vec, <8 x float> %mask) {
3164; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask1:
3165; CHECK:       # %bb.0:
3166; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [10,12,3,12,4,15,1,14]
3167; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
3168; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
3169; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
3170; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
3171; CHECK-NEXT:    retq
3172  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14>
3173  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3174  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3175  ret <8 x float> %res
3176}
3177define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
3178; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask2:
3179; CHECK:       # %bb.0:
3180; CHECK-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
3181; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,4,8,9,6,1,4,4]
3182; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
3183; CHECK-NEXT:    vcmpeqps %ymm4, %ymm2, %k1
3184; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1}
3185; CHECK-NEXT:    vmovaps %ymm1, %ymm0
3186; CHECK-NEXT:    retq
3187  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4>
3188  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3189  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3190  ret <8 x float> %res
3191}
3192
3193define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec, <8 x float> %mask) {
3194; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask2:
3195; CHECK:       # %bb.0:
3196; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,4,8,9,6,1,4,4]
3197; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
3198; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
3199; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
3200; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
3201; CHECK-NEXT:    retq
3202  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4>
3203  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3204  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3205  ret <8 x float> %res
3206}
3207define <8 x float> @test_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec) {
3208; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mask3:
3209; CHECK:       # %bb.0:
3210; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [12,14,9,0,12,4,5,8]
3211; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
3212; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
3213; CHECK-NEXT:    retq
3214  %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 12, i32 14, i32 9, i32 0, i32 12, i32 4, i32 5, i32 8>
3215  ret <8 x float> %res
3216}
3217define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
3218; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask3:
3219; CHECK:       # %bb.0:
3220; CHECK-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
3221; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [12,14,9,0,12,4,5,8]
3222; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
3223; CHECK-NEXT:    vcmpeqps %ymm4, %ymm2, %k1
3224; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1}
3225; CHECK-NEXT:    vmovaps %ymm1, %ymm0
3226; CHECK-NEXT:    retq
3227  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 12, i32 14, i32 9, i32 0, i32 12, i32 4, i32 5, i32 8>
3228  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3229  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3230  ret <8 x float> %res
3231}
3232
3233define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec, <8 x float> %mask) {
3234; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask3:
3235; CHECK:       # %bb.0:
3236; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [12,14,9,0,12,4,5,8]
3237; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
3238; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
3239; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
3240; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
3241; CHECK-NEXT:    retq
3242  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 12, i32 14, i32 9, i32 0, i32 12, i32 4, i32 5, i32 8>
3243  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3244  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3245  ret <8 x float> %res
3246}
3247define <4 x float> @test_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec) {
3248; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mask0:
3249; CHECK:       # %bb.0:
3250; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,8,9,10]
3251; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
3252; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
3253; CHECK-NEXT:    vzeroupper
3254; CHECK-NEXT:    retq
3255  %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 4, i32 8, i32 9, i32 10>
3256  ret <4 x float> %res
3257}
3258define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
3259; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask0:
3260; CHECK:       # %bb.0:
3261; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
3262; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [4,8,9,10]
3263; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
3264; CHECK-NEXT:    vcmpeqps %xmm4, %xmm2, %k1
3265; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1}
3266; CHECK-NEXT:    vmovaps %xmm1, %xmm0
3267; CHECK-NEXT:    vzeroupper
3268; CHECK-NEXT:    retq
3269  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 4, i32 8, i32 9, i32 10>
3270  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3271  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3272  ret <4 x float> %res
3273}
3274
3275define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec, <4 x float> %mask) {
3276; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask0:
3277; CHECK:       # %bb.0:
3278; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [4,8,9,10]
3279; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
3280; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
3281; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
3282; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
3283; CHECK-NEXT:    vzeroupper
3284; CHECK-NEXT:    retq
3285  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 4, i32 8, i32 9, i32 10>
3286  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3287  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3288  ret <4 x float> %res
3289}
3290define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
3291; CHECK-FAST-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask1:
3292; CHECK-FAST:       # %bb.0:
3293; CHECK-FAST-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
3294; CHECK-FAST-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [8,6,10,6]
3295; CHECK-FAST-NEXT:    vxorps %xmm4, %xmm4, %xmm4
3296; CHECK-FAST-NEXT:    vcmpeqps %xmm4, %xmm2, %k1
3297; CHECK-FAST-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1}
3298; CHECK-FAST-NEXT:    vmovaps %xmm1, %xmm0
3299; CHECK-FAST-NEXT:    vzeroupper
3300; CHECK-FAST-NEXT:    retq
3301;
3302; CHECK-FAST-PERLANE-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask1:
3303; CHECK-FAST-PERLANE:       # %bb.0:
3304; CHECK-FAST-PERLANE-NEXT:    vextractf32x4 $2, %zmm0, %xmm3
3305; CHECK-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm0
3306; CHECK-FAST-PERLANE-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [0,6,2,6]
3307; CHECK-FAST-PERLANE-NEXT:    vpermi2ps %xmm0, %xmm3, %xmm4
3308; CHECK-FAST-PERLANE-NEXT:    vxorps %xmm0, %xmm0, %xmm0
3309; CHECK-FAST-PERLANE-NEXT:    vcmpeqps %xmm0, %xmm2, %k1
3310; CHECK-FAST-PERLANE-NEXT:    vblendmps %xmm4, %xmm1, %xmm0 {%k1}
3311; CHECK-FAST-PERLANE-NEXT:    vzeroupper
3312; CHECK-FAST-PERLANE-NEXT:    retq
3313  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 6, i32 10, i32 6>
3314  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3315  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3316  ret <4 x float> %res
3317}
3318
3319define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %mask) {
3320; CHECK-FAST-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask1:
3321; CHECK-FAST:       # %bb.0:
3322; CHECK-FAST-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [8,6,10,6]
3323; CHECK-FAST-NEXT:    vxorps %xmm3, %xmm3, %xmm3
3324; CHECK-FAST-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
3325; CHECK-FAST-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
3326; CHECK-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
3327; CHECK-FAST-NEXT:    vzeroupper
3328; CHECK-FAST-NEXT:    retq
3329;
3330; CHECK-FAST-PERLANE-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask1:
3331; CHECK-FAST-PERLANE:       # %bb.0:
3332; CHECK-FAST-PERLANE-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
3333; CHECK-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm3
3334; CHECK-FAST-PERLANE-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,6,2,6]
3335; CHECK-FAST-PERLANE-NEXT:    vxorps %xmm4, %xmm4, %xmm4
3336; CHECK-FAST-PERLANE-NEXT:    vcmpeqps %xmm4, %xmm1, %k1
3337; CHECK-FAST-PERLANE-NEXT:    vpermi2ps %xmm3, %xmm2, %xmm0 {%k1} {z}
3338; CHECK-FAST-PERLANE-NEXT:    vzeroupper
3339; CHECK-FAST-PERLANE-NEXT:    retq
3340  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 6, i32 10, i32 6>
3341  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3342  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3343  ret <4 x float> %res
3344}
3345define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask2(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
3346; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask2:
3347; CHECK:       # %bb.0:
3348; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm3
3349; CHECK-NEXT:    vshufps {{.*#+}} ymm0 = ymm3[0,0],ymm0[0,1],ymm3[4,4],ymm0[4,5]
3350; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
3351; CHECK-NEXT:    vcmpeqps %xmm3, %xmm2, %k1
3352; CHECK-NEXT:    vextractf32x4 $1, %ymm0, %xmm1 {%k1}
3353; CHECK-NEXT:    vmovaps %xmm1, %xmm0
3354; CHECK-NEXT:    vzeroupper
3355; CHECK-NEXT:    retq
3356  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 12, i32 4, i32 5>
3357  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3358  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3359  ret <4 x float> %res
3360}
3361
3362define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask2(<16 x float> %vec, <4 x float> %mask) {
3363; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask2:
3364; CHECK:       # %bb.0:
3365; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
3366; CHECK-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[0,0],ymm0[0,1],ymm2[4,4],ymm0[4,5]
3367; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
3368; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
3369; CHECK-NEXT:    vextractf32x4 $1, %ymm0, %xmm0 {%k1} {z}
3370; CHECK-NEXT:    vzeroupper
3371; CHECK-NEXT:    retq
3372  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 12, i32 4, i32 5>
3373  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3374  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3375  ret <4 x float> %res
3376}
3377define <4 x float> @test_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec) {
3378; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mask3:
3379; CHECK:       # %bb.0:
3380; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [10,2,11,6]
3381; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
3382; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
3383; CHECK-NEXT:    vzeroupper
3384; CHECK-NEXT:    retq
3385  %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 10, i32 2, i32 11, i32 6>
3386  ret <4 x float> %res
3387}
3388define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
3389; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask3:
3390; CHECK:       # %bb.0:
3391; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
3392; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [10,2,11,6]
3393; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
3394; CHECK-NEXT:    vcmpeqps %xmm4, %xmm2, %k1
3395; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1}
3396; CHECK-NEXT:    vmovaps %xmm1, %xmm0
3397; CHECK-NEXT:    vzeroupper
3398; CHECK-NEXT:    retq
3399  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 10, i32 2, i32 11, i32 6>
3400  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3401  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3402  ret <4 x float> %res
3403}
3404
3405define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec, <4 x float> %mask) {
3406; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask3:
3407; CHECK:       # %bb.0:
3408; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [10,2,11,6]
3409; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
3410; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
3411; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
3412; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
3413; CHECK-NEXT:    vzeroupper
3414; CHECK-NEXT:    retq
3415  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 10, i32 2, i32 11, i32 6>
3416  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3417  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3418  ret <4 x float> %res
3419}
3420define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask0(ptr %vp) {
3421; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mem_mask0:
3422; CHECK:       # %bb.0:
3423; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [7,6,7,11,5,10,0,4]
3424; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0
3425; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
3426; CHECK-NEXT:    retq
3427  %vec = load <16 x float>, ptr %vp
3428  %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 7, i32 6, i32 7, i32 11, i32 5, i32 10, i32 0, i32 4>
3429  ret <8 x float> %res
3430}
3431define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask0(ptr %vp, <8 x float> %vec2, <8 x float> %mask) {
3432; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask0:
3433; CHECK:       # %bb.0:
3434; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
3435; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [7,6,7,11,5,10,0,4]
3436; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
3437; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
3438; CHECK-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1}
3439; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
3440; CHECK-NEXT:    retq
3441  %vec = load <16 x float>, ptr %vp
3442  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 7, i32 6, i32 7, i32 11, i32 5, i32 10, i32 0, i32 4>
3443  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3444  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3445  ret <8 x float> %res
3446}
3447
3448define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask0(ptr %vp, <8 x float> %mask) {
3449; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask0:
3450; CHECK:       # %bb.0:
3451; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [7,6,7,11,5,10,0,4]
3452; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
3453; CHECK-NEXT:    vcmpeqps %ymm2, %ymm0, %k1
3454; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z}
3455; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
3456; CHECK-NEXT:    retq
3457  %vec = load <16 x float>, ptr %vp
3458  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 7, i32 6, i32 7, i32 11, i32 5, i32 10, i32 0, i32 4>
3459  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3460  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3461  ret <8 x float> %res
3462}
3463
3464define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask1(ptr %vp, <8 x float> %vec2, <8 x float> %mask) {
3465; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask1:
3466; CHECK:       # %bb.0:
3467; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
3468; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [11,0,9,0,7,14,0,8]
3469; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
3470; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
3471; CHECK-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1}
3472; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
3473; CHECK-NEXT:    retq
3474  %vec = load <16 x float>, ptr %vp
3475  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 11, i32 0, i32 9, i32 0, i32 7, i32 14, i32 0, i32 8>
3476  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3477  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3478  ret <8 x float> %res
3479}
3480
3481define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask1(ptr %vp, <8 x float> %mask) {
3482; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask1:
3483; CHECK:       # %bb.0:
3484; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [11,0,9,0,7,14,0,8]
3485; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
3486; CHECK-NEXT:    vcmpeqps %ymm2, %ymm0, %k1
3487; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z}
3488; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
3489; CHECK-NEXT:    retq
3490  %vec = load <16 x float>, ptr %vp
3491  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 11, i32 0, i32 9, i32 0, i32 7, i32 14, i32 0, i32 8>
3492  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3493  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3494  ret <8 x float> %res
3495}
3496
3497define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask2(ptr %vp, <8 x float> %vec2, <8 x float> %mask) {
3498; CHECK-FAST-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask2:
3499; CHECK-FAST:       # %bb.0:
3500; CHECK-FAST-NEXT:    vmovaps 32(%rdi), %ymm2
3501; CHECK-FAST-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [9,5,2,3,2,8,8,1]
3502; CHECK-FAST-NEXT:    vpermi2ps (%rdi), %ymm2, %ymm3
3503; CHECK-FAST-NEXT:    vxorps %xmm2, %xmm2, %xmm2
3504; CHECK-FAST-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
3505; CHECK-FAST-NEXT:    vmovaps %ymm3, %ymm0 {%k1}
3506; CHECK-FAST-NEXT:    retq
3507;
3508; CHECK-FAST-PERLANE-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask2:
3509; CHECK-FAST-PERLANE:       # %bb.0:
3510; CHECK-FAST-PERLANE-NEXT:    vmovaps (%rdi), %xmm2
3511; CHECK-FAST-PERLANE-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,13,10,11,10,0,0,9]
3512; CHECK-FAST-PERLANE-NEXT:    vpermi2ps 32(%rdi), %ymm2, %ymm3
3513; CHECK-FAST-PERLANE-NEXT:    vxorps %xmm2, %xmm2, %xmm2
3514; CHECK-FAST-PERLANE-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
3515; CHECK-FAST-PERLANE-NEXT:    vmovaps %ymm3, %ymm0 {%k1}
3516; CHECK-FAST-PERLANE-NEXT:    retq
3517  %vec = load <16 x float>, ptr %vp
3518  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 1, i32 13, i32 10, i32 11, i32 10, i32 0, i32 0, i32 9>
3519  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3520  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3521  ret <8 x float> %res
3522}
3523
3524define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2(ptr %vp, <8 x float> %mask) {
3525; CHECK-FAST-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2:
3526; CHECK-FAST:       # %bb.0:
3527; CHECK-FAST-NEXT:    vmovaps 32(%rdi), %ymm2
3528; CHECK-FAST-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [9,5,2,3,2,8,8,1]
3529; CHECK-FAST-NEXT:    vxorps %xmm3, %xmm3, %xmm3
3530; CHECK-FAST-NEXT:    vcmpeqps %ymm3, %ymm0, %k1
3531; CHECK-FAST-NEXT:    vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z}
3532; CHECK-FAST-NEXT:    vmovaps %ymm1, %ymm0
3533; CHECK-FAST-NEXT:    retq
3534;
3535; CHECK-FAST-PERLANE-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2:
3536; CHECK-FAST-PERLANE:       # %bb.0:
3537; CHECK-FAST-PERLANE-NEXT:    vmovaps (%rdi), %xmm2
3538; CHECK-FAST-PERLANE-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [1,13,10,11,10,0,0,9]
3539; CHECK-FAST-PERLANE-NEXT:    vxorps %xmm3, %xmm3, %xmm3
3540; CHECK-FAST-PERLANE-NEXT:    vcmpeqps %ymm3, %ymm0, %k1
3541; CHECK-FAST-PERLANE-NEXT:    vpermi2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z}
3542; CHECK-FAST-PERLANE-NEXT:    vmovaps %ymm1, %ymm0
3543; CHECK-FAST-PERLANE-NEXT:    retq
3544  %vec = load <16 x float>, ptr %vp
3545  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 1, i32 13, i32 10, i32 11, i32 10, i32 0, i32 0, i32 9>
3546  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3547  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3548  ret <8 x float> %res
3549}
3550
3551define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask3(ptr %vp) {
3552; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mem_mask3:
3553; CHECK:       # %bb.0:
3554; CHECK-NEXT:    vmovaps 32(%rdi), %ymm1
3555; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [7,5,3,3,11,4,12,9]
3556; CHECK-NEXT:    vpermi2ps (%rdi), %ymm1, %ymm0
3557; CHECK-NEXT:    retq
3558  %vec = load <16 x float>, ptr %vp
3559  %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1>
3560  ret <8 x float> %res
3561}
3562define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask3(ptr %vp, <8 x float> %vec2, <8 x float> %mask) {
3563; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask3:
3564; CHECK:       # %bb.0:
3565; CHECK-NEXT:    vmovaps 32(%rdi), %ymm2
3566; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [7,5,3,3,11,4,12,9]
3567; CHECK-NEXT:    vpermi2ps (%rdi), %ymm2, %ymm3
3568; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
3569; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
3570; CHECK-NEXT:    vmovaps %ymm3, %ymm0 {%k1}
3571; CHECK-NEXT:    retq
3572  %vec = load <16 x float>, ptr %vp
3573  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1>
3574  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3575  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3576  ret <8 x float> %res
3577}
3578
3579define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask3(ptr %vp, <8 x float> %mask) {
3580; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask3:
3581; CHECK:       # %bb.0:
3582; CHECK-NEXT:    vmovaps 32(%rdi), %ymm2
3583; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [7,5,3,3,11,4,12,9]
3584; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
3585; CHECK-NEXT:    vcmpeqps %ymm3, %ymm0, %k1
3586; CHECK-NEXT:    vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z}
3587; CHECK-NEXT:    vmovaps %ymm1, %ymm0
3588; CHECK-NEXT:    retq
3589  %vec = load <16 x float>, ptr %vp
3590  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1>
3591  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3592  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3593  ret <8 x float> %res
3594}
3595
3596define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask0(ptr %vp) {
3597; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mem_mask0:
3598; CHECK:       # %bb.0:
3599; CHECK-NEXT:    vpermpd $231, 32(%rdi), %ymm1 # ymm1 = mem[3,1,2,3]
3600; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,6,7,3]
3601; CHECK-NEXT:    vpermi2ps 16(%rdi), %xmm1, %xmm0
3602; CHECK-NEXT:    vzeroupper
3603; CHECK-NEXT:    retq
3604  %vec = load <16 x float>, ptr %vp
3605  %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 14, i32 6, i32 7, i32 11>
3606  ret <4 x float> %res
3607}
3608define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x float> %vec2, <4 x float> %mask) {
3609; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask0:
3610; CHECK:       # %bb.0:
3611; CHECK-NEXT:    vpermpd $231, 32(%rdi), %ymm2 # ymm2 = mem[3,1,2,3]
3612; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [0,6,7,3]
3613; CHECK-NEXT:    vpermi2ps 16(%rdi), %xmm2, %xmm3
3614; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
3615; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
3616; CHECK-NEXT:    vmovaps %xmm3, %xmm0 {%k1}
3617; CHECK-NEXT:    vzeroupper
3618; CHECK-NEXT:    retq
3619  %vec = load <16 x float>, ptr %vp
3620  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 14, i32 6, i32 7, i32 11>
3621  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3622  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3623  ret <4 x float> %res
3624}
3625
3626define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x float> %mask) {
3627; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0:
3628; CHECK:       # %bb.0:
3629; CHECK-NEXT:    vpermpd $231, 32(%rdi), %ymm2 # ymm2 = mem[3,1,2,3]
3630; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,6,7,3]
3631; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
3632; CHECK-NEXT:    vcmpeqps %xmm3, %xmm0, %k1
3633; CHECK-NEXT:    vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z}
3634; CHECK-NEXT:    vmovaps %xmm1, %xmm0
3635; CHECK-NEXT:    vzeroupper
3636; CHECK-NEXT:    retq
3637  %vec = load <16 x float>, ptr %vp
3638  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 14, i32 6, i32 7, i32 11>
3639  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3640  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3641  ret <4 x float> %res
3642}
3643
3644define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x float> %vec2, <4 x float> %mask) {
3645; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask1:
3646; CHECK:       # %bb.0:
3647; CHECK-NEXT:    vmovaps 32(%rdi), %ymm2
3648; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,10,6,15,0,0,0,0]
3649; CHECK-NEXT:    vpermi2ps (%rdi), %ymm2, %ymm3
3650; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
3651; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
3652; CHECK-NEXT:    vmovaps %xmm3, %xmm0 {%k1}
3653; CHECK-NEXT:    vzeroupper
3654; CHECK-NEXT:    retq
3655  %vec = load <16 x float>, ptr %vp
3656  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 2, i32 14, i32 7>
3657  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3658  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3659  ret <4 x float> %res
3660}
3661
3662define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x float> %mask) {
3663; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1:
3664; CHECK:       # %bb.0:
3665; CHECK-NEXT:    vmovaps 32(%rdi), %ymm2
3666; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,10,6,15,0,0,0,0]
3667; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
3668; CHECK-NEXT:    vcmpeqps %xmm3, %xmm0, %k1
3669; CHECK-NEXT:    vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z}
3670; CHECK-NEXT:    vmovaps %xmm1, %xmm0
3671; CHECK-NEXT:    vzeroupper
3672; CHECK-NEXT:    retq
3673  %vec = load <16 x float>, ptr %vp
3674  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 2, i32 14, i32 7>
3675  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3676  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3677  ret <4 x float> %res
3678}
3679
3680define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %vec2, <4 x float> %mask) {
3681; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask2:
3682; CHECK:       # %bb.0:
3683; CHECK-NEXT:    vmovddup {{.*#+}} xmm2 = [4,14,4,14]
3684; CHECK-NEXT:    # xmm2 = mem[0,0]
3685; CHECK-NEXT:    vmovaps 32(%rdi), %ymm3
3686; CHECK-NEXT:    vpermt2ps (%rdi), %ymm2, %ymm3
3687; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
3688; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
3689; CHECK-NEXT:    vmovaps %xmm3, %xmm0 {%k1}
3690; CHECK-NEXT:    vzeroupper
3691; CHECK-NEXT:    retq
3692  %vec = load <16 x float>, ptr %vp
3693  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 6, i32 12, i32 6>
3694  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3695  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3696  ret <4 x float> %res
3697}
3698
3699define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %mask) {
3700; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2:
3701; CHECK:       # %bb.0:
3702; CHECK-NEXT:    vmovddup {{.*#+}} xmm2 = [4,14,4,14]
3703; CHECK-NEXT:    # xmm2 = mem[0,0]
3704; CHECK-NEXT:    vmovaps 32(%rdi), %ymm1
3705; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
3706; CHECK-NEXT:    vcmpeqps %xmm3, %xmm0, %k1
3707; CHECK-NEXT:    vpermt2ps (%rdi), %ymm2, %ymm1 {%k1} {z}
3708; CHECK-NEXT:    vmovaps %xmm1, %xmm0
3709; CHECK-NEXT:    vzeroupper
3710; CHECK-NEXT:    retq
3711  %vec = load <16 x float>, ptr %vp
3712  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 6, i32 12, i32 6>
3713  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3714  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3715  ret <4 x float> %res
3716}
3717
3718define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp) {
3719; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mem_mask3:
3720; CHECK:       # %bb.0:
3721; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [3,3,15,9]
3722; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0
3723; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
3724; CHECK-NEXT:    vzeroupper
3725; CHECK-NEXT:    retq
3726  %vec = load <16 x float>, ptr %vp
3727  %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 3, i32 3, i32 15, i32 9>
3728  ret <4 x float> %res
3729}
3730define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x float> %vec2, <4 x float> %mask) {
3731; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask3:
3732; CHECK:       # %bb.0:
3733; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
3734; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [3,3,15,9]
3735; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
3736; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
3737; CHECK-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1}
3738; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
3739; CHECK-NEXT:    vzeroupper
3740; CHECK-NEXT:    retq
3741  %vec = load <16 x float>, ptr %vp
3742  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 3, i32 3, i32 15, i32 9>
3743  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3744  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3745  ret <4 x float> %res
3746}
3747
3748define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x float> %mask) {
3749; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3:
3750; CHECK:       # %bb.0:
3751; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [3,3,15,9]
3752; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
3753; CHECK-NEXT:    vcmpeqps %xmm2, %xmm0, %k1
3754; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z}
3755; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
3756; CHECK-NEXT:    vzeroupper
3757; CHECK-NEXT:    retq
3758  %vec = load <16 x float>, ptr %vp
3759  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 3, i32 3, i32 15, i32 9>
3760  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3761  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3762  ret <4 x float> %res
3763}
3764
3765define <2 x double> @test_4xdouble_to_2xdouble_perm_mask0(<4 x double> %vec) {
3766; CHECK-LABEL: test_4xdouble_to_2xdouble_perm_mask0:
3767; CHECK:       # %bb.0:
3768; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,0,2,3]
3769; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3770; CHECK-NEXT:    vzeroupper
3771; CHECK-NEXT:    retq
3772  %res = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
3773  ret <2 x double> %res
3774}
3775define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mask0(<4 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
3776; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mask0:
3777; CHECK:       # %bb.0:
3778; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
3779; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
3780; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm2, %k1
3781; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,0,2,3]
3782; CHECK-NEXT:    vmovapd %xmm1, %xmm0
3783; CHECK-NEXT:    vzeroupper
3784; CHECK-NEXT:    retq
3785  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
3786  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3787  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
3788  ret <2 x double> %res
3789}
3790
3791define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mask0(<4 x double> %vec, <2 x double> %mask) {
3792; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mask0:
3793; CHECK:       # %bb.0:
3794; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
3795; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm1, %k1
3796; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,2,3]
3797; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3798; CHECK-NEXT:    vzeroupper
3799; CHECK-NEXT:    retq
3800  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
3801  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3802  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
3803  ret <2 x double> %res
3804}
3805define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mask1(<4 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
3806; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mask1:
3807; CHECK:       # %bb.0:
3808; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
3809; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
3810; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm2, %k1
3811; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[1,3,2,3]
3812; CHECK-NEXT:    vmovapd %xmm1, %xmm0
3813; CHECK-NEXT:    vzeroupper
3814; CHECK-NEXT:    retq
3815  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 1, i32 3>
3816  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3817  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
3818  ret <2 x double> %res
3819}
3820
3821define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mask1(<4 x double> %vec, <2 x double> %mask) {
3822; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mask1:
3823; CHECK:       # %bb.0:
3824; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
3825; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm1, %k1
3826; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3,2,3]
3827; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3828; CHECK-NEXT:    vzeroupper
3829; CHECK-NEXT:    retq
3830  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 1, i32 3>
3831  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3832  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
3833  ret <2 x double> %res
3834}
3835define <2 x double> @test_4xdouble_to_2xdouble_perm_mem_mask0(ptr %vp) {
3836; CHECK-LABEL: test_4xdouble_to_2xdouble_perm_mem_mask0:
3837; CHECK:       # %bb.0:
3838; CHECK-NEXT:    vmovaps (%rdi), %xmm0
3839; CHECK-NEXT:    vblendps $3, 16(%rdi), %xmm0, %xmm0 # xmm0 = mem[0,1],xmm0[2,3]
3840; CHECK-NEXT:    retq
3841  %vec = load <4 x double>, ptr %vp
3842  %res = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 1>
3843  ret <2 x double> %res
3844}
3845define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask0(ptr %vp, <2 x double> %vec2, <2 x double> %mask) {
3846; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mem_mask0:
3847; CHECK:       # %bb.0:
3848; CHECK-NEXT:    vmovapd (%rdi), %xmm2
3849; CHECK-NEXT:    vblendpd $1, 16(%rdi), %xmm2, %xmm2 # xmm2 = mem[0],xmm2[1]
3850; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
3851; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm1, %k1
3852; CHECK-NEXT:    vmovapd %xmm2, %xmm0 {%k1}
3853; CHECK-NEXT:    retq
3854  %vec = load <4 x double>, ptr %vp
3855  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 1>
3856  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3857  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
3858  ret <2 x double> %res
3859}
3860
3861define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask0(ptr %vp, <2 x double> %mask) {
3862; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mem_mask0:
3863; CHECK:       # %bb.0:
3864; CHECK-NEXT:    vmovapd (%rdi), %xmm1
3865; CHECK-NEXT:    vblendpd $1, 16(%rdi), %xmm1, %xmm1 # xmm1 = mem[0],xmm1[1]
3866; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
3867; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm0, %k1
3868; CHECK-NEXT:    vmovapd %xmm1, %xmm0 {%k1} {z}
3869; CHECK-NEXT:    retq
3870  %vec = load <4 x double>, ptr %vp
3871  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 1>
3872  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3873  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
3874  ret <2 x double> %res
3875}
3876
3877define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2 x double> %vec2, <2 x double> %mask) {
3878; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mem_mask1:
3879; CHECK:       # %bb.0:
3880; CHECK-NEXT:    vmovapd 16(%rdi), %xmm2
3881; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
3882; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm1, %k1
3883; CHECK-NEXT:    vunpcklpd (%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[0],mem[0]
3884; CHECK-NEXT:    retq
3885  %vec = load <4 x double>, ptr %vp
3886  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
3887  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3888  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
3889  ret <2 x double> %res
3890}
3891
3892define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2 x double> %mask) {
3893; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1:
3894; CHECK:       # %bb.0:
3895; CHECK-NEXT:    vmovapd 16(%rdi), %xmm1
3896; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
3897; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm0, %k1
3898; CHECK-NEXT:    vunpcklpd (%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[0],mem[0]
3899; CHECK-NEXT:    retq
3900  %vec = load <4 x double>, ptr %vp
3901  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
3902  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3903  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
3904  ret <2 x double> %res
3905}
3906
3907define <4 x double> @test_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec) {
3908; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask0:
3909; CHECK:       # %bb.0:
3910; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [7,3,7,3]
3911; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
3912; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
3913; CHECK-NEXT:    retq
3914  %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3>
3915  ret <4 x double> %res
3916}
3917define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
3918; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask0:
3919; CHECK:       # %bb.0:
3920; CHECK-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
3921; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [7,3,7,3]
3922; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
3923; CHECK-NEXT:    vcmpeqpd %ymm4, %ymm2, %k1
3924; CHECK-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
3925; CHECK-NEXT:    vmovapd %ymm1, %ymm0
3926; CHECK-NEXT:    retq
3927  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3>
3928  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3929  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
3930  ret <4 x double> %res
3931}
3932
3933define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %mask) {
3934; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask0:
3935; CHECK:       # %bb.0:
3936; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [7,3,7,3]
3937; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
3938; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
3939; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
3940; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
3941; CHECK-NEXT:    retq
3942  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3>
3943  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3944  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
3945  ret <4 x double> %res
3946}
3947define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask1(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
3948; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask1:
3949; CHECK:       # %bb.0:
3950; CHECK-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
3951; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [2,0,7,6]
3952; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
3953; CHECK-NEXT:    vcmpeqpd %ymm4, %ymm2, %k1
3954; CHECK-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
3955; CHECK-NEXT:    vmovapd %ymm1, %ymm0
3956; CHECK-NEXT:    retq
3957  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 0, i32 7, i32 6>
3958  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3959  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
3960  ret <4 x double> %res
3961}
3962
3963define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask1(<8 x double> %vec, <4 x double> %mask) {
3964; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask1:
3965; CHECK:       # %bb.0:
3966; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [2,0,7,6]
3967; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
3968; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
3969; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
3970; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
3971; CHECK-NEXT:    retq
3972  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 0, i32 7, i32 6>
3973  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3974  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
3975  ret <4 x double> %res
3976}
3977define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask2(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
3978; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask2:
3979; CHECK:       # %bb.0:
3980; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
3981; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
3982; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,0]
3983; CHECK-NEXT:    vmovapd %ymm1, %ymm0
3984; CHECK-NEXT:    retq
3985  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 0>
3986  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3987  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
3988  ret <4 x double> %res
3989}
3990
3991define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask2(<8 x double> %vec, <4 x double> %mask) {
3992; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask2:
3993; CHECK:       # %bb.0:
3994; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
3995; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
3996; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,0]
3997; CHECK-NEXT:    retq
3998  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 0>
3999  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4000  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4001  ret <4 x double> %res
4002}
4003define <4 x double> @test_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec) {
4004; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask3:
4005; CHECK:       # %bb.0:
4006; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,2,1,4]
4007; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
4008; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
4009; CHECK-NEXT:    retq
4010  %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 4>
4011  ret <4 x double> %res
4012}
4013define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
4014; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask3:
4015; CHECK:       # %bb.0:
4016; CHECK-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
4017; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [0,2,1,4]
4018; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
4019; CHECK-NEXT:    vcmpeqpd %ymm4, %ymm2, %k1
4020; CHECK-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
4021; CHECK-NEXT:    vmovapd %ymm1, %ymm0
4022; CHECK-NEXT:    retq
4023  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 4>
4024  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4025  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4026  ret <4 x double> %res
4027}
4028
4029define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec, <4 x double> %mask) {
4030; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask3:
4031; CHECK:       # %bb.0:
4032; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [0,2,1,4]
4033; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
4034; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
4035; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
4036; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
4037; CHECK-NEXT:    retq
4038  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 4>
4039  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4040  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4041  ret <4 x double> %res
4042}
4043define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
4044; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask4:
4045; CHECK-FAST:       # %bb.0:
4046; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [1,5]
4047; CHECK-FAST-NEXT:    vpermpd %zmm0, %zmm3, %zmm0
4048; CHECK-FAST-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
4049; CHECK-FAST-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
4050; CHECK-FAST-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,0,1,1]
4051; CHECK-FAST-NEXT:    vmovapd %ymm1, %ymm0
4052; CHECK-FAST-NEXT:    retq
4053;
4054; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask4:
4055; CHECK-FAST-PERLANE:       # %bb.0:
4056; CHECK-FAST-PERLANE-NEXT:    vextractf32x4 $2, %zmm0, %xmm3
4057; CHECK-FAST-PERLANE-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1]
4058; CHECK-FAST-PERLANE-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
4059; CHECK-FAST-PERLANE-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
4060; CHECK-FAST-PERLANE-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,0,1,1]
4061; CHECK-FAST-PERLANE-NEXT:    vmovapd %ymm1, %ymm0
4062; CHECK-FAST-PERLANE-NEXT:    retq
4063  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
4064  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4065  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4066  ret <4 x double> %res
4067}
4068
4069define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %mask) {
4070; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask4:
4071; CHECK-FAST:       # %bb.0:
4072; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [1,5]
4073; CHECK-FAST-NEXT:    vpermpd %zmm0, %zmm2, %zmm0
4074; CHECK-FAST-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
4075; CHECK-FAST-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
4076; CHECK-FAST-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,1,1]
4077; CHECK-FAST-NEXT:    retq
4078;
4079; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask4:
4080; CHECK-FAST-PERLANE:       # %bb.0:
4081; CHECK-FAST-PERLANE-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
4082; CHECK-FAST-PERLANE-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
4083; CHECK-FAST-PERLANE-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
4084; CHECK-FAST-PERLANE-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
4085; CHECK-FAST-PERLANE-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,1,1]
4086; CHECK-FAST-PERLANE-NEXT:    retq
4087  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
4088  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4089  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4090  ret <4 x double> %res
4091}
4092define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask5(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
4093; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask5:
4094; CHECK:       # %bb.0:
4095; CHECK-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
4096; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [2,6,2,2]
4097; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
4098; CHECK-NEXT:    vcmpeqpd %ymm4, %ymm2, %k1
4099; CHECK-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
4100; CHECK-NEXT:    vmovapd %ymm1, %ymm0
4101; CHECK-NEXT:    retq
4102  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 6, i32 2, i32 2>
4103  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4104  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4105  ret <4 x double> %res
4106}
4107
4108define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask5(<8 x double> %vec, <4 x double> %mask) {
4109; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask5:
4110; CHECK:       # %bb.0:
4111; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [2,6,2,2]
4112; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
4113; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
4114; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
4115; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
4116; CHECK-NEXT:    retq
4117  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 6, i32 2, i32 2>
4118  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4119  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4120  ret <4 x double> %res
4121}
4122define <4 x double> @test_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec) {
4123; CHECK-FAST-LABEL: test_8xdouble_to_4xdouble_perm_mask6:
4124; CHECK-FAST:       # %bb.0:
4125; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [5,0,7,0]
4126; CHECK-FAST-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
4127; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
4128; CHECK-FAST-NEXT:    retq
4129;
4130; CHECK-FAST-PERLANE-LABEL: test_8xdouble_to_4xdouble_perm_mask6:
4131; CHECK-FAST-PERLANE:       # %bb.0:
4132; CHECK-FAST-PERLANE-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
4133; CHECK-FAST-PERLANE-NEXT:    vbroadcastsd %xmm0, %ymm0
4134; CHECK-FAST-PERLANE-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
4135; CHECK-FAST-PERLANE-NEXT:    retq
4136  %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0>
4137  ret <4 x double> %res
4138}
4139define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
4140; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask6:
4141; CHECK-FAST:       # %bb.0:
4142; CHECK-FAST-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
4143; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [5,0,7,0]
4144; CHECK-FAST-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
4145; CHECK-FAST-NEXT:    vcmpeqpd %ymm4, %ymm2, %k1
4146; CHECK-FAST-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
4147; CHECK-FAST-NEXT:    vmovapd %ymm1, %ymm0
4148; CHECK-FAST-NEXT:    retq
4149;
4150; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask6:
4151; CHECK-FAST-PERLANE:       # %bb.0:
4152; CHECK-FAST-PERLANE-NEXT:    vextractf64x4 $1, %zmm0, %ymm3
4153; CHECK-FAST-PERLANE-NEXT:    vbroadcastsd %xmm0, %ymm0
4154; CHECK-FAST-PERLANE-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
4155; CHECK-FAST-PERLANE-NEXT:    vcmpeqpd %ymm4, %ymm2, %k1
4156; CHECK-FAST-PERLANE-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm3[1],ymm0[1],ymm3[3],ymm0[3]
4157; CHECK-FAST-PERLANE-NEXT:    vmovapd %ymm1, %ymm0
4158; CHECK-FAST-PERLANE-NEXT:    retq
4159  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0>
4160  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4161  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4162  ret <4 x double> %res
4163}
4164
4165define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %mask) {
4166; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask6:
4167; CHECK-FAST:       # %bb.0:
4168; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [5,0,7,0]
4169; CHECK-FAST-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
4170; CHECK-FAST-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
4171; CHECK-FAST-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
4172; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
4173; CHECK-FAST-NEXT:    retq
4174;
4175; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask6:
4176; CHECK-FAST-PERLANE:       # %bb.0:
4177; CHECK-FAST-PERLANE-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
4178; CHECK-FAST-PERLANE-NEXT:    vbroadcastsd %xmm0, %ymm0
4179; CHECK-FAST-PERLANE-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
4180; CHECK-FAST-PERLANE-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
4181; CHECK-FAST-PERLANE-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm2[1],ymm0[1],ymm2[3],ymm0[3]
4182; CHECK-FAST-PERLANE-NEXT:    retq
4183  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0>
4184  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4185  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4186  ret <4 x double> %res
4187}
4188define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask7(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
4189; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask7:
4190; CHECK-FAST:       # %bb.0:
4191; CHECK-FAST-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
4192; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [3,5,0,6]
4193; CHECK-FAST-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
4194; CHECK-FAST-NEXT:    vcmpeqpd %ymm4, %ymm2, %k1
4195; CHECK-FAST-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
4196; CHECK-FAST-NEXT:    vmovapd %ymm1, %ymm0
4197; CHECK-FAST-NEXT:    retq
4198;
4199; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask7:
4200; CHECK-FAST-PERLANE:       # %bb.0:
4201; CHECK-FAST-PERLANE-NEXT:    vextractf64x4 $1, %zmm0, %ymm3
4202; CHECK-FAST-PERLANE-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,3]
4203; CHECK-FAST-PERLANE-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
4204; CHECK-FAST-PERLANE-NEXT:    vcmpeqpd %ymm4, %ymm2, %k1
4205; CHECK-FAST-PERLANE-NEXT:    vshufpd {{.*#+}} ymm1 {%k1} = ymm0[1],ymm3[1],ymm0[2],ymm3[2]
4206; CHECK-FAST-PERLANE-NEXT:    vmovapd %ymm1, %ymm0
4207; CHECK-FAST-PERLANE-NEXT:    retq
4208  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 5, i32 0, i32 6>
4209  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4210  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4211  ret <4 x double> %res
4212}
4213
4214define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask7(<8 x double> %vec, <4 x double> %mask) {
4215; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask7:
4216; CHECK-FAST:       # %bb.0:
4217; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [3,5,0,6]
4218; CHECK-FAST-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
4219; CHECK-FAST-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
4220; CHECK-FAST-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
4221; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
4222; CHECK-FAST-NEXT:    retq
4223;
4224; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask7:
4225; CHECK-FAST-PERLANE:       # %bb.0:
4226; CHECK-FAST-PERLANE-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
4227; CHECK-FAST-PERLANE-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,3]
4228; CHECK-FAST-PERLANE-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
4229; CHECK-FAST-PERLANE-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
4230; CHECK-FAST-PERLANE-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm2[1],ymm0[2],ymm2[2]
4231; CHECK-FAST-PERLANE-NEXT:    retq
4232  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 5, i32 0, i32 6>
4233  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4234  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4235  ret <4 x double> %res
4236}
4237define <2 x double> @test_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec) {
4238; CHECK-FAST-LABEL: test_8xdouble_to_2xdouble_perm_mask0:
4239; CHECK-FAST:       # %bb.0:
4240; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [0,6]
4241; CHECK-FAST-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
4242; CHECK-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
4243; CHECK-FAST-NEXT:    vzeroupper
4244; CHECK-FAST-NEXT:    retq
4245;
4246; CHECK-FAST-PERLANE-LABEL: test_8xdouble_to_2xdouble_perm_mask0:
4247; CHECK-FAST-PERLANE:       # %bb.0:
4248; CHECK-FAST-PERLANE-NEXT:    vextractf32x4 $3, %zmm0, %xmm1
4249; CHECK-FAST-PERLANE-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4250; CHECK-FAST-PERLANE-NEXT:    vzeroupper
4251; CHECK-FAST-PERLANE-NEXT:    retq
4252  %res = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
4253  ret <2 x double> %res
4254}
4255define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
4256; CHECK-FAST-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask0:
4257; CHECK-FAST:       # %bb.0:
4258; CHECK-FAST-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
4259; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [0,6]
4260; CHECK-FAST-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
4261; CHECK-FAST-NEXT:    vcmpeqpd %xmm4, %xmm2, %k1
4262; CHECK-FAST-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
4263; CHECK-FAST-NEXT:    vmovapd %xmm1, %xmm0
4264; CHECK-FAST-NEXT:    vzeroupper
4265; CHECK-FAST-NEXT:    retq
4266;
4267; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask0:
4268; CHECK-FAST-PERLANE:       # %bb.0:
4269; CHECK-FAST-PERLANE-NEXT:    vextractf32x4 $3, %zmm0, %xmm3
4270; CHECK-FAST-PERLANE-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
4271; CHECK-FAST-PERLANE-NEXT:    vcmpeqpd %xmm4, %xmm2, %k1
4272; CHECK-FAST-PERLANE-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],xmm3[0]
4273; CHECK-FAST-PERLANE-NEXT:    vmovapd %xmm1, %xmm0
4274; CHECK-FAST-PERLANE-NEXT:    vzeroupper
4275; CHECK-FAST-PERLANE-NEXT:    retq
4276  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
4277  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4278  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
4279  ret <2 x double> %res
4280}
4281
4282define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec, <2 x double> %mask) {
4283; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask0:
4284; CHECK-FAST:       # %bb.0:
4285; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [0,6]
4286; CHECK-FAST-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
4287; CHECK-FAST-NEXT:    vcmpeqpd %xmm3, %xmm1, %k1
4288; CHECK-FAST-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
4289; CHECK-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
4290; CHECK-FAST-NEXT:    vzeroupper
4291; CHECK-FAST-NEXT:    retq
4292;
4293; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask0:
4294; CHECK-FAST-PERLANE:       # %bb.0:
4295; CHECK-FAST-PERLANE-NEXT:    vextractf32x4 $3, %zmm0, %xmm2
4296; CHECK-FAST-PERLANE-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
4297; CHECK-FAST-PERLANE-NEXT:    vcmpeqpd %xmm3, %xmm1, %k1
4298; CHECK-FAST-PERLANE-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm2[0]
4299; CHECK-FAST-PERLANE-NEXT:    vzeroupper
4300; CHECK-FAST-PERLANE-NEXT:    retq
4301  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
4302  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4303  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
4304  ret <2 x double> %res
4305}
4306define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask1(<8 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
4307; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask1:
4308; CHECK:       # %bb.0:
4309; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
4310; CHECK-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [3,7]
4311; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
4312; CHECK-NEXT:    vcmpeqpd %xmm4, %xmm2, %k1
4313; CHECK-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
4314; CHECK-NEXT:    vmovapd %xmm1, %xmm0
4315; CHECK-NEXT:    vzeroupper
4316; CHECK-NEXT:    retq
4317  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 3, i32 7>
4318  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4319  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
4320  ret <2 x double> %res
4321}
4322
4323define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask1(<8 x double> %vec, <2 x double> %mask) {
4324; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask1:
4325; CHECK:       # %bb.0:
4326; CHECK-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [3,7]
4327; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
4328; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm1, %k1
4329; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
4330; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
4331; CHECK-NEXT:    vzeroupper
4332; CHECK-NEXT:    retq
4333  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 3, i32 7>
4334  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4335  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
4336  ret <2 x double> %res
4337}
4338define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask0(ptr %vp) {
4339; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask0:
4340; CHECK:       # %bb.0:
4341; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [1,6,7,2]
4342; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
4343; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
4344; CHECK-NEXT:    retq
4345  %vec = load <8 x double>, ptr %vp
4346  %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 6, i32 7, i32 2>
4347  ret <4 x double> %res
4348}
4349define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask0(ptr %vp, <4 x double> %vec2, <4 x double> %mask) {
4350; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask0:
4351; CHECK:       # %bb.0:
4352; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
4353; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [1,6,7,2]
4354; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
4355; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
4356; CHECK-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1}
4357; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
4358; CHECK-NEXT:    retq
4359  %vec = load <8 x double>, ptr %vp
4360  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 6, i32 7, i32 2>
4361  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4362  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4363  ret <4 x double> %res
4364}
4365
4366define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask0(ptr %vp, <4 x double> %mask) {
4367; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask0:
4368; CHECK:       # %bb.0:
4369; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [1,6,7,2]
4370; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
4371; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm0, %k1
4372; CHECK-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z}
4373; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
4374; CHECK-NEXT:    retq
4375  %vec = load <8 x double>, ptr %vp
4376  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 6, i32 7, i32 2>
4377  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4378  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4379  ret <4 x double> %res
4380}
4381
4382define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask1(ptr %vp, <4 x double> %vec2, <4 x double> %mask) {
4383; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask1:
4384; CHECK-FAST:       # %bb.0:
4385; CHECK-FAST-NEXT:    vbroadcastsd 32(%rdi), %ymm2
4386; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [7,0,6,2]
4387; CHECK-FAST-NEXT:    vpermi2pd (%rdi), %ymm2, %ymm3
4388; CHECK-FAST-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
4389; CHECK-FAST-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
4390; CHECK-FAST-NEXT:    vmovapd %ymm3, %ymm0 {%k1}
4391; CHECK-FAST-NEXT:    retq
4392;
4393; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask1:
4394; CHECK-FAST-PERLANE:       # %bb.0:
4395; CHECK-FAST-PERLANE-NEXT:    vpermpd $236, (%rdi), %ymm2 # ymm2 = mem[0,3,2,3]
4396; CHECK-FAST-PERLANE-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
4397; CHECK-FAST-PERLANE-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
4398; CHECK-FAST-PERLANE-NEXT:    vshufpd $1, 32(%rdi){1to4}, %ymm2, %ymm0 {%k1}
4399; CHECK-FAST-PERLANE-NEXT:    retq
4400  %vec = load <8 x double>, ptr %vp
4401  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 4, i32 2, i32 4>
4402  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4403  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4404  ret <4 x double> %res
4405}
4406
4407define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1(ptr %vp, <4 x double> %mask) {
4408; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1:
4409; CHECK-FAST:       # %bb.0:
4410; CHECK-FAST-NEXT:    vbroadcastsd 32(%rdi), %ymm2
4411; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [7,0,6,2]
4412; CHECK-FAST-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
4413; CHECK-FAST-NEXT:    vcmpeqpd %ymm3, %ymm0, %k1
4414; CHECK-FAST-NEXT:    vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z}
4415; CHECK-FAST-NEXT:    vmovapd %ymm1, %ymm0
4416; CHECK-FAST-NEXT:    retq
4417;
4418; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1:
4419; CHECK-FAST-PERLANE:       # %bb.0:
4420; CHECK-FAST-PERLANE-NEXT:    vpermpd $236, (%rdi), %ymm1 # ymm1 = mem[0,3,2,3]
4421; CHECK-FAST-PERLANE-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
4422; CHECK-FAST-PERLANE-NEXT:    vcmpeqpd %ymm2, %ymm0, %k1
4423; CHECK-FAST-PERLANE-NEXT:    vshufpd $1, 32(%rdi){1to4}, %ymm1, %ymm0 {%k1} {z}
4424; CHECK-FAST-PERLANE-NEXT:    retq
4425  %vec = load <8 x double>, ptr %vp
4426  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 4, i32 2, i32 4>
4427  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4428  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4429  ret <4 x double> %res
4430}
4431
4432define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask2(ptr %vp, <4 x double> %vec2, <4 x double> %mask) {
4433; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask2:
4434; CHECK-FAST:       # %bb.0:
4435; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
4436; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [1,2,3,4]
4437; CHECK-FAST-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
4438; CHECK-FAST-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
4439; CHECK-FAST-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1}
4440; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
4441; CHECK-FAST-NEXT:    retq
4442;
4443; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask2:
4444; CHECK-FAST-PERLANE:       # %bb.0:
4445; CHECK-FAST-PERLANE-NEXT:    vmovapd (%rdi), %ymm2
4446; CHECK-FAST-PERLANE-NEXT:    vperm2f128 $33, 32(%rdi), %ymm2, %ymm3 # ymm3 = ymm2[2,3],mem[0,1]
4447; CHECK-FAST-PERLANE-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
4448; CHECK-FAST-PERLANE-NEXT:    vcmpeqpd %ymm4, %ymm1, %k1
4449; CHECK-FAST-PERLANE-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} = ymm2[1],ymm3[0],ymm2[3],ymm3[2]
4450; CHECK-FAST-PERLANE-NEXT:    retq
4451  %vec = load <8 x double>, ptr %vp
4452  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
4453  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4454  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4455  ret <4 x double> %res
4456}
4457
4458define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2(ptr %vp, <4 x double> %mask) {
4459; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2:
4460; CHECK-FAST:       # %bb.0:
4461; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [1,2,3,4]
4462; CHECK-FAST-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
4463; CHECK-FAST-NEXT:    vcmpeqpd %ymm2, %ymm0, %k1
4464; CHECK-FAST-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z}
4465; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
4466; CHECK-FAST-NEXT:    retq
4467;
4468; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2:
4469; CHECK-FAST-PERLANE:       # %bb.0:
4470; CHECK-FAST-PERLANE-NEXT:    vmovapd (%rdi), %ymm1
4471; CHECK-FAST-PERLANE-NEXT:    vperm2f128 $33, 32(%rdi), %ymm1, %ymm2 # ymm2 = ymm1[2,3],mem[0,1]
4472; CHECK-FAST-PERLANE-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
4473; CHECK-FAST-PERLANE-NEXT:    vcmpeqpd %ymm3, %ymm0, %k1
4474; CHECK-FAST-PERLANE-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm1[1],ymm2[0],ymm1[3],ymm2[2]
4475; CHECK-FAST-PERLANE-NEXT:    retq
4476  %vec = load <8 x double>, ptr %vp
4477  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
4478  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4479  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4480  ret <4 x double> %res
4481}
4482
4483define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp) {
4484; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask3:
4485; CHECK:       # %bb.0:
4486; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [4,2,1,0]
4487; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
4488; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
4489; CHECK-NEXT:    retq
4490  %vec = load <8 x double>, ptr %vp
4491  %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
4492  ret <4 x double> %res
4493}
4494define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp, <4 x double> %vec2, <4 x double> %mask) {
4495; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask3:
4496; CHECK:       # %bb.0:
4497; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
4498; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [4,2,1,0]
4499; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
4500; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
4501; CHECK-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1}
4502; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
4503; CHECK-NEXT:    retq
4504  %vec = load <8 x double>, ptr %vp
4505  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
4506  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4507  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4508  ret <4 x double> %res
4509}
4510
4511define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp, <4 x double> %mask) {
4512; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3:
4513; CHECK:       # %bb.0:
4514; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [4,2,1,0]
4515; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
4516; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm0, %k1
4517; CHECK-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z}
4518; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
4519; CHECK-NEXT:    retq
4520  %vec = load <8 x double>, ptr %vp
4521  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
4522  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4523  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4524  ret <4 x double> %res
4525}
4526
4527define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask4(ptr %vp, <4 x double> %vec2, <4 x double> %mask) {
4528; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask4:
4529; CHECK:       # %bb.0:
4530; CHECK-NEXT:    vmovapd 32(%rdi), %ymm2
4531; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [2,4,1,5]
4532; CHECK-NEXT:    vpermi2pd (%rdi), %ymm2, %ymm3
4533; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
4534; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
4535; CHECK-NEXT:    vmovapd %ymm3, %ymm0 {%k1}
4536; CHECK-NEXT:    retq
4537  %vec = load <8 x double>, ptr %vp
4538  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 6, i32 0, i32 5, i32 1>
4539  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4540  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4541  ret <4 x double> %res
4542}
4543
4544define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask4(ptr %vp, <4 x double> %mask) {
4545; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask4:
4546; CHECK:       # %bb.0:
4547; CHECK-NEXT:    vmovapd 32(%rdi), %ymm2
4548; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [2,4,1,5]
4549; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
4550; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm0, %k1
4551; CHECK-NEXT:    vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z}
4552; CHECK-NEXT:    vmovapd %ymm1, %ymm0
4553; CHECK-NEXT:    retq
4554  %vec = load <8 x double>, ptr %vp
4555  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 6, i32 0, i32 5, i32 1>
4556  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4557  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4558  ret <4 x double> %res
4559}
4560
4561define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask5(ptr %vp, <4 x double> %vec2, <4 x double> %mask) {
4562; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask5:
4563; CHECK:       # %bb.0:
4564; CHECK-NEXT:    vmovapd (%rdi), %ymm2
4565; CHECK-NEXT:    vperm2f128 $33, 32(%rdi), %ymm2, %ymm2 # ymm2 = ymm2[2,3],mem[0,1]
4566; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
4567; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
4568; CHECK-NEXT:    vshufpd $14, 40(%rdi){1to4}, %ymm2, %ymm0 {%k1}
4569; CHECK-NEXT:    retq
4570  %vec = load <8 x double>, ptr %vp
4571  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 5, i32 5, i32 5>
4572  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4573  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4574  ret <4 x double> %res
4575}
4576
4577define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5(ptr %vp, <4 x double> %mask) {
4578; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5:
4579; CHECK:       # %bb.0:
4580; CHECK-NEXT:    vmovapd (%rdi), %ymm1
4581; CHECK-NEXT:    vperm2f128 $33, 32(%rdi), %ymm1, %ymm1 # ymm1 = ymm1[2,3],mem[0,1]
4582; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
4583; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm0, %k1
4584; CHECK-NEXT:    vshufpd $14, 40(%rdi){1to4}, %ymm1, %ymm0 {%k1} {z}
4585; CHECK-NEXT:    retq
4586  %vec = load <8 x double>, ptr %vp
4587  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 5, i32 5, i32 5>
4588  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4589  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4590  ret <4 x double> %res
4591}
4592
4593define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp) {
4594; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask6:
4595; CHECK:       # %bb.0:
4596; CHECK-NEXT:    vmovapd 32(%rdi), %ymm1
4597; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [0,2,4,1]
4598; CHECK-NEXT:    vpermi2pd (%rdi), %ymm1, %ymm0
4599; CHECK-NEXT:    retq
4600  %vec = load <8 x double>, ptr %vp
4601  %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
4602  ret <4 x double> %res
4603}
4604define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp, <4 x double> %vec2, <4 x double> %mask) {
4605; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask6:
4606; CHECK:       # %bb.0:
4607; CHECK-NEXT:    vmovapd 32(%rdi), %ymm2
4608; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [0,2,4,1]
4609; CHECK-NEXT:    vpermi2pd (%rdi), %ymm2, %ymm3
4610; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
4611; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
4612; CHECK-NEXT:    vmovapd %ymm3, %ymm0 {%k1}
4613; CHECK-NEXT:    retq
4614  %vec = load <8 x double>, ptr %vp
4615  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
4616  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4617  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4618  ret <4 x double> %res
4619}
4620
4621define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp, <4 x double> %mask) {
4622; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6:
4623; CHECK:       # %bb.0:
4624; CHECK-NEXT:    vmovapd 32(%rdi), %ymm2
4625; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,2,4,1]
4626; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
4627; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm0, %k1
4628; CHECK-NEXT:    vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z}
4629; CHECK-NEXT:    vmovapd %ymm1, %ymm0
4630; CHECK-NEXT:    retq
4631  %vec = load <8 x double>, ptr %vp
4632  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
4633  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4634  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4635  ret <4 x double> %res
4636}
4637
4638define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask7(ptr %vp, <4 x double> %vec2, <4 x double> %mask) {
4639; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask7:
4640; CHECK:       # %bb.0:
4641; CHECK-NEXT:    vmovapd (%rdi), %ymm2
4642; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
4643; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
4644; CHECK-NEXT:    vunpcklpd 40(%rdi){1to4}, %ymm2, %ymm0 {%k1}
4645; CHECK-NEXT:    retq
4646  %vec = load <8 x double>, ptr %vp
4647  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 5>
4648  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4649  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4650  ret <4 x double> %res
4651}
4652
4653define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask7(ptr %vp, <4 x double> %mask) {
4654; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask7:
4655; CHECK:       # %bb.0:
4656; CHECK-NEXT:    vmovapd (%rdi), %ymm1
4657; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
4658; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm0, %k1
4659; CHECK-NEXT:    vunpcklpd 40(%rdi){1to4}, %ymm1, %ymm0 {%k1} {z}
4660; CHECK-NEXT:    retq
4661  %vec = load <8 x double>, ptr %vp
4662  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 5>
4663  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4664  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4665  ret <4 x double> %res
4666}
4667
4668define <2 x double> @test_8xdouble_to_2xdouble_perm_mem_mask0(ptr %vp) {
4669; CHECK-LABEL: test_8xdouble_to_2xdouble_perm_mem_mask0:
4670; CHECK:       # %bb.0:
4671; CHECK-NEXT:    vmovapd (%rdi), %xmm0
4672; CHECK-NEXT:    vshufpd $1, 48(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[1],mem[0]
4673; CHECK-NEXT:    retq
4674  %vec = load <8 x double>, ptr %vp
4675  %res = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6>
4676  ret <2 x double> %res
4677}
4678define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask0(ptr %vp, <2 x double> %vec2, <2 x double> %mask) {
4679; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask0:
4680; CHECK:       # %bb.0:
4681; CHECK-NEXT:    vmovapd (%rdi), %xmm2
4682; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
4683; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm1, %k1
4684; CHECK-NEXT:    vshufpd $1, 48(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[1],mem[0]
4685; CHECK-NEXT:    retq
4686  %vec = load <8 x double>, ptr %vp
4687  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6>
4688  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4689  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
4690  ret <2 x double> %res
4691}
4692
4693define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0(ptr %vp, <2 x double> %mask) {
4694; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0:
4695; CHECK:       # %bb.0:
4696; CHECK-NEXT:    vmovapd (%rdi), %xmm1
4697; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
4698; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm0, %k1
4699; CHECK-NEXT:    vshufpd $1, 48(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[1],mem[0]
4700; CHECK-NEXT:    retq
4701  %vec = load <8 x double>, ptr %vp
4702  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6>
4703  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4704  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
4705  ret <2 x double> %res
4706}
4707
4708define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2 x double> %vec2, <2 x double> %mask) {
4709; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask1:
4710; CHECK:       # %bb.0:
4711; CHECK-NEXT:    vmovddup 8(%rdi), %xmm2 # xmm2 = mem[0,0]
4712; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
4713; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm1, %k1
4714; CHECK-NEXT:    vunpcklpd 32(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[0],mem[0]
4715; CHECK-NEXT:    retq
4716  %vec = load <8 x double>, ptr %vp
4717  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 4>
4718  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4719  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
4720  ret <2 x double> %res
4721}
4722
4723define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2 x double> %mask) {
4724; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1:
4725; CHECK:       # %bb.0:
4726; CHECK-NEXT:    vmovddup 8(%rdi), %xmm1 # xmm1 = mem[0,0]
4727; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
4728; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm0, %k1
4729; CHECK-NEXT:    vunpcklpd 32(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[0],mem[0]
4730; CHECK-NEXT:    retq
4731  %vec = load <8 x double>, ptr %vp
4732  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 4>
4733  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4734  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
4735  ret <2 x double> %res
4736}
4737
4738; PR35977
4739define void @test_zext_v8i8_to_v8i16(ptr %arg, ptr %arg1) {
4740; CHECK-LABEL: test_zext_v8i8_to_v8i16:
4741; CHECK:       # %bb.0:
4742; CHECK-NEXT:    vpmovzxbw (%rdi), %xmm0 # xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
4743; CHECK-NEXT:    vpsllw $8, %xmm0, %xmm0
4744; CHECK-NEXT:    vmovdqa %xmm0, (%rsi)
4745; CHECK-NEXT:    retq
4746  %tmp2 = load <8 x i8>, ptr %arg
4747  %tmp3 = extractelement <8 x i8> %tmp2, i32 0
4748  %tmp4 = zext i8 %tmp3 to i16
4749  %tmp5 = insertelement <8 x i16> undef, i16 %tmp4, i32 0
4750  %tmp6 = extractelement <8 x i8> %tmp2, i32 1
4751  %tmp7 = zext i8 %tmp6 to i16
4752  %tmp8 = insertelement <8 x i16> %tmp5, i16 %tmp7, i32 1
4753  %tmp9 = extractelement <8 x i8> %tmp2, i32 2
4754  %tmp10 = zext i8 %tmp9 to i16
4755  %tmp11 = insertelement <8 x i16> %tmp8, i16 %tmp10, i32 2
4756  %tmp12 = extractelement <8 x i8> %tmp2, i32 3
4757  %tmp13 = zext i8 %tmp12 to i16
4758  %tmp14 = insertelement <8 x i16> %tmp11, i16 %tmp13, i32 3
4759  %tmp15 = extractelement <8 x i8> %tmp2, i32 4
4760  %tmp16 = zext i8 %tmp15 to i16
4761  %tmp17 = insertelement <8 x i16> %tmp14, i16 %tmp16, i32 4
4762  %tmp18 = extractelement <8 x i8> %tmp2, i32 5
4763  %tmp19 = zext i8 %tmp18 to i16
4764  %tmp20 = insertelement <8 x i16> %tmp17, i16 %tmp19, i32 5
4765  %tmp21 = extractelement <8 x i8> %tmp2, i32 6
4766  %tmp22 = zext i8 %tmp21 to i16
4767  %tmp23 = insertelement <8 x i16> %tmp20, i16 %tmp22, i32 6
4768  %tmp24 = extractelement <8 x i8> %tmp2, i32 7
4769  %tmp25 = zext i8 %tmp24 to i16
4770  %tmp26 = insertelement <8 x i16> %tmp23, i16 %tmp25, i32 7
4771  %tmp27 = shl <8 x i16> %tmp26, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
4772  store <8 x i16> %tmp27, ptr %arg1
4773  ret void
4774}
4775