xref: /llvm-project/llvm/test/CodeGen/X86/vector-compress.ll (revision 6b1db79887df19bc8e8c946108966aa6021c8b87)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2; RUN: llc -mtriple=x86_64 -mattr=+avx2                           < %s | FileCheck %s --check-prefixes=CHECK,AVX2
3; RUN: llc -mtriple=x86_64 -mattr=+avx512f                        < %s | FileCheck %s --check-prefixes=CHECK,AVX512F
4; RUN: llc -mtriple=x86_64 -mattr=+avx512f,+avx512vl,+avx512vbmi2 < %s | FileCheck %s --check-prefixes=CHECK,AVX512VL
5
6define <4 x i32> @test_compress_v4i32(<4 x i32> %vec, <4 x i1> %mask, <4 x i32> %passthru) nounwind {
7; AVX2-LABEL: test_compress_v4i32:
8; AVX2:       # %bb.0:
9; AVX2-NEXT:    vpslld $31, %xmm1, %xmm1
10; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm1
11; AVX2-NEXT:    vmovaps %xmm2, -{{[0-9]+}}(%rsp)
12; AVX2-NEXT:    vpextrd $1, %xmm1, %eax
13; AVX2-NEXT:    vmovd %xmm1, %esi
14; AVX2-NEXT:    andl $1, %esi
15; AVX2-NEXT:    movl %esi, %edi
16; AVX2-NEXT:    subl %eax, %edi
17; AVX2-NEXT:    vpextrd $2, %xmm1, %edx
18; AVX2-NEXT:    subl %edx, %edi
19; AVX2-NEXT:    vpextrd $3, %xmm1, %ecx
20; AVX2-NEXT:    subl %ecx, %edi
21; AVX2-NEXT:    andl $3, %edi
22; AVX2-NEXT:    andl $1, %eax
23; AVX2-NEXT:    addq %rsi, %rax
24; AVX2-NEXT:    andl $1, %edx
25; AVX2-NEXT:    addq %rax, %rdx
26; AVX2-NEXT:    andl $1, %ecx
27; AVX2-NEXT:    addq %rdx, %rcx
28; AVX2-NEXT:    vextractps $3, %xmm0, %r8d
29; AVX2-NEXT:    cmpq $4, %rcx
30; AVX2-NEXT:    cmovbl -24(%rsp,%rdi,4), %r8d
31; AVX2-NEXT:    vmovss %xmm0, -{{[0-9]+}}(%rsp)
32; AVX2-NEXT:    vextractps $1, %xmm0, -24(%rsp,%rsi,4)
33; AVX2-NEXT:    vextractps $2, %xmm0, -24(%rsp,%rax,4)
34; AVX2-NEXT:    andl $3, %edx
35; AVX2-NEXT:    vextractps $3, %xmm0, -24(%rsp,%rdx,4)
36; AVX2-NEXT:    cmpq $3, %rcx
37; AVX2-NEXT:    movl $3, %eax
38; AVX2-NEXT:    cmovbq %rcx, %rax
39; AVX2-NEXT:    movl %r8d, -24(%rsp,%rax,4)
40; AVX2-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
41; AVX2-NEXT:    retq
42;
43; AVX512F-LABEL: test_compress_v4i32:
44; AVX512F:       # %bb.0:
45; AVX512F-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
46; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
47; AVX512F-NEXT:    vpslld $31, %xmm1, %xmm1
48; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k0
49; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
50; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
51; AVX512F-NEXT:    vpcompressd %zmm0, %zmm2 {%k1}
52; AVX512F-NEXT:    vmovdqa %xmm2, %xmm0
53; AVX512F-NEXT:    vzeroupper
54; AVX512F-NEXT:    retq
55;
56; AVX512VL-LABEL: test_compress_v4i32:
57; AVX512VL:       # %bb.0:
58; AVX512VL-NEXT:    vpslld $31, %xmm1, %xmm1
59; AVX512VL-NEXT:    vptestmd %xmm1, %xmm1, %k1
60; AVX512VL-NEXT:    vpcompressd %xmm0, %xmm2 {%k1}
61; AVX512VL-NEXT:    vmovdqa %xmm2, %xmm0
62; AVX512VL-NEXT:    retq
63    %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> %mask, <4 x i32> %passthru)
64    ret <4 x i32> %out
65}
66
67define <4 x float> @test_compress_v4f32(<4 x float> %vec, <4 x i1> %mask, <4 x float> %passthru) nounwind {
68; AVX2-LABEL: test_compress_v4f32:
69; AVX2:       # %bb.0:
70; AVX2-NEXT:    vpslld $31, %xmm1, %xmm1
71; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm1
72; AVX2-NEXT:    vmovaps %xmm2, -{{[0-9]+}}(%rsp)
73; AVX2-NEXT:    vpextrd $1, %xmm1, %edx
74; AVX2-NEXT:    vmovd %xmm1, %esi
75; AVX2-NEXT:    andl $1, %esi
76; AVX2-NEXT:    movl %esi, %edi
77; AVX2-NEXT:    subl %edx, %edi
78; AVX2-NEXT:    vpextrd $2, %xmm1, %ecx
79; AVX2-NEXT:    subl %ecx, %edi
80; AVX2-NEXT:    vpextrd $3, %xmm1, %eax
81; AVX2-NEXT:    subl %eax, %edi
82; AVX2-NEXT:    andl $3, %edi
83; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
84; AVX2-NEXT:    vmovss %xmm0, -{{[0-9]+}}(%rsp)
85; AVX2-NEXT:    vextractps $1, %xmm0, -24(%rsp,%rsi,4)
86; AVX2-NEXT:    andl $1, %edx
87; AVX2-NEXT:    addq %rsi, %rdx
88; AVX2-NEXT:    vextractps $2, %xmm0, -24(%rsp,%rdx,4)
89; AVX2-NEXT:    andl $1, %ecx
90; AVX2-NEXT:    addq %rdx, %rcx
91; AVX2-NEXT:    andl $1, %eax
92; AVX2-NEXT:    addq %rcx, %rax
93; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
94; AVX2-NEXT:    andl $3, %ecx
95; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
96; AVX2-NEXT:    vmovss %xmm0, -24(%rsp,%rcx,4)
97; AVX2-NEXT:    cmpq $3, %rax
98; AVX2-NEXT:    movl $3, %ecx
99; AVX2-NEXT:    cmovbq %rax, %rcx
100; AVX2-NEXT:    ja .LBB1_2
101; AVX2-NEXT:  # %bb.1:
102; AVX2-NEXT:    vmovaps %xmm1, %xmm0
103; AVX2-NEXT:  .LBB1_2:
104; AVX2-NEXT:    vmovss %xmm0, -24(%rsp,%rcx,4)
105; AVX2-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
106; AVX2-NEXT:    retq
107;
108; AVX512F-LABEL: test_compress_v4f32:
109; AVX512F:       # %bb.0:
110; AVX512F-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
111; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
112; AVX512F-NEXT:    vpslld $31, %xmm1, %xmm1
113; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k0
114; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
115; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
116; AVX512F-NEXT:    vcompressps %zmm0, %zmm2 {%k1}
117; AVX512F-NEXT:    vmovdqa %xmm2, %xmm0
118; AVX512F-NEXT:    vzeroupper
119; AVX512F-NEXT:    retq
120;
121; AVX512VL-LABEL: test_compress_v4f32:
122; AVX512VL:       # %bb.0:
123; AVX512VL-NEXT:    vpslld $31, %xmm1, %xmm1
124; AVX512VL-NEXT:    vptestmd %xmm1, %xmm1, %k1
125; AVX512VL-NEXT:    vcompressps %xmm0, %xmm2 {%k1}
126; AVX512VL-NEXT:    vmovdqa %xmm2, %xmm0
127; AVX512VL-NEXT:    retq
128    %out = call <4 x float> @llvm.experimental.vector.compress(<4 x float> %vec, <4 x i1> %mask, <4 x float> %passthru)
129    ret <4 x float> %out
130}
131
132define <2 x i64> @test_compress_v2i64(<2 x i64> %vec, <2 x i1> %mask, <2 x i64> %passthru) nounwind {
133; AVX2-LABEL: test_compress_v2i64:
134; AVX2:       # %bb.0:
135; AVX2-NEXT:    vpsllq $63, %xmm1, %xmm1
136; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
137; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm3, %xmm1
138; AVX2-NEXT:    vmovaps %xmm2, -{{[0-9]+}}(%rsp)
139; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
140; AVX2-NEXT:    vmovq %xmm1, %rcx
141; AVX2-NEXT:    movl %ecx, %edx
142; AVX2-NEXT:    subl %eax, %edx
143; AVX2-NEXT:    andl $1, %edx
144; AVX2-NEXT:    andl $1, %eax
145; AVX2-NEXT:    andl $1, %ecx
146; AVX2-NEXT:    addq %rcx, %rax
147; AVX2-NEXT:    vpextrq $1, %xmm0, %rsi
148; AVX2-NEXT:    cmpq $2, %rax
149; AVX2-NEXT:    cmovbq -24(%rsp,%rdx,8), %rsi
150; AVX2-NEXT:    vmovq %xmm0, -{{[0-9]+}}(%rsp)
151; AVX2-NEXT:    movl %ecx, %ecx
152; AVX2-NEXT:    vpextrq $1, %xmm0, -24(%rsp,%rcx,8)
153; AVX2-NEXT:    cmpq $1, %rax
154; AVX2-NEXT:    movl $1, %ecx
155; AVX2-NEXT:    cmovbq %rax, %rcx
156; AVX2-NEXT:    movq %rsi, -24(%rsp,%rcx,8)
157; AVX2-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
158; AVX2-NEXT:    retq
159;
160; AVX512F-LABEL: test_compress_v2i64:
161; AVX512F:       # %bb.0:
162; AVX512F-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
163; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
164; AVX512F-NEXT:    vpsllq $63, %xmm1, %xmm1
165; AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k0
166; AVX512F-NEXT:    kshiftlw $14, %k0, %k0
167; AVX512F-NEXT:    kshiftrw $14, %k0, %k1
168; AVX512F-NEXT:    vpcompressq %zmm0, %zmm2 {%k1}
169; AVX512F-NEXT:    vmovdqa %xmm2, %xmm0
170; AVX512F-NEXT:    vzeroupper
171; AVX512F-NEXT:    retq
172;
173; AVX512VL-LABEL: test_compress_v2i64:
174; AVX512VL:       # %bb.0:
175; AVX512VL-NEXT:    vpsllq $63, %xmm1, %xmm1
176; AVX512VL-NEXT:    vptestmq %xmm1, %xmm1, %k1
177; AVX512VL-NEXT:    vpcompressq %xmm0, %xmm2 {%k1}
178; AVX512VL-NEXT:    vmovdqa %xmm2, %xmm0
179; AVX512VL-NEXT:    retq
180    %out = call <2 x i64> @llvm.experimental.vector.compress(<2 x i64> %vec, <2 x i1> %mask, <2 x i64> %passthru)
181    ret <2 x i64> %out
182}
183
184define <2 x double> @test_compress_v2f64(<2 x double> %vec, <2 x i1> %mask, <2 x double> %passthru) nounwind {
185; AVX2-LABEL: test_compress_v2f64:
186; AVX2:       # %bb.0:
187; AVX2-NEXT:    vpsllq $63, %xmm1, %xmm1
188; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
189; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm3, %xmm1
190; AVX2-NEXT:    vmovaps %xmm2, -{{[0-9]+}}(%rsp)
191; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
192; AVX2-NEXT:    vmovq %xmm1, %rcx
193; AVX2-NEXT:    movl %ecx, %edx
194; AVX2-NEXT:    subl %eax, %edx
195; AVX2-NEXT:    andl $1, %edx
196; AVX2-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
197; AVX2-NEXT:    vmovlpd %xmm0, -{{[0-9]+}}(%rsp)
198; AVX2-NEXT:    andl $1, %ecx
199; AVX2-NEXT:    movl %ecx, %edx
200; AVX2-NEXT:    vmovhpd %xmm0, -24(%rsp,%rdx,8)
201; AVX2-NEXT:    andl $1, %eax
202; AVX2-NEXT:    addq %rcx, %rax
203; AVX2-NEXT:    cmpq $2, %rax
204; AVX2-NEXT:    jb .LBB3_2
205; AVX2-NEXT:  # %bb.1:
206; AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
207; AVX2-NEXT:  .LBB3_2:
208; AVX2-NEXT:    cmpq $1, %rax
209; AVX2-NEXT:    movl $1, %ecx
210; AVX2-NEXT:    cmovbq %rax, %rcx
211; AVX2-NEXT:    vmovsd %xmm1, -24(%rsp,%rcx,8)
212; AVX2-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
213; AVX2-NEXT:    retq
214;
215; AVX512F-LABEL: test_compress_v2f64:
216; AVX512F:       # %bb.0:
217; AVX512F-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
218; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
219; AVX512F-NEXT:    vpsllq $63, %xmm1, %xmm1
220; AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k0
221; AVX512F-NEXT:    kshiftlw $14, %k0, %k0
222; AVX512F-NEXT:    kshiftrw $14, %k0, %k1
223; AVX512F-NEXT:    vcompresspd %zmm0, %zmm2 {%k1}
224; AVX512F-NEXT:    vmovdqa %xmm2, %xmm0
225; AVX512F-NEXT:    vzeroupper
226; AVX512F-NEXT:    retq
227;
228; AVX512VL-LABEL: test_compress_v2f64:
229; AVX512VL:       # %bb.0:
230; AVX512VL-NEXT:    vpsllq $63, %xmm1, %xmm1
231; AVX512VL-NEXT:    vptestmq %xmm1, %xmm1, %k1
232; AVX512VL-NEXT:    vcompresspd %xmm0, %xmm2 {%k1}
233; AVX512VL-NEXT:    vmovdqa %xmm2, %xmm0
234; AVX512VL-NEXT:    retq
235    %out = call <2 x double> @llvm.experimental.vector.compress(<2 x double> %vec, <2 x i1> %mask, <2 x double> %passthru)
236    ret <2 x double> %out
237}
238
239define <8 x i32> @test_compress_v8i32(<8 x i32> %vec, <8 x i1> %mask, <8 x i32> %passthru) nounwind {
240; AVX2-LABEL: test_compress_v8i32:
241; AVX2:       # %bb.0:
242; AVX2-NEXT:    pushq %rbp
243; AVX2-NEXT:    movq %rsp, %rbp
244; AVX2-NEXT:    pushq %rbx
245; AVX2-NEXT:    andq $-32, %rsp
246; AVX2-NEXT:    subq $64, %rsp
247; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
248; AVX2-NEXT:    vpslld $31, %ymm1, %ymm1
249; AVX2-NEXT:    vpsrad $31, %ymm1, %ymm3
250; AVX2-NEXT:    vmovaps %ymm2, (%rsp)
251; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm1
252; AVX2-NEXT:    vpackssdw %xmm1, %xmm3, %xmm2
253; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
254; AVX2-NEXT:    vpslld $31, %ymm2, %ymm2
255; AVX2-NEXT:    vpsrld $31, %ymm2, %ymm2
256; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm4
257; AVX2-NEXT:    vpaddd %xmm4, %xmm2, %xmm2
258; AVX2-NEXT:    vpextrd $1, %xmm2, %eax
259; AVX2-NEXT:    vmovd %xmm2, %ecx
260; AVX2-NEXT:    addl %eax, %ecx
261; AVX2-NEXT:    vpextrd $2, %xmm2, %edx
262; AVX2-NEXT:    vpextrd $3, %xmm2, %eax
263; AVX2-NEXT:    addl %edx, %eax
264; AVX2-NEXT:    addl %ecx, %eax
265; AVX2-NEXT:    andl $7, %eax
266; AVX2-NEXT:    vpextrd $1, %xmm3, %ecx
267; AVX2-NEXT:    andl $1, %ecx
268; AVX2-NEXT:    vmovd %xmm3, %edx
269; AVX2-NEXT:    andl $1, %edx
270; AVX2-NEXT:    addq %rdx, %rcx
271; AVX2-NEXT:    vpextrd $2, %xmm3, %esi
272; AVX2-NEXT:    andl $1, %esi
273; AVX2-NEXT:    addq %rcx, %rsi
274; AVX2-NEXT:    vpextrd $3, %xmm3, %edi
275; AVX2-NEXT:    andl $1, %edi
276; AVX2-NEXT:    addq %rsi, %rdi
277; AVX2-NEXT:    vmovd %xmm1, %r8d
278; AVX2-NEXT:    andl $1, %r8d
279; AVX2-NEXT:    addq %rdi, %r8
280; AVX2-NEXT:    vpextrd $1, %xmm1, %r9d
281; AVX2-NEXT:    andl $1, %r9d
282; AVX2-NEXT:    addq %r8, %r9
283; AVX2-NEXT:    vpextrd $2, %xmm1, %r10d
284; AVX2-NEXT:    andl $1, %r10d
285; AVX2-NEXT:    addq %r9, %r10
286; AVX2-NEXT:    vpextrd $3, %xmm1, %r11d
287; AVX2-NEXT:    andl $1, %r11d
288; AVX2-NEXT:    addq %r10, %r11
289; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
290; AVX2-NEXT:    vextractps $3, %xmm1, %ebx
291; AVX2-NEXT:    cmpq $8, %r11
292; AVX2-NEXT:    cmovbl (%rsp,%rax,4), %ebx
293; AVX2-NEXT:    vmovss %xmm0, (%rsp)
294; AVX2-NEXT:    vextractps $1, %xmm0, (%rsp,%rdx,4)
295; AVX2-NEXT:    vextractps $2, %xmm0, (%rsp,%rcx,4)
296; AVX2-NEXT:    vextractps $3, %xmm0, (%rsp,%rsi,4)
297; AVX2-NEXT:    andl $7, %edi
298; AVX2-NEXT:    vmovss %xmm1, (%rsp,%rdi,4)
299; AVX2-NEXT:    andl $7, %r8d
300; AVX2-NEXT:    vextractps $1, %xmm1, (%rsp,%r8,4)
301; AVX2-NEXT:    andl $7, %r9d
302; AVX2-NEXT:    vextractps $2, %xmm1, (%rsp,%r9,4)
303; AVX2-NEXT:    andl $7, %r10d
304; AVX2-NEXT:    vextractps $3, %xmm1, (%rsp,%r10,4)
305; AVX2-NEXT:    cmpq $7, %r11
306; AVX2-NEXT:    movl $7, %eax
307; AVX2-NEXT:    cmovbq %r11, %rax
308; AVX2-NEXT:    movl %eax, %eax
309; AVX2-NEXT:    movl %ebx, (%rsp,%rax,4)
310; AVX2-NEXT:    vmovaps (%rsp), %ymm0
311; AVX2-NEXT:    leaq -8(%rbp), %rsp
312; AVX2-NEXT:    popq %rbx
313; AVX2-NEXT:    popq %rbp
314; AVX2-NEXT:    retq
315;
316; AVX512F-LABEL: test_compress_v8i32:
317; AVX512F:       # %bb.0:
318; AVX512F-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
319; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
320; AVX512F-NEXT:    vpmovsxwq %xmm1, %zmm1
321; AVX512F-NEXT:    vpsllq $63, %zmm1, %zmm1
322; AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k1
323; AVX512F-NEXT:    vpcompressd %zmm0, %zmm2 {%k1}
324; AVX512F-NEXT:    vmovdqa %ymm2, %ymm0
325; AVX512F-NEXT:    retq
326;
327; AVX512VL-LABEL: test_compress_v8i32:
328; AVX512VL:       # %bb.0:
329; AVX512VL-NEXT:    vpsllw $15, %xmm1, %xmm1
330; AVX512VL-NEXT:    vpmovw2m %xmm1, %k1
331; AVX512VL-NEXT:    vpcompressd %ymm0, %ymm2 {%k1}
332; AVX512VL-NEXT:    vmovdqa %ymm2, %ymm0
333; AVX512VL-NEXT:    retq
334    %out = call <8 x i32> @llvm.experimental.vector.compress(<8 x i32> %vec, <8 x i1> %mask, <8 x i32> %passthru)
335    ret <8 x i32> %out
336}
337
338define <8 x float> @test_compress_v8f32(<8 x float> %vec, <8 x i1> %mask, <8 x float> %passthru) nounwind {
339; AVX2-LABEL: test_compress_v8f32:
340; AVX2:       # %bb.0:
341; AVX2-NEXT:    pushq %rbp
342; AVX2-NEXT:    movq %rsp, %rbp
343; AVX2-NEXT:    andq $-32, %rsp
344; AVX2-NEXT:    subq $64, %rsp
345; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
346; AVX2-NEXT:    vpslld $31, %ymm1, %ymm1
347; AVX2-NEXT:    vpsrad $31, %ymm1, %ymm3
348; AVX2-NEXT:    vmovaps %ymm2, (%rsp)
349; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm1
350; AVX2-NEXT:    vpackssdw %xmm1, %xmm3, %xmm2
351; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
352; AVX2-NEXT:    vpslld $31, %ymm2, %ymm2
353; AVX2-NEXT:    vpsrld $31, %ymm2, %ymm2
354; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm4
355; AVX2-NEXT:    vpaddd %xmm4, %xmm2, %xmm2
356; AVX2-NEXT:    vpextrd $1, %xmm2, %eax
357; AVX2-NEXT:    vmovd %xmm2, %ecx
358; AVX2-NEXT:    addl %eax, %ecx
359; AVX2-NEXT:    vpextrd $2, %xmm2, %eax
360; AVX2-NEXT:    vpextrd $3, %xmm2, %edx
361; AVX2-NEXT:    addl %eax, %edx
362; AVX2-NEXT:    addl %ecx, %edx
363; AVX2-NEXT:    andl $7, %edx
364; AVX2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
365; AVX2-NEXT:    vmovss %xmm0, (%rsp)
366; AVX2-NEXT:    vmovd %xmm3, %eax
367; AVX2-NEXT:    andl $1, %eax
368; AVX2-NEXT:    vextractps $1, %xmm0, (%rsp,%rax,4)
369; AVX2-NEXT:    vpextrd $1, %xmm3, %ecx
370; AVX2-NEXT:    andl $1, %ecx
371; AVX2-NEXT:    addq %rax, %rcx
372; AVX2-NEXT:    vextractps $2, %xmm0, (%rsp,%rcx,4)
373; AVX2-NEXT:    vpextrd $2, %xmm3, %eax
374; AVX2-NEXT:    andl $1, %eax
375; AVX2-NEXT:    addq %rcx, %rax
376; AVX2-NEXT:    vextractps $3, %xmm0, (%rsp,%rax,4)
377; AVX2-NEXT:    vpextrd $3, %xmm3, %ecx
378; AVX2-NEXT:    andl $1, %ecx
379; AVX2-NEXT:    addq %rax, %rcx
380; AVX2-NEXT:    vmovd %xmm1, %eax
381; AVX2-NEXT:    andl $1, %eax
382; AVX2-NEXT:    addq %rcx, %rax
383; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
384; AVX2-NEXT:    andl $7, %ecx
385; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
386; AVX2-NEXT:    vmovss %xmm0, (%rsp,%rcx,4)
387; AVX2-NEXT:    vpextrd $1, %xmm1, %ecx
388; AVX2-NEXT:    andl $1, %ecx
389; AVX2-NEXT:    addq %rax, %rcx
390; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
391; AVX2-NEXT:    andl $7, %eax
392; AVX2-NEXT:    vextractps $1, %xmm0, (%rsp,%rax,4)
393; AVX2-NEXT:    vpextrd $2, %xmm1, %edx
394; AVX2-NEXT:    andl $1, %edx
395; AVX2-NEXT:    addq %rcx, %rdx
396; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
397; AVX2-NEXT:    andl $7, %ecx
398; AVX2-NEXT:    vextractps $2, %xmm0, (%rsp,%rcx,4)
399; AVX2-NEXT:    vpextrd $3, %xmm1, %eax
400; AVX2-NEXT:    andl $1, %eax
401; AVX2-NEXT:    addq %rdx, %rax
402; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
403; AVX2-NEXT:    andl $7, %edx
404; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
405; AVX2-NEXT:    vmovss %xmm0, (%rsp,%rdx,4)
406; AVX2-NEXT:    cmpq $8, %rax
407; AVX2-NEXT:    jae .LBB5_2
408; AVX2-NEXT:  # %bb.1:
409; AVX2-NEXT:    vmovaps %xmm2, %xmm0
410; AVX2-NEXT:  .LBB5_2:
411; AVX2-NEXT:    cmpq $7, %rax
412; AVX2-NEXT:    movl $7, %ecx
413; AVX2-NEXT:    cmovbq %rax, %rcx
414; AVX2-NEXT:    movl %ecx, %eax
415; AVX2-NEXT:    vmovss %xmm0, (%rsp,%rax,4)
416; AVX2-NEXT:    vmovaps (%rsp), %ymm0
417; AVX2-NEXT:    movq %rbp, %rsp
418; AVX2-NEXT:    popq %rbp
419; AVX2-NEXT:    retq
420;
421; AVX512F-LABEL: test_compress_v8f32:
422; AVX512F:       # %bb.0:
423; AVX512F-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
424; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
425; AVX512F-NEXT:    vpmovsxwq %xmm1, %zmm1
426; AVX512F-NEXT:    vpsllq $63, %zmm1, %zmm1
427; AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k1
428; AVX512F-NEXT:    vcompressps %zmm0, %zmm2 {%k1}
429; AVX512F-NEXT:    vmovdqa %ymm2, %ymm0
430; AVX512F-NEXT:    retq
431;
432; AVX512VL-LABEL: test_compress_v8f32:
433; AVX512VL:       # %bb.0:
434; AVX512VL-NEXT:    vpsllw $15, %xmm1, %xmm1
435; AVX512VL-NEXT:    vpmovw2m %xmm1, %k1
436; AVX512VL-NEXT:    vcompressps %ymm0, %ymm2 {%k1}
437; AVX512VL-NEXT:    vmovdqa %ymm2, %ymm0
438; AVX512VL-NEXT:    retq
439    %out = call <8 x float> @llvm.experimental.vector.compress(<8 x float> %vec, <8 x i1> %mask, <8 x float> %passthru)
440    ret <8 x float> %out
441}
442
443define <4 x i64> @test_compress_v4i64(<4 x i64> %vec, <4 x i1> %mask, <4 x i64> %passthru) nounwind {
444; AVX2-LABEL: test_compress_v4i64:
445; AVX2:       # %bb.0:
446; AVX2-NEXT:    pushq %rbp
447; AVX2-NEXT:    movq %rsp, %rbp
448; AVX2-NEXT:    andq $-32, %rsp
449; AVX2-NEXT:    subq $64, %rsp
450; AVX2-NEXT:    vpslld $31, %xmm1, %xmm1
451; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm1
452; AVX2-NEXT:    vpmovsxdq %xmm1, %ymm1
453; AVX2-NEXT:    vmovaps %ymm2, (%rsp)
454; AVX2-NEXT:    vpsrlq $63, %ymm1, %ymm2
455; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
456; AVX2-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
457; AVX2-NEXT:    vpextrq $1, %xmm2, %rcx
458; AVX2-NEXT:    vmovq %xmm2, %rax
459; AVX2-NEXT:    addl %ecx, %eax
460; AVX2-NEXT:    andl $3, %eax
461; AVX2-NEXT:    vpextrq $1, %xmm1, %rcx
462; AVX2-NEXT:    vmovq %xmm1, %rdx
463; AVX2-NEXT:    andl $1, %edx
464; AVX2-NEXT:    movl %edx, %esi
465; AVX2-NEXT:    subq %rcx, %rdx
466; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
467; AVX2-NEXT:    vmovq %xmm1, %rcx
468; AVX2-NEXT:    movl %edx, %edi
469; AVX2-NEXT:    subq %rcx, %rdx
470; AVX2-NEXT:    vpextrq $1, %xmm1, %rcx
471; AVX2-NEXT:    movq %rdx, %r8
472; AVX2-NEXT:    subq %rcx, %r8
473; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
474; AVX2-NEXT:    vpextrq $1, %xmm1, %rcx
475; AVX2-NEXT:    cmpq $4, %r8
476; AVX2-NEXT:    cmovbq (%rsp,%rax,8), %rcx
477; AVX2-NEXT:    vmovq %xmm0, (%rsp)
478; AVX2-NEXT:    vpextrq $1, %xmm0, (%rsp,%rsi,8)
479; AVX2-NEXT:    vmovq %xmm1, (%rsp,%rdi,8)
480; AVX2-NEXT:    andl $3, %edx
481; AVX2-NEXT:    vpextrq $1, %xmm1, (%rsp,%rdx,8)
482; AVX2-NEXT:    cmpq $3, %r8
483; AVX2-NEXT:    movl $3, %eax
484; AVX2-NEXT:    cmovbq %r8, %rax
485; AVX2-NEXT:    movl %eax, %eax
486; AVX2-NEXT:    movq %rcx, (%rsp,%rax,8)
487; AVX2-NEXT:    vmovaps (%rsp), %ymm0
488; AVX2-NEXT:    movq %rbp, %rsp
489; AVX2-NEXT:    popq %rbp
490; AVX2-NEXT:    retq
491;
492; AVX512F-LABEL: test_compress_v4i64:
493; AVX512F:       # %bb.0:
494; AVX512F-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
495; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
496; AVX512F-NEXT:    vpslld $31, %xmm1, %xmm1
497; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k0
498; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
499; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
500; AVX512F-NEXT:    vpcompressq %zmm0, %zmm2 {%k1}
501; AVX512F-NEXT:    vmovdqa %ymm2, %ymm0
502; AVX512F-NEXT:    retq
503;
504; AVX512VL-LABEL: test_compress_v4i64:
505; AVX512VL:       # %bb.0:
506; AVX512VL-NEXT:    vpslld $31, %xmm1, %xmm1
507; AVX512VL-NEXT:    vptestmd %xmm1, %xmm1, %k1
508; AVX512VL-NEXT:    vpcompressq %ymm0, %ymm2 {%k1}
509; AVX512VL-NEXT:    vmovdqa %ymm2, %ymm0
510; AVX512VL-NEXT:    retq
511    %out = call <4 x i64> @llvm.experimental.vector.compress(<4 x i64> %vec, <4 x i1> %mask, <4 x i64> %passthru)
512    ret <4 x i64> %out
513}
514
515define <4 x double> @test_compress_v4f64(<4 x double> %vec, <4 x i1> %mask, <4 x double> %passthru) nounwind {
516; AVX2-LABEL: test_compress_v4f64:
517; AVX2:       # %bb.0:
518; AVX2-NEXT:    pushq %rbp
519; AVX2-NEXT:    movq %rsp, %rbp
520; AVX2-NEXT:    andq $-32, %rsp
521; AVX2-NEXT:    subq $64, %rsp
522; AVX2-NEXT:    vpslld $31, %xmm1, %xmm1
523; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm1
524; AVX2-NEXT:    vpmovsxdq %xmm1, %ymm3
525; AVX2-NEXT:    vmovaps %ymm2, (%rsp)
526; AVX2-NEXT:    vpsrlq $63, %ymm3, %ymm1
527; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
528; AVX2-NEXT:    vpaddq %xmm2, %xmm1, %xmm1
529; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
530; AVX2-NEXT:    vmovq %xmm1, %rcx
531; AVX2-NEXT:    addl %eax, %ecx
532; AVX2-NEXT:    andl $3, %ecx
533; AVX2-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
534; AVX2-NEXT:    vmovlpd %xmm0, (%rsp)
535; AVX2-NEXT:    vmovq %xmm3, %rax
536; AVX2-NEXT:    andl $1, %eax
537; AVX2-NEXT:    movl %eax, %ecx
538; AVX2-NEXT:    vmovhpd %xmm0, (%rsp,%rcx,8)
539; AVX2-NEXT:    vpextrq $1, %xmm3, %rcx
540; AVX2-NEXT:    subq %rcx, %rax
541; AVX2-NEXT:    movl %eax, %ecx
542; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
543; AVX2-NEXT:    vmovlpd %xmm0, (%rsp,%rcx,8)
544; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm2
545; AVX2-NEXT:    vmovq %xmm2, %rcx
546; AVX2-NEXT:    subq %rcx, %rax
547; AVX2-NEXT:    movl %eax, %ecx
548; AVX2-NEXT:    andl $3, %ecx
549; AVX2-NEXT:    vmovhpd %xmm0, (%rsp,%rcx,8)
550; AVX2-NEXT:    vpextrq $1, %xmm2, %rcx
551; AVX2-NEXT:    subq %rcx, %rax
552; AVX2-NEXT:    cmpq $4, %rax
553; AVX2-NEXT:    jb .LBB7_2
554; AVX2-NEXT:  # %bb.1:
555; AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
556; AVX2-NEXT:  .LBB7_2:
557; AVX2-NEXT:    cmpq $3, %rax
558; AVX2-NEXT:    movl $3, %ecx
559; AVX2-NEXT:    cmovbq %rax, %rcx
560; AVX2-NEXT:    movl %ecx, %eax
561; AVX2-NEXT:    vmovsd %xmm1, (%rsp,%rax,8)
562; AVX2-NEXT:    vmovaps (%rsp), %ymm0
563; AVX2-NEXT:    movq %rbp, %rsp
564; AVX2-NEXT:    popq %rbp
565; AVX2-NEXT:    retq
566;
567; AVX512F-LABEL: test_compress_v4f64:
568; AVX512F:       # %bb.0:
569; AVX512F-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
570; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
571; AVX512F-NEXT:    vpslld $31, %xmm1, %xmm1
572; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k0
573; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
574; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
575; AVX512F-NEXT:    vcompresspd %zmm0, %zmm2 {%k1}
576; AVX512F-NEXT:    vmovdqa %ymm2, %ymm0
577; AVX512F-NEXT:    retq
578;
579; AVX512VL-LABEL: test_compress_v4f64:
580; AVX512VL:       # %bb.0:
581; AVX512VL-NEXT:    vpslld $31, %xmm1, %xmm1
582; AVX512VL-NEXT:    vptestmd %xmm1, %xmm1, %k1
583; AVX512VL-NEXT:    vcompresspd %ymm0, %ymm2 {%k1}
584; AVX512VL-NEXT:    vmovdqa %ymm2, %ymm0
585; AVX512VL-NEXT:    retq
586    %out = call <4 x double> @llvm.experimental.vector.compress(<4 x double> %vec, <4 x i1> %mask, <4 x double> %passthru)
587    ret <4 x double> %out
588}
589
590define <16 x i32> @test_compress_v16i32(<16 x i32> %vec, <16 x i1> %mask, <16 x i32> %passthru) nounwind {
591; AVX2-LABEL: test_compress_v16i32:
592; AVX2:       # %bb.0:
593; AVX2-NEXT:    pushq %rbp
594; AVX2-NEXT:    movq %rsp, %rbp
595; AVX2-NEXT:    pushq %r15
596; AVX2-NEXT:    pushq %r14
597; AVX2-NEXT:    pushq %r13
598; AVX2-NEXT:    pushq %r12
599; AVX2-NEXT:    pushq %rbx
600; AVX2-NEXT:    andq $-32, %rsp
601; AVX2-NEXT:    subq $128, %rsp
602; AVX2-NEXT:    vmovaps %ymm4, {{[0-9]+}}(%rsp)
603; AVX2-NEXT:    vmovaps %ymm3, (%rsp)
604; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
605; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
606; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
607; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
608; AVX2-NEXT:    vpaddd %ymm3, %ymm4, %ymm3
609; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
610; AVX2-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
611; AVX2-NEXT:    vpextrd $1, %xmm3, %eax
612; AVX2-NEXT:    vmovd %xmm3, %ecx
613; AVX2-NEXT:    addl %eax, %ecx
614; AVX2-NEXT:    vpextrd $2, %xmm3, %eax
615; AVX2-NEXT:    vpextrd $3, %xmm3, %edx
616; AVX2-NEXT:    addl %eax, %edx
617; AVX2-NEXT:    addl %ecx, %edx
618; AVX2-NEXT:    andl $15, %edx
619; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
620; AVX2-NEXT:    vpextrb $1, %xmm2, %eax
621; AVX2-NEXT:    andl $1, %eax
622; AVX2-NEXT:    vmovd %xmm2, %ecx
623; AVX2-NEXT:    andl $1, %ecx
624; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
625; AVX2-NEXT:    addq %rcx, %rax
626; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
627; AVX2-NEXT:    vpextrb $2, %xmm2, %ecx
628; AVX2-NEXT:    andl $1, %ecx
629; AVX2-NEXT:    addq %rax, %rcx
630; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
631; AVX2-NEXT:    vpextrb $3, %xmm2, %eax
632; AVX2-NEXT:    andl $1, %eax
633; AVX2-NEXT:    addq %rcx, %rax
634; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
635; AVX2-NEXT:    vpextrb $4, %xmm2, %r8d
636; AVX2-NEXT:    andl $1, %r8d
637; AVX2-NEXT:    addq %rax, %r8
638; AVX2-NEXT:    vpextrb $5, %xmm2, %r9d
639; AVX2-NEXT:    andl $1, %r9d
640; AVX2-NEXT:    addq %r8, %r9
641; AVX2-NEXT:    vpextrb $6, %xmm2, %r10d
642; AVX2-NEXT:    andl $1, %r10d
643; AVX2-NEXT:    addq %r9, %r10
644; AVX2-NEXT:    vpextrb $7, %xmm2, %r11d
645; AVX2-NEXT:    andl $1, %r11d
646; AVX2-NEXT:    addq %r10, %r11
647; AVX2-NEXT:    vpextrb $8, %xmm2, %ebx
648; AVX2-NEXT:    andl $1, %ebx
649; AVX2-NEXT:    addq %r11, %rbx
650; AVX2-NEXT:    vpextrb $9, %xmm2, %r14d
651; AVX2-NEXT:    andl $1, %r14d
652; AVX2-NEXT:    addq %rbx, %r14
653; AVX2-NEXT:    vpextrb $10, %xmm2, %r15d
654; AVX2-NEXT:    andl $1, %r15d
655; AVX2-NEXT:    addq %r14, %r15
656; AVX2-NEXT:    vpextrb $11, %xmm2, %r12d
657; AVX2-NEXT:    andl $1, %r12d
658; AVX2-NEXT:    addq %r15, %r12
659; AVX2-NEXT:    vpextrb $12, %xmm2, %r13d
660; AVX2-NEXT:    andl $1, %r13d
661; AVX2-NEXT:    addq %r12, %r13
662; AVX2-NEXT:    vpextrb $13, %xmm2, %ecx
663; AVX2-NEXT:    andl $1, %ecx
664; AVX2-NEXT:    addq %r13, %rcx
665; AVX2-NEXT:    vpextrb $14, %xmm2, %eax
666; AVX2-NEXT:    andl $1, %eax
667; AVX2-NEXT:    addq %rcx, %rax
668; AVX2-NEXT:    vpextrb $15, %xmm2, %edx
669; AVX2-NEXT:    andl $1, %edx
670; AVX2-NEXT:    addq %rax, %rdx
671; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm2
672; AVX2-NEXT:    cmpq $16, %rdx
673; AVX2-NEXT:    vextractps $3, %xmm2, %esi
674; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
675; AVX2-NEXT:    cmovbl (%rsp,%rdi,4), %esi
676; AVX2-NEXT:    movl %esi, %edi
677; AVX2-NEXT:    vmovss %xmm0, (%rsp)
678; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
679; AVX2-NEXT:    vextractps $1, %xmm0, (%rsp,%rsi,4)
680; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
681; AVX2-NEXT:    vextractps $2, %xmm0, (%rsp,%rsi,4)
682; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
683; AVX2-NEXT:    vextractps $3, %xmm0, (%rsp,%rsi,4)
684; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
685; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
686; AVX2-NEXT:    vmovss %xmm0, (%rsp,%rsi,4)
687; AVX2-NEXT:    andl $15, %r8d
688; AVX2-NEXT:    vextractps $1, %xmm0, (%rsp,%r8,4)
689; AVX2-NEXT:    andl $15, %r9d
690; AVX2-NEXT:    vextractps $2, %xmm0, (%rsp,%r9,4)
691; AVX2-NEXT:    andl $15, %r10d
692; AVX2-NEXT:    vextractps $3, %xmm0, (%rsp,%r10,4)
693; AVX2-NEXT:    andl $15, %r11d
694; AVX2-NEXT:    vmovss %xmm1, (%rsp,%r11,4)
695; AVX2-NEXT:    andl $15, %ebx
696; AVX2-NEXT:    vextractps $1, %xmm1, (%rsp,%rbx,4)
697; AVX2-NEXT:    andl $15, %r14d
698; AVX2-NEXT:    vextractps $2, %xmm1, (%rsp,%r14,4)
699; AVX2-NEXT:    andl $15, %r15d
700; AVX2-NEXT:    vextractps $3, %xmm1, (%rsp,%r15,4)
701; AVX2-NEXT:    andl $15, %r12d
702; AVX2-NEXT:    vmovss %xmm2, (%rsp,%r12,4)
703; AVX2-NEXT:    andl $15, %r13d
704; AVX2-NEXT:    vextractps $1, %xmm2, (%rsp,%r13,4)
705; AVX2-NEXT:    andl $15, %ecx
706; AVX2-NEXT:    vextractps $2, %xmm2, (%rsp,%rcx,4)
707; AVX2-NEXT:    andl $15, %eax
708; AVX2-NEXT:    vextractps $3, %xmm2, (%rsp,%rax,4)
709; AVX2-NEXT:    cmpq $15, %rdx
710; AVX2-NEXT:    movl $15, %eax
711; AVX2-NEXT:    cmovbq %rdx, %rax
712; AVX2-NEXT:    movl %eax, %eax
713; AVX2-NEXT:    movl %edi, (%rsp,%rax,4)
714; AVX2-NEXT:    vmovaps (%rsp), %ymm0
715; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
716; AVX2-NEXT:    leaq -40(%rbp), %rsp
717; AVX2-NEXT:    popq %rbx
718; AVX2-NEXT:    popq %r12
719; AVX2-NEXT:    popq %r13
720; AVX2-NEXT:    popq %r14
721; AVX2-NEXT:    popq %r15
722; AVX2-NEXT:    popq %rbp
723; AVX2-NEXT:    retq
724;
725; AVX512F-LABEL: test_compress_v16i32:
726; AVX512F:       # %bb.0:
727; AVX512F-NEXT:    vpmovsxbd %xmm1, %zmm1
728; AVX512F-NEXT:    vpslld $31, %zmm1, %zmm1
729; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k1
730; AVX512F-NEXT:    vpcompressd %zmm0, %zmm2 {%k1}
731; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
732; AVX512F-NEXT:    retq
733;
734; AVX512VL-LABEL: test_compress_v16i32:
735; AVX512VL:       # %bb.0:
736; AVX512VL-NEXT:    vpsllw $7, %xmm1, %xmm1
737; AVX512VL-NEXT:    vpmovb2m %xmm1, %k1
738; AVX512VL-NEXT:    vpcompressd %zmm0, %zmm2 {%k1}
739; AVX512VL-NEXT:    vmovdqa64 %zmm2, %zmm0
740; AVX512VL-NEXT:    retq
741    %out = call <16 x i32> @llvm.experimental.vector.compress(<16 x i32> %vec, <16 x i1> %mask, <16 x i32> %passthru)
742    ret <16 x i32> %out
743}
744
745define <16 x float> @test_compress_v16f32(<16 x float> %vec, <16 x i1> %mask, <16 x float> %passthru) nounwind {
746; AVX2-LABEL: test_compress_v16f32:
747; AVX2:       # %bb.0:
748; AVX2-NEXT:    pushq %rbp
749; AVX2-NEXT:    movq %rsp, %rbp
750; AVX2-NEXT:    andq $-32, %rsp
751; AVX2-NEXT:    subq $96, %rsp
752; AVX2-NEXT:    vmovaps %ymm4, {{[0-9]+}}(%rsp)
753; AVX2-NEXT:    vmovaps %ymm3, (%rsp)
754; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
755; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
756; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
757; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
758; AVX2-NEXT:    vpaddd %ymm3, %ymm4, %ymm3
759; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
760; AVX2-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
761; AVX2-NEXT:    vpextrd $1, %xmm3, %eax
762; AVX2-NEXT:    vmovd %xmm3, %ecx
763; AVX2-NEXT:    addl %eax, %ecx
764; AVX2-NEXT:    vpextrd $2, %xmm3, %eax
765; AVX2-NEXT:    vpextrd $3, %xmm3, %edx
766; AVX2-NEXT:    addl %eax, %edx
767; AVX2-NEXT:    addl %ecx, %edx
768; AVX2-NEXT:    andl $15, %edx
769; AVX2-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
770; AVX2-NEXT:    vmovss %xmm0, (%rsp)
771; AVX2-NEXT:    vmovd %xmm2, %eax
772; AVX2-NEXT:    andl $1, %eax
773; AVX2-NEXT:    vextractps $1, %xmm0, (%rsp,%rax,4)
774; AVX2-NEXT:    vpextrb $1, %xmm2, %ecx
775; AVX2-NEXT:    andl $1, %ecx
776; AVX2-NEXT:    addq %rax, %rcx
777; AVX2-NEXT:    vextractps $2, %xmm0, (%rsp,%rcx,4)
778; AVX2-NEXT:    vpextrb $2, %xmm2, %eax
779; AVX2-NEXT:    andl $1, %eax
780; AVX2-NEXT:    addq %rcx, %rax
781; AVX2-NEXT:    vextractps $3, %xmm0, (%rsp,%rax,4)
782; AVX2-NEXT:    vpextrb $3, %xmm2, %ecx
783; AVX2-NEXT:    andl $1, %ecx
784; AVX2-NEXT:    addq %rax, %rcx
785; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
786; AVX2-NEXT:    vmovss %xmm0, (%rsp,%rcx,4)
787; AVX2-NEXT:    vpextrb $4, %xmm2, %eax
788; AVX2-NEXT:    andl $1, %eax
789; AVX2-NEXT:    addq %rcx, %rax
790; AVX2-NEXT:    vpextrb $5, %xmm2, %ecx
791; AVX2-NEXT:    andl $1, %ecx
792; AVX2-NEXT:    addq %rax, %rcx
793; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
794; AVX2-NEXT:    andl $15, %eax
795; AVX2-NEXT:    vextractps $1, %xmm0, (%rsp,%rax,4)
796; AVX2-NEXT:    vpextrb $6, %xmm2, %eax
797; AVX2-NEXT:    andl $1, %eax
798; AVX2-NEXT:    addq %rcx, %rax
799; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
800; AVX2-NEXT:    andl $15, %ecx
801; AVX2-NEXT:    vextractps $2, %xmm0, (%rsp,%rcx,4)
802; AVX2-NEXT:    vpextrb $7, %xmm2, %ecx
803; AVX2-NEXT:    andl $1, %ecx
804; AVX2-NEXT:    addq %rax, %rcx
805; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
806; AVX2-NEXT:    andl $15, %eax
807; AVX2-NEXT:    vextractps $3, %xmm0, (%rsp,%rax,4)
808; AVX2-NEXT:    vpextrb $8, %xmm2, %eax
809; AVX2-NEXT:    andl $1, %eax
810; AVX2-NEXT:    addq %rcx, %rax
811; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
812; AVX2-NEXT:    andl $15, %ecx
813; AVX2-NEXT:    vmovss %xmm1, (%rsp,%rcx,4)
814; AVX2-NEXT:    vpextrb $9, %xmm2, %ecx
815; AVX2-NEXT:    andl $1, %ecx
816; AVX2-NEXT:    addq %rax, %rcx
817; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
818; AVX2-NEXT:    andl $15, %eax
819; AVX2-NEXT:    vextractps $1, %xmm1, (%rsp,%rax,4)
820; AVX2-NEXT:    vpextrb $10, %xmm2, %eax
821; AVX2-NEXT:    andl $1, %eax
822; AVX2-NEXT:    addq %rcx, %rax
823; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
824; AVX2-NEXT:    andl $15, %ecx
825; AVX2-NEXT:    vextractps $2, %xmm1, (%rsp,%rcx,4)
826; AVX2-NEXT:    vpextrb $11, %xmm2, %ecx
827; AVX2-NEXT:    andl $1, %ecx
828; AVX2-NEXT:    addq %rax, %rcx
829; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
830; AVX2-NEXT:    andl $15, %eax
831; AVX2-NEXT:    vextractps $3, %xmm1, (%rsp,%rax,4)
832; AVX2-NEXT:    vpextrb $12, %xmm2, %eax
833; AVX2-NEXT:    andl $1, %eax
834; AVX2-NEXT:    addq %rcx, %rax
835; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
836; AVX2-NEXT:    andl $15, %ecx
837; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm0
838; AVX2-NEXT:    vmovss %xmm0, (%rsp,%rcx,4)
839; AVX2-NEXT:    vpextrb $13, %xmm2, %ecx
840; AVX2-NEXT:    andl $1, %ecx
841; AVX2-NEXT:    addq %rax, %rcx
842; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
843; AVX2-NEXT:    andl $15, %eax
844; AVX2-NEXT:    vextractps $1, %xmm0, (%rsp,%rax,4)
845; AVX2-NEXT:    vpextrb $14, %xmm2, %edx
846; AVX2-NEXT:    andl $1, %edx
847; AVX2-NEXT:    addq %rcx, %rdx
848; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
849; AVX2-NEXT:    andl $15, %ecx
850; AVX2-NEXT:    vextractps $2, %xmm0, (%rsp,%rcx,4)
851; AVX2-NEXT:    vpextrb $15, %xmm2, %eax
852; AVX2-NEXT:    andl $1, %eax
853; AVX2-NEXT:    addq %rdx, %rax
854; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
855; AVX2-NEXT:    andl $15, %edx
856; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
857; AVX2-NEXT:    vmovss %xmm0, (%rsp,%rdx,4)
858; AVX2-NEXT:    cmpq $16, %rax
859; AVX2-NEXT:    jae .LBB9_2
860; AVX2-NEXT:  # %bb.1:
861; AVX2-NEXT:    vmovaps %xmm3, %xmm0
862; AVX2-NEXT:  .LBB9_2:
863; AVX2-NEXT:    cmpq $15, %rax
864; AVX2-NEXT:    movl $15, %ecx
865; AVX2-NEXT:    cmovbq %rax, %rcx
866; AVX2-NEXT:    movl %ecx, %eax
867; AVX2-NEXT:    vmovss %xmm0, (%rsp,%rax,4)
868; AVX2-NEXT:    vmovaps (%rsp), %ymm0
869; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
870; AVX2-NEXT:    movq %rbp, %rsp
871; AVX2-NEXT:    popq %rbp
872; AVX2-NEXT:    retq
873;
874; AVX512F-LABEL: test_compress_v16f32:
875; AVX512F:       # %bb.0:
876; AVX512F-NEXT:    vpmovsxbd %xmm1, %zmm1
877; AVX512F-NEXT:    vpslld $31, %zmm1, %zmm1
878; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k1
879; AVX512F-NEXT:    vcompressps %zmm0, %zmm2 {%k1}
880; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
881; AVX512F-NEXT:    retq
882;
883; AVX512VL-LABEL: test_compress_v16f32:
884; AVX512VL:       # %bb.0:
885; AVX512VL-NEXT:    vpsllw $7, %xmm1, %xmm1
886; AVX512VL-NEXT:    vpmovb2m %xmm1, %k1
887; AVX512VL-NEXT:    vcompressps %zmm0, %zmm2 {%k1}
888; AVX512VL-NEXT:    vmovdqa64 %zmm2, %zmm0
889; AVX512VL-NEXT:    retq
890    %out = call <16 x float> @llvm.experimental.vector.compress(<16 x float> %vec, <16 x i1> %mask, <16 x float> %passthru)
891    ret <16 x float> %out
892}
893
894define <8 x i64> @test_compress_v8i64(<8 x i64> %vec, <8 x i1> %mask, <8 x i64> %passthru) nounwind {
895; AVX2-LABEL: test_compress_v8i64:
896; AVX2:       # %bb.0:
897; AVX2-NEXT:    pushq %rbp
898; AVX2-NEXT:    movq %rsp, %rbp
899; AVX2-NEXT:    pushq %rbx
900; AVX2-NEXT:    andq $-32, %rsp
901; AVX2-NEXT:    subq $96, %rsp
902; AVX2-NEXT:    vmovaps %ymm4, {{[0-9]+}}(%rsp)
903; AVX2-NEXT:    vmovaps %ymm3, (%rsp)
904; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
905; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
906; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
907; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
908; AVX2-NEXT:    vpaddq %ymm3, %ymm4, %ymm3
909; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
910; AVX2-NEXT:    vpaddq %xmm4, %xmm3, %xmm3
911; AVX2-NEXT:    vpextrq $1, %xmm3, %rcx
912; AVX2-NEXT:    vmovq %xmm3, %rax
913; AVX2-NEXT:    addl %ecx, %eax
914; AVX2-NEXT:    andl $7, %eax
915; AVX2-NEXT:    vpextrw $1, %xmm2, %ecx
916; AVX2-NEXT:    andl $1, %ecx
917; AVX2-NEXT:    vmovd %xmm2, %edx
918; AVX2-NEXT:    andl $1, %edx
919; AVX2-NEXT:    addq %rdx, %rcx
920; AVX2-NEXT:    vpextrw $2, %xmm2, %esi
921; AVX2-NEXT:    andl $1, %esi
922; AVX2-NEXT:    addq %rcx, %rsi
923; AVX2-NEXT:    vpextrw $3, %xmm2, %edi
924; AVX2-NEXT:    andl $1, %edi
925; AVX2-NEXT:    addq %rsi, %rdi
926; AVX2-NEXT:    vpextrw $4, %xmm2, %r8d
927; AVX2-NEXT:    andl $1, %r8d
928; AVX2-NEXT:    addq %rdi, %r8
929; AVX2-NEXT:    vpextrw $5, %xmm2, %r9d
930; AVX2-NEXT:    andl $1, %r9d
931; AVX2-NEXT:    addq %r8, %r9
932; AVX2-NEXT:    vpextrw $6, %xmm2, %r10d
933; AVX2-NEXT:    andl $1, %r10d
934; AVX2-NEXT:    addq %r9, %r10
935; AVX2-NEXT:    vpextrw $7, %xmm2, %r11d
936; AVX2-NEXT:    andl $1, %r11d
937; AVX2-NEXT:    addq %r10, %r11
938; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
939; AVX2-NEXT:    vpextrq $1, %xmm2, %rbx
940; AVX2-NEXT:    cmpq $8, %r11
941; AVX2-NEXT:    cmovbq (%rsp,%rax,8), %rbx
942; AVX2-NEXT:    vmovq %xmm0, (%rsp)
943; AVX2-NEXT:    vpextrq $1, %xmm0, (%rsp,%rdx,8)
944; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
945; AVX2-NEXT:    vmovq %xmm0, (%rsp,%rcx,8)
946; AVX2-NEXT:    vpextrq $1, %xmm0, (%rsp,%rsi,8)
947; AVX2-NEXT:    andl $7, %edi
948; AVX2-NEXT:    vmovq %xmm1, (%rsp,%rdi,8)
949; AVX2-NEXT:    andl $7, %r8d
950; AVX2-NEXT:    vpextrq $1, %xmm1, (%rsp,%r8,8)
951; AVX2-NEXT:    andl $7, %r9d
952; AVX2-NEXT:    vmovq %xmm2, (%rsp,%r9,8)
953; AVX2-NEXT:    andl $7, %r10d
954; AVX2-NEXT:    vpextrq $1, %xmm2, (%rsp,%r10,8)
955; AVX2-NEXT:    cmpq $7, %r11
956; AVX2-NEXT:    movl $7, %eax
957; AVX2-NEXT:    cmovbq %r11, %rax
958; AVX2-NEXT:    movl %eax, %eax
959; AVX2-NEXT:    movq %rbx, (%rsp,%rax,8)
960; AVX2-NEXT:    vmovaps (%rsp), %ymm0
961; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
962; AVX2-NEXT:    leaq -8(%rbp), %rsp
963; AVX2-NEXT:    popq %rbx
964; AVX2-NEXT:    popq %rbp
965; AVX2-NEXT:    retq
966;
967; AVX512F-LABEL: test_compress_v8i64:
968; AVX512F:       # %bb.0:
969; AVX512F-NEXT:    vpmovsxwq %xmm1, %zmm1
970; AVX512F-NEXT:    vpsllq $63, %zmm1, %zmm1
971; AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k1
972; AVX512F-NEXT:    vpcompressq %zmm0, %zmm2 {%k1}
973; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
974; AVX512F-NEXT:    retq
975;
976; AVX512VL-LABEL: test_compress_v8i64:
977; AVX512VL:       # %bb.0:
978; AVX512VL-NEXT:    vpsllw $15, %xmm1, %xmm1
979; AVX512VL-NEXT:    vpmovw2m %xmm1, %k1
980; AVX512VL-NEXT:    vpcompressq %zmm0, %zmm2 {%k1}
981; AVX512VL-NEXT:    vmovdqa64 %zmm2, %zmm0
982; AVX512VL-NEXT:    retq
983    %out = call <8 x i64> @llvm.experimental.vector.compress(<8 x i64> %vec, <8 x i1> %mask, <8 x i64> %passthru)
984    ret <8 x i64> %out
985}
986
987define <8 x double> @test_compress_v8f64(<8 x double> %vec, <8 x i1> %mask, <8 x double> %passthru) nounwind {
988; AVX2-LABEL: test_compress_v8f64:
989; AVX2:       # %bb.0:
990; AVX2-NEXT:    pushq %rbp
991; AVX2-NEXT:    movq %rsp, %rbp
992; AVX2-NEXT:    andq $-32, %rsp
993; AVX2-NEXT:    subq $96, %rsp
994; AVX2-NEXT:    vmovaps %ymm4, {{[0-9]+}}(%rsp)
995; AVX2-NEXT:    vmovaps %ymm3, (%rsp)
996; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
997; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
998; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
999; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
1000; AVX2-NEXT:    vpaddq %ymm3, %ymm4, %ymm3
1001; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
1002; AVX2-NEXT:    vpaddq %xmm4, %xmm3, %xmm3
1003; AVX2-NEXT:    vpextrq $1, %xmm3, %rax
1004; AVX2-NEXT:    vmovq %xmm3, %rcx
1005; AVX2-NEXT:    addl %eax, %ecx
1006; AVX2-NEXT:    andl $7, %ecx
1007; AVX2-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
1008; AVX2-NEXT:    vmovlps %xmm0, (%rsp)
1009; AVX2-NEXT:    vmovd %xmm2, %eax
1010; AVX2-NEXT:    andl $1, %eax
1011; AVX2-NEXT:    vmovhps %xmm0, (%rsp,%rax,8)
1012; AVX2-NEXT:    vpextrw $1, %xmm2, %ecx
1013; AVX2-NEXT:    andl $1, %ecx
1014; AVX2-NEXT:    addq %rax, %rcx
1015; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
1016; AVX2-NEXT:    vmovlps %xmm0, (%rsp,%rcx,8)
1017; AVX2-NEXT:    vpextrw $2, %xmm2, %eax
1018; AVX2-NEXT:    andl $1, %eax
1019; AVX2-NEXT:    addq %rcx, %rax
1020; AVX2-NEXT:    vmovhps %xmm0, (%rsp,%rax,8)
1021; AVX2-NEXT:    vpextrw $3, %xmm2, %ecx
1022; AVX2-NEXT:    andl $1, %ecx
1023; AVX2-NEXT:    addq %rax, %rcx
1024; AVX2-NEXT:    vpextrw $4, %xmm2, %eax
1025; AVX2-NEXT:    andl $1, %eax
1026; AVX2-NEXT:    addq %rcx, %rax
1027; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
1028; AVX2-NEXT:    andl $7, %ecx
1029; AVX2-NEXT:    vmovlpd %xmm1, (%rsp,%rcx,8)
1030; AVX2-NEXT:    vpextrw $5, %xmm2, %ecx
1031; AVX2-NEXT:    andl $1, %ecx
1032; AVX2-NEXT:    addq %rax, %rcx
1033; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
1034; AVX2-NEXT:    andl $7, %eax
1035; AVX2-NEXT:    vmovhpd %xmm1, (%rsp,%rax,8)
1036; AVX2-NEXT:    vpextrw $6, %xmm2, %edx
1037; AVX2-NEXT:    andl $1, %edx
1038; AVX2-NEXT:    addq %rcx, %rdx
1039; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
1040; AVX2-NEXT:    andl $7, %ecx
1041; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm0
1042; AVX2-NEXT:    vmovlpd %xmm0, (%rsp,%rcx,8)
1043; AVX2-NEXT:    vpextrw $7, %xmm2, %eax
1044; AVX2-NEXT:    andl $1, %eax
1045; AVX2-NEXT:    addq %rdx, %rax
1046; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
1047; AVX2-NEXT:    andl $7, %edx
1048; AVX2-NEXT:    vmovhpd %xmm0, (%rsp,%rdx,8)
1049; AVX2-NEXT:    cmpq $8, %rax
1050; AVX2-NEXT:    jb .LBB11_2
1051; AVX2-NEXT:  # %bb.1:
1052; AVX2-NEXT:    vshufpd {{.*#+}} xmm3 = xmm0[1,0]
1053; AVX2-NEXT:  .LBB11_2:
1054; AVX2-NEXT:    cmpq $7, %rax
1055; AVX2-NEXT:    movl $7, %ecx
1056; AVX2-NEXT:    cmovbq %rax, %rcx
1057; AVX2-NEXT:    movl %ecx, %eax
1058; AVX2-NEXT:    vmovsd %xmm3, (%rsp,%rax,8)
1059; AVX2-NEXT:    vmovaps (%rsp), %ymm0
1060; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
1061; AVX2-NEXT:    movq %rbp, %rsp
1062; AVX2-NEXT:    popq %rbp
1063; AVX2-NEXT:    retq
1064;
1065; AVX512F-LABEL: test_compress_v8f64:
1066; AVX512F:       # %bb.0:
1067; AVX512F-NEXT:    vpmovsxwq %xmm1, %zmm1
1068; AVX512F-NEXT:    vpsllq $63, %zmm1, %zmm1
1069; AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k1
1070; AVX512F-NEXT:    vcompresspd %zmm0, %zmm2 {%k1}
1071; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
1072; AVX512F-NEXT:    retq
1073;
1074; AVX512VL-LABEL: test_compress_v8f64:
1075; AVX512VL:       # %bb.0:
1076; AVX512VL-NEXT:    vpsllw $15, %xmm1, %xmm1
1077; AVX512VL-NEXT:    vpmovw2m %xmm1, %k1
1078; AVX512VL-NEXT:    vcompresspd %zmm0, %zmm2 {%k1}
1079; AVX512VL-NEXT:    vmovdqa64 %zmm2, %zmm0
1080; AVX512VL-NEXT:    retq
1081    %out = call <8 x double> @llvm.experimental.vector.compress(<8 x double> %vec, <8 x i1> %mask, <8 x double> %passthru)
1082    ret <8 x double> %out
1083}
1084
1085define <16 x i8> @test_compress_v16i8(<16 x i8> %vec, <16 x i1> %mask, <16 x i8> %passthru) nounwind {
1086; AVX2-LABEL: test_compress_v16i8:
1087; AVX2:       # %bb.0:
1088; AVX2-NEXT:    pushq %rbp
1089; AVX2-NEXT:    pushq %r15
1090; AVX2-NEXT:    pushq %r14
1091; AVX2-NEXT:    pushq %r13
1092; AVX2-NEXT:    pushq %r12
1093; AVX2-NEXT:    pushq %rbx
1094; AVX2-NEXT:    vpsllw $7, %xmm1, %xmm1
1095; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1096; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
1097; AVX2-NEXT:    vpcmpgtb %xmm1, %xmm3, %xmm1
1098; AVX2-NEXT:    vmovaps %xmm2, -{{[0-9]+}}(%rsp)
1099; AVX2-NEXT:    vpextrb $1, %xmm1, %r11d
1100; AVX2-NEXT:    vmovd %xmm1, %eax
1101; AVX2-NEXT:    movzbl %al, %edx
1102; AVX2-NEXT:    # kill: def $al killed $al killed $eax
1103; AVX2-NEXT:    andb $1, %al
1104; AVX2-NEXT:    subb %r11b, %al
1105; AVX2-NEXT:    vpextrb $2, %xmm1, %esi
1106; AVX2-NEXT:    subb %sil, %al
1107; AVX2-NEXT:    vpextrb $3, %xmm1, %r13d
1108; AVX2-NEXT:    subb %r13b, %al
1109; AVX2-NEXT:    vpextrb $4, %xmm1, %r12d
1110; AVX2-NEXT:    subb %r12b, %al
1111; AVX2-NEXT:    vpextrb $5, %xmm1, %r15d
1112; AVX2-NEXT:    subb %r15b, %al
1113; AVX2-NEXT:    vpextrb $6, %xmm1, %r14d
1114; AVX2-NEXT:    subb %r14b, %al
1115; AVX2-NEXT:    vpextrb $7, %xmm1, %ebp
1116; AVX2-NEXT:    subb %bpl, %al
1117; AVX2-NEXT:    vpextrb $8, %xmm1, %ebx
1118; AVX2-NEXT:    subb %bl, %al
1119; AVX2-NEXT:    vpextrb $9, %xmm1, %r10d
1120; AVX2-NEXT:    subb %r10b, %al
1121; AVX2-NEXT:    vpextrb $10, %xmm1, %r9d
1122; AVX2-NEXT:    subb %r9b, %al
1123; AVX2-NEXT:    vpextrb $11, %xmm1, %r8d
1124; AVX2-NEXT:    subb %r8b, %al
1125; AVX2-NEXT:    vpextrb $12, %xmm1, %edi
1126; AVX2-NEXT:    subb %dil, %al
1127; AVX2-NEXT:    vpextrb $13, %xmm1, %ecx
1128; AVX2-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1129; AVX2-NEXT:    subb %cl, %al
1130; AVX2-NEXT:    vpextrb $14, %xmm1, %ecx
1131; AVX2-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1132; AVX2-NEXT:    subb %cl, %al
1133; AVX2-NEXT:    vpextrb $15, %xmm1, %ecx
1134; AVX2-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1135; AVX2-NEXT:    subb %cl, %al
1136; AVX2-NEXT:    movzbl %al, %eax
1137; AVX2-NEXT:    andl $15, %eax
1138; AVX2-NEXT:    movzbl -40(%rsp,%rax), %eax
1139; AVX2-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1140; AVX2-NEXT:    vpextrb $0, %xmm0, -{{[0-9]+}}(%rsp)
1141; AVX2-NEXT:    andl $1, %edx
1142; AVX2-NEXT:    vpextrb $1, %xmm0, -40(%rsp,%rdx)
1143; AVX2-NEXT:    movzbl %r11b, %eax
1144; AVX2-NEXT:    andl $1, %eax
1145; AVX2-NEXT:    addq %rdx, %rax
1146; AVX2-NEXT:    vpextrb $2, %xmm0, -40(%rsp,%rax)
1147; AVX2-NEXT:    movzbl %sil, %ecx
1148; AVX2-NEXT:    andl $1, %ecx
1149; AVX2-NEXT:    addq %rax, %rcx
1150; AVX2-NEXT:    vpextrb $3, %xmm0, -40(%rsp,%rcx)
1151; AVX2-NEXT:    movzbl %r13b, %eax
1152; AVX2-NEXT:    andl $1, %eax
1153; AVX2-NEXT:    addq %rcx, %rax
1154; AVX2-NEXT:    vpextrb $4, %xmm0, -40(%rsp,%rax)
1155; AVX2-NEXT:    movzbl %r12b, %ecx
1156; AVX2-NEXT:    andl $1, %ecx
1157; AVX2-NEXT:    addq %rax, %rcx
1158; AVX2-NEXT:    movzbl %r15b, %eax
1159; AVX2-NEXT:    andl $1, %eax
1160; AVX2-NEXT:    addq %rcx, %rax
1161; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
1162; AVX2-NEXT:    andl $15, %ecx
1163; AVX2-NEXT:    vpextrb $5, %xmm0, -40(%rsp,%rcx)
1164; AVX2-NEXT:    movzbl %r14b, %ecx
1165; AVX2-NEXT:    andl $1, %ecx
1166; AVX2-NEXT:    addq %rax, %rcx
1167; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
1168; AVX2-NEXT:    andl $15, %eax
1169; AVX2-NEXT:    vpextrb $6, %xmm0, -40(%rsp,%rax)
1170; AVX2-NEXT:    movzbl %bpl, %eax
1171; AVX2-NEXT:    andl $1, %eax
1172; AVX2-NEXT:    addq %rcx, %rax
1173; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
1174; AVX2-NEXT:    andl $15, %ecx
1175; AVX2-NEXT:    vpextrb $7, %xmm0, -40(%rsp,%rcx)
1176; AVX2-NEXT:    movzbl %bl, %ecx
1177; AVX2-NEXT:    andl $1, %ecx
1178; AVX2-NEXT:    addq %rax, %rcx
1179; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
1180; AVX2-NEXT:    andl $15, %eax
1181; AVX2-NEXT:    vpextrb $8, %xmm0, -40(%rsp,%rax)
1182; AVX2-NEXT:    movzbl %r10b, %eax
1183; AVX2-NEXT:    andl $1, %eax
1184; AVX2-NEXT:    addq %rcx, %rax
1185; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
1186; AVX2-NEXT:    andl $15, %ecx
1187; AVX2-NEXT:    vpextrb $9, %xmm0, -40(%rsp,%rcx)
1188; AVX2-NEXT:    movzbl %r9b, %ecx
1189; AVX2-NEXT:    andl $1, %ecx
1190; AVX2-NEXT:    addq %rax, %rcx
1191; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
1192; AVX2-NEXT:    andl $15, %eax
1193; AVX2-NEXT:    vpextrb $10, %xmm0, -40(%rsp,%rax)
1194; AVX2-NEXT:    movzbl %r8b, %eax
1195; AVX2-NEXT:    andl $1, %eax
1196; AVX2-NEXT:    addq %rcx, %rax
1197; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
1198; AVX2-NEXT:    andl $15, %ecx
1199; AVX2-NEXT:    vpextrb $11, %xmm0, -40(%rsp,%rcx)
1200; AVX2-NEXT:    movzbl %dil, %ecx
1201; AVX2-NEXT:    andl $1, %ecx
1202; AVX2-NEXT:    addq %rax, %rcx
1203; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
1204; AVX2-NEXT:    andl $15, %eax
1205; AVX2-NEXT:    vpextrb $12, %xmm0, -40(%rsp,%rax)
1206; AVX2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1207; AVX2-NEXT:    andl $1, %eax
1208; AVX2-NEXT:    addq %rcx, %rax
1209; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
1210; AVX2-NEXT:    andl $15, %ecx
1211; AVX2-NEXT:    vpextrb $13, %xmm0, -40(%rsp,%rcx)
1212; AVX2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
1213; AVX2-NEXT:    andl $1, %ecx
1214; AVX2-NEXT:    addq %rax, %rcx
1215; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
1216; AVX2-NEXT:    andl $15, %eax
1217; AVX2-NEXT:    vpextrb $14, %xmm0, -40(%rsp,%rax)
1218; AVX2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1219; AVX2-NEXT:    andl $1, %eax
1220; AVX2-NEXT:    addq %rcx, %rax
1221; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
1222; AVX2-NEXT:    andl $15, %ecx
1223; AVX2-NEXT:    vpextrb $15, %xmm0, -40(%rsp,%rcx)
1224; AVX2-NEXT:    cmpq $15, %rax
1225; AVX2-NEXT:    movl $15, %ecx
1226; AVX2-NEXT:    cmovbq %rax, %rcx
1227; AVX2-NEXT:    vpextrb $15, %xmm0, %eax
1228; AVX2-NEXT:    cmovbel {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
1229; AVX2-NEXT:    movb %al, -40(%rsp,%rcx)
1230; AVX2-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
1231; AVX2-NEXT:    popq %rbx
1232; AVX2-NEXT:    popq %r12
1233; AVX2-NEXT:    popq %r13
1234; AVX2-NEXT:    popq %r14
1235; AVX2-NEXT:    popq %r15
1236; AVX2-NEXT:    popq %rbp
1237; AVX2-NEXT:    retq
1238;
1239; AVX512F-LABEL: test_compress_v16i8:
1240; AVX512F:       # %bb.0:
1241; AVX512F-NEXT:    vpmovsxbd %xmm1, %zmm1
1242; AVX512F-NEXT:    vpslld $31, %zmm1, %zmm1
1243; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k1
1244; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1245; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
1246; AVX512F-NEXT:    vpcompressd %zmm0, %zmm1 {%k1}
1247; AVX512F-NEXT:    vpmovdb %zmm1, %xmm0
1248; AVX512F-NEXT:    vzeroupper
1249; AVX512F-NEXT:    retq
1250;
1251; AVX512VL-LABEL: test_compress_v16i8:
1252; AVX512VL:       # %bb.0:
1253; AVX512VL-NEXT:    vpsllw $7, %xmm1, %xmm1
1254; AVX512VL-NEXT:    vpmovb2m %xmm1, %k1
1255; AVX512VL-NEXT:    vpcompressb %xmm0, %xmm2 {%k1}
1256; AVX512VL-NEXT:    vmovdqa %xmm2, %xmm0
1257; AVX512VL-NEXT:    retq
1258    %out = call <16 x i8> @llvm.experimental.vector.compress(<16 x i8> %vec, <16 x i1> %mask, <16 x i8> %passthru)
1259    ret <16 x i8> %out
1260}
1261
1262define <8 x i16> @test_compress_v8i16(<8 x i16> %vec, <8 x i1> %mask, <8 x i16> %passthru) nounwind {
1263; AVX2-LABEL: test_compress_v8i16:
1264; AVX2:       # %bb.0:
1265; AVX2-NEXT:    pushq %rbx
1266; AVX2-NEXT:    vpsllw $15, %xmm1, %xmm1
1267; AVX2-NEXT:    vpsraw $15, %xmm1, %xmm1
1268; AVX2-NEXT:    vmovaps %xmm2, -{{[0-9]+}}(%rsp)
1269; AVX2-NEXT:    vpextrw $1, %xmm1, %eax
1270; AVX2-NEXT:    andl $1, %eax
1271; AVX2-NEXT:    vmovd %xmm1, %ecx
1272; AVX2-NEXT:    andl $1, %ecx
1273; AVX2-NEXT:    leal (%rcx,%rax), %esi
1274; AVX2-NEXT:    vpextrw $2, %xmm1, %edi
1275; AVX2-NEXT:    andl $1, %edi
1276; AVX2-NEXT:    vpextrw $3, %xmm1, %edx
1277; AVX2-NEXT:    andl $1, %edx
1278; AVX2-NEXT:    leal (%rdi,%rdx), %r10d
1279; AVX2-NEXT:    addl %esi, %r10d
1280; AVX2-NEXT:    vpextrw $4, %xmm1, %r9d
1281; AVX2-NEXT:    andl $1, %r9d
1282; AVX2-NEXT:    vpextrw $5, %xmm1, %esi
1283; AVX2-NEXT:    andl $1, %esi
1284; AVX2-NEXT:    leal (%r9,%rsi), %r11d
1285; AVX2-NEXT:    vpextrw $6, %xmm1, %r8d
1286; AVX2-NEXT:    andl $1, %r8d
1287; AVX2-NEXT:    addl %r8d, %r11d
1288; AVX2-NEXT:    addl %r10d, %r11d
1289; AVX2-NEXT:    vpextrw $7, %xmm1, %r10d
1290; AVX2-NEXT:    andl $1, %r10d
1291; AVX2-NEXT:    addl %r10d, %r11d
1292; AVX2-NEXT:    andl $7, %r11d
1293; AVX2-NEXT:    addq %rcx, %rax
1294; AVX2-NEXT:    addq %rax, %rdi
1295; AVX2-NEXT:    addq %rdi, %rdx
1296; AVX2-NEXT:    addq %rdx, %r9
1297; AVX2-NEXT:    addq %r9, %rsi
1298; AVX2-NEXT:    addq %rsi, %r8
1299; AVX2-NEXT:    addq %r8, %r10
1300; AVX2-NEXT:    vpextrw $7, %xmm0, %ebx
1301; AVX2-NEXT:    cmpq $8, %r10
1302; AVX2-NEXT:    cmovbw -16(%rsp,%r11,2), %bx
1303; AVX2-NEXT:    vpextrw $0, %xmm0, -{{[0-9]+}}(%rsp)
1304; AVX2-NEXT:    vpextrw $1, %xmm0, -16(%rsp,%rcx,2)
1305; AVX2-NEXT:    vpextrw $2, %xmm0, -16(%rsp,%rax,2)
1306; AVX2-NEXT:    vpextrw $3, %xmm0, -16(%rsp,%rdi,2)
1307; AVX2-NEXT:    andl $7, %edx
1308; AVX2-NEXT:    vpextrw $4, %xmm0, -16(%rsp,%rdx,2)
1309; AVX2-NEXT:    andl $7, %r9d
1310; AVX2-NEXT:    vpextrw $5, %xmm0, -16(%rsp,%r9,2)
1311; AVX2-NEXT:    andl $7, %esi
1312; AVX2-NEXT:    vpextrw $6, %xmm0, -16(%rsp,%rsi,2)
1313; AVX2-NEXT:    andl $7, %r8d
1314; AVX2-NEXT:    vpextrw $7, %xmm0, -16(%rsp,%r8,2)
1315; AVX2-NEXT:    cmpq $7, %r10
1316; AVX2-NEXT:    movl $7, %eax
1317; AVX2-NEXT:    cmovbq %r10, %rax
1318; AVX2-NEXT:    movl %eax, %eax
1319; AVX2-NEXT:    movw %bx, -16(%rsp,%rax,2)
1320; AVX2-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
1321; AVX2-NEXT:    popq %rbx
1322; AVX2-NEXT:    retq
1323;
1324; AVX512F-LABEL: test_compress_v8i16:
1325; AVX512F:       # %bb.0:
1326; AVX512F-NEXT:    vpmovsxwq %xmm1, %zmm1
1327; AVX512F-NEXT:    vpsllq $63, %zmm1, %zmm1
1328; AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k1
1329; AVX512F-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
1330; AVX512F-NEXT:    vpmovzxwq {{.*#+}} zmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
1331; AVX512F-NEXT:    vpcompressq %zmm0, %zmm1 {%k1}
1332; AVX512F-NEXT:    vpmovqw %zmm1, %xmm0
1333; AVX512F-NEXT:    vzeroupper
1334; AVX512F-NEXT:    retq
1335;
1336; AVX512VL-LABEL: test_compress_v8i16:
1337; AVX512VL:       # %bb.0:
1338; AVX512VL-NEXT:    vpsllw $15, %xmm1, %xmm1
1339; AVX512VL-NEXT:    vpmovw2m %xmm1, %k1
1340; AVX512VL-NEXT:    vpcompressw %xmm0, %xmm2 {%k1}
1341; AVX512VL-NEXT:    vmovdqa %xmm2, %xmm0
1342; AVX512VL-NEXT:    retq
1343    %out = call <8 x i16> @llvm.experimental.vector.compress(<8 x i16> %vec, <8 x i1> %mask, <8 x i16> %passthru)
1344    ret <8 x i16> %out
1345}
1346
1347define <32 x i8> @test_compress_v32i8(<32 x i8> %vec, <32 x i1> %mask, <32 x i8> %passthru) nounwind {
1348; AVX2-LABEL: test_compress_v32i8:
1349; AVX2:       # %bb.0:
1350; AVX2-NEXT:    pushq %rbp
1351; AVX2-NEXT:    movq %rsp, %rbp
1352; AVX2-NEXT:    andq $-32, %rsp
1353; AVX2-NEXT:    subq $64, %rsp
1354; AVX2-NEXT:    vpsllw $7, %ymm1, %ymm1
1355; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1356; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
1357; AVX2-NEXT:    vpcmpgtb %ymm1, %ymm3, %ymm3
1358; AVX2-NEXT:    vmovaps %ymm2, (%rsp)
1359; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm1
1360; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1361; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm4
1362; AVX2-NEXT:    vpand %xmm2, %xmm3, %xmm2
1363; AVX2-NEXT:    vpaddb %xmm4, %xmm2, %xmm2
1364; AVX2-NEXT:    vpextrb $1, %xmm2, %eax
1365; AVX2-NEXT:    vmovd %xmm2, %ecx
1366; AVX2-NEXT:    addb %al, %cl
1367; AVX2-NEXT:    vpextrb $2, %xmm2, %eax
1368; AVX2-NEXT:    vpextrb $3, %xmm2, %edx
1369; AVX2-NEXT:    addb %al, %dl
1370; AVX2-NEXT:    addb %cl, %dl
1371; AVX2-NEXT:    vpextrb $4, %xmm2, %eax
1372; AVX2-NEXT:    vpextrb $5, %xmm2, %ecx
1373; AVX2-NEXT:    addb %al, %cl
1374; AVX2-NEXT:    vpextrb $6, %xmm2, %eax
1375; AVX2-NEXT:    addb %cl, %al
1376; AVX2-NEXT:    addb %dl, %al
1377; AVX2-NEXT:    vpextrb $7, %xmm2, %ecx
1378; AVX2-NEXT:    vpextrb $8, %xmm2, %edx
1379; AVX2-NEXT:    addb %cl, %dl
1380; AVX2-NEXT:    vpextrb $9, %xmm2, %ecx
1381; AVX2-NEXT:    addb %dl, %cl
1382; AVX2-NEXT:    vpextrb $10, %xmm2, %edx
1383; AVX2-NEXT:    addb %cl, %dl
1384; AVX2-NEXT:    addb %al, %dl
1385; AVX2-NEXT:    vpextrb $11, %xmm2, %eax
1386; AVX2-NEXT:    vpextrb $12, %xmm2, %ecx
1387; AVX2-NEXT:    addb %al, %cl
1388; AVX2-NEXT:    vpextrb $13, %xmm2, %eax
1389; AVX2-NEXT:    addb %cl, %al
1390; AVX2-NEXT:    vpextrb $14, %xmm2, %ecx
1391; AVX2-NEXT:    addb %al, %cl
1392; AVX2-NEXT:    vpextrb $15, %xmm2, %eax
1393; AVX2-NEXT:    addb %cl, %al
1394; AVX2-NEXT:    addb %dl, %al
1395; AVX2-NEXT:    movzbl %al, %eax
1396; AVX2-NEXT:    andl $31, %eax
1397; AVX2-NEXT:    movzbl (%rsp,%rax), %eax
1398; AVX2-NEXT:    vpextrb $0, %xmm0, (%rsp)
1399; AVX2-NEXT:    vmovd %xmm3, %ecx
1400; AVX2-NEXT:    andl $1, %ecx
1401; AVX2-NEXT:    vpextrb $1, %xmm0, (%rsp,%rcx)
1402; AVX2-NEXT:    vpextrb $1, %xmm3, %edx
1403; AVX2-NEXT:    andl $1, %edx
1404; AVX2-NEXT:    addq %rcx, %rdx
1405; AVX2-NEXT:    vpextrb $2, %xmm0, (%rsp,%rdx)
1406; AVX2-NEXT:    vpextrb $2, %xmm3, %ecx
1407; AVX2-NEXT:    andl $1, %ecx
1408; AVX2-NEXT:    addq %rdx, %rcx
1409; AVX2-NEXT:    vpextrb $3, %xmm0, (%rsp,%rcx)
1410; AVX2-NEXT:    vpextrb $3, %xmm3, %edx
1411; AVX2-NEXT:    andl $1, %edx
1412; AVX2-NEXT:    addq %rcx, %rdx
1413; AVX2-NEXT:    vpextrb $4, %xmm0, (%rsp,%rdx)
1414; AVX2-NEXT:    vpextrb $4, %xmm3, %ecx
1415; AVX2-NEXT:    andl $1, %ecx
1416; AVX2-NEXT:    addq %rdx, %rcx
1417; AVX2-NEXT:    vpextrb $5, %xmm0, (%rsp,%rcx)
1418; AVX2-NEXT:    vpextrb $5, %xmm3, %edx
1419; AVX2-NEXT:    andl $1, %edx
1420; AVX2-NEXT:    addq %rcx, %rdx
1421; AVX2-NEXT:    vpextrb $6, %xmm3, %ecx
1422; AVX2-NEXT:    andl $1, %ecx
1423; AVX2-NEXT:    addq %rdx, %rcx
1424; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
1425; AVX2-NEXT:    andl $31, %edx
1426; AVX2-NEXT:    vpextrb $6, %xmm0, (%rsp,%rdx)
1427; AVX2-NEXT:    vpextrb $7, %xmm3, %edx
1428; AVX2-NEXT:    andl $1, %edx
1429; AVX2-NEXT:    addq %rcx, %rdx
1430; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
1431; AVX2-NEXT:    andl $31, %ecx
1432; AVX2-NEXT:    vpextrb $7, %xmm0, (%rsp,%rcx)
1433; AVX2-NEXT:    vpextrb $8, %xmm3, %ecx
1434; AVX2-NEXT:    andl $1, %ecx
1435; AVX2-NEXT:    addq %rdx, %rcx
1436; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
1437; AVX2-NEXT:    andl $31, %edx
1438; AVX2-NEXT:    vpextrb $8, %xmm0, (%rsp,%rdx)
1439; AVX2-NEXT:    vpextrb $9, %xmm3, %edx
1440; AVX2-NEXT:    andl $1, %edx
1441; AVX2-NEXT:    addq %rcx, %rdx
1442; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
1443; AVX2-NEXT:    andl $31, %ecx
1444; AVX2-NEXT:    vpextrb $9, %xmm0, (%rsp,%rcx)
1445; AVX2-NEXT:    vpextrb $10, %xmm3, %ecx
1446; AVX2-NEXT:    andl $1, %ecx
1447; AVX2-NEXT:    addq %rdx, %rcx
1448; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
1449; AVX2-NEXT:    andl $31, %edx
1450; AVX2-NEXT:    vpextrb $10, %xmm0, (%rsp,%rdx)
1451; AVX2-NEXT:    vpextrb $11, %xmm3, %edx
1452; AVX2-NEXT:    andl $1, %edx
1453; AVX2-NEXT:    addq %rcx, %rdx
1454; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
1455; AVX2-NEXT:    andl $31, %ecx
1456; AVX2-NEXT:    vpextrb $11, %xmm0, (%rsp,%rcx)
1457; AVX2-NEXT:    vpextrb $12, %xmm3, %ecx
1458; AVX2-NEXT:    andl $1, %ecx
1459; AVX2-NEXT:    addq %rdx, %rcx
1460; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
1461; AVX2-NEXT:    andl $31, %edx
1462; AVX2-NEXT:    vpextrb $12, %xmm0, (%rsp,%rdx)
1463; AVX2-NEXT:    vpextrb $13, %xmm3, %edx
1464; AVX2-NEXT:    andl $1, %edx
1465; AVX2-NEXT:    addq %rcx, %rdx
1466; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
1467; AVX2-NEXT:    andl $31, %ecx
1468; AVX2-NEXT:    vpextrb $13, %xmm0, (%rsp,%rcx)
1469; AVX2-NEXT:    vpextrb $14, %xmm3, %ecx
1470; AVX2-NEXT:    andl $1, %ecx
1471; AVX2-NEXT:    addq %rdx, %rcx
1472; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
1473; AVX2-NEXT:    andl $31, %edx
1474; AVX2-NEXT:    vpextrb $14, %xmm0, (%rsp,%rdx)
1475; AVX2-NEXT:    vpextrb $15, %xmm3, %edx
1476; AVX2-NEXT:    andl $1, %edx
1477; AVX2-NEXT:    addq %rcx, %rdx
1478; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
1479; AVX2-NEXT:    andl $31, %ecx
1480; AVX2-NEXT:    vpextrb $15, %xmm0, (%rsp,%rcx)
1481; AVX2-NEXT:    vmovd %xmm1, %ecx
1482; AVX2-NEXT:    andl $1, %ecx
1483; AVX2-NEXT:    addq %rdx, %rcx
1484; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
1485; AVX2-NEXT:    andl $31, %edx
1486; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1487; AVX2-NEXT:    vpextrb $0, %xmm0, (%rsp,%rdx)
1488; AVX2-NEXT:    vpextrb $1, %xmm1, %edx
1489; AVX2-NEXT:    andl $1, %edx
1490; AVX2-NEXT:    addq %rcx, %rdx
1491; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
1492; AVX2-NEXT:    andl $31, %ecx
1493; AVX2-NEXT:    vpextrb $1, %xmm0, (%rsp,%rcx)
1494; AVX2-NEXT:    vpextrb $2, %xmm1, %ecx
1495; AVX2-NEXT:    andl $1, %ecx
1496; AVX2-NEXT:    addq %rdx, %rcx
1497; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
1498; AVX2-NEXT:    andl $31, %edx
1499; AVX2-NEXT:    vpextrb $2, %xmm0, (%rsp,%rdx)
1500; AVX2-NEXT:    vpextrb $3, %xmm1, %edx
1501; AVX2-NEXT:    andl $1, %edx
1502; AVX2-NEXT:    addq %rcx, %rdx
1503; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
1504; AVX2-NEXT:    andl $31, %ecx
1505; AVX2-NEXT:    vpextrb $3, %xmm0, (%rsp,%rcx)
1506; AVX2-NEXT:    vpextrb $4, %xmm1, %ecx
1507; AVX2-NEXT:    andl $1, %ecx
1508; AVX2-NEXT:    addq %rdx, %rcx
1509; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
1510; AVX2-NEXT:    andl $31, %edx
1511; AVX2-NEXT:    vpextrb $4, %xmm0, (%rsp,%rdx)
1512; AVX2-NEXT:    vpextrb $5, %xmm1, %edx
1513; AVX2-NEXT:    andl $1, %edx
1514; AVX2-NEXT:    addq %rcx, %rdx
1515; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
1516; AVX2-NEXT:    andl $31, %ecx
1517; AVX2-NEXT:    vpextrb $5, %xmm0, (%rsp,%rcx)
1518; AVX2-NEXT:    vpextrb $6, %xmm1, %ecx
1519; AVX2-NEXT:    andl $1, %ecx
1520; AVX2-NEXT:    addq %rdx, %rcx
1521; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
1522; AVX2-NEXT:    andl $31, %edx
1523; AVX2-NEXT:    vpextrb $6, %xmm0, (%rsp,%rdx)
1524; AVX2-NEXT:    vpextrb $7, %xmm1, %edx
1525; AVX2-NEXT:    andl $1, %edx
1526; AVX2-NEXT:    addq %rcx, %rdx
1527; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
1528; AVX2-NEXT:    andl $31, %ecx
1529; AVX2-NEXT:    vpextrb $7, %xmm0, (%rsp,%rcx)
1530; AVX2-NEXT:    vpextrb $8, %xmm1, %ecx
1531; AVX2-NEXT:    andl $1, %ecx
1532; AVX2-NEXT:    addq %rdx, %rcx
1533; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
1534; AVX2-NEXT:    andl $31, %edx
1535; AVX2-NEXT:    vpextrb $8, %xmm0, (%rsp,%rdx)
1536; AVX2-NEXT:    vpextrb $9, %xmm1, %edx
1537; AVX2-NEXT:    andl $1, %edx
1538; AVX2-NEXT:    addq %rcx, %rdx
1539; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
1540; AVX2-NEXT:    andl $31, %ecx
1541; AVX2-NEXT:    vpextrb $9, %xmm0, (%rsp,%rcx)
1542; AVX2-NEXT:    vpextrb $10, %xmm1, %ecx
1543; AVX2-NEXT:    andl $1, %ecx
1544; AVX2-NEXT:    addq %rdx, %rcx
1545; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
1546; AVX2-NEXT:    andl $31, %edx
1547; AVX2-NEXT:    vpextrb $10, %xmm0, (%rsp,%rdx)
1548; AVX2-NEXT:    vpextrb $11, %xmm1, %edx
1549; AVX2-NEXT:    andl $1, %edx
1550; AVX2-NEXT:    addq %rcx, %rdx
1551; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
1552; AVX2-NEXT:    andl $31, %ecx
1553; AVX2-NEXT:    vpextrb $11, %xmm0, (%rsp,%rcx)
1554; AVX2-NEXT:    vpextrb $12, %xmm1, %ecx
1555; AVX2-NEXT:    andl $1, %ecx
1556; AVX2-NEXT:    addq %rdx, %rcx
1557; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
1558; AVX2-NEXT:    andl $31, %edx
1559; AVX2-NEXT:    vpextrb $12, %xmm0, (%rsp,%rdx)
1560; AVX2-NEXT:    vpextrb $13, %xmm1, %edx
1561; AVX2-NEXT:    andl $1, %edx
1562; AVX2-NEXT:    addq %rcx, %rdx
1563; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
1564; AVX2-NEXT:    andl $31, %ecx
1565; AVX2-NEXT:    vpextrb $13, %xmm0, (%rsp,%rcx)
1566; AVX2-NEXT:    vpextrb $14, %xmm1, %ecx
1567; AVX2-NEXT:    andl $1, %ecx
1568; AVX2-NEXT:    addq %rdx, %rcx
1569; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
1570; AVX2-NEXT:    andl $31, %edx
1571; AVX2-NEXT:    vpextrb $14, %xmm0, (%rsp,%rdx)
1572; AVX2-NEXT:    vpextrb $15, %xmm1, %edx
1573; AVX2-NEXT:    andl $1, %edx
1574; AVX2-NEXT:    addq %rcx, %rdx
1575; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
1576; AVX2-NEXT:    andl $31, %ecx
1577; AVX2-NEXT:    vpextrb $15, %xmm0, (%rsp,%rcx)
1578; AVX2-NEXT:    cmpq $31, %rdx
1579; AVX2-NEXT:    movl $31, %ecx
1580; AVX2-NEXT:    cmovbq %rdx, %rcx
1581; AVX2-NEXT:    vpextrb $15, %xmm0, %edx
1582; AVX2-NEXT:    cmovbel %eax, %edx
1583; AVX2-NEXT:    movb %dl, (%rsp,%rcx)
1584; AVX2-NEXT:    vmovaps (%rsp), %ymm0
1585; AVX2-NEXT:    movq %rbp, %rsp
1586; AVX2-NEXT:    popq %rbp
1587; AVX2-NEXT:    retq
1588;
1589; AVX512F-LABEL: test_compress_v32i8:
1590; AVX512F:       # %bb.0:
1591; AVX512F-NEXT:    pushq %rbp
1592; AVX512F-NEXT:    movq %rsp, %rbp
1593; AVX512F-NEXT:    andq $-32, %rsp
1594; AVX512F-NEXT:    subq $64, %rsp
1595; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm3
1596; AVX512F-NEXT:    vpmovsxbd %xmm3, %zmm3
1597; AVX512F-NEXT:    vpslld $31, %zmm3, %zmm3
1598; AVX512F-NEXT:    vptestmd %zmm3, %zmm3, %k1
1599; AVX512F-NEXT:    vpmovsxbd %xmm1, %zmm3
1600; AVX512F-NEXT:    vpslld $31, %zmm3, %zmm3
1601; AVX512F-NEXT:    vptestmd %zmm3, %zmm3, %k2
1602; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1603; AVX512F-NEXT:    vpcompressd %zmm3, %zmm3 {%k2} {z}
1604; AVX512F-NEXT:    vpmovdb %zmm3, (%rsp)
1605; AVX512F-NEXT:    kshiftrw $8, %k2, %k0
1606; AVX512F-NEXT:    kxorw %k0, %k2, %k0
1607; AVX512F-NEXT:    kshiftrw $4, %k0, %k2
1608; AVX512F-NEXT:    kxorw %k2, %k0, %k0
1609; AVX512F-NEXT:    kshiftrw $2, %k0, %k2
1610; AVX512F-NEXT:    kxorw %k2, %k0, %k0
1611; AVX512F-NEXT:    kshiftrw $1, %k0, %k2
1612; AVX512F-NEXT:    kxorw %k2, %k0, %k0
1613; AVX512F-NEXT:    kmovw %k0, %eax
1614; AVX512F-NEXT:    andl $31, %eax
1615; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
1616; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1617; AVX512F-NEXT:    vpcompressd %zmm0, %zmm0 {%k1} {z}
1618; AVX512F-NEXT:    vpmovdb %zmm0, (%rsp,%rax)
1619; AVX512F-NEXT:    vpsllw $7, %ymm1, %ymm0
1620; AVX512F-NEXT:    vpblendvb %ymm0, (%rsp), %ymm2, %ymm0
1621; AVX512F-NEXT:    movq %rbp, %rsp
1622; AVX512F-NEXT:    popq %rbp
1623; AVX512F-NEXT:    retq
1624;
1625; AVX512VL-LABEL: test_compress_v32i8:
1626; AVX512VL:       # %bb.0:
1627; AVX512VL-NEXT:    vpsllw $7, %ymm1, %ymm1
1628; AVX512VL-NEXT:    vpmovb2m %ymm1, %k1
1629; AVX512VL-NEXT:    vpcompressb %ymm0, %ymm2 {%k1}
1630; AVX512VL-NEXT:    vmovdqa %ymm2, %ymm0
1631; AVX512VL-NEXT:    retq
1632    %out = call <32 x i8> @llvm.experimental.vector.compress(<32 x i8> %vec, <32 x i1> %mask, <32 x i8> %passthru)
1633    ret <32 x i8> %out
1634}
1635
1636define <16 x i16> @test_compress_v16i16(<16 x i16> %vec, <16 x i1> %mask, <16 x i16> %passthru) nounwind {
1637; AVX2-LABEL: test_compress_v16i16:
1638; AVX2:       # %bb.0:
1639; AVX2-NEXT:    pushq %rbp
1640; AVX2-NEXT:    movq %rsp, %rbp
1641; AVX2-NEXT:    pushq %r15
1642; AVX2-NEXT:    pushq %r14
1643; AVX2-NEXT:    pushq %r13
1644; AVX2-NEXT:    pushq %r12
1645; AVX2-NEXT:    pushq %rbx
1646; AVX2-NEXT:    andq $-32, %rsp
1647; AVX2-NEXT:    subq $96, %rsp
1648; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1649; AVX2-NEXT:    vpsllw $15, %ymm1, %ymm3
1650; AVX2-NEXT:    vpsraw $15, %ymm3, %ymm1
1651; AVX2-NEXT:    vmovaps %ymm2, (%rsp)
1652; AVX2-NEXT:    vpsrlw $15, %ymm3, %ymm2
1653; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
1654; AVX2-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
1655; AVX2-NEXT:    vpextrw $1, %xmm2, %eax
1656; AVX2-NEXT:    vmovd %xmm2, %ecx
1657; AVX2-NEXT:    addl %eax, %ecx
1658; AVX2-NEXT:    vpextrw $2, %xmm2, %eax
1659; AVX2-NEXT:    vpextrw $3, %xmm2, %edx
1660; AVX2-NEXT:    addl %eax, %edx
1661; AVX2-NEXT:    addl %ecx, %edx
1662; AVX2-NEXT:    vpextrw $4, %xmm2, %eax
1663; AVX2-NEXT:    vpextrw $5, %xmm2, %ecx
1664; AVX2-NEXT:    addl %eax, %ecx
1665; AVX2-NEXT:    vpextrw $6, %xmm2, %eax
1666; AVX2-NEXT:    addl %ecx, %eax
1667; AVX2-NEXT:    addl %edx, %eax
1668; AVX2-NEXT:    vpextrw $7, %xmm2, %ecx
1669; AVX2-NEXT:    addl %eax, %ecx
1670; AVX2-NEXT:    andl $15, %ecx
1671; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1672; AVX2-NEXT:    vpextrw $1, %xmm1, %eax
1673; AVX2-NEXT:    andl $1, %eax
1674; AVX2-NEXT:    vmovd %xmm1, %ecx
1675; AVX2-NEXT:    andl $1, %ecx
1676; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1677; AVX2-NEXT:    addq %rcx, %rax
1678; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1679; AVX2-NEXT:    vpextrw $2, %xmm1, %ecx
1680; AVX2-NEXT:    andl $1, %ecx
1681; AVX2-NEXT:    addq %rax, %rcx
1682; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1683; AVX2-NEXT:    vpextrw $3, %xmm1, %eax
1684; AVX2-NEXT:    andl $1, %eax
1685; AVX2-NEXT:    addq %rcx, %rax
1686; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1687; AVX2-NEXT:    vpextrw $4, %xmm1, %r8d
1688; AVX2-NEXT:    andl $1, %r8d
1689; AVX2-NEXT:    addq %rax, %r8
1690; AVX2-NEXT:    vpextrw $5, %xmm1, %r9d
1691; AVX2-NEXT:    andl $1, %r9d
1692; AVX2-NEXT:    addq %r8, %r9
1693; AVX2-NEXT:    vpextrw $6, %xmm1, %r10d
1694; AVX2-NEXT:    andl $1, %r10d
1695; AVX2-NEXT:    addq %r9, %r10
1696; AVX2-NEXT:    vpextrw $7, %xmm1, %r11d
1697; AVX2-NEXT:    andl $1, %r11d
1698; AVX2-NEXT:    addq %r10, %r11
1699; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
1700; AVX2-NEXT:    vmovd %xmm1, %ebx
1701; AVX2-NEXT:    andl $1, %ebx
1702; AVX2-NEXT:    addq %r11, %rbx
1703; AVX2-NEXT:    vpextrw $1, %xmm1, %r14d
1704; AVX2-NEXT:    andl $1, %r14d
1705; AVX2-NEXT:    addq %rbx, %r14
1706; AVX2-NEXT:    vpextrw $2, %xmm1, %r15d
1707; AVX2-NEXT:    andl $1, %r15d
1708; AVX2-NEXT:    addq %r14, %r15
1709; AVX2-NEXT:    vpextrw $3, %xmm1, %r12d
1710; AVX2-NEXT:    andl $1, %r12d
1711; AVX2-NEXT:    addq %r15, %r12
1712; AVX2-NEXT:    vpextrw $4, %xmm1, %r13d
1713; AVX2-NEXT:    andl $1, %r13d
1714; AVX2-NEXT:    addq %r12, %r13
1715; AVX2-NEXT:    vpextrw $5, %xmm1, %edx
1716; AVX2-NEXT:    andl $1, %edx
1717; AVX2-NEXT:    addq %r13, %rdx
1718; AVX2-NEXT:    vpextrw $6, %xmm1, %ecx
1719; AVX2-NEXT:    andl $1, %ecx
1720; AVX2-NEXT:    addq %rdx, %rcx
1721; AVX2-NEXT:    vpextrw $7, %xmm1, %edi
1722; AVX2-NEXT:    andl $1, %edi
1723; AVX2-NEXT:    addq %rcx, %rdi
1724; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1725; AVX2-NEXT:    cmpq $16, %rdi
1726; AVX2-NEXT:    vpextrw $7, %xmm1, %eax
1727; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
1728; AVX2-NEXT:    cmovbw (%rsp,%rsi,2), %ax
1729; AVX2-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1730; AVX2-NEXT:    vpextrw $0, %xmm0, (%rsp)
1731; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
1732; AVX2-NEXT:    vpextrw $1, %xmm0, (%rsp,%rsi,2)
1733; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
1734; AVX2-NEXT:    vpextrw $2, %xmm0, (%rsp,%rsi,2)
1735; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
1736; AVX2-NEXT:    vpextrw $3, %xmm0, (%rsp,%rsi,2)
1737; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
1738; AVX2-NEXT:    vpextrw $4, %xmm0, (%rsp,%rax,2)
1739; AVX2-NEXT:    andl $15, %r8d
1740; AVX2-NEXT:    vpextrw $5, %xmm0, (%rsp,%r8,2)
1741; AVX2-NEXT:    andl $15, %r9d
1742; AVX2-NEXT:    vpextrw $6, %xmm0, (%rsp,%r9,2)
1743; AVX2-NEXT:    andl $15, %r10d
1744; AVX2-NEXT:    vpextrw $7, %xmm0, (%rsp,%r10,2)
1745; AVX2-NEXT:    andl $15, %r11d
1746; AVX2-NEXT:    vpextrw $0, %xmm1, (%rsp,%r11,2)
1747; AVX2-NEXT:    andl $15, %ebx
1748; AVX2-NEXT:    vpextrw $1, %xmm1, (%rsp,%rbx,2)
1749; AVX2-NEXT:    andl $15, %r14d
1750; AVX2-NEXT:    vpextrw $2, %xmm1, (%rsp,%r14,2)
1751; AVX2-NEXT:    andl $15, %r15d
1752; AVX2-NEXT:    vpextrw $3, %xmm1, (%rsp,%r15,2)
1753; AVX2-NEXT:    andl $15, %r12d
1754; AVX2-NEXT:    vpextrw $4, %xmm1, (%rsp,%r12,2)
1755; AVX2-NEXT:    andl $15, %r13d
1756; AVX2-NEXT:    vpextrw $5, %xmm1, (%rsp,%r13,2)
1757; AVX2-NEXT:    andl $15, %edx
1758; AVX2-NEXT:    vpextrw $6, %xmm1, (%rsp,%rdx,2)
1759; AVX2-NEXT:    andl $15, %ecx
1760; AVX2-NEXT:    vpextrw $7, %xmm1, (%rsp,%rcx,2)
1761; AVX2-NEXT:    cmpq $15, %rdi
1762; AVX2-NEXT:    movl $15, %eax
1763; AVX2-NEXT:    cmovbq %rdi, %rax
1764; AVX2-NEXT:    movl %eax, %eax
1765; AVX2-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
1766; AVX2-NEXT:    movw %cx, (%rsp,%rax,2)
1767; AVX2-NEXT:    vmovaps (%rsp), %ymm0
1768; AVX2-NEXT:    leaq -40(%rbp), %rsp
1769; AVX2-NEXT:    popq %rbx
1770; AVX2-NEXT:    popq %r12
1771; AVX2-NEXT:    popq %r13
1772; AVX2-NEXT:    popq %r14
1773; AVX2-NEXT:    popq %r15
1774; AVX2-NEXT:    popq %rbp
1775; AVX2-NEXT:    retq
1776;
1777; AVX512F-LABEL: test_compress_v16i16:
1778; AVX512F:       # %bb.0:
1779; AVX512F-NEXT:    vpmovsxbd %xmm1, %zmm1
1780; AVX512F-NEXT:    vpslld $31, %zmm1, %zmm1
1781; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k1
1782; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1783; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
1784; AVX512F-NEXT:    vpcompressd %zmm0, %zmm1 {%k1}
1785; AVX512F-NEXT:    vpmovdw %zmm1, %ymm0
1786; AVX512F-NEXT:    retq
1787;
1788; AVX512VL-LABEL: test_compress_v16i16:
1789; AVX512VL:       # %bb.0:
1790; AVX512VL-NEXT:    vpsllw $7, %xmm1, %xmm1
1791; AVX512VL-NEXT:    vpmovb2m %xmm1, %k1
1792; AVX512VL-NEXT:    vpcompressw %ymm0, %ymm2 {%k1}
1793; AVX512VL-NEXT:    vmovdqa %ymm2, %ymm0
1794; AVX512VL-NEXT:    retq
1795    %out = call <16 x i16> @llvm.experimental.vector.compress(<16 x i16> %vec, <16 x i1> %mask, <16 x i16> %passthru)
1796    ret <16 x i16> %out
1797}
1798
1799define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8> %passthru) nounwind {
1800; AVX2-LABEL: test_compress_v64i8:
1801; AVX2:       # %bb.0:
1802; AVX2-NEXT:    pushq %rbp
1803; AVX2-NEXT:    movq %rsp, %rbp
1804; AVX2-NEXT:    pushq %r15
1805; AVX2-NEXT:    pushq %r14
1806; AVX2-NEXT:    pushq %r13
1807; AVX2-NEXT:    pushq %r12
1808; AVX2-NEXT:    pushq %rbx
1809; AVX2-NEXT:    andq $-32, %rsp
1810; AVX2-NEXT:    subq $128, %rsp
1811; AVX2-NEXT:    # kill: def $r9d killed $r9d def $r9
1812; AVX2-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1813; AVX2-NEXT:    # kill: def $r8d killed $r8d def $r8
1814; AVX2-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1815; AVX2-NEXT:    movl %ecx, %r13d
1816; AVX2-NEXT:    movl %edx, %r15d
1817; AVX2-NEXT:    movl %esi, %ebx
1818; AVX2-NEXT:    # kill: def $edi killed $edi def $rdi
1819; AVX2-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1820; AVX2-NEXT:    movl 360(%rbp), %eax
1821; AVX2-NEXT:    movl 352(%rbp), %ecx
1822; AVX2-NEXT:    vmovd %ecx, %xmm4
1823; AVX2-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
1824; AVX2-NEXT:    movl 368(%rbp), %eax
1825; AVX2-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
1826; AVX2-NEXT:    movl 376(%rbp), %eax
1827; AVX2-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
1828; AVX2-NEXT:    movl 384(%rbp), %eax
1829; AVX2-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
1830; AVX2-NEXT:    movl 392(%rbp), %eax
1831; AVX2-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
1832; AVX2-NEXT:    movl 400(%rbp), %eax
1833; AVX2-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
1834; AVX2-NEXT:    movl 408(%rbp), %eax
1835; AVX2-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
1836; AVX2-NEXT:    movl 416(%rbp), %eax
1837; AVX2-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
1838; AVX2-NEXT:    movl 424(%rbp), %eax
1839; AVX2-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
1840; AVX2-NEXT:    movl 432(%rbp), %eax
1841; AVX2-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
1842; AVX2-NEXT:    movl 440(%rbp), %eax
1843; AVX2-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
1844; AVX2-NEXT:    movl 448(%rbp), %eax
1845; AVX2-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
1846; AVX2-NEXT:    movl 456(%rbp), %eax
1847; AVX2-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
1848; AVX2-NEXT:    movl 464(%rbp), %eax
1849; AVX2-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm4
1850; AVX2-NEXT:    movl 472(%rbp), %eax
1851; AVX2-NEXT:    vpinsrb $15, %eax, %xmm4, %xmm4
1852; AVX2-NEXT:    movl 224(%rbp), %eax
1853; AVX2-NEXT:    vmovd %eax, %xmm5
1854; AVX2-NEXT:    movl 232(%rbp), %eax
1855; AVX2-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
1856; AVX2-NEXT:    movl 240(%rbp), %eax
1857; AVX2-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
1858; AVX2-NEXT:    movl 248(%rbp), %eax
1859; AVX2-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
1860; AVX2-NEXT:    movl 256(%rbp), %eax
1861; AVX2-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
1862; AVX2-NEXT:    movl 264(%rbp), %eax
1863; AVX2-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
1864; AVX2-NEXT:    movl 272(%rbp), %eax
1865; AVX2-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
1866; AVX2-NEXT:    movl 280(%rbp), %eax
1867; AVX2-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
1868; AVX2-NEXT:    movl 288(%rbp), %eax
1869; AVX2-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
1870; AVX2-NEXT:    movl 296(%rbp), %eax
1871; AVX2-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
1872; AVX2-NEXT:    movl 304(%rbp), %eax
1873; AVX2-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
1874; AVX2-NEXT:    movl 312(%rbp), %eax
1875; AVX2-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
1876; AVX2-NEXT:    movl 320(%rbp), %eax
1877; AVX2-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
1878; AVX2-NEXT:    movl 328(%rbp), %eax
1879; AVX2-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
1880; AVX2-NEXT:    movl 336(%rbp), %eax
1881; AVX2-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm5
1882; AVX2-NEXT:    movl 344(%rbp), %eax
1883; AVX2-NEXT:    vpinsrb $15, %eax, %xmm5, %xmm5
1884; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
1885; AVX2-NEXT:    movl 96(%rbp), %eax
1886; AVX2-NEXT:    vmovd %eax, %xmm5
1887; AVX2-NEXT:    movl 104(%rbp), %eax
1888; AVX2-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
1889; AVX2-NEXT:    movl 112(%rbp), %eax
1890; AVX2-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
1891; AVX2-NEXT:    movl 120(%rbp), %eax
1892; AVX2-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
1893; AVX2-NEXT:    movl 128(%rbp), %eax
1894; AVX2-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
1895; AVX2-NEXT:    movl 136(%rbp), %eax
1896; AVX2-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
1897; AVX2-NEXT:    movl 144(%rbp), %eax
1898; AVX2-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
1899; AVX2-NEXT:    movl 152(%rbp), %eax
1900; AVX2-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
1901; AVX2-NEXT:    movl 160(%rbp), %eax
1902; AVX2-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
1903; AVX2-NEXT:    movl 168(%rbp), %eax
1904; AVX2-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
1905; AVX2-NEXT:    movl 176(%rbp), %eax
1906; AVX2-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
1907; AVX2-NEXT:    movl 184(%rbp), %eax
1908; AVX2-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
1909; AVX2-NEXT:    movl 192(%rbp), %eax
1910; AVX2-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
1911; AVX2-NEXT:    movl 200(%rbp), %eax
1912; AVX2-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
1913; AVX2-NEXT:    movl 208(%rbp), %eax
1914; AVX2-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm5
1915; AVX2-NEXT:    movl 216(%rbp), %eax
1916; AVX2-NEXT:    vpinsrb $15, %eax, %xmm5, %xmm5
1917; AVX2-NEXT:    vmovd %edi, %xmm6
1918; AVX2-NEXT:    vpinsrb $1, %esi, %xmm6, %xmm6
1919; AVX2-NEXT:    vpinsrb $2, %edx, %xmm6, %xmm6
1920; AVX2-NEXT:    vpinsrb $3, %r13d, %xmm6, %xmm6
1921; AVX2-NEXT:    vpinsrb $4, %r8d, %xmm6, %xmm6
1922; AVX2-NEXT:    vpinsrb $5, %r9d, %xmm6, %xmm6
1923; AVX2-NEXT:    movl 16(%rbp), %esi
1924; AVX2-NEXT:    vpinsrb $6, %esi, %xmm6, %xmm6
1925; AVX2-NEXT:    movl 24(%rbp), %edi
1926; AVX2-NEXT:    vpinsrb $7, %edi, %xmm6, %xmm6
1927; AVX2-NEXT:    movl 32(%rbp), %r8d
1928; AVX2-NEXT:    vpinsrb $8, %r8d, %xmm6, %xmm6
1929; AVX2-NEXT:    movl 40(%rbp), %r9d
1930; AVX2-NEXT:    vpinsrb $9, %r9d, %xmm6, %xmm6
1931; AVX2-NEXT:    movl 48(%rbp), %r10d
1932; AVX2-NEXT:    vpinsrb $10, %r10d, %xmm6, %xmm6
1933; AVX2-NEXT:    movl 56(%rbp), %r11d
1934; AVX2-NEXT:    vpinsrb $11, %r11d, %xmm6, %xmm6
1935; AVX2-NEXT:    movl 64(%rbp), %r14d
1936; AVX2-NEXT:    vpinsrb $12, %r14d, %xmm6, %xmm6
1937; AVX2-NEXT:    movl 72(%rbp), %r12d
1938; AVX2-NEXT:    vpinsrb $13, %r12d, %xmm6, %xmm6
1939; AVX2-NEXT:    movl 80(%rbp), %eax
1940; AVX2-NEXT:    vpinsrb $14, %eax, %xmm6, %xmm6
1941; AVX2-NEXT:    movl 88(%rbp), %eax
1942; AVX2-NEXT:    vpinsrb $15, %eax, %xmm6, %xmm6
1943; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm6, %ymm5
1944; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1945; AVX2-NEXT:    vpand %ymm6, %ymm5, %ymm5
1946; AVX2-NEXT:    vpand %ymm6, %ymm4, %ymm4
1947; AVX2-NEXT:    vpaddb %ymm4, %ymm5, %ymm4
1948; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm5
1949; AVX2-NEXT:    vpaddb %xmm5, %xmm4, %xmm4
1950; AVX2-NEXT:    vpextrb $1, %xmm4, %eax
1951; AVX2-NEXT:    vmovd %xmm4, %ecx
1952; AVX2-NEXT:    addb %al, %cl
1953; AVX2-NEXT:    vpextrb $2, %xmm4, %edx
1954; AVX2-NEXT:    vpextrb $3, %xmm4, %eax
1955; AVX2-NEXT:    addb %dl, %al
1956; AVX2-NEXT:    addb %cl, %al
1957; AVX2-NEXT:    vpextrb $4, %xmm4, %ecx
1958; AVX2-NEXT:    vpextrb $5, %xmm4, %edx
1959; AVX2-NEXT:    addb %cl, %dl
1960; AVX2-NEXT:    vpextrb $6, %xmm4, %ecx
1961; AVX2-NEXT:    addb %dl, %cl
1962; AVX2-NEXT:    addb %al, %cl
1963; AVX2-NEXT:    vpextrb $7, %xmm4, %eax
1964; AVX2-NEXT:    vpextrb $8, %xmm4, %edx
1965; AVX2-NEXT:    addb %al, %dl
1966; AVX2-NEXT:    vpextrb $9, %xmm4, %eax
1967; AVX2-NEXT:    addb %dl, %al
1968; AVX2-NEXT:    vpextrb $10, %xmm4, %edx
1969; AVX2-NEXT:    addb %al, %dl
1970; AVX2-NEXT:    addb %cl, %dl
1971; AVX2-NEXT:    vpextrb $11, %xmm4, %eax
1972; AVX2-NEXT:    vpextrb $12, %xmm4, %ecx
1973; AVX2-NEXT:    addb %al, %cl
1974; AVX2-NEXT:    vpextrb $13, %xmm4, %eax
1975; AVX2-NEXT:    addb %cl, %al
1976; AVX2-NEXT:    vpextrb $14, %xmm4, %ecx
1977; AVX2-NEXT:    addb %al, %cl
1978; AVX2-NEXT:    vpextrb $15, %xmm4, %eax
1979; AVX2-NEXT:    addb %cl, %al
1980; AVX2-NEXT:    addb %dl, %al
1981; AVX2-NEXT:    vmovaps %ymm3, {{[0-9]+}}(%rsp)
1982; AVX2-NEXT:    vmovaps %ymm2, (%rsp)
1983; AVX2-NEXT:    movzbl %al, %eax
1984; AVX2-NEXT:    andl $63, %eax
1985; AVX2-NEXT:    movzbl (%rsp,%rax), %eax
1986; AVX2-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1987; AVX2-NEXT:    vpextrb $0, %xmm0, (%rsp)
1988; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
1989; AVX2-NEXT:    andl $1, %eax
1990; AVX2-NEXT:    vpextrb $1, %xmm0, (%rsp,%rax)
1991; AVX2-NEXT:    andl $1, %ebx
1992; AVX2-NEXT:    addq %rax, %rbx
1993; AVX2-NEXT:    vpextrb $2, %xmm0, (%rsp,%rbx)
1994; AVX2-NEXT:    andl $1, %r15d
1995; AVX2-NEXT:    addq %rbx, %r15
1996; AVX2-NEXT:    vpextrb $3, %xmm0, (%rsp,%r15)
1997; AVX2-NEXT:    andl $1, %r13d
1998; AVX2-NEXT:    addq %r15, %r13
1999; AVX2-NEXT:    vpextrb $4, %xmm0, (%rsp,%r13)
2000; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
2001; AVX2-NEXT:    andl $1, %ecx
2002; AVX2-NEXT:    addq %r13, %rcx
2003; AVX2-NEXT:    movl %ecx, %eax
2004; AVX2-NEXT:    vpextrb $5, %xmm0, (%rsp,%rax)
2005; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
2006; AVX2-NEXT:    andl $1, %eax
2007; AVX2-NEXT:    addq %rcx, %rax
2008; AVX2-NEXT:    andl $1, %esi
2009; AVX2-NEXT:    addq %rax, %rsi
2010; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
2011; AVX2-NEXT:    andl $63, %eax
2012; AVX2-NEXT:    vpextrb $6, %xmm0, (%rsp,%rax)
2013; AVX2-NEXT:    andl $1, %edi
2014; AVX2-NEXT:    addq %rsi, %rdi
2015; AVX2-NEXT:    # kill: def $esi killed $esi killed $rsi def $rsi
2016; AVX2-NEXT:    andl $63, %esi
2017; AVX2-NEXT:    vpextrb $7, %xmm0, (%rsp,%rsi)
2018; AVX2-NEXT:    andl $1, %r8d
2019; AVX2-NEXT:    addq %rdi, %r8
2020; AVX2-NEXT:    # kill: def $edi killed $edi killed $rdi def $rdi
2021; AVX2-NEXT:    andl $63, %edi
2022; AVX2-NEXT:    vpextrb $8, %xmm0, (%rsp,%rdi)
2023; AVX2-NEXT:    andl $1, %r9d
2024; AVX2-NEXT:    addq %r8, %r9
2025; AVX2-NEXT:    # kill: def $r8d killed $r8d killed $r8 def $r8
2026; AVX2-NEXT:    andl $63, %r8d
2027; AVX2-NEXT:    vpextrb $9, %xmm0, (%rsp,%r8)
2028; AVX2-NEXT:    andl $1, %r10d
2029; AVX2-NEXT:    addq %r9, %r10
2030; AVX2-NEXT:    # kill: def $r9d killed $r9d killed $r9 def $r9
2031; AVX2-NEXT:    andl $63, %r9d
2032; AVX2-NEXT:    vpextrb $10, %xmm0, (%rsp,%r9)
2033; AVX2-NEXT:    andl $1, %r11d
2034; AVX2-NEXT:    addq %r10, %r11
2035; AVX2-NEXT:    # kill: def $r10d killed $r10d killed $r10 def $r10
2036; AVX2-NEXT:    andl $63, %r10d
2037; AVX2-NEXT:    vpextrb $11, %xmm0, (%rsp,%r10)
2038; AVX2-NEXT:    andl $1, %r14d
2039; AVX2-NEXT:    addq %r11, %r14
2040; AVX2-NEXT:    # kill: def $r11d killed $r11d killed $r11 def $r11
2041; AVX2-NEXT:    andl $63, %r11d
2042; AVX2-NEXT:    vpextrb $12, %xmm0, (%rsp,%r11)
2043; AVX2-NEXT:    andl $1, %r12d
2044; AVX2-NEXT:    addq %r14, %r12
2045; AVX2-NEXT:    # kill: def $r14d killed $r14d killed $r14 def $r14
2046; AVX2-NEXT:    andl $63, %r14d
2047; AVX2-NEXT:    vpextrb $13, %xmm0, (%rsp,%r14)
2048; AVX2-NEXT:    movl 80(%rbp), %eax
2049; AVX2-NEXT:    andl $1, %eax
2050; AVX2-NEXT:    addq %r12, %rax
2051; AVX2-NEXT:    # kill: def $r12d killed $r12d killed $r12 def $r12
2052; AVX2-NEXT:    andl $63, %r12d
2053; AVX2-NEXT:    vpextrb $14, %xmm0, (%rsp,%r12)
2054; AVX2-NEXT:    movl 88(%rbp), %ecx
2055; AVX2-NEXT:    andl $1, %ecx
2056; AVX2-NEXT:    addq %rax, %rcx
2057; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
2058; AVX2-NEXT:    andl $63, %eax
2059; AVX2-NEXT:    vpextrb $15, %xmm0, (%rsp,%rax)
2060; AVX2-NEXT:    movl 96(%rbp), %edx
2061; AVX2-NEXT:    andl $1, %edx
2062; AVX2-NEXT:    addq %rcx, %rdx
2063; AVX2-NEXT:    movl %ecx, %eax
2064; AVX2-NEXT:    andl $63, %eax
2065; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
2066; AVX2-NEXT:    vpextrb $0, %xmm0, (%rsp,%rax)
2067; AVX2-NEXT:    movl 104(%rbp), %ecx
2068; AVX2-NEXT:    andl $1, %ecx
2069; AVX2-NEXT:    addq %rdx, %rcx
2070; AVX2-NEXT:    movl %edx, %eax
2071; AVX2-NEXT:    andl $63, %eax
2072; AVX2-NEXT:    vpextrb $1, %xmm0, (%rsp,%rax)
2073; AVX2-NEXT:    movl 112(%rbp), %edx
2074; AVX2-NEXT:    andl $1, %edx
2075; AVX2-NEXT:    addq %rcx, %rdx
2076; AVX2-NEXT:    movl %ecx, %eax
2077; AVX2-NEXT:    andl $63, %eax
2078; AVX2-NEXT:    vpextrb $2, %xmm0, (%rsp,%rax)
2079; AVX2-NEXT:    movl 120(%rbp), %ecx
2080; AVX2-NEXT:    andl $1, %ecx
2081; AVX2-NEXT:    addq %rdx, %rcx
2082; AVX2-NEXT:    movl %edx, %eax
2083; AVX2-NEXT:    andl $63, %eax
2084; AVX2-NEXT:    vpextrb $3, %xmm0, (%rsp,%rax)
2085; AVX2-NEXT:    movl 128(%rbp), %edx
2086; AVX2-NEXT:    andl $1, %edx
2087; AVX2-NEXT:    addq %rcx, %rdx
2088; AVX2-NEXT:    movl %ecx, %eax
2089; AVX2-NEXT:    andl $63, %eax
2090; AVX2-NEXT:    vpextrb $4, %xmm0, (%rsp,%rax)
2091; AVX2-NEXT:    movl 136(%rbp), %ecx
2092; AVX2-NEXT:    andl $1, %ecx
2093; AVX2-NEXT:    addq %rdx, %rcx
2094; AVX2-NEXT:    movl %edx, %eax
2095; AVX2-NEXT:    andl $63, %eax
2096; AVX2-NEXT:    vpextrb $5, %xmm0, (%rsp,%rax)
2097; AVX2-NEXT:    movl 144(%rbp), %edx
2098; AVX2-NEXT:    andl $1, %edx
2099; AVX2-NEXT:    addq %rcx, %rdx
2100; AVX2-NEXT:    movl %ecx, %eax
2101; AVX2-NEXT:    andl $63, %eax
2102; AVX2-NEXT:    vpextrb $6, %xmm0, (%rsp,%rax)
2103; AVX2-NEXT:    movl 152(%rbp), %ecx
2104; AVX2-NEXT:    andl $1, %ecx
2105; AVX2-NEXT:    addq %rdx, %rcx
2106; AVX2-NEXT:    movl %edx, %eax
2107; AVX2-NEXT:    andl $63, %eax
2108; AVX2-NEXT:    vpextrb $7, %xmm0, (%rsp,%rax)
2109; AVX2-NEXT:    movl 160(%rbp), %edx
2110; AVX2-NEXT:    andl $1, %edx
2111; AVX2-NEXT:    addq %rcx, %rdx
2112; AVX2-NEXT:    movl %ecx, %eax
2113; AVX2-NEXT:    andl $63, %eax
2114; AVX2-NEXT:    vpextrb $8, %xmm0, (%rsp,%rax)
2115; AVX2-NEXT:    movl 168(%rbp), %ecx
2116; AVX2-NEXT:    andl $1, %ecx
2117; AVX2-NEXT:    addq %rdx, %rcx
2118; AVX2-NEXT:    movl %edx, %eax
2119; AVX2-NEXT:    andl $63, %eax
2120; AVX2-NEXT:    vpextrb $9, %xmm0, (%rsp,%rax)
2121; AVX2-NEXT:    movl 176(%rbp), %edx
2122; AVX2-NEXT:    andl $1, %edx
2123; AVX2-NEXT:    addq %rcx, %rdx
2124; AVX2-NEXT:    movl %ecx, %eax
2125; AVX2-NEXT:    andl $63, %eax
2126; AVX2-NEXT:    vpextrb $10, %xmm0, (%rsp,%rax)
2127; AVX2-NEXT:    movl 184(%rbp), %ecx
2128; AVX2-NEXT:    andl $1, %ecx
2129; AVX2-NEXT:    addq %rdx, %rcx
2130; AVX2-NEXT:    movl %edx, %eax
2131; AVX2-NEXT:    andl $63, %eax
2132; AVX2-NEXT:    vpextrb $11, %xmm0, (%rsp,%rax)
2133; AVX2-NEXT:    movl 192(%rbp), %edx
2134; AVX2-NEXT:    andl $1, %edx
2135; AVX2-NEXT:    addq %rcx, %rdx
2136; AVX2-NEXT:    movl %ecx, %eax
2137; AVX2-NEXT:    andl $63, %eax
2138; AVX2-NEXT:    vpextrb $12, %xmm0, (%rsp,%rax)
2139; AVX2-NEXT:    movl 200(%rbp), %ecx
2140; AVX2-NEXT:    andl $1, %ecx
2141; AVX2-NEXT:    addq %rdx, %rcx
2142; AVX2-NEXT:    movl %edx, %eax
2143; AVX2-NEXT:    andl $63, %eax
2144; AVX2-NEXT:    vpextrb $13, %xmm0, (%rsp,%rax)
2145; AVX2-NEXT:    movl 208(%rbp), %edx
2146; AVX2-NEXT:    andl $1, %edx
2147; AVX2-NEXT:    addq %rcx, %rdx
2148; AVX2-NEXT:    movl %ecx, %eax
2149; AVX2-NEXT:    andl $63, %eax
2150; AVX2-NEXT:    vpextrb $14, %xmm0, (%rsp,%rax)
2151; AVX2-NEXT:    movl 216(%rbp), %ecx
2152; AVX2-NEXT:    andl $1, %ecx
2153; AVX2-NEXT:    addq %rdx, %rcx
2154; AVX2-NEXT:    movl %edx, %eax
2155; AVX2-NEXT:    andl $63, %eax
2156; AVX2-NEXT:    vpextrb $15, %xmm0, (%rsp,%rax)
2157; AVX2-NEXT:    movl 224(%rbp), %edx
2158; AVX2-NEXT:    andl $1, %edx
2159; AVX2-NEXT:    addq %rcx, %rdx
2160; AVX2-NEXT:    movl %ecx, %eax
2161; AVX2-NEXT:    andl $63, %eax
2162; AVX2-NEXT:    vpextrb $0, %xmm1, (%rsp,%rax)
2163; AVX2-NEXT:    movl 232(%rbp), %ecx
2164; AVX2-NEXT:    andl $1, %ecx
2165; AVX2-NEXT:    addq %rdx, %rcx
2166; AVX2-NEXT:    movl %edx, %eax
2167; AVX2-NEXT:    andl $63, %eax
2168; AVX2-NEXT:    vpextrb $1, %xmm1, (%rsp,%rax)
2169; AVX2-NEXT:    movl 240(%rbp), %edx
2170; AVX2-NEXT:    andl $1, %edx
2171; AVX2-NEXT:    addq %rcx, %rdx
2172; AVX2-NEXT:    movl %ecx, %eax
2173; AVX2-NEXT:    andl $63, %eax
2174; AVX2-NEXT:    vpextrb $2, %xmm1, (%rsp,%rax)
2175; AVX2-NEXT:    movl 248(%rbp), %ecx
2176; AVX2-NEXT:    andl $1, %ecx
2177; AVX2-NEXT:    addq %rdx, %rcx
2178; AVX2-NEXT:    movl %edx, %eax
2179; AVX2-NEXT:    andl $63, %eax
2180; AVX2-NEXT:    vpextrb $3, %xmm1, (%rsp,%rax)
2181; AVX2-NEXT:    movl 256(%rbp), %edx
2182; AVX2-NEXT:    andl $1, %edx
2183; AVX2-NEXT:    addq %rcx, %rdx
2184; AVX2-NEXT:    movl %ecx, %eax
2185; AVX2-NEXT:    andl $63, %eax
2186; AVX2-NEXT:    vpextrb $4, %xmm1, (%rsp,%rax)
2187; AVX2-NEXT:    movl 264(%rbp), %ecx
2188; AVX2-NEXT:    andl $1, %ecx
2189; AVX2-NEXT:    addq %rdx, %rcx
2190; AVX2-NEXT:    movl %edx, %eax
2191; AVX2-NEXT:    andl $63, %eax
2192; AVX2-NEXT:    vpextrb $5, %xmm1, (%rsp,%rax)
2193; AVX2-NEXT:    movl 272(%rbp), %edx
2194; AVX2-NEXT:    andl $1, %edx
2195; AVX2-NEXT:    addq %rcx, %rdx
2196; AVX2-NEXT:    movl %ecx, %eax
2197; AVX2-NEXT:    andl $63, %eax
2198; AVX2-NEXT:    vpextrb $6, %xmm1, (%rsp,%rax)
2199; AVX2-NEXT:    movl 280(%rbp), %ecx
2200; AVX2-NEXT:    andl $1, %ecx
2201; AVX2-NEXT:    addq %rdx, %rcx
2202; AVX2-NEXT:    movl %edx, %eax
2203; AVX2-NEXT:    andl $63, %eax
2204; AVX2-NEXT:    vpextrb $7, %xmm1, (%rsp,%rax)
2205; AVX2-NEXT:    movl 288(%rbp), %edx
2206; AVX2-NEXT:    andl $1, %edx
2207; AVX2-NEXT:    addq %rcx, %rdx
2208; AVX2-NEXT:    movl %ecx, %eax
2209; AVX2-NEXT:    andl $63, %eax
2210; AVX2-NEXT:    vpextrb $8, %xmm1, (%rsp,%rax)
2211; AVX2-NEXT:    movl 296(%rbp), %ecx
2212; AVX2-NEXT:    andl $1, %ecx
2213; AVX2-NEXT:    addq %rdx, %rcx
2214; AVX2-NEXT:    movl %edx, %eax
2215; AVX2-NEXT:    andl $63, %eax
2216; AVX2-NEXT:    vpextrb $9, %xmm1, (%rsp,%rax)
2217; AVX2-NEXT:    movl 304(%rbp), %edx
2218; AVX2-NEXT:    andl $1, %edx
2219; AVX2-NEXT:    addq %rcx, %rdx
2220; AVX2-NEXT:    movl %ecx, %eax
2221; AVX2-NEXT:    andl $63, %eax
2222; AVX2-NEXT:    vpextrb $10, %xmm1, (%rsp,%rax)
2223; AVX2-NEXT:    movl 312(%rbp), %ecx
2224; AVX2-NEXT:    andl $1, %ecx
2225; AVX2-NEXT:    addq %rdx, %rcx
2226; AVX2-NEXT:    movl %edx, %eax
2227; AVX2-NEXT:    andl $63, %eax
2228; AVX2-NEXT:    vpextrb $11, %xmm1, (%rsp,%rax)
2229; AVX2-NEXT:    movl 320(%rbp), %edx
2230; AVX2-NEXT:    andl $1, %edx
2231; AVX2-NEXT:    addq %rcx, %rdx
2232; AVX2-NEXT:    movl %ecx, %eax
2233; AVX2-NEXT:    andl $63, %eax
2234; AVX2-NEXT:    vpextrb $12, %xmm1, (%rsp,%rax)
2235; AVX2-NEXT:    movl 328(%rbp), %ecx
2236; AVX2-NEXT:    andl $1, %ecx
2237; AVX2-NEXT:    addq %rdx, %rcx
2238; AVX2-NEXT:    movl %edx, %eax
2239; AVX2-NEXT:    andl $63, %eax
2240; AVX2-NEXT:    vpextrb $13, %xmm1, (%rsp,%rax)
2241; AVX2-NEXT:    movl 336(%rbp), %edx
2242; AVX2-NEXT:    andl $1, %edx
2243; AVX2-NEXT:    addq %rcx, %rdx
2244; AVX2-NEXT:    movl %ecx, %eax
2245; AVX2-NEXT:    andl $63, %eax
2246; AVX2-NEXT:    vpextrb $14, %xmm1, (%rsp,%rax)
2247; AVX2-NEXT:    movl 344(%rbp), %ecx
2248; AVX2-NEXT:    andl $1, %ecx
2249; AVX2-NEXT:    addq %rdx, %rcx
2250; AVX2-NEXT:    movl %edx, %eax
2251; AVX2-NEXT:    andl $63, %eax
2252; AVX2-NEXT:    vpextrb $15, %xmm1, (%rsp,%rax)
2253; AVX2-NEXT:    movl 352(%rbp), %edx
2254; AVX2-NEXT:    andl $1, %edx
2255; AVX2-NEXT:    addq %rcx, %rdx
2256; AVX2-NEXT:    movl %ecx, %eax
2257; AVX2-NEXT:    andl $63, %eax
2258; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm0
2259; AVX2-NEXT:    vpextrb $0, %xmm0, (%rsp,%rax)
2260; AVX2-NEXT:    movl 360(%rbp), %ecx
2261; AVX2-NEXT:    andl $1, %ecx
2262; AVX2-NEXT:    addq %rdx, %rcx
2263; AVX2-NEXT:    movl %edx, %eax
2264; AVX2-NEXT:    andl $63, %eax
2265; AVX2-NEXT:    vpextrb $1, %xmm0, (%rsp,%rax)
2266; AVX2-NEXT:    movl 368(%rbp), %edx
2267; AVX2-NEXT:    andl $1, %edx
2268; AVX2-NEXT:    addq %rcx, %rdx
2269; AVX2-NEXT:    movl %ecx, %eax
2270; AVX2-NEXT:    andl $63, %eax
2271; AVX2-NEXT:    vpextrb $2, %xmm0, (%rsp,%rax)
2272; AVX2-NEXT:    movl 376(%rbp), %ecx
2273; AVX2-NEXT:    andl $1, %ecx
2274; AVX2-NEXT:    addq %rdx, %rcx
2275; AVX2-NEXT:    movl %edx, %eax
2276; AVX2-NEXT:    andl $63, %eax
2277; AVX2-NEXT:    vpextrb $3, %xmm0, (%rsp,%rax)
2278; AVX2-NEXT:    movl 384(%rbp), %edx
2279; AVX2-NEXT:    andl $1, %edx
2280; AVX2-NEXT:    addq %rcx, %rdx
2281; AVX2-NEXT:    movl %ecx, %eax
2282; AVX2-NEXT:    andl $63, %eax
2283; AVX2-NEXT:    vpextrb $4, %xmm0, (%rsp,%rax)
2284; AVX2-NEXT:    movl 392(%rbp), %ecx
2285; AVX2-NEXT:    andl $1, %ecx
2286; AVX2-NEXT:    addq %rdx, %rcx
2287; AVX2-NEXT:    movl %edx, %eax
2288; AVX2-NEXT:    andl $63, %eax
2289; AVX2-NEXT:    vpextrb $5, %xmm0, (%rsp,%rax)
2290; AVX2-NEXT:    movl 400(%rbp), %edx
2291; AVX2-NEXT:    andl $1, %edx
2292; AVX2-NEXT:    addq %rcx, %rdx
2293; AVX2-NEXT:    movl %ecx, %eax
2294; AVX2-NEXT:    andl $63, %eax
2295; AVX2-NEXT:    vpextrb $6, %xmm0, (%rsp,%rax)
2296; AVX2-NEXT:    movl 408(%rbp), %ecx
2297; AVX2-NEXT:    andl $1, %ecx
2298; AVX2-NEXT:    addq %rdx, %rcx
2299; AVX2-NEXT:    movl %edx, %eax
2300; AVX2-NEXT:    andl $63, %eax
2301; AVX2-NEXT:    vpextrb $7, %xmm0, (%rsp,%rax)
2302; AVX2-NEXT:    movl 416(%rbp), %edx
2303; AVX2-NEXT:    andl $1, %edx
2304; AVX2-NEXT:    addq %rcx, %rdx
2305; AVX2-NEXT:    movl %ecx, %eax
2306; AVX2-NEXT:    andl $63, %eax
2307; AVX2-NEXT:    vpextrb $8, %xmm0, (%rsp,%rax)
2308; AVX2-NEXT:    movl 424(%rbp), %ecx
2309; AVX2-NEXT:    andl $1, %ecx
2310; AVX2-NEXT:    addq %rdx, %rcx
2311; AVX2-NEXT:    movl %edx, %eax
2312; AVX2-NEXT:    andl $63, %eax
2313; AVX2-NEXT:    vpextrb $9, %xmm0, (%rsp,%rax)
2314; AVX2-NEXT:    movl 432(%rbp), %edx
2315; AVX2-NEXT:    andl $1, %edx
2316; AVX2-NEXT:    addq %rcx, %rdx
2317; AVX2-NEXT:    movl %ecx, %eax
2318; AVX2-NEXT:    andl $63, %eax
2319; AVX2-NEXT:    vpextrb $10, %xmm0, (%rsp,%rax)
2320; AVX2-NEXT:    movl 440(%rbp), %ecx
2321; AVX2-NEXT:    andl $1, %ecx
2322; AVX2-NEXT:    addq %rdx, %rcx
2323; AVX2-NEXT:    movl %edx, %eax
2324; AVX2-NEXT:    andl $63, %eax
2325; AVX2-NEXT:    vpextrb $11, %xmm0, (%rsp,%rax)
2326; AVX2-NEXT:    movl 448(%rbp), %edx
2327; AVX2-NEXT:    andl $1, %edx
2328; AVX2-NEXT:    addq %rcx, %rdx
2329; AVX2-NEXT:    movl %ecx, %eax
2330; AVX2-NEXT:    andl $63, %eax
2331; AVX2-NEXT:    vpextrb $12, %xmm0, (%rsp,%rax)
2332; AVX2-NEXT:    movl 456(%rbp), %ecx
2333; AVX2-NEXT:    andl $1, %ecx
2334; AVX2-NEXT:    addq %rdx, %rcx
2335; AVX2-NEXT:    movl %edx, %eax
2336; AVX2-NEXT:    andl $63, %eax
2337; AVX2-NEXT:    vpextrb $13, %xmm0, (%rsp,%rax)
2338; AVX2-NEXT:    movl 464(%rbp), %edx
2339; AVX2-NEXT:    andl $1, %edx
2340; AVX2-NEXT:    addq %rcx, %rdx
2341; AVX2-NEXT:    movl %ecx, %eax
2342; AVX2-NEXT:    andl $63, %eax
2343; AVX2-NEXT:    vpextrb $14, %xmm0, (%rsp,%rax)
2344; AVX2-NEXT:    movl 472(%rbp), %ecx
2345; AVX2-NEXT:    andl $1, %ecx
2346; AVX2-NEXT:    addq %rdx, %rcx
2347; AVX2-NEXT:    movl %edx, %eax
2348; AVX2-NEXT:    andl $63, %eax
2349; AVX2-NEXT:    vpextrb $15, %xmm0, (%rsp,%rax)
2350; AVX2-NEXT:    vpextrb $15, %xmm0, %eax
2351; AVX2-NEXT:    cmpq $64, %rcx
2352; AVX2-NEXT:    cmovbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
2353; AVX2-NEXT:    cmpq $63, %rcx
2354; AVX2-NEXT:    movq %rcx, %rdx
2355; AVX2-NEXT:    movl $63, %ecx
2356; AVX2-NEXT:    cmovbq %rdx, %rcx
2357; AVX2-NEXT:    movb %al, (%rsp,%rcx)
2358; AVX2-NEXT:    vmovaps (%rsp), %ymm0
2359; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
2360; AVX2-NEXT:    leaq -40(%rbp), %rsp
2361; AVX2-NEXT:    popq %rbx
2362; AVX2-NEXT:    popq %r12
2363; AVX2-NEXT:    popq %r13
2364; AVX2-NEXT:    popq %r14
2365; AVX2-NEXT:    popq %r15
2366; AVX2-NEXT:    popq %rbp
2367; AVX2-NEXT:    retq
2368;
2369; AVX512F-LABEL: test_compress_v64i8:
2370; AVX512F:       # %bb.0:
2371; AVX512F-NEXT:    pushq %rbp
2372; AVX512F-NEXT:    movq %rsp, %rbp
2373; AVX512F-NEXT:    andq $-64, %rsp
2374; AVX512F-NEXT:    subq $256, %rsp # imm = 0x100
2375; AVX512F-NEXT:    movzbl 352(%rbp), %eax
2376; AVX512F-NEXT:    andl $1, %eax
2377; AVX512F-NEXT:    kmovw %eax, %k0
2378; AVX512F-NEXT:    movzbl 360(%rbp), %eax
2379; AVX512F-NEXT:    kmovw %eax, %k1
2380; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
2381; AVX512F-NEXT:    kshiftrw $14, %k1, %k1
2382; AVX512F-NEXT:    korw %k1, %k0, %k0
2383; AVX512F-NEXT:    movw $-5, %ax
2384; AVX512F-NEXT:    kmovw %eax, %k1
2385; AVX512F-NEXT:    kandw %k1, %k0, %k0
2386; AVX512F-NEXT:    kmovw %k1, %k3
2387; AVX512F-NEXT:    movzbl 368(%rbp), %eax
2388; AVX512F-NEXT:    kmovw %eax, %k1
2389; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
2390; AVX512F-NEXT:    kshiftrw $13, %k1, %k1
2391; AVX512F-NEXT:    korw %k1, %k0, %k0
2392; AVX512F-NEXT:    movw $-9, %ax
2393; AVX512F-NEXT:    kmovw %eax, %k7
2394; AVX512F-NEXT:    kandw %k7, %k0, %k0
2395; AVX512F-NEXT:    kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
2396; AVX512F-NEXT:    movzbl 376(%rbp), %eax
2397; AVX512F-NEXT:    kmovw %eax, %k1
2398; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
2399; AVX512F-NEXT:    kshiftrw $12, %k1, %k1
2400; AVX512F-NEXT:    korw %k1, %k0, %k0
2401; AVX512F-NEXT:    movw $-17, %ax
2402; AVX512F-NEXT:    kmovw %eax, %k5
2403; AVX512F-NEXT:    kandw %k5, %k0, %k0
2404; AVX512F-NEXT:    movzbl 384(%rbp), %eax
2405; AVX512F-NEXT:    kmovw %eax, %k1
2406; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
2407; AVX512F-NEXT:    kshiftrw $11, %k1, %k1
2408; AVX512F-NEXT:    korw %k1, %k0, %k0
2409; AVX512F-NEXT:    movw $-33, %ax
2410; AVX512F-NEXT:    kmovw %eax, %k6
2411; AVX512F-NEXT:    kandw %k6, %k0, %k0
2412; AVX512F-NEXT:    movzbl 392(%rbp), %eax
2413; AVX512F-NEXT:    kmovw %eax, %k1
2414; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
2415; AVX512F-NEXT:    kshiftrw $10, %k1, %k1
2416; AVX512F-NEXT:    korw %k1, %k0, %k0
2417; AVX512F-NEXT:    movw $-65, %ax
2418; AVX512F-NEXT:    kmovw %eax, %k1
2419; AVX512F-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
2420; AVX512F-NEXT:    kandw %k1, %k0, %k0
2421; AVX512F-NEXT:    movzbl 400(%rbp), %eax
2422; AVX512F-NEXT:    kmovw %eax, %k1
2423; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
2424; AVX512F-NEXT:    kshiftrw $9, %k1, %k1
2425; AVX512F-NEXT:    korw %k1, %k0, %k0
2426; AVX512F-NEXT:    movw $-129, %ax
2427; AVX512F-NEXT:    kmovw %eax, %k1
2428; AVX512F-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
2429; AVX512F-NEXT:    kandw %k1, %k0, %k0
2430; AVX512F-NEXT:    movzbl 408(%rbp), %eax
2431; AVX512F-NEXT:    kmovw %eax, %k1
2432; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
2433; AVX512F-NEXT:    kshiftrw $8, %k1, %k1
2434; AVX512F-NEXT:    korw %k1, %k0, %k0
2435; AVX512F-NEXT:    movw $-257, %ax # imm = 0xFEFF
2436; AVX512F-NEXT:    kmovw %eax, %k1
2437; AVX512F-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
2438; AVX512F-NEXT:    kandw %k1, %k0, %k0
2439; AVX512F-NEXT:    movzbl 416(%rbp), %eax
2440; AVX512F-NEXT:    kmovw %eax, %k1
2441; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
2442; AVX512F-NEXT:    kshiftrw $7, %k1, %k1
2443; AVX512F-NEXT:    korw %k1, %k0, %k0
2444; AVX512F-NEXT:    movw $-513, %ax # imm = 0xFDFF
2445; AVX512F-NEXT:    kmovw %eax, %k1
2446; AVX512F-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
2447; AVX512F-NEXT:    kandw %k1, %k0, %k0
2448; AVX512F-NEXT:    movzbl 424(%rbp), %eax
2449; AVX512F-NEXT:    kmovw %eax, %k1
2450; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
2451; AVX512F-NEXT:    kshiftrw $6, %k1, %k1
2452; AVX512F-NEXT:    korw %k1, %k0, %k0
2453; AVX512F-NEXT:    movw $-1025, %ax # imm = 0xFBFF
2454; AVX512F-NEXT:    kmovw %eax, %k1
2455; AVX512F-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
2456; AVX512F-NEXT:    kandw %k1, %k0, %k0
2457; AVX512F-NEXT:    movzbl 432(%rbp), %eax
2458; AVX512F-NEXT:    kmovw %eax, %k1
2459; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
2460; AVX512F-NEXT:    kshiftrw $5, %k1, %k1
2461; AVX512F-NEXT:    korw %k1, %k0, %k0
2462; AVX512F-NEXT:    movw $-2049, %ax # imm = 0xF7FF
2463; AVX512F-NEXT:    kmovw %eax, %k1
2464; AVX512F-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
2465; AVX512F-NEXT:    kandw %k1, %k0, %k0
2466; AVX512F-NEXT:    movzbl 440(%rbp), %eax
2467; AVX512F-NEXT:    kmovw %eax, %k1
2468; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
2469; AVX512F-NEXT:    kshiftrw $4, %k1, %k1
2470; AVX512F-NEXT:    korw %k1, %k0, %k0
2471; AVX512F-NEXT:    movw $-4097, %ax # imm = 0xEFFF
2472; AVX512F-NEXT:    kmovw %eax, %k1
2473; AVX512F-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
2474; AVX512F-NEXT:    kandw %k1, %k0, %k0
2475; AVX512F-NEXT:    movzbl 448(%rbp), %eax
2476; AVX512F-NEXT:    kmovw %eax, %k1
2477; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
2478; AVX512F-NEXT:    kshiftrw $3, %k1, %k1
2479; AVX512F-NEXT:    korw %k1, %k0, %k0
2480; AVX512F-NEXT:    movw $-8193, %ax # imm = 0xDFFF
2481; AVX512F-NEXT:    kmovw %eax, %k1
2482; AVX512F-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
2483; AVX512F-NEXT:    kandw %k1, %k0, %k0
2484; AVX512F-NEXT:    movzbl 456(%rbp), %eax
2485; AVX512F-NEXT:    kmovw %eax, %k1
2486; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
2487; AVX512F-NEXT:    kshiftrw $2, %k1, %k1
2488; AVX512F-NEXT:    korw %k1, %k0, %k1
2489; AVX512F-NEXT:    movw $-16385, %ax # imm = 0xBFFF
2490; AVX512F-NEXT:    kmovw %eax, %k4
2491; AVX512F-NEXT:    kandw %k4, %k1, %k1
2492; AVX512F-NEXT:    kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
2493; AVX512F-NEXT:    movzbl 464(%rbp), %eax
2494; AVX512F-NEXT:    kmovw %eax, %k2
2495; AVX512F-NEXT:    kshiftlw $14, %k2, %k2
2496; AVX512F-NEXT:    korw %k2, %k1, %k1
2497; AVX512F-NEXT:    kshiftlw $1, %k1, %k1
2498; AVX512F-NEXT:    kshiftrw $1, %k1, %k1
2499; AVX512F-NEXT:    movzbl 472(%rbp), %eax
2500; AVX512F-NEXT:    kmovw %eax, %k2
2501; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
2502; AVX512F-NEXT:    korw %k2, %k1, %k1
2503; AVX512F-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
2504; AVX512F-NEXT:    movzbl 224(%rbp), %eax
2505; AVX512F-NEXT:    andl $1, %eax
2506; AVX512F-NEXT:    movzbl 232(%rbp), %r10d
2507; AVX512F-NEXT:    kmovw %r10d, %k1
2508; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
2509; AVX512F-NEXT:    kshiftrw $14, %k1, %k1
2510; AVX512F-NEXT:    kmovw %eax, %k2
2511; AVX512F-NEXT:    korw %k1, %k2, %k1
2512; AVX512F-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
2513; AVX512F-NEXT:    kandw %k3, %k1, %k1
2514; AVX512F-NEXT:    movzbl 240(%rbp), %eax
2515; AVX512F-NEXT:    kmovw %eax, %k2
2516; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
2517; AVX512F-NEXT:    kshiftrw $13, %k2, %k2
2518; AVX512F-NEXT:    korw %k2, %k1, %k1
2519; AVX512F-NEXT:    kandw %k7, %k1, %k1
2520; AVX512F-NEXT:    movzbl 248(%rbp), %eax
2521; AVX512F-NEXT:    kmovw %eax, %k2
2522; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
2523; AVX512F-NEXT:    kshiftrw $12, %k2, %k2
2524; AVX512F-NEXT:    korw %k2, %k1, %k1
2525; AVX512F-NEXT:    kandw %k5, %k1, %k1
2526; AVX512F-NEXT:    movzbl 256(%rbp), %eax
2527; AVX512F-NEXT:    kmovw %eax, %k2
2528; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
2529; AVX512F-NEXT:    kshiftrw $11, %k2, %k2
2530; AVX512F-NEXT:    korw %k2, %k1, %k1
2531; AVX512F-NEXT:    kandw %k6, %k1, %k1
2532; AVX512F-NEXT:    movzbl 264(%rbp), %eax
2533; AVX512F-NEXT:    kmovw %eax, %k2
2534; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
2535; AVX512F-NEXT:    kshiftrw $10, %k2, %k2
2536; AVX512F-NEXT:    korw %k2, %k1, %k1
2537; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
2538; AVX512F-NEXT:    kandw %k7, %k1, %k1
2539; AVX512F-NEXT:    movzbl 272(%rbp), %eax
2540; AVX512F-NEXT:    kmovw %eax, %k2
2541; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
2542; AVX512F-NEXT:    kshiftrw $9, %k2, %k2
2543; AVX512F-NEXT:    korw %k2, %k1, %k0
2544; AVX512F-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
2545; AVX512F-NEXT:    movzbl 280(%rbp), %eax
2546; AVX512F-NEXT:    kmovw %eax, %k1
2547; AVX512F-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
2548; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
2549; AVX512F-NEXT:    kshiftrw $8, %k1, %k1
2550; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
2551; AVX512F-NEXT:    kandw %k2, %k0, %k2
2552; AVX512F-NEXT:    korw %k1, %k2, %k1
2553; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
2554; AVX512F-NEXT:    kandw %k0, %k1, %k1
2555; AVX512F-NEXT:    movzbl 288(%rbp), %eax
2556; AVX512F-NEXT:    kmovw %eax, %k0
2557; AVX512F-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
2558; AVX512F-NEXT:    kshiftlw $15, %k0, %k2
2559; AVX512F-NEXT:    kshiftrw $7, %k2, %k2
2560; AVX512F-NEXT:    korw %k2, %k1, %k1
2561; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
2562; AVX512F-NEXT:    kandw %k0, %k1, %k1
2563; AVX512F-NEXT:    movzbl 296(%rbp), %eax
2564; AVX512F-NEXT:    kmovw %eax, %k2
2565; AVX512F-NEXT:    kshiftlw $15, %k2, %k0
2566; AVX512F-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
2567; AVX512F-NEXT:    kshiftrw $6, %k0, %k2
2568; AVX512F-NEXT:    korw %k2, %k1, %k1
2569; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
2570; AVX512F-NEXT:    kandw %k0, %k1, %k1
2571; AVX512F-NEXT:    movzbl 304(%rbp), %eax
2572; AVX512F-NEXT:    kmovw %eax, %k2
2573; AVX512F-NEXT:    kshiftlw $15, %k2, %k0
2574; AVX512F-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
2575; AVX512F-NEXT:    kshiftrw $5, %k0, %k2
2576; AVX512F-NEXT:    korw %k2, %k1, %k1
2577; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
2578; AVX512F-NEXT:    kandw %k0, %k1, %k1
2579; AVX512F-NEXT:    movzbl 312(%rbp), %eax
2580; AVX512F-NEXT:    kmovw %eax, %k2
2581; AVX512F-NEXT:    kshiftlw $15, %k2, %k0
2582; AVX512F-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
2583; AVX512F-NEXT:    kshiftrw $4, %k0, %k2
2584; AVX512F-NEXT:    korw %k2, %k1, %k1
2585; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
2586; AVX512F-NEXT:    kandw %k0, %k1, %k1
2587; AVX512F-NEXT:    movzbl 320(%rbp), %eax
2588; AVX512F-NEXT:    kmovw %eax, %k2
2589; AVX512F-NEXT:    kshiftlw $15, %k2, %k0
2590; AVX512F-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
2591; AVX512F-NEXT:    kshiftrw $3, %k0, %k2
2592; AVX512F-NEXT:    korw %k2, %k1, %k1
2593; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
2594; AVX512F-NEXT:    kandw %k0, %k1, %k1
2595; AVX512F-NEXT:    movzbl 328(%rbp), %eax
2596; AVX512F-NEXT:    kmovw %eax, %k2
2597; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
2598; AVX512F-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
2599; AVX512F-NEXT:    kshiftrw $2, %k2, %k2
2600; AVX512F-NEXT:    korw %k2, %k1, %k1
2601; AVX512F-NEXT:    kandw %k4, %k1, %k1
2602; AVX512F-NEXT:    movzbl 336(%rbp), %eax
2603; AVX512F-NEXT:    kmovw %eax, %k2
2604; AVX512F-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
2605; AVX512F-NEXT:    kshiftlw $14, %k2, %k2
2606; AVX512F-NEXT:    korw %k2, %k1, %k1
2607; AVX512F-NEXT:    kshiftlw $1, %k1, %k1
2608; AVX512F-NEXT:    kshiftrw $1, %k1, %k1
2609; AVX512F-NEXT:    movzbl 344(%rbp), %eax
2610; AVX512F-NEXT:    kmovw %eax, %k2
2611; AVX512F-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
2612; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
2613; AVX512F-NEXT:    korw %k2, %k1, %k1
2614; AVX512F-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
2615; AVX512F-NEXT:    movzbl 96(%rbp), %eax
2616; AVX512F-NEXT:    andl $1, %eax
2617; AVX512F-NEXT:    movzbl 104(%rbp), %r10d
2618; AVX512F-NEXT:    kmovw %r10d, %k1
2619; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
2620; AVX512F-NEXT:    kshiftrw $14, %k1, %k1
2621; AVX512F-NEXT:    kmovw %eax, %k2
2622; AVX512F-NEXT:    korw %k1, %k2, %k1
2623; AVX512F-NEXT:    kandw %k3, %k1, %k1
2624; AVX512F-NEXT:    movzbl 112(%rbp), %eax
2625; AVX512F-NEXT:    kmovw %eax, %k2
2626; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
2627; AVX512F-NEXT:    kshiftrw $13, %k2, %k2
2628; AVX512F-NEXT:    korw %k2, %k1, %k1
2629; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
2630; AVX512F-NEXT:    kandw %k4, %k1, %k1
2631; AVX512F-NEXT:    movzbl 120(%rbp), %eax
2632; AVX512F-NEXT:    kmovw %eax, %k2
2633; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
2634; AVX512F-NEXT:    kshiftrw $12, %k2, %k2
2635; AVX512F-NEXT:    korw %k2, %k1, %k1
2636; AVX512F-NEXT:    kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
2637; AVX512F-NEXT:    kandw %k5, %k1, %k1
2638; AVX512F-NEXT:    movzbl 128(%rbp), %eax
2639; AVX512F-NEXT:    kmovw %eax, %k2
2640; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
2641; AVX512F-NEXT:    kshiftrw $11, %k2, %k2
2642; AVX512F-NEXT:    korw %k2, %k1, %k1
2643; AVX512F-NEXT:    kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
2644; AVX512F-NEXT:    kandw %k6, %k1, %k1
2645; AVX512F-NEXT:    movzbl 136(%rbp), %eax
2646; AVX512F-NEXT:    kmovw %eax, %k2
2647; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
2648; AVX512F-NEXT:    kshiftrw $10, %k2, %k2
2649; AVX512F-NEXT:    korw %k2, %k1, %k1
2650; AVX512F-NEXT:    kandw %k7, %k1, %k1
2651; AVX512F-NEXT:    movzbl 144(%rbp), %eax
2652; AVX512F-NEXT:    kmovw %eax, %k2
2653; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
2654; AVX512F-NEXT:    kshiftrw $9, %k2, %k2
2655; AVX512F-NEXT:    korw %k2, %k1, %k1
2656; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
2657; AVX512F-NEXT:    kandw %k2, %k1, %k1
2658; AVX512F-NEXT:    movzbl 152(%rbp), %eax
2659; AVX512F-NEXT:    kmovw %eax, %k2
2660; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
2661; AVX512F-NEXT:    kshiftrw $8, %k2, %k2
2662; AVX512F-NEXT:    korw %k2, %k1, %k1
2663; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
2664; AVX512F-NEXT:    kandw %k3, %k1, %k1
2665; AVX512F-NEXT:    movzbl 160(%rbp), %eax
2666; AVX512F-NEXT:    kmovw %eax, %k2
2667; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
2668; AVX512F-NEXT:    kshiftrw $7, %k2, %k2
2669; AVX512F-NEXT:    korw %k2, %k1, %k1
2670; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
2671; AVX512F-NEXT:    kandw %k7, %k1, %k1
2672; AVX512F-NEXT:    movzbl 168(%rbp), %eax
2673; AVX512F-NEXT:    kmovw %eax, %k2
2674; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
2675; AVX512F-NEXT:    kshiftrw $6, %k2, %k2
2676; AVX512F-NEXT:    korw %k2, %k1, %k1
2677; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
2678; AVX512F-NEXT:    kandw %k2, %k1, %k1
2679; AVX512F-NEXT:    movzbl 176(%rbp), %eax
2680; AVX512F-NEXT:    kmovw %eax, %k2
2681; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
2682; AVX512F-NEXT:    kshiftrw $5, %k2, %k2
2683; AVX512F-NEXT:    korw %k2, %k1, %k1
2684; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
2685; AVX512F-NEXT:    kandw %k2, %k1, %k1
2686; AVX512F-NEXT:    movzbl 184(%rbp), %eax
2687; AVX512F-NEXT:    kmovw %eax, %k2
2688; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
2689; AVX512F-NEXT:    kshiftrw $4, %k2, %k2
2690; AVX512F-NEXT:    korw %k2, %k1, %k1
2691; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
2692; AVX512F-NEXT:    kandw %k2, %k1, %k1
2693; AVX512F-NEXT:    movzbl 192(%rbp), %eax
2694; AVX512F-NEXT:    kmovw %eax, %k2
2695; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
2696; AVX512F-NEXT:    kshiftrw $3, %k2, %k2
2697; AVX512F-NEXT:    korw %k2, %k1, %k1
2698; AVX512F-NEXT:    kandw %k0, %k1, %k1
2699; AVX512F-NEXT:    movzbl 200(%rbp), %eax
2700; AVX512F-NEXT:    kmovw %eax, %k2
2701; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
2702; AVX512F-NEXT:    kshiftrw $2, %k2, %k2
2703; AVX512F-NEXT:    korw %k2, %k1, %k1
2704; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
2705; AVX512F-NEXT:    kandw %k2, %k1, %k1
2706; AVX512F-NEXT:    movzbl 208(%rbp), %eax
2707; AVX512F-NEXT:    kmovw %eax, %k2
2708; AVX512F-NEXT:    kshiftlw $14, %k2, %k2
2709; AVX512F-NEXT:    korw %k2, %k1, %k1
2710; AVX512F-NEXT:    kshiftlw $1, %k1, %k1
2711; AVX512F-NEXT:    kshiftrw $1, %k1, %k1
2712; AVX512F-NEXT:    movzbl 216(%rbp), %eax
2713; AVX512F-NEXT:    kmovw %eax, %k2
2714; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
2715; AVX512F-NEXT:    korw %k2, %k1, %k1
2716; AVX512F-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
2717; AVX512F-NEXT:    andl $1, %edi
2718; AVX512F-NEXT:    kmovw %esi, %k1
2719; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
2720; AVX512F-NEXT:    kshiftrw $14, %k1, %k1
2721; AVX512F-NEXT:    kmovw %edi, %k2
2722; AVX512F-NEXT:    korw %k1, %k2, %k1
2723; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
2724; AVX512F-NEXT:    kandw %k2, %k1, %k1
2725; AVX512F-NEXT:    kmovw %edx, %k2
2726; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
2727; AVX512F-NEXT:    kshiftrw $13, %k2, %k2
2728; AVX512F-NEXT:    korw %k2, %k1, %k1
2729; AVX512F-NEXT:    kandw %k4, %k1, %k1
2730; AVX512F-NEXT:    kmovw %ecx, %k2
2731; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
2732; AVX512F-NEXT:    kshiftrw $12, %k2, %k2
2733; AVX512F-NEXT:    korw %k2, %k1, %k1
2734; AVX512F-NEXT:    kandw %k5, %k1, %k1
2735; AVX512F-NEXT:    kmovw %r8d, %k2
2736; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
2737; AVX512F-NEXT:    kshiftrw $11, %k2, %k2
2738; AVX512F-NEXT:    korw %k2, %k1, %k1
2739; AVX512F-NEXT:    kandw %k6, %k1, %k1
2740; AVX512F-NEXT:    kmovw %r9d, %k2
2741; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
2742; AVX512F-NEXT:    kshiftrw $10, %k2, %k2
2743; AVX512F-NEXT:    korw %k2, %k1, %k1
2744; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
2745; AVX512F-NEXT:    kandw %k2, %k1, %k1
2746; AVX512F-NEXT:    movzbl 16(%rbp), %eax
2747; AVX512F-NEXT:    kmovw %eax, %k2
2748; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
2749; AVX512F-NEXT:    kshiftrw $9, %k2, %k2
2750; AVX512F-NEXT:    korw %k2, %k1, %k2
2751; AVX512F-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
2752; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
2753; AVX512F-NEXT:    kandw %k1, %k2, %k1
2754; AVX512F-NEXT:    movzbl 24(%rbp), %eax
2755; AVX512F-NEXT:    kmovw %eax, %k2
2756; AVX512F-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
2757; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
2758; AVX512F-NEXT:    kshiftrw $8, %k2, %k2
2759; AVX512F-NEXT:    korw %k2, %k1, %k1
2760; AVX512F-NEXT:    kandw %k3, %k1, %k1
2761; AVX512F-NEXT:    movzbl 32(%rbp), %eax
2762; AVX512F-NEXT:    kmovw %eax, %k2
2763; AVX512F-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
2764; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
2765; AVX512F-NEXT:    kshiftrw $7, %k2, %k2
2766; AVX512F-NEXT:    korw %k2, %k1, %k1
2767; AVX512F-NEXT:    kandw %k7, %k1, %k1
2768; AVX512F-NEXT:    movzbl 40(%rbp), %eax
2769; AVX512F-NEXT:    kmovw %eax, %k2
2770; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
2771; AVX512F-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
2772; AVX512F-NEXT:    kshiftrw $6, %k2, %k2
2773; AVX512F-NEXT:    korw %k2, %k1, %k1
2774; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
2775; AVX512F-NEXT:    kandw %k2, %k1, %k1
2776; AVX512F-NEXT:    movzbl 48(%rbp), %eax
2777; AVX512F-NEXT:    kmovw %eax, %k2
2778; AVX512F-NEXT:    kshiftlw $15, %k2, %k5
2779; AVX512F-NEXT:    kshiftrw $5, %k5, %k2
2780; AVX512F-NEXT:    korw %k2, %k1, %k1
2781; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
2782; AVX512F-NEXT:    kandw %k2, %k1, %k1
2783; AVX512F-NEXT:    movzbl 56(%rbp), %eax
2784; AVX512F-NEXT:    kmovw %eax, %k2
2785; AVX512F-NEXT:    kshiftlw $15, %k2, %k4
2786; AVX512F-NEXT:    kshiftrw $4, %k4, %k2
2787; AVX512F-NEXT:    korw %k2, %k1, %k1
2788; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
2789; AVX512F-NEXT:    kandw %k2, %k1, %k1
2790; AVX512F-NEXT:    movzbl 64(%rbp), %eax
2791; AVX512F-NEXT:    kmovw %eax, %k2
2792; AVX512F-NEXT:    kshiftlw $15, %k2, %k3
2793; AVX512F-NEXT:    kshiftrw $3, %k3, %k2
2794; AVX512F-NEXT:    korw %k2, %k1, %k1
2795; AVX512F-NEXT:    kandw %k0, %k1, %k1
2796; AVX512F-NEXT:    movzbl 72(%rbp), %eax
2797; AVX512F-NEXT:    kmovw %eax, %k2
2798; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
2799; AVX512F-NEXT:    kshiftrw $2, %k2, %k0
2800; AVX512F-NEXT:    korw %k0, %k1, %k0
2801; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
2802; AVX512F-NEXT:    kandw %k1, %k0, %k0
2803; AVX512F-NEXT:    movzbl 80(%rbp), %eax
2804; AVX512F-NEXT:    kmovw %eax, %k1
2805; AVX512F-NEXT:    kshiftlw $14, %k1, %k7
2806; AVX512F-NEXT:    korw %k7, %k0, %k0
2807; AVX512F-NEXT:    kshiftlw $1, %k0, %k0
2808; AVX512F-NEXT:    kshiftrw $1, %k0, %k7
2809; AVX512F-NEXT:    movzbl 88(%rbp), %eax
2810; AVX512F-NEXT:    kmovw %eax, %k0
2811; AVX512F-NEXT:    kshiftlw $15, %k0, %k6
2812; AVX512F-NEXT:    korw %k6, %k7, %k6
2813; AVX512F-NEXT:    kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
2814; AVX512F-NEXT:    movw $-3, %ax
2815; AVX512F-NEXT:    kmovw %eax, %k6
2816; AVX512F-NEXT:    kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
2817; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
2818; AVX512F-NEXT:    kandw %k6, %k7, %k6
2819; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
2820; AVX512F-NEXT:    kshiftrw $14, %k7, %k7
2821; AVX512F-NEXT:    korw %k7, %k6, %k6
2822; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
2823; AVX512F-NEXT:    kandw %k7, %k6, %k6
2824; AVX512F-NEXT:    kshiftrw $13, %k5, %k5
2825; AVX512F-NEXT:    korw %k5, %k6, %k5
2826; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
2827; AVX512F-NEXT:    kandw %k6, %k5, %k5
2828; AVX512F-NEXT:    kshiftrw $12, %k4, %k4
2829; AVX512F-NEXT:    korw %k4, %k5, %k4
2830; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
2831; AVX512F-NEXT:    kandw %k5, %k4, %k4
2832; AVX512F-NEXT:    kshiftrw $11, %k3, %k3
2833; AVX512F-NEXT:    korw %k3, %k4, %k3
2834; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
2835; AVX512F-NEXT:    kandw %k4, %k3, %k3
2836; AVX512F-NEXT:    kshiftrw $10, %k2, %k2
2837; AVX512F-NEXT:    korw %k2, %k3, %k2
2838; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
2839; AVX512F-NEXT:    kandw %k3, %k2, %k2
2840; AVX512F-NEXT:    kshiftlw $6, %k1, %k1
2841; AVX512F-NEXT:    korw %k1, %k2, %k1
2842; AVX512F-NEXT:    kshiftlw $9, %k1, %k1
2843; AVX512F-NEXT:    kshiftrw $9, %k1, %k1
2844; AVX512F-NEXT:    kshiftlw $7, %k0, %k0
2845; AVX512F-NEXT:    korw %k0, %k1, %k0
2846; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
2847; AVX512F-NEXT:    kshiftlw $9, %k1, %k1
2848; AVX512F-NEXT:    kshiftrw $9, %k1, %k1
2849; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
2850; AVX512F-NEXT:    kshiftlw $7, %k2, %k2
2851; AVX512F-NEXT:    korw %k2, %k1, %k1
2852; AVX512F-NEXT:    kxorw %k0, %k1, %k0
2853; AVX512F-NEXT:    kshiftrw $4, %k0, %k1
2854; AVX512F-NEXT:    kxorw %k1, %k0, %k0
2855; AVX512F-NEXT:    kshiftrw $2, %k0, %k1
2856; AVX512F-NEXT:    kxorw %k1, %k0, %k0
2857; AVX512F-NEXT:    kshiftrw $1, %k0, %k1
2858; AVX512F-NEXT:    kxorw %k1, %k0, %k0
2859; AVX512F-NEXT:    kmovw %k0, %eax
2860; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
2861; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
2862; AVX512F-NEXT:    kandw %k1, %k0, %k0
2863; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
2864; AVX512F-NEXT:    kshiftrw $14, %k1, %k1
2865; AVX512F-NEXT:    korw %k1, %k0, %k0
2866; AVX512F-NEXT:    kandw %k7, %k0, %k0
2867; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
2868; AVX512F-NEXT:    kshiftrw $13, %k1, %k1
2869; AVX512F-NEXT:    korw %k1, %k0, %k0
2870; AVX512F-NEXT:    kandw %k6, %k0, %k0
2871; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
2872; AVX512F-NEXT:    kshiftrw $12, %k1, %k1
2873; AVX512F-NEXT:    korw %k1, %k0, %k0
2874; AVX512F-NEXT:    kandw %k5, %k0, %k0
2875; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
2876; AVX512F-NEXT:    kshiftrw $11, %k1, %k1
2877; AVX512F-NEXT:    korw %k1, %k0, %k0
2878; AVX512F-NEXT:    kandw %k4, %k0, %k0
2879; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
2880; AVX512F-NEXT:    kshiftrw $10, %k1, %k1
2881; AVX512F-NEXT:    korw %k1, %k0, %k0
2882; AVX512F-NEXT:    kandw %k3, %k0, %k0
2883; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
2884; AVX512F-NEXT:    kshiftlw $6, %k1, %k1
2885; AVX512F-NEXT:    korw %k1, %k0, %k0
2886; AVX512F-NEXT:    kshiftlw $9, %k0, %k0
2887; AVX512F-NEXT:    kshiftrw $9, %k0, %k0
2888; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
2889; AVX512F-NEXT:    kshiftlw $7, %k1, %k1
2890; AVX512F-NEXT:    korw %k1, %k0, %k0
2891; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
2892; AVX512F-NEXT:    kshiftlw $9, %k1, %k1
2893; AVX512F-NEXT:    kshiftrw $9, %k1, %k1
2894; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
2895; AVX512F-NEXT:    kshiftlw $7, %k2, %k2
2896; AVX512F-NEXT:    korw %k2, %k1, %k1
2897; AVX512F-NEXT:    kxorw %k0, %k1, %k0
2898; AVX512F-NEXT:    kshiftrw $4, %k0, %k1
2899; AVX512F-NEXT:    kxorw %k1, %k0, %k0
2900; AVX512F-NEXT:    kshiftrw $2, %k0, %k1
2901; AVX512F-NEXT:    kxorw %k1, %k0, %k0
2902; AVX512F-NEXT:    kshiftrw $1, %k0, %k1
2903; AVX512F-NEXT:    kxorw %k1, %k0, %k0
2904; AVX512F-NEXT:    kmovw %k0, %ecx
2905; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
2906; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
2907; AVX512F-NEXT:    kxorw %k2, %k3, %k0
2908; AVX512F-NEXT:    kshiftrw $8, %k0, %k1
2909; AVX512F-NEXT:    kxorw %k1, %k0, %k0
2910; AVX512F-NEXT:    kshiftrw $4, %k0, %k1
2911; AVX512F-NEXT:    kxorw %k1, %k0, %k0
2912; AVX512F-NEXT:    kshiftrw $2, %k0, %k1
2913; AVX512F-NEXT:    kxorw %k1, %k0, %k0
2914; AVX512F-NEXT:    kshiftrw $1, %k0, %k1
2915; AVX512F-NEXT:    kxorw %k1, %k0, %k0
2916; AVX512F-NEXT:    kmovw %k0, %edx
2917; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
2918; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
2919; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
2920; AVX512F-NEXT:    vpcompressd %zmm3, %zmm3 {%k1} {z}
2921; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm4 {%k1} {z} = -1
2922; AVX512F-NEXT:    vextracti128 $1, %ymm2, %xmm2
2923; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
2924; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
2925; AVX512F-NEXT:    vpcompressd %zmm2, %zmm2 {%k1} {z}
2926; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm5 {%k1} {z} = -1
2927; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
2928; AVX512F-NEXT:    vpcompressd %zmm6, %zmm6 {%k3} {z}
2929; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm7 {%k3} {z} = -1
2930; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
2931; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
2932; AVX512F-NEXT:    vpcompressd %zmm0, %zmm0 {%k2} {z}
2933; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm8 {%k2} {z} = -1
2934; AVX512F-NEXT:    vpmovdb %zmm6, {{[0-9]+}}(%rsp)
2935; AVX512F-NEXT:    andl $31, %eax
2936; AVX512F-NEXT:    vpmovdb %zmm0, 64(%rsp,%rax)
2937; AVX512F-NEXT:    vpmovdb %zmm3, {{[0-9]+}}(%rsp)
2938; AVX512F-NEXT:    andl $31, %ecx
2939; AVX512F-NEXT:    vpmovdb %zmm2, 96(%rsp,%rcx)
2940; AVX512F-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
2941; AVX512F-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
2942; AVX512F-NEXT:    andl $63, %edx
2943; AVX512F-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
2944; AVX512F-NEXT:    vmovaps %ymm0, 128(%rsp,%rdx)
2945; AVX512F-NEXT:    vpmovdb %zmm4, %xmm0
2946; AVX512F-NEXT:    vpmovdb %zmm5, %xmm2
2947; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
2948; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
2949; AVX512F-NEXT:    vpblendvb %ymm0, {{[0-9]+}}(%rsp), %ymm2, %ymm0
2950; AVX512F-NEXT:    vpmovdb %zmm7, %xmm2
2951; AVX512F-NEXT:    vpmovdb %zmm8, %xmm3
2952; AVX512F-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
2953; AVX512F-NEXT:    vpblendvb %ymm2, {{[0-9]+}}(%rsp), %ymm1, %ymm1
2954; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
2955; AVX512F-NEXT:    movq %rbp, %rsp
2956; AVX512F-NEXT:    popq %rbp
2957; AVX512F-NEXT:    retq
2958;
2959; AVX512VL-LABEL: test_compress_v64i8:
2960; AVX512VL:       # %bb.0:
2961; AVX512VL-NEXT:    vpsllw $7, %zmm1, %zmm1
2962; AVX512VL-NEXT:    vpmovb2m %zmm1, %k1
2963; AVX512VL-NEXT:    vpcompressb %zmm0, %zmm2 {%k1}
2964; AVX512VL-NEXT:    vmovdqa64 %zmm2, %zmm0
2965; AVX512VL-NEXT:    retq
2966    %out = call <64 x i8> @llvm.experimental.vector.compress(<64 x i8> %vec, <64 x i1> %mask, <64 x i8> %passthru)
2967    ret <64 x i8> %out
2968}
2969
2970define <32 x i16> @test_compress_v32i16(<32 x i16> %vec, <32 x i1> %mask, <32 x i16> %passthru) nounwind {
2971; AVX2-LABEL: test_compress_v32i16:
2972; AVX2:       # %bb.0:
2973; AVX2-NEXT:    pushq %rbp
2974; AVX2-NEXT:    movq %rsp, %rbp
2975; AVX2-NEXT:    pushq %r15
2976; AVX2-NEXT:    pushq %r14
2977; AVX2-NEXT:    pushq %r13
2978; AVX2-NEXT:    pushq %r12
2979; AVX2-NEXT:    pushq %rbx
2980; AVX2-NEXT:    andq $-32, %rsp
2981; AVX2-NEXT:    subq $256, %rsp # imm = 0x100
2982; AVX2-NEXT:    vmovaps %ymm4, {{[0-9]+}}(%rsp)
2983; AVX2-NEXT:    vmovaps %ymm3, (%rsp)
2984; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
2985; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
2986; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
2987; AVX2-NEXT:    vpand %ymm5, %ymm4, %ymm4
2988; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
2989; AVX2-NEXT:    vpand %ymm5, %ymm6, %ymm5
2990; AVX2-NEXT:    vpaddw %ymm4, %ymm5, %ymm4
2991; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm5
2992; AVX2-NEXT:    vpaddw %xmm5, %xmm4, %xmm4
2993; AVX2-NEXT:    vpextrw $1, %xmm4, %eax
2994; AVX2-NEXT:    vmovd %xmm4, %ecx
2995; AVX2-NEXT:    addl %eax, %ecx
2996; AVX2-NEXT:    vpextrw $2, %xmm4, %eax
2997; AVX2-NEXT:    vpextrw $3, %xmm4, %edx
2998; AVX2-NEXT:    addl %eax, %edx
2999; AVX2-NEXT:    addl %ecx, %edx
3000; AVX2-NEXT:    vpextrw $4, %xmm4, %eax
3001; AVX2-NEXT:    vpextrw $5, %xmm4, %ecx
3002; AVX2-NEXT:    addl %eax, %ecx
3003; AVX2-NEXT:    vpextrw $6, %xmm4, %eax
3004; AVX2-NEXT:    addl %ecx, %eax
3005; AVX2-NEXT:    addl %edx, %eax
3006; AVX2-NEXT:    vpextrw $7, %xmm4, %ecx
3007; AVX2-NEXT:    addl %eax, %ecx
3008; AVX2-NEXT:    andl $31, %ecx
3009; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
3010; AVX2-NEXT:    vpextrb $1, %xmm2, %eax
3011; AVX2-NEXT:    andl $1, %eax
3012; AVX2-NEXT:    vmovd %xmm2, %ecx
3013; AVX2-NEXT:    andl $1, %ecx
3014; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
3015; AVX2-NEXT:    addq %rcx, %rax
3016; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
3017; AVX2-NEXT:    vpextrb $2, %xmm2, %ecx
3018; AVX2-NEXT:    andl $1, %ecx
3019; AVX2-NEXT:    addq %rax, %rcx
3020; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
3021; AVX2-NEXT:    vpextrb $3, %xmm2, %eax
3022; AVX2-NEXT:    andl $1, %eax
3023; AVX2-NEXT:    addq %rcx, %rax
3024; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
3025; AVX2-NEXT:    vpextrb $4, %xmm2, %ecx
3026; AVX2-NEXT:    andl $1, %ecx
3027; AVX2-NEXT:    addq %rax, %rcx
3028; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
3029; AVX2-NEXT:    vpextrb $5, %xmm2, %eax
3030; AVX2-NEXT:    andl $1, %eax
3031; AVX2-NEXT:    addq %rcx, %rax
3032; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
3033; AVX2-NEXT:    vpextrb $6, %xmm2, %ecx
3034; AVX2-NEXT:    andl $1, %ecx
3035; AVX2-NEXT:    addq %rax, %rcx
3036; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
3037; AVX2-NEXT:    vpextrb $7, %xmm2, %eax
3038; AVX2-NEXT:    andl $1, %eax
3039; AVX2-NEXT:    addq %rcx, %rax
3040; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
3041; AVX2-NEXT:    vpextrb $8, %xmm2, %ecx
3042; AVX2-NEXT:    andl $1, %ecx
3043; AVX2-NEXT:    addq %rax, %rcx
3044; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
3045; AVX2-NEXT:    vpextrb $9, %xmm2, %eax
3046; AVX2-NEXT:    andl $1, %eax
3047; AVX2-NEXT:    addq %rcx, %rax
3048; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
3049; AVX2-NEXT:    vpextrb $10, %xmm2, %ecx
3050; AVX2-NEXT:    andl $1, %ecx
3051; AVX2-NEXT:    addq %rax, %rcx
3052; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
3053; AVX2-NEXT:    vpextrb $11, %xmm2, %eax
3054; AVX2-NEXT:    andl $1, %eax
3055; AVX2-NEXT:    addq %rcx, %rax
3056; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
3057; AVX2-NEXT:    vpextrb $12, %xmm2, %ecx
3058; AVX2-NEXT:    andl $1, %ecx
3059; AVX2-NEXT:    addq %rax, %rcx
3060; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
3061; AVX2-NEXT:    vpextrb $13, %xmm2, %eax
3062; AVX2-NEXT:    andl $1, %eax
3063; AVX2-NEXT:    addq %rcx, %rax
3064; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
3065; AVX2-NEXT:    vpextrb $14, %xmm2, %ecx
3066; AVX2-NEXT:    andl $1, %ecx
3067; AVX2-NEXT:    addq %rax, %rcx
3068; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
3069; AVX2-NEXT:    vpextrb $15, %xmm2, %eax
3070; AVX2-NEXT:    andl $1, %eax
3071; AVX2-NEXT:    addq %rcx, %rax
3072; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
3073; AVX2-NEXT:    vmovd %xmm3, %ecx
3074; AVX2-NEXT:    andl $1, %ecx
3075; AVX2-NEXT:    addq %rax, %rcx
3076; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
3077; AVX2-NEXT:    vpextrb $1, %xmm3, %eax
3078; AVX2-NEXT:    andl $1, %eax
3079; AVX2-NEXT:    addq %rcx, %rax
3080; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
3081; AVX2-NEXT:    vpextrb $2, %xmm3, %ecx
3082; AVX2-NEXT:    andl $1, %ecx
3083; AVX2-NEXT:    addq %rax, %rcx
3084; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
3085; AVX2-NEXT:    vpextrb $3, %xmm3, %r12d
3086; AVX2-NEXT:    andl $1, %r12d
3087; AVX2-NEXT:    addq %rcx, %r12
3088; AVX2-NEXT:    vpextrb $4, %xmm3, %r15d
3089; AVX2-NEXT:    andl $1, %r15d
3090; AVX2-NEXT:    addq %r12, %r15
3091; AVX2-NEXT:    vpextrb $5, %xmm3, %r14d
3092; AVX2-NEXT:    andl $1, %r14d
3093; AVX2-NEXT:    addq %r15, %r14
3094; AVX2-NEXT:    vpextrb $6, %xmm3, %ebx
3095; AVX2-NEXT:    andl $1, %ebx
3096; AVX2-NEXT:    addq %r14, %rbx
3097; AVX2-NEXT:    vpextrb $7, %xmm3, %r11d
3098; AVX2-NEXT:    andl $1, %r11d
3099; AVX2-NEXT:    addq %rbx, %r11
3100; AVX2-NEXT:    vpextrb $8, %xmm3, %r10d
3101; AVX2-NEXT:    andl $1, %r10d
3102; AVX2-NEXT:    addq %r11, %r10
3103; AVX2-NEXT:    vpextrb $9, %xmm3, %r9d
3104; AVX2-NEXT:    andl $1, %r9d
3105; AVX2-NEXT:    addq %r10, %r9
3106; AVX2-NEXT:    vpextrb $10, %xmm3, %r8d
3107; AVX2-NEXT:    andl $1, %r8d
3108; AVX2-NEXT:    addq %r9, %r8
3109; AVX2-NEXT:    vpextrb $11, %xmm3, %edi
3110; AVX2-NEXT:    andl $1, %edi
3111; AVX2-NEXT:    addq %r8, %rdi
3112; AVX2-NEXT:    vpextrb $12, %xmm3, %esi
3113; AVX2-NEXT:    andl $1, %esi
3114; AVX2-NEXT:    addq %rdi, %rsi
3115; AVX2-NEXT:    vpextrb $13, %xmm3, %edx
3116; AVX2-NEXT:    andl $1, %edx
3117; AVX2-NEXT:    addq %rsi, %rdx
3118; AVX2-NEXT:    vpextrb $14, %xmm3, %ecx
3119; AVX2-NEXT:    andl $1, %ecx
3120; AVX2-NEXT:    addq %rdx, %rcx
3121; AVX2-NEXT:    vpextrb $15, %xmm3, %eax
3122; AVX2-NEXT:    andl $1, %eax
3123; AVX2-NEXT:    addq %rcx, %rax
3124; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
3125; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
3126; AVX2-NEXT:    cmpq $32, %rax
3127; AVX2-NEXT:    vpextrw $7, %xmm2, %eax
3128; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
3129; AVX2-NEXT:    cmovbw (%rsp,%r13,2), %ax
3130; AVX2-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3131; AVX2-NEXT:    vpextrw $0, %xmm0, (%rsp)
3132; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
3133; AVX2-NEXT:    vpextrw $1, %xmm0, (%rsp,%r13,2)
3134; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
3135; AVX2-NEXT:    vpextrw $2, %xmm0, (%rsp,%r13,2)
3136; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
3137; AVX2-NEXT:    vpextrw $3, %xmm0, (%rsp,%r13,2)
3138; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
3139; AVX2-NEXT:    vpextrw $4, %xmm0, (%rsp,%r13,2)
3140; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
3141; AVX2-NEXT:    vpextrw $5, %xmm0, (%rsp,%r13,2)
3142; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
3143; AVX2-NEXT:    andl $31, %r13d
3144; AVX2-NEXT:    vpextrw $6, %xmm0, (%rsp,%r13,2)
3145; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
3146; AVX2-NEXT:    andl $31, %r13d
3147; AVX2-NEXT:    vpextrw $7, %xmm0, (%rsp,%r13,2)
3148; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
3149; AVX2-NEXT:    andl $31, %r13d
3150; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
3151; AVX2-NEXT:    vpextrw $0, %xmm0, (%rsp,%r13,2)
3152; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
3153; AVX2-NEXT:    andl $31, %r13d
3154; AVX2-NEXT:    vpextrw $1, %xmm0, (%rsp,%r13,2)
3155; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
3156; AVX2-NEXT:    andl $31, %r13d
3157; AVX2-NEXT:    vpextrw $2, %xmm0, (%rsp,%r13,2)
3158; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
3159; AVX2-NEXT:    andl $31, %r13d
3160; AVX2-NEXT:    vpextrw $3, %xmm0, (%rsp,%r13,2)
3161; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
3162; AVX2-NEXT:    andl $31, %r13d
3163; AVX2-NEXT:    vpextrw $4, %xmm0, (%rsp,%r13,2)
3164; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
3165; AVX2-NEXT:    andl $31, %r13d
3166; AVX2-NEXT:    vpextrw $5, %xmm0, (%rsp,%r13,2)
3167; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
3168; AVX2-NEXT:    andl $31, %r13d
3169; AVX2-NEXT:    vpextrw $6, %xmm0, (%rsp,%r13,2)
3170; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
3171; AVX2-NEXT:    andl $31, %r13d
3172; AVX2-NEXT:    vpextrw $7, %xmm0, (%rsp,%r13,2)
3173; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
3174; AVX2-NEXT:    andl $31, %r13d
3175; AVX2-NEXT:    vpextrw $0, %xmm1, (%rsp,%r13,2)
3176; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
3177; AVX2-NEXT:    andl $31, %r13d
3178; AVX2-NEXT:    vpextrw $1, %xmm1, (%rsp,%r13,2)
3179; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
3180; AVX2-NEXT:    andl $31, %r13d
3181; AVX2-NEXT:    vpextrw $2, %xmm1, (%rsp,%r13,2)
3182; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
3183; AVX2-NEXT:    andl $31, %eax
3184; AVX2-NEXT:    vpextrw $3, %xmm1, (%rsp,%rax,2)
3185; AVX2-NEXT:    andl $31, %r12d
3186; AVX2-NEXT:    vpextrw $4, %xmm1, (%rsp,%r12,2)
3187; AVX2-NEXT:    andl $31, %r15d
3188; AVX2-NEXT:    vpextrw $5, %xmm1, (%rsp,%r15,2)
3189; AVX2-NEXT:    andl $31, %r14d
3190; AVX2-NEXT:    vpextrw $6, %xmm1, (%rsp,%r14,2)
3191; AVX2-NEXT:    andl $31, %ebx
3192; AVX2-NEXT:    vpextrw $7, %xmm1, (%rsp,%rbx,2)
3193; AVX2-NEXT:    andl $31, %r11d
3194; AVX2-NEXT:    vpextrw $0, %xmm2, (%rsp,%r11,2)
3195; AVX2-NEXT:    andl $31, %r10d
3196; AVX2-NEXT:    vpextrw $1, %xmm2, (%rsp,%r10,2)
3197; AVX2-NEXT:    andl $31, %r9d
3198; AVX2-NEXT:    vpextrw $2, %xmm2, (%rsp,%r9,2)
3199; AVX2-NEXT:    andl $31, %r8d
3200; AVX2-NEXT:    vpextrw $3, %xmm2, (%rsp,%r8,2)
3201; AVX2-NEXT:    andl $31, %edi
3202; AVX2-NEXT:    vpextrw $4, %xmm2, (%rsp,%rdi,2)
3203; AVX2-NEXT:    andl $31, %esi
3204; AVX2-NEXT:    vpextrw $5, %xmm2, (%rsp,%rsi,2)
3205; AVX2-NEXT:    andl $31, %edx
3206; AVX2-NEXT:    vpextrw $6, %xmm2, (%rsp,%rdx,2)
3207; AVX2-NEXT:    andl $31, %ecx
3208; AVX2-NEXT:    vpextrw $7, %xmm2, (%rsp,%rcx,2)
3209; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
3210; AVX2-NEXT:    cmpq $31, %rcx
3211; AVX2-NEXT:    movl $31, %eax
3212; AVX2-NEXT:    cmovbq %rcx, %rax
3213; AVX2-NEXT:    movl %eax, %eax
3214; AVX2-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
3215; AVX2-NEXT:    movw %cx, (%rsp,%rax,2)
3216; AVX2-NEXT:    vmovaps (%rsp), %ymm0
3217; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
3218; AVX2-NEXT:    leaq -40(%rbp), %rsp
3219; AVX2-NEXT:    popq %rbx
3220; AVX2-NEXT:    popq %r12
3221; AVX2-NEXT:    popq %r13
3222; AVX2-NEXT:    popq %r14
3223; AVX2-NEXT:    popq %r15
3224; AVX2-NEXT:    popq %rbp
3225; AVX2-NEXT:    retq
3226;
3227; AVX512F-LABEL: test_compress_v32i16:
3228; AVX512F:       # %bb.0:
3229; AVX512F-NEXT:    pushq %rbp
3230; AVX512F-NEXT:    movq %rsp, %rbp
3231; AVX512F-NEXT:    andq $-64, %rsp
3232; AVX512F-NEXT:    subq $128, %rsp
3233; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
3234; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm5
3235; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
3236; AVX512F-NEXT:    vpmovsxbd %xmm5, %zmm5
3237; AVX512F-NEXT:    vpslld $31, %zmm5, %zmm5
3238; AVX512F-NEXT:    vptestmd %zmm5, %zmm5, %k1
3239; AVX512F-NEXT:    vpmovsxbd %xmm1, %zmm1
3240; AVX512F-NEXT:    vpslld $31, %zmm1, %zmm1
3241; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k2
3242; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3243; AVX512F-NEXT:    vpcompressd %zmm1, %zmm1 {%k2} {z}
3244; AVX512F-NEXT:    vpmovdw %zmm1, (%rsp)
3245; AVX512F-NEXT:    kshiftrw $8, %k2, %k0
3246; AVX512F-NEXT:    kxorw %k0, %k2, %k0
3247; AVX512F-NEXT:    kshiftrw $4, %k0, %k2
3248; AVX512F-NEXT:    kxorw %k2, %k0, %k0
3249; AVX512F-NEXT:    kshiftrw $2, %k0, %k2
3250; AVX512F-NEXT:    kxorw %k2, %k0, %k0
3251; AVX512F-NEXT:    kshiftrw $1, %k0, %k2
3252; AVX512F-NEXT:    kxorw %k2, %k0, %k0
3253; AVX512F-NEXT:    kmovw %k0, %eax
3254; AVX512F-NEXT:    andl $31, %eax
3255; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
3256; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3257; AVX512F-NEXT:    vpcompressd %zmm0, %zmm0 {%k1} {z}
3258; AVX512F-NEXT:    vpmovdw %zmm0, (%rsp,%rax,2)
3259; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
3260; AVX512F-NEXT:    vpsllw $15, %ymm4, %ymm1
3261; AVX512F-NEXT:    vpsraw $15, %ymm1, %ymm1
3262; AVX512F-NEXT:    vpblendvb %ymm1, {{[0-9]+}}(%rsp), %ymm0, %ymm0
3263; AVX512F-NEXT:    vpsllw $15, %ymm3, %ymm1
3264; AVX512F-NEXT:    vpsraw $15, %ymm1, %ymm1
3265; AVX512F-NEXT:    vpblendvb %ymm1, (%rsp), %ymm2, %ymm1
3266; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
3267; AVX512F-NEXT:    movq %rbp, %rsp
3268; AVX512F-NEXT:    popq %rbp
3269; AVX512F-NEXT:    retq
3270;
3271; AVX512VL-LABEL: test_compress_v32i16:
3272; AVX512VL:       # %bb.0:
3273; AVX512VL-NEXT:    vpsllw $7, %ymm1, %ymm1
3274; AVX512VL-NEXT:    vpmovb2m %ymm1, %k1
3275; AVX512VL-NEXT:    vpcompressw %zmm0, %zmm2 {%k1}
3276; AVX512VL-NEXT:    vmovdqa64 %zmm2, %zmm0
3277; AVX512VL-NEXT:    retq
3278    %out = call <32 x i16> @llvm.experimental.vector.compress(<32 x i16> %vec, <32 x i1> %mask, <32 x i16> %passthru)
3279    ret <32 x i16> %out
3280}
3281
3282define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i32> %passthru) nounwind {
3283; AVX2-LABEL: test_compress_large:
3284; AVX2:       # %bb.0:
3285; AVX2-NEXT:    pushq %rbp
3286; AVX2-NEXT:    movq %rsp, %rbp
3287; AVX2-NEXT:    andq $-32, %rsp
3288; AVX2-NEXT:    subq $288, %rsp # imm = 0x120
3289; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
3290; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
3291; AVX2-NEXT:    # kill: def $r9d killed $r9d def $r9
3292; AVX2-NEXT:    # kill: def $r8d killed $r8d def $r8
3293; AVX2-NEXT:    # kill: def $esi killed $esi def $rsi
3294; AVX2-NEXT:    movq %rdi, %rax
3295; AVX2-NEXT:    vmovss %xmm0, (%rsp)
3296; AVX2-NEXT:    andl $1, %esi
3297; AVX2-NEXT:    vextractps $1, %xmm0, (%rsp,%rsi,4)
3298; AVX2-NEXT:    andl $1, %edx
3299; AVX2-NEXT:    addl %esi, %edx
3300; AVX2-NEXT:    vextractps $2, %xmm0, (%rsp,%rdx,4)
3301; AVX2-NEXT:    andl $1, %ecx
3302; AVX2-NEXT:    addl %edx, %ecx
3303; AVX2-NEXT:    vextractps $3, %xmm0, (%rsp,%rcx,4)
3304; AVX2-NEXT:    andl $1, %r8d
3305; AVX2-NEXT:    addl %ecx, %r8d
3306; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
3307; AVX2-NEXT:    vmovss %xmm0, (%rsp,%r8,4)
3308; AVX2-NEXT:    andl $1, %r9d
3309; AVX2-NEXT:    addl %r8d, %r9d
3310; AVX2-NEXT:    movzbl 16(%rbp), %ecx
3311; AVX2-NEXT:    vextractps $1, %xmm0, (%rsp,%r9,4)
3312; AVX2-NEXT:    movzbl %cl, %ecx
3313; AVX2-NEXT:    andl $1, %ecx
3314; AVX2-NEXT:    addl %r9d, %ecx
3315; AVX2-NEXT:    movzbl 24(%rbp), %edx
3316; AVX2-NEXT:    vextractps $2, %xmm0, (%rsp,%rcx,4)
3317; AVX2-NEXT:    movzbl %dl, %edx
3318; AVX2-NEXT:    andl $1, %edx
3319; AVX2-NEXT:    addl %ecx, %edx
3320; AVX2-NEXT:    movzbl 32(%rbp), %ecx
3321; AVX2-NEXT:    movzbl %cl, %ecx
3322; AVX2-NEXT:    andl $1, %ecx
3323; AVX2-NEXT:    addl %edx, %ecx
3324; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
3325; AVX2-NEXT:    andl $63, %edx
3326; AVX2-NEXT:    vextractps $3, %xmm0, (%rsp,%rdx,4)
3327; AVX2-NEXT:    movzbl 40(%rbp), %edx
3328; AVX2-NEXT:    movzbl %dl, %edx
3329; AVX2-NEXT:    andl $1, %edx
3330; AVX2-NEXT:    addl %ecx, %edx
3331; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
3332; AVX2-NEXT:    andl $63, %ecx
3333; AVX2-NEXT:    vmovss %xmm1, (%rsp,%rcx,4)
3334; AVX2-NEXT:    movzbl 48(%rbp), %ecx
3335; AVX2-NEXT:    movzbl %cl, %ecx
3336; AVX2-NEXT:    andl $1, %ecx
3337; AVX2-NEXT:    addl %edx, %ecx
3338; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
3339; AVX2-NEXT:    andl $63, %edx
3340; AVX2-NEXT:    vextractps $1, %xmm1, (%rsp,%rdx,4)
3341; AVX2-NEXT:    movzbl 56(%rbp), %edx
3342; AVX2-NEXT:    movzbl %dl, %edx
3343; AVX2-NEXT:    andl $1, %edx
3344; AVX2-NEXT:    addl %ecx, %edx
3345; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
3346; AVX2-NEXT:    andl $63, %ecx
3347; AVX2-NEXT:    vextractps $2, %xmm1, (%rsp,%rcx,4)
3348; AVX2-NEXT:    movzbl 64(%rbp), %ecx
3349; AVX2-NEXT:    movzbl %cl, %ecx
3350; AVX2-NEXT:    andl $1, %ecx
3351; AVX2-NEXT:    addl %edx, %ecx
3352; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
3353; AVX2-NEXT:    andl $63, %edx
3354; AVX2-NEXT:    vextractps $3, %xmm1, (%rsp,%rdx,4)
3355; AVX2-NEXT:    movzbl 72(%rbp), %edx
3356; AVX2-NEXT:    movzbl %dl, %edx
3357; AVX2-NEXT:    andl $1, %edx
3358; AVX2-NEXT:    addl %ecx, %edx
3359; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
3360; AVX2-NEXT:    andl $63, %ecx
3361; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm0
3362; AVX2-NEXT:    vmovss %xmm0, (%rsp,%rcx,4)
3363; AVX2-NEXT:    movzbl 80(%rbp), %ecx
3364; AVX2-NEXT:    movzbl %cl, %ecx
3365; AVX2-NEXT:    andl $1, %ecx
3366; AVX2-NEXT:    addl %edx, %ecx
3367; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
3368; AVX2-NEXT:    andl $63, %edx
3369; AVX2-NEXT:    vextractps $1, %xmm0, (%rsp,%rdx,4)
3370; AVX2-NEXT:    movzbl 88(%rbp), %edx
3371; AVX2-NEXT:    movzbl %dl, %edx
3372; AVX2-NEXT:    andl $1, %edx
3373; AVX2-NEXT:    addl %ecx, %edx
3374; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
3375; AVX2-NEXT:    andl $63, %ecx
3376; AVX2-NEXT:    vextractps $2, %xmm0, (%rsp,%rcx,4)
3377; AVX2-NEXT:    movzbl 96(%rbp), %ecx
3378; AVX2-NEXT:    movzbl %cl, %ecx
3379; AVX2-NEXT:    andl $1, %ecx
3380; AVX2-NEXT:    addl %edx, %ecx
3381; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
3382; AVX2-NEXT:    andl $63, %edx
3383; AVX2-NEXT:    vextractps $3, %xmm0, (%rsp,%rdx,4)
3384; AVX2-NEXT:    movzbl 104(%rbp), %edx
3385; AVX2-NEXT:    movzbl %dl, %edx
3386; AVX2-NEXT:    andl $1, %edx
3387; AVX2-NEXT:    addl %ecx, %edx
3388; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
3389; AVX2-NEXT:    andl $63, %ecx
3390; AVX2-NEXT:    vmovss %xmm2, (%rsp,%rcx,4)
3391; AVX2-NEXT:    movzbl 112(%rbp), %ecx
3392; AVX2-NEXT:    movzbl %cl, %ecx
3393; AVX2-NEXT:    andl $1, %ecx
3394; AVX2-NEXT:    addl %edx, %ecx
3395; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
3396; AVX2-NEXT:    andl $63, %edx
3397; AVX2-NEXT:    vextractps $1, %xmm2, (%rsp,%rdx,4)
3398; AVX2-NEXT:    movzbl 120(%rbp), %edx
3399; AVX2-NEXT:    movzbl %dl, %edx
3400; AVX2-NEXT:    andl $1, %edx
3401; AVX2-NEXT:    addl %ecx, %edx
3402; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
3403; AVX2-NEXT:    andl $63, %ecx
3404; AVX2-NEXT:    vextractps $2, %xmm2, (%rsp,%rcx,4)
3405; AVX2-NEXT:    movzbl 128(%rbp), %ecx
3406; AVX2-NEXT:    movzbl %cl, %ecx
3407; AVX2-NEXT:    andl $1, %ecx
3408; AVX2-NEXT:    addl %edx, %ecx
3409; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
3410; AVX2-NEXT:    andl $63, %edx
3411; AVX2-NEXT:    vextractps $3, %xmm2, (%rsp,%rdx,4)
3412; AVX2-NEXT:    movzbl 136(%rbp), %edx
3413; AVX2-NEXT:    movzbl %dl, %edx
3414; AVX2-NEXT:    andl $1, %edx
3415; AVX2-NEXT:    addl %ecx, %edx
3416; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
3417; AVX2-NEXT:    andl $63, %ecx
3418; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm0
3419; AVX2-NEXT:    vmovss %xmm0, (%rsp,%rcx,4)
3420; AVX2-NEXT:    movzbl 144(%rbp), %ecx
3421; AVX2-NEXT:    movzbl %cl, %ecx
3422; AVX2-NEXT:    andl $1, %ecx
3423; AVX2-NEXT:    addl %edx, %ecx
3424; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
3425; AVX2-NEXT:    andl $63, %edx
3426; AVX2-NEXT:    vextractps $1, %xmm0, (%rsp,%rdx,4)
3427; AVX2-NEXT:    movzbl 152(%rbp), %edx
3428; AVX2-NEXT:    movzbl %dl, %edx
3429; AVX2-NEXT:    andl $1, %edx
3430; AVX2-NEXT:    addl %ecx, %edx
3431; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
3432; AVX2-NEXT:    andl $63, %ecx
3433; AVX2-NEXT:    vextractps $2, %xmm0, (%rsp,%rcx,4)
3434; AVX2-NEXT:    movzbl 160(%rbp), %ecx
3435; AVX2-NEXT:    movzbl %cl, %ecx
3436; AVX2-NEXT:    andl $1, %ecx
3437; AVX2-NEXT:    addl %edx, %ecx
3438; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
3439; AVX2-NEXT:    andl $63, %edx
3440; AVX2-NEXT:    vextractps $3, %xmm0, (%rsp,%rdx,4)
3441; AVX2-NEXT:    movzbl 168(%rbp), %edx
3442; AVX2-NEXT:    movzbl %dl, %edx
3443; AVX2-NEXT:    andl $1, %edx
3444; AVX2-NEXT:    addl %ecx, %edx
3445; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
3446; AVX2-NEXT:    andl $63, %ecx
3447; AVX2-NEXT:    vmovss %xmm3, (%rsp,%rcx,4)
3448; AVX2-NEXT:    movzbl 176(%rbp), %ecx
3449; AVX2-NEXT:    movzbl %cl, %ecx
3450; AVX2-NEXT:    andl $1, %ecx
3451; AVX2-NEXT:    addl %edx, %ecx
3452; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
3453; AVX2-NEXT:    andl $63, %edx
3454; AVX2-NEXT:    vextractps $1, %xmm3, (%rsp,%rdx,4)
3455; AVX2-NEXT:    movzbl 184(%rbp), %edx
3456; AVX2-NEXT:    movzbl %dl, %edx
3457; AVX2-NEXT:    andl $1, %edx
3458; AVX2-NEXT:    addl %ecx, %edx
3459; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
3460; AVX2-NEXT:    andl $63, %ecx
3461; AVX2-NEXT:    vextractps $2, %xmm3, (%rsp,%rcx,4)
3462; AVX2-NEXT:    movzbl 192(%rbp), %ecx
3463; AVX2-NEXT:    movzbl %cl, %ecx
3464; AVX2-NEXT:    andl $1, %ecx
3465; AVX2-NEXT:    addl %edx, %ecx
3466; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
3467; AVX2-NEXT:    andl $63, %edx
3468; AVX2-NEXT:    vextractps $3, %xmm3, (%rsp,%rdx,4)
3469; AVX2-NEXT:    movzbl 200(%rbp), %edx
3470; AVX2-NEXT:    movzbl %dl, %edx
3471; AVX2-NEXT:    andl $1, %edx
3472; AVX2-NEXT:    addl %ecx, %edx
3473; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
3474; AVX2-NEXT:    andl $63, %ecx
3475; AVX2-NEXT:    vextractf128 $1, %ymm3, %xmm0
3476; AVX2-NEXT:    vmovss %xmm0, (%rsp,%rcx,4)
3477; AVX2-NEXT:    movzbl 208(%rbp), %ecx
3478; AVX2-NEXT:    movzbl %cl, %ecx
3479; AVX2-NEXT:    andl $1, %ecx
3480; AVX2-NEXT:    addl %edx, %ecx
3481; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
3482; AVX2-NEXT:    andl $63, %edx
3483; AVX2-NEXT:    vextractps $1, %xmm0, (%rsp,%rdx,4)
3484; AVX2-NEXT:    movzbl 216(%rbp), %edx
3485; AVX2-NEXT:    movzbl %dl, %edx
3486; AVX2-NEXT:    andl $1, %edx
3487; AVX2-NEXT:    addl %ecx, %edx
3488; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
3489; AVX2-NEXT:    andl $63, %ecx
3490; AVX2-NEXT:    vextractps $2, %xmm0, (%rsp,%rcx,4)
3491; AVX2-NEXT:    movzbl 224(%rbp), %ecx
3492; AVX2-NEXT:    movzbl %cl, %ecx
3493; AVX2-NEXT:    andl $1, %ecx
3494; AVX2-NEXT:    addl %edx, %ecx
3495; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
3496; AVX2-NEXT:    andl $63, %edx
3497; AVX2-NEXT:    vextractps $3, %xmm0, (%rsp,%rdx,4)
3498; AVX2-NEXT:    movzbl 232(%rbp), %edx
3499; AVX2-NEXT:    movzbl %dl, %edx
3500; AVX2-NEXT:    andl $1, %edx
3501; AVX2-NEXT:    addl %ecx, %edx
3502; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
3503; AVX2-NEXT:    andl $63, %ecx
3504; AVX2-NEXT:    vmovss %xmm4, (%rsp,%rcx,4)
3505; AVX2-NEXT:    movzbl 240(%rbp), %ecx
3506; AVX2-NEXT:    movzbl %cl, %ecx
3507; AVX2-NEXT:    andl $1, %ecx
3508; AVX2-NEXT:    addl %edx, %ecx
3509; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
3510; AVX2-NEXT:    andl $63, %edx
3511; AVX2-NEXT:    vextractps $1, %xmm4, (%rsp,%rdx,4)
3512; AVX2-NEXT:    movzbl 248(%rbp), %edx
3513; AVX2-NEXT:    movzbl %dl, %edx
3514; AVX2-NEXT:    andl $1, %edx
3515; AVX2-NEXT:    addl %ecx, %edx
3516; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
3517; AVX2-NEXT:    andl $63, %ecx
3518; AVX2-NEXT:    vextractps $2, %xmm4, (%rsp,%rcx,4)
3519; AVX2-NEXT:    movzbl 256(%rbp), %ecx
3520; AVX2-NEXT:    movzbl %cl, %ecx
3521; AVX2-NEXT:    andl $1, %ecx
3522; AVX2-NEXT:    addl %edx, %ecx
3523; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
3524; AVX2-NEXT:    andl $63, %edx
3525; AVX2-NEXT:    vextractps $3, %xmm4, (%rsp,%rdx,4)
3526; AVX2-NEXT:    movzbl 264(%rbp), %edx
3527; AVX2-NEXT:    movzbl %dl, %edx
3528; AVX2-NEXT:    andl $1, %edx
3529; AVX2-NEXT:    addl %ecx, %edx
3530; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
3531; AVX2-NEXT:    andl $63, %ecx
3532; AVX2-NEXT:    vextractf128 $1, %ymm4, %xmm0
3533; AVX2-NEXT:    vmovss %xmm0, (%rsp,%rcx,4)
3534; AVX2-NEXT:    movzbl 272(%rbp), %ecx
3535; AVX2-NEXT:    movzbl %cl, %ecx
3536; AVX2-NEXT:    andl $1, %ecx
3537; AVX2-NEXT:    addl %edx, %ecx
3538; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
3539; AVX2-NEXT:    andl $63, %edx
3540; AVX2-NEXT:    vextractps $1, %xmm0, (%rsp,%rdx,4)
3541; AVX2-NEXT:    movzbl 280(%rbp), %edx
3542; AVX2-NEXT:    movzbl %dl, %edx
3543; AVX2-NEXT:    andl $1, %edx
3544; AVX2-NEXT:    addl %ecx, %edx
3545; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
3546; AVX2-NEXT:    andl $63, %ecx
3547; AVX2-NEXT:    vextractps $2, %xmm0, (%rsp,%rcx,4)
3548; AVX2-NEXT:    movzbl 288(%rbp), %ecx
3549; AVX2-NEXT:    movzbl %cl, %ecx
3550; AVX2-NEXT:    andl $1, %ecx
3551; AVX2-NEXT:    addl %edx, %ecx
3552; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
3553; AVX2-NEXT:    andl $63, %edx
3554; AVX2-NEXT:    vextractps $3, %xmm0, (%rsp,%rdx,4)
3555; AVX2-NEXT:    movzbl 296(%rbp), %edx
3556; AVX2-NEXT:    movzbl %dl, %edx
3557; AVX2-NEXT:    andl $1, %edx
3558; AVX2-NEXT:    addl %ecx, %edx
3559; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
3560; AVX2-NEXT:    andl $63, %ecx
3561; AVX2-NEXT:    vmovss %xmm5, (%rsp,%rcx,4)
3562; AVX2-NEXT:    movzbl 304(%rbp), %ecx
3563; AVX2-NEXT:    movzbl %cl, %ecx
3564; AVX2-NEXT:    andl $1, %ecx
3565; AVX2-NEXT:    addl %edx, %ecx
3566; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
3567; AVX2-NEXT:    andl $63, %edx
3568; AVX2-NEXT:    vextractps $1, %xmm5, (%rsp,%rdx,4)
3569; AVX2-NEXT:    movzbl 312(%rbp), %edx
3570; AVX2-NEXT:    movzbl %dl, %edx
3571; AVX2-NEXT:    andl $1, %edx
3572; AVX2-NEXT:    addl %ecx, %edx
3573; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
3574; AVX2-NEXT:    andl $63, %ecx
3575; AVX2-NEXT:    vextractps $2, %xmm5, (%rsp,%rcx,4)
3576; AVX2-NEXT:    movzbl 320(%rbp), %ecx
3577; AVX2-NEXT:    movzbl %cl, %ecx
3578; AVX2-NEXT:    andl $1, %ecx
3579; AVX2-NEXT:    addl %edx, %ecx
3580; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
3581; AVX2-NEXT:    andl $63, %edx
3582; AVX2-NEXT:    vextractps $3, %xmm5, (%rsp,%rdx,4)
3583; AVX2-NEXT:    movzbl 328(%rbp), %edx
3584; AVX2-NEXT:    movzbl %dl, %edx
3585; AVX2-NEXT:    andl $1, %edx
3586; AVX2-NEXT:    addl %ecx, %edx
3587; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
3588; AVX2-NEXT:    andl $63, %ecx
3589; AVX2-NEXT:    vextractf128 $1, %ymm5, %xmm0
3590; AVX2-NEXT:    vmovss %xmm0, (%rsp,%rcx,4)
3591; AVX2-NEXT:    movzbl 336(%rbp), %ecx
3592; AVX2-NEXT:    movzbl %cl, %ecx
3593; AVX2-NEXT:    andl $1, %ecx
3594; AVX2-NEXT:    addl %edx, %ecx
3595; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
3596; AVX2-NEXT:    andl $63, %edx
3597; AVX2-NEXT:    vextractps $1, %xmm0, (%rsp,%rdx,4)
3598; AVX2-NEXT:    movzbl 344(%rbp), %edx
3599; AVX2-NEXT:    movzbl %dl, %edx
3600; AVX2-NEXT:    andl $1, %edx
3601; AVX2-NEXT:    addl %ecx, %edx
3602; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
3603; AVX2-NEXT:    andl $63, %ecx
3604; AVX2-NEXT:    vextractps $2, %xmm0, (%rsp,%rcx,4)
3605; AVX2-NEXT:    movzbl 352(%rbp), %ecx
3606; AVX2-NEXT:    movzbl %cl, %ecx
3607; AVX2-NEXT:    andl $1, %ecx
3608; AVX2-NEXT:    addl %edx, %ecx
3609; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
3610; AVX2-NEXT:    andl $63, %edx
3611; AVX2-NEXT:    vextractps $3, %xmm0, (%rsp,%rdx,4)
3612; AVX2-NEXT:    movzbl 360(%rbp), %edx
3613; AVX2-NEXT:    movzbl %dl, %edx
3614; AVX2-NEXT:    andl $1, %edx
3615; AVX2-NEXT:    addl %ecx, %edx
3616; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
3617; AVX2-NEXT:    andl $63, %ecx
3618; AVX2-NEXT:    vmovss %xmm6, (%rsp,%rcx,4)
3619; AVX2-NEXT:    movzbl 368(%rbp), %ecx
3620; AVX2-NEXT:    movzbl %cl, %ecx
3621; AVX2-NEXT:    andl $1, %ecx
3622; AVX2-NEXT:    addl %edx, %ecx
3623; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
3624; AVX2-NEXT:    andl $63, %edx
3625; AVX2-NEXT:    vextractps $1, %xmm6, (%rsp,%rdx,4)
3626; AVX2-NEXT:    movzbl 376(%rbp), %edx
3627; AVX2-NEXT:    movzbl %dl, %edx
3628; AVX2-NEXT:    andl $1, %edx
3629; AVX2-NEXT:    addl %ecx, %edx
3630; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
3631; AVX2-NEXT:    andl $63, %ecx
3632; AVX2-NEXT:    vextractps $2, %xmm6, (%rsp,%rcx,4)
3633; AVX2-NEXT:    movzbl 384(%rbp), %ecx
3634; AVX2-NEXT:    movzbl %cl, %ecx
3635; AVX2-NEXT:    andl $1, %ecx
3636; AVX2-NEXT:    addl %edx, %ecx
3637; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
3638; AVX2-NEXT:    andl $63, %edx
3639; AVX2-NEXT:    vextractps $3, %xmm6, (%rsp,%rdx,4)
3640; AVX2-NEXT:    movzbl 392(%rbp), %edx
3641; AVX2-NEXT:    movzbl %dl, %edx
3642; AVX2-NEXT:    andl $1, %edx
3643; AVX2-NEXT:    addl %ecx, %edx
3644; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
3645; AVX2-NEXT:    andl $63, %ecx
3646; AVX2-NEXT:    vextractf128 $1, %ymm6, %xmm0
3647; AVX2-NEXT:    vmovss %xmm0, (%rsp,%rcx,4)
3648; AVX2-NEXT:    movzbl 400(%rbp), %ecx
3649; AVX2-NEXT:    movzbl %cl, %ecx
3650; AVX2-NEXT:    andl $1, %ecx
3651; AVX2-NEXT:    addl %edx, %ecx
3652; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
3653; AVX2-NEXT:    andl $63, %edx
3654; AVX2-NEXT:    vextractps $1, %xmm0, (%rsp,%rdx,4)
3655; AVX2-NEXT:    movzbl 408(%rbp), %edx
3656; AVX2-NEXT:    movzbl %dl, %edx
3657; AVX2-NEXT:    andl $1, %edx
3658; AVX2-NEXT:    addl %ecx, %edx
3659; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
3660; AVX2-NEXT:    andl $63, %ecx
3661; AVX2-NEXT:    vextractps $2, %xmm0, (%rsp,%rcx,4)
3662; AVX2-NEXT:    movzbl 416(%rbp), %ecx
3663; AVX2-NEXT:    movzbl %cl, %ecx
3664; AVX2-NEXT:    andl $1, %ecx
3665; AVX2-NEXT:    addl %edx, %ecx
3666; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
3667; AVX2-NEXT:    andl $63, %edx
3668; AVX2-NEXT:    vextractps $3, %xmm0, (%rsp,%rdx,4)
3669; AVX2-NEXT:    movzbl 424(%rbp), %edx
3670; AVX2-NEXT:    movzbl %dl, %edx
3671; AVX2-NEXT:    andl $1, %edx
3672; AVX2-NEXT:    addl %ecx, %edx
3673; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
3674; AVX2-NEXT:    andl $63, %ecx
3675; AVX2-NEXT:    vmovss %xmm7, (%rsp,%rcx,4)
3676; AVX2-NEXT:    movzbl 432(%rbp), %ecx
3677; AVX2-NEXT:    movzbl %cl, %ecx
3678; AVX2-NEXT:    andl $1, %ecx
3679; AVX2-NEXT:    addl %edx, %ecx
3680; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
3681; AVX2-NEXT:    andl $63, %edx
3682; AVX2-NEXT:    vextractps $1, %xmm7, (%rsp,%rdx,4)
3683; AVX2-NEXT:    movzbl 440(%rbp), %edx
3684; AVX2-NEXT:    movzbl %dl, %edx
3685; AVX2-NEXT:    andl $1, %edx
3686; AVX2-NEXT:    addl %ecx, %edx
3687; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
3688; AVX2-NEXT:    andl $63, %ecx
3689; AVX2-NEXT:    vextractps $2, %xmm7, (%rsp,%rcx,4)
3690; AVX2-NEXT:    movzbl 448(%rbp), %ecx
3691; AVX2-NEXT:    movzbl %cl, %ecx
3692; AVX2-NEXT:    andl $1, %ecx
3693; AVX2-NEXT:    addl %edx, %ecx
3694; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
3695; AVX2-NEXT:    andl $63, %edx
3696; AVX2-NEXT:    vextractps $3, %xmm7, (%rsp,%rdx,4)
3697; AVX2-NEXT:    movzbl 456(%rbp), %edx
3698; AVX2-NEXT:    movzbl %dl, %edx
3699; AVX2-NEXT:    andl $1, %edx
3700; AVX2-NEXT:    addl %ecx, %edx
3701; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
3702; AVX2-NEXT:    andl $63, %ecx
3703; AVX2-NEXT:    vextractf128 $1, %ymm7, %xmm0
3704; AVX2-NEXT:    vmovss %xmm0, (%rsp,%rcx,4)
3705; AVX2-NEXT:    movzbl 464(%rbp), %ecx
3706; AVX2-NEXT:    movzbl %cl, %ecx
3707; AVX2-NEXT:    andl $1, %ecx
3708; AVX2-NEXT:    addl %edx, %ecx
3709; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
3710; AVX2-NEXT:    andl $63, %edx
3711; AVX2-NEXT:    vextractps $1, %xmm0, (%rsp,%rdx,4)
3712; AVX2-NEXT:    movzbl 472(%rbp), %edx
3713; AVX2-NEXT:    movzbl %dl, %edx
3714; AVX2-NEXT:    andl $1, %edx
3715; AVX2-NEXT:    addl %ecx, %edx
3716; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
3717; AVX2-NEXT:    andl $63, %ecx
3718; AVX2-NEXT:    vextractps $2, %xmm0, (%rsp,%rcx,4)
3719; AVX2-NEXT:    andl $63, %edx
3720; AVX2-NEXT:    vextractps $3, %xmm0, (%rsp,%rdx,4)
3721; AVX2-NEXT:    vmovaps (%rsp), %ymm0
3722; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
3723; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm2
3724; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm3
3725; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm4
3726; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm5
3727; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm6
3728; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm7
3729; AVX2-NEXT:    vmovaps %ymm7, 224(%rdi)
3730; AVX2-NEXT:    vmovaps %ymm6, 192(%rdi)
3731; AVX2-NEXT:    vmovaps %ymm5, 160(%rdi)
3732; AVX2-NEXT:    vmovaps %ymm4, 128(%rdi)
3733; AVX2-NEXT:    vmovaps %ymm3, 96(%rdi)
3734; AVX2-NEXT:    vmovaps %ymm2, 64(%rdi)
3735; AVX2-NEXT:    vmovaps %ymm1, 32(%rdi)
3736; AVX2-NEXT:    vmovaps %ymm0, (%rdi)
3737; AVX2-NEXT:    movq %rbp, %rsp
3738; AVX2-NEXT:    popq %rbp
3739; AVX2-NEXT:    vzeroupper
3740; AVX2-NEXT:    retq
3741;
3742; AVX512F-LABEL: test_compress_large:
3743; AVX512F:       # %bb.0:
3744; AVX512F-NEXT:    pushq %rbp
3745; AVX512F-NEXT:    movq %rsp, %rbp
3746; AVX512F-NEXT:    andq $-64, %rsp
3747; AVX512F-NEXT:    subq $640, %rsp # imm = 0x280
3748; AVX512F-NEXT:    movzbl 352(%rbp), %eax
3749; AVX512F-NEXT:    andl $1, %eax
3750; AVX512F-NEXT:    kmovw %eax, %k0
3751; AVX512F-NEXT:    movzbl 360(%rbp), %eax
3752; AVX512F-NEXT:    kmovw %eax, %k1
3753; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
3754; AVX512F-NEXT:    kshiftrw $14, %k1, %k1
3755; AVX512F-NEXT:    korw %k1, %k0, %k0
3756; AVX512F-NEXT:    movw $-5, %ax
3757; AVX512F-NEXT:    kmovw %eax, %k1
3758; AVX512F-NEXT:    kandw %k1, %k0, %k0
3759; AVX512F-NEXT:    kmovw %k1, %k3
3760; AVX512F-NEXT:    movzbl 368(%rbp), %eax
3761; AVX512F-NEXT:    kmovw %eax, %k1
3762; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
3763; AVX512F-NEXT:    kshiftrw $13, %k1, %k1
3764; AVX512F-NEXT:    korw %k1, %k0, %k0
3765; AVX512F-NEXT:    movw $-9, %ax
3766; AVX512F-NEXT:    kmovw %eax, %k7
3767; AVX512F-NEXT:    kandw %k7, %k0, %k0
3768; AVX512F-NEXT:    kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
3769; AVX512F-NEXT:    movzbl 376(%rbp), %eax
3770; AVX512F-NEXT:    kmovw %eax, %k1
3771; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
3772; AVX512F-NEXT:    kshiftrw $12, %k1, %k1
3773; AVX512F-NEXT:    korw %k1, %k0, %k0
3774; AVX512F-NEXT:    movw $-17, %ax
3775; AVX512F-NEXT:    kmovw %eax, %k5
3776; AVX512F-NEXT:    kandw %k5, %k0, %k0
3777; AVX512F-NEXT:    movzbl 384(%rbp), %eax
3778; AVX512F-NEXT:    kmovw %eax, %k1
3779; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
3780; AVX512F-NEXT:    kshiftrw $11, %k1, %k1
3781; AVX512F-NEXT:    korw %k1, %k0, %k0
3782; AVX512F-NEXT:    movw $-33, %ax
3783; AVX512F-NEXT:    kmovw %eax, %k6
3784; AVX512F-NEXT:    kandw %k6, %k0, %k0
3785; AVX512F-NEXT:    movzbl 392(%rbp), %eax
3786; AVX512F-NEXT:    kmovw %eax, %k1
3787; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
3788; AVX512F-NEXT:    kshiftrw $10, %k1, %k1
3789; AVX512F-NEXT:    korw %k1, %k0, %k0
3790; AVX512F-NEXT:    movw $-65, %ax
3791; AVX512F-NEXT:    kmovw %eax, %k1
3792; AVX512F-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
3793; AVX512F-NEXT:    kandw %k1, %k0, %k0
3794; AVX512F-NEXT:    movzbl 400(%rbp), %eax
3795; AVX512F-NEXT:    kmovw %eax, %k1
3796; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
3797; AVX512F-NEXT:    kshiftrw $9, %k1, %k1
3798; AVX512F-NEXT:    korw %k1, %k0, %k0
3799; AVX512F-NEXT:    movw $-129, %ax
3800; AVX512F-NEXT:    kmovw %eax, %k1
3801; AVX512F-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
3802; AVX512F-NEXT:    kandw %k1, %k0, %k0
3803; AVX512F-NEXT:    movzbl 408(%rbp), %eax
3804; AVX512F-NEXT:    kmovw %eax, %k1
3805; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
3806; AVX512F-NEXT:    kshiftrw $8, %k1, %k1
3807; AVX512F-NEXT:    korw %k1, %k0, %k0
3808; AVX512F-NEXT:    movw $-257, %ax # imm = 0xFEFF
3809; AVX512F-NEXT:    kmovw %eax, %k1
3810; AVX512F-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
3811; AVX512F-NEXT:    kandw %k1, %k0, %k0
3812; AVX512F-NEXT:    movzbl 416(%rbp), %eax
3813; AVX512F-NEXT:    kmovw %eax, %k1
3814; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
3815; AVX512F-NEXT:    kshiftrw $7, %k1, %k1
3816; AVX512F-NEXT:    korw %k1, %k0, %k0
3817; AVX512F-NEXT:    movw $-513, %ax # imm = 0xFDFF
3818; AVX512F-NEXT:    kmovw %eax, %k1
3819; AVX512F-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
3820; AVX512F-NEXT:    kandw %k1, %k0, %k0
3821; AVX512F-NEXT:    movzbl 424(%rbp), %eax
3822; AVX512F-NEXT:    kmovw %eax, %k1
3823; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
3824; AVX512F-NEXT:    kshiftrw $6, %k1, %k1
3825; AVX512F-NEXT:    korw %k1, %k0, %k0
3826; AVX512F-NEXT:    movw $-1025, %ax # imm = 0xFBFF
3827; AVX512F-NEXT:    kmovw %eax, %k1
3828; AVX512F-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
3829; AVX512F-NEXT:    kandw %k1, %k0, %k0
3830; AVX512F-NEXT:    movzbl 432(%rbp), %eax
3831; AVX512F-NEXT:    kmovw %eax, %k1
3832; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
3833; AVX512F-NEXT:    kshiftrw $5, %k1, %k1
3834; AVX512F-NEXT:    korw %k1, %k0, %k0
3835; AVX512F-NEXT:    movw $-2049, %ax # imm = 0xF7FF
3836; AVX512F-NEXT:    kmovw %eax, %k1
3837; AVX512F-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
3838; AVX512F-NEXT:    kandw %k1, %k0, %k0
3839; AVX512F-NEXT:    movzbl 440(%rbp), %eax
3840; AVX512F-NEXT:    kmovw %eax, %k1
3841; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
3842; AVX512F-NEXT:    kshiftrw $4, %k1, %k1
3843; AVX512F-NEXT:    korw %k1, %k0, %k0
3844; AVX512F-NEXT:    movw $-4097, %ax # imm = 0xEFFF
3845; AVX512F-NEXT:    kmovw %eax, %k1
3846; AVX512F-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
3847; AVX512F-NEXT:    kandw %k1, %k0, %k0
3848; AVX512F-NEXT:    movzbl 448(%rbp), %eax
3849; AVX512F-NEXT:    kmovw %eax, %k1
3850; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
3851; AVX512F-NEXT:    kshiftrw $3, %k1, %k1
3852; AVX512F-NEXT:    korw %k1, %k0, %k0
3853; AVX512F-NEXT:    movw $-8193, %ax # imm = 0xDFFF
3854; AVX512F-NEXT:    kmovw %eax, %k1
3855; AVX512F-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
3856; AVX512F-NEXT:    kandw %k1, %k0, %k0
3857; AVX512F-NEXT:    movzbl 456(%rbp), %eax
3858; AVX512F-NEXT:    kmovw %eax, %k1
3859; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
3860; AVX512F-NEXT:    kshiftrw $2, %k1, %k1
3861; AVX512F-NEXT:    korw %k1, %k0, %k1
3862; AVX512F-NEXT:    movw $-16385, %ax # imm = 0xBFFF
3863; AVX512F-NEXT:    kmovw %eax, %k4
3864; AVX512F-NEXT:    kandw %k4, %k1, %k1
3865; AVX512F-NEXT:    kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
3866; AVX512F-NEXT:    movzbl 464(%rbp), %eax
3867; AVX512F-NEXT:    kmovw %eax, %k2
3868; AVX512F-NEXT:    kshiftlw $14, %k2, %k2
3869; AVX512F-NEXT:    korw %k2, %k1, %k1
3870; AVX512F-NEXT:    kshiftlw $1, %k1, %k1
3871; AVX512F-NEXT:    kshiftrw $1, %k1, %k1
3872; AVX512F-NEXT:    movzbl 472(%rbp), %eax
3873; AVX512F-NEXT:    kmovw %eax, %k2
3874; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
3875; AVX512F-NEXT:    korw %k2, %k1, %k1
3876; AVX512F-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
3877; AVX512F-NEXT:    movzbl 224(%rbp), %eax
3878; AVX512F-NEXT:    andl $1, %eax
3879; AVX512F-NEXT:    movzbl 232(%rbp), %r10d
3880; AVX512F-NEXT:    kmovw %r10d, %k1
3881; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
3882; AVX512F-NEXT:    kshiftrw $14, %k1, %k1
3883; AVX512F-NEXT:    kmovw %eax, %k2
3884; AVX512F-NEXT:    korw %k1, %k2, %k1
3885; AVX512F-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
3886; AVX512F-NEXT:    kandw %k3, %k1, %k1
3887; AVX512F-NEXT:    movzbl 240(%rbp), %eax
3888; AVX512F-NEXT:    kmovw %eax, %k2
3889; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
3890; AVX512F-NEXT:    kshiftrw $13, %k2, %k2
3891; AVX512F-NEXT:    korw %k2, %k1, %k1
3892; AVX512F-NEXT:    kandw %k7, %k1, %k1
3893; AVX512F-NEXT:    movzbl 248(%rbp), %eax
3894; AVX512F-NEXT:    kmovw %eax, %k2
3895; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
3896; AVX512F-NEXT:    kshiftrw $12, %k2, %k2
3897; AVX512F-NEXT:    korw %k2, %k1, %k1
3898; AVX512F-NEXT:    kandw %k5, %k1, %k1
3899; AVX512F-NEXT:    movzbl 256(%rbp), %eax
3900; AVX512F-NEXT:    kmovw %eax, %k2
3901; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
3902; AVX512F-NEXT:    kshiftrw $11, %k2, %k2
3903; AVX512F-NEXT:    korw %k2, %k1, %k1
3904; AVX512F-NEXT:    kandw %k6, %k1, %k1
3905; AVX512F-NEXT:    movzbl 264(%rbp), %eax
3906; AVX512F-NEXT:    kmovw %eax, %k2
3907; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
3908; AVX512F-NEXT:    kshiftrw $10, %k2, %k2
3909; AVX512F-NEXT:    korw %k2, %k1, %k1
3910; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
3911; AVX512F-NEXT:    kandw %k7, %k1, %k1
3912; AVX512F-NEXT:    movzbl 272(%rbp), %eax
3913; AVX512F-NEXT:    kmovw %eax, %k2
3914; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
3915; AVX512F-NEXT:    kshiftrw $9, %k2, %k2
3916; AVX512F-NEXT:    korw %k2, %k1, %k0
3917; AVX512F-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
3918; AVX512F-NEXT:    movzbl 280(%rbp), %eax
3919; AVX512F-NEXT:    kmovw %eax, %k1
3920; AVX512F-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
3921; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
3922; AVX512F-NEXT:    kshiftrw $8, %k1, %k1
3923; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
3924; AVX512F-NEXT:    kandw %k2, %k0, %k2
3925; AVX512F-NEXT:    korw %k1, %k2, %k1
3926; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
3927; AVX512F-NEXT:    kandw %k0, %k1, %k1
3928; AVX512F-NEXT:    movzbl 288(%rbp), %eax
3929; AVX512F-NEXT:    kmovw %eax, %k0
3930; AVX512F-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
3931; AVX512F-NEXT:    kshiftlw $15, %k0, %k2
3932; AVX512F-NEXT:    kshiftrw $7, %k2, %k2
3933; AVX512F-NEXT:    korw %k2, %k1, %k1
3934; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
3935; AVX512F-NEXT:    kandw %k0, %k1, %k1
3936; AVX512F-NEXT:    movzbl 296(%rbp), %eax
3937; AVX512F-NEXT:    kmovw %eax, %k2
3938; AVX512F-NEXT:    kshiftlw $15, %k2, %k0
3939; AVX512F-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
3940; AVX512F-NEXT:    kshiftrw $6, %k0, %k2
3941; AVX512F-NEXT:    korw %k2, %k1, %k1
3942; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
3943; AVX512F-NEXT:    kandw %k0, %k1, %k1
3944; AVX512F-NEXT:    movzbl 304(%rbp), %eax
3945; AVX512F-NEXT:    kmovw %eax, %k2
3946; AVX512F-NEXT:    kshiftlw $15, %k2, %k0
3947; AVX512F-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
3948; AVX512F-NEXT:    kshiftrw $5, %k0, %k2
3949; AVX512F-NEXT:    korw %k2, %k1, %k1
3950; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
3951; AVX512F-NEXT:    kandw %k0, %k1, %k1
3952; AVX512F-NEXT:    movzbl 312(%rbp), %eax
3953; AVX512F-NEXT:    kmovw %eax, %k2
3954; AVX512F-NEXT:    kshiftlw $15, %k2, %k0
3955; AVX512F-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
3956; AVX512F-NEXT:    kshiftrw $4, %k0, %k2
3957; AVX512F-NEXT:    korw %k2, %k1, %k1
3958; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
3959; AVX512F-NEXT:    kandw %k0, %k1, %k1
3960; AVX512F-NEXT:    movzbl 320(%rbp), %eax
3961; AVX512F-NEXT:    kmovw %eax, %k2
3962; AVX512F-NEXT:    kshiftlw $15, %k2, %k0
3963; AVX512F-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
3964; AVX512F-NEXT:    kshiftrw $3, %k0, %k2
3965; AVX512F-NEXT:    korw %k2, %k1, %k1
3966; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
3967; AVX512F-NEXT:    kandw %k0, %k1, %k1
3968; AVX512F-NEXT:    movzbl 328(%rbp), %eax
3969; AVX512F-NEXT:    kmovw %eax, %k2
3970; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
3971; AVX512F-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
3972; AVX512F-NEXT:    kshiftrw $2, %k2, %k2
3973; AVX512F-NEXT:    korw %k2, %k1, %k1
3974; AVX512F-NEXT:    kandw %k4, %k1, %k1
3975; AVX512F-NEXT:    movzbl 336(%rbp), %eax
3976; AVX512F-NEXT:    kmovw %eax, %k2
3977; AVX512F-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
3978; AVX512F-NEXT:    kshiftlw $14, %k2, %k2
3979; AVX512F-NEXT:    korw %k2, %k1, %k1
3980; AVX512F-NEXT:    kshiftlw $1, %k1, %k1
3981; AVX512F-NEXT:    kshiftrw $1, %k1, %k1
3982; AVX512F-NEXT:    movzbl 344(%rbp), %eax
3983; AVX512F-NEXT:    kmovw %eax, %k2
3984; AVX512F-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
3985; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
3986; AVX512F-NEXT:    korw %k2, %k1, %k1
3987; AVX512F-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
3988; AVX512F-NEXT:    movzbl 96(%rbp), %eax
3989; AVX512F-NEXT:    andl $1, %eax
3990; AVX512F-NEXT:    movzbl 104(%rbp), %r10d
3991; AVX512F-NEXT:    kmovw %r10d, %k1
3992; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
3993; AVX512F-NEXT:    kshiftrw $14, %k1, %k1
3994; AVX512F-NEXT:    kmovw %eax, %k2
3995; AVX512F-NEXT:    korw %k1, %k2, %k1
3996; AVX512F-NEXT:    kandw %k3, %k1, %k1
3997; AVX512F-NEXT:    movzbl 112(%rbp), %eax
3998; AVX512F-NEXT:    kmovw %eax, %k2
3999; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
4000; AVX512F-NEXT:    kshiftrw $13, %k2, %k2
4001; AVX512F-NEXT:    korw %k2, %k1, %k1
4002; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
4003; AVX512F-NEXT:    kandw %k4, %k1, %k1
4004; AVX512F-NEXT:    movzbl 120(%rbp), %eax
4005; AVX512F-NEXT:    kmovw %eax, %k2
4006; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
4007; AVX512F-NEXT:    kshiftrw $12, %k2, %k2
4008; AVX512F-NEXT:    korw %k2, %k1, %k1
4009; AVX512F-NEXT:    kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
4010; AVX512F-NEXT:    kandw %k5, %k1, %k1
4011; AVX512F-NEXT:    movzbl 128(%rbp), %eax
4012; AVX512F-NEXT:    kmovw %eax, %k2
4013; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
4014; AVX512F-NEXT:    kshiftrw $11, %k2, %k2
4015; AVX512F-NEXT:    korw %k2, %k1, %k1
4016; AVX512F-NEXT:    kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
4017; AVX512F-NEXT:    kandw %k6, %k1, %k1
4018; AVX512F-NEXT:    movzbl 136(%rbp), %eax
4019; AVX512F-NEXT:    kmovw %eax, %k2
4020; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
4021; AVX512F-NEXT:    kshiftrw $10, %k2, %k2
4022; AVX512F-NEXT:    korw %k2, %k1, %k1
4023; AVX512F-NEXT:    kandw %k7, %k1, %k1
4024; AVX512F-NEXT:    movzbl 144(%rbp), %eax
4025; AVX512F-NEXT:    kmovw %eax, %k2
4026; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
4027; AVX512F-NEXT:    kshiftrw $9, %k2, %k2
4028; AVX512F-NEXT:    korw %k2, %k1, %k1
4029; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
4030; AVX512F-NEXT:    kandw %k2, %k1, %k1
4031; AVX512F-NEXT:    movzbl 152(%rbp), %eax
4032; AVX512F-NEXT:    kmovw %eax, %k2
4033; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
4034; AVX512F-NEXT:    kshiftrw $8, %k2, %k2
4035; AVX512F-NEXT:    korw %k2, %k1, %k1
4036; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
4037; AVX512F-NEXT:    kandw %k3, %k1, %k1
4038; AVX512F-NEXT:    movzbl 160(%rbp), %eax
4039; AVX512F-NEXT:    kmovw %eax, %k2
4040; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
4041; AVX512F-NEXT:    kshiftrw $7, %k2, %k2
4042; AVX512F-NEXT:    korw %k2, %k1, %k1
4043; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
4044; AVX512F-NEXT:    kandw %k7, %k1, %k1
4045; AVX512F-NEXT:    movzbl 168(%rbp), %eax
4046; AVX512F-NEXT:    kmovw %eax, %k2
4047; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
4048; AVX512F-NEXT:    kshiftrw $6, %k2, %k2
4049; AVX512F-NEXT:    korw %k2, %k1, %k1
4050; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
4051; AVX512F-NEXT:    kandw %k2, %k1, %k1
4052; AVX512F-NEXT:    movzbl 176(%rbp), %eax
4053; AVX512F-NEXT:    kmovw %eax, %k2
4054; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
4055; AVX512F-NEXT:    kshiftrw $5, %k2, %k2
4056; AVX512F-NEXT:    korw %k2, %k1, %k1
4057; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
4058; AVX512F-NEXT:    kandw %k2, %k1, %k1
4059; AVX512F-NEXT:    movzbl 184(%rbp), %eax
4060; AVX512F-NEXT:    kmovw %eax, %k2
4061; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
4062; AVX512F-NEXT:    kshiftrw $4, %k2, %k2
4063; AVX512F-NEXT:    korw %k2, %k1, %k1
4064; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
4065; AVX512F-NEXT:    kandw %k2, %k1, %k1
4066; AVX512F-NEXT:    movzbl 192(%rbp), %eax
4067; AVX512F-NEXT:    kmovw %eax, %k2
4068; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
4069; AVX512F-NEXT:    kshiftrw $3, %k2, %k2
4070; AVX512F-NEXT:    korw %k2, %k1, %k1
4071; AVX512F-NEXT:    kandw %k0, %k1, %k1
4072; AVX512F-NEXT:    movzbl 200(%rbp), %eax
4073; AVX512F-NEXT:    kmovw %eax, %k2
4074; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
4075; AVX512F-NEXT:    kshiftrw $2, %k2, %k2
4076; AVX512F-NEXT:    korw %k2, %k1, %k1
4077; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
4078; AVX512F-NEXT:    kandw %k2, %k1, %k1
4079; AVX512F-NEXT:    movzbl 208(%rbp), %eax
4080; AVX512F-NEXT:    kmovw %eax, %k2
4081; AVX512F-NEXT:    kshiftlw $14, %k2, %k2
4082; AVX512F-NEXT:    korw %k2, %k1, %k1
4083; AVX512F-NEXT:    kshiftlw $1, %k1, %k1
4084; AVX512F-NEXT:    kshiftrw $1, %k1, %k1
4085; AVX512F-NEXT:    movzbl 216(%rbp), %eax
4086; AVX512F-NEXT:    kmovw %eax, %k2
4087; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
4088; AVX512F-NEXT:    korw %k2, %k1, %k1
4089; AVX512F-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
4090; AVX512F-NEXT:    andl $1, %edi
4091; AVX512F-NEXT:    kmovw %esi, %k1
4092; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
4093; AVX512F-NEXT:    kshiftrw $14, %k1, %k1
4094; AVX512F-NEXT:    kmovw %edi, %k2
4095; AVX512F-NEXT:    korw %k1, %k2, %k1
4096; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
4097; AVX512F-NEXT:    kandw %k2, %k1, %k1
4098; AVX512F-NEXT:    kmovw %edx, %k2
4099; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
4100; AVX512F-NEXT:    kshiftrw $13, %k2, %k2
4101; AVX512F-NEXT:    korw %k2, %k1, %k1
4102; AVX512F-NEXT:    kandw %k4, %k1, %k1
4103; AVX512F-NEXT:    kmovw %ecx, %k2
4104; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
4105; AVX512F-NEXT:    kshiftrw $12, %k2, %k2
4106; AVX512F-NEXT:    korw %k2, %k1, %k1
4107; AVX512F-NEXT:    kandw %k5, %k1, %k1
4108; AVX512F-NEXT:    kmovw %r8d, %k2
4109; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
4110; AVX512F-NEXT:    kshiftrw $11, %k2, %k2
4111; AVX512F-NEXT:    korw %k2, %k1, %k1
4112; AVX512F-NEXT:    kandw %k6, %k1, %k1
4113; AVX512F-NEXT:    kmovw %r9d, %k2
4114; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
4115; AVX512F-NEXT:    kshiftrw $10, %k2, %k2
4116; AVX512F-NEXT:    korw %k2, %k1, %k1
4117; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
4118; AVX512F-NEXT:    kandw %k2, %k1, %k1
4119; AVX512F-NEXT:    movzbl 16(%rbp), %eax
4120; AVX512F-NEXT:    kmovw %eax, %k2
4121; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
4122; AVX512F-NEXT:    kshiftrw $9, %k2, %k2
4123; AVX512F-NEXT:    korw %k2, %k1, %k2
4124; AVX512F-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
4125; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
4126; AVX512F-NEXT:    kandw %k1, %k2, %k1
4127; AVX512F-NEXT:    movzbl 24(%rbp), %eax
4128; AVX512F-NEXT:    kmovw %eax, %k2
4129; AVX512F-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
4130; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
4131; AVX512F-NEXT:    kshiftrw $8, %k2, %k2
4132; AVX512F-NEXT:    korw %k2, %k1, %k1
4133; AVX512F-NEXT:    kandw %k3, %k1, %k1
4134; AVX512F-NEXT:    movzbl 32(%rbp), %eax
4135; AVX512F-NEXT:    kmovw %eax, %k2
4136; AVX512F-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
4137; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
4138; AVX512F-NEXT:    kshiftrw $7, %k2, %k2
4139; AVX512F-NEXT:    korw %k2, %k1, %k1
4140; AVX512F-NEXT:    kandw %k7, %k1, %k1
4141; AVX512F-NEXT:    movzbl 40(%rbp), %eax
4142; AVX512F-NEXT:    kmovw %eax, %k2
4143; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
4144; AVX512F-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
4145; AVX512F-NEXT:    kshiftrw $6, %k2, %k2
4146; AVX512F-NEXT:    korw %k2, %k1, %k1
4147; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
4148; AVX512F-NEXT:    kandw %k2, %k1, %k1
4149; AVX512F-NEXT:    movzbl 48(%rbp), %eax
4150; AVX512F-NEXT:    kmovw %eax, %k2
4151; AVX512F-NEXT:    kshiftlw $15, %k2, %k5
4152; AVX512F-NEXT:    kshiftrw $5, %k5, %k2
4153; AVX512F-NEXT:    korw %k2, %k1, %k1
4154; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
4155; AVX512F-NEXT:    kandw %k2, %k1, %k1
4156; AVX512F-NEXT:    movzbl 56(%rbp), %eax
4157; AVX512F-NEXT:    kmovw %eax, %k2
4158; AVX512F-NEXT:    kshiftlw $15, %k2, %k4
4159; AVX512F-NEXT:    kshiftrw $4, %k4, %k2
4160; AVX512F-NEXT:    korw %k2, %k1, %k1
4161; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
4162; AVX512F-NEXT:    kandw %k2, %k1, %k1
4163; AVX512F-NEXT:    movzbl 64(%rbp), %eax
4164; AVX512F-NEXT:    kmovw %eax, %k2
4165; AVX512F-NEXT:    kshiftlw $15, %k2, %k3
4166; AVX512F-NEXT:    kshiftrw $3, %k3, %k2
4167; AVX512F-NEXT:    korw %k2, %k1, %k1
4168; AVX512F-NEXT:    kandw %k0, %k1, %k1
4169; AVX512F-NEXT:    movzbl 72(%rbp), %eax
4170; AVX512F-NEXT:    kmovw %eax, %k2
4171; AVX512F-NEXT:    kshiftlw $15, %k2, %k2
4172; AVX512F-NEXT:    kshiftrw $2, %k2, %k0
4173; AVX512F-NEXT:    korw %k0, %k1, %k0
4174; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
4175; AVX512F-NEXT:    kandw %k1, %k0, %k0
4176; AVX512F-NEXT:    movzbl 80(%rbp), %eax
4177; AVX512F-NEXT:    kmovw %eax, %k1
4178; AVX512F-NEXT:    kshiftlw $14, %k1, %k7
4179; AVX512F-NEXT:    korw %k7, %k0, %k0
4180; AVX512F-NEXT:    kshiftlw $1, %k0, %k0
4181; AVX512F-NEXT:    kshiftrw $1, %k0, %k7
4182; AVX512F-NEXT:    movzbl 88(%rbp), %eax
4183; AVX512F-NEXT:    kmovw %eax, %k0
4184; AVX512F-NEXT:    kshiftlw $15, %k0, %k6
4185; AVX512F-NEXT:    korw %k6, %k7, %k6
4186; AVX512F-NEXT:    kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
4187; AVX512F-NEXT:    movw $-3, %ax
4188; AVX512F-NEXT:    kmovw %eax, %k6
4189; AVX512F-NEXT:    kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
4190; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
4191; AVX512F-NEXT:    kandw %k6, %k7, %k6
4192; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
4193; AVX512F-NEXT:    kshiftrw $14, %k7, %k7
4194; AVX512F-NEXT:    korw %k7, %k6, %k6
4195; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
4196; AVX512F-NEXT:    kandw %k7, %k6, %k6
4197; AVX512F-NEXT:    kshiftrw $13, %k5, %k5
4198; AVX512F-NEXT:    korw %k5, %k6, %k5
4199; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
4200; AVX512F-NEXT:    kandw %k6, %k5, %k5
4201; AVX512F-NEXT:    kshiftrw $12, %k4, %k4
4202; AVX512F-NEXT:    korw %k4, %k5, %k4
4203; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
4204; AVX512F-NEXT:    kandw %k5, %k4, %k4
4205; AVX512F-NEXT:    kshiftrw $11, %k3, %k3
4206; AVX512F-NEXT:    korw %k3, %k4, %k3
4207; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
4208; AVX512F-NEXT:    kandw %k4, %k3, %k3
4209; AVX512F-NEXT:    kshiftrw $10, %k2, %k2
4210; AVX512F-NEXT:    korw %k2, %k3, %k2
4211; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
4212; AVX512F-NEXT:    kandw %k3, %k2, %k2
4213; AVX512F-NEXT:    kshiftlw $6, %k1, %k1
4214; AVX512F-NEXT:    korw %k1, %k2, %k1
4215; AVX512F-NEXT:    kshiftlw $9, %k1, %k1
4216; AVX512F-NEXT:    kshiftrw $9, %k1, %k1
4217; AVX512F-NEXT:    kshiftlw $7, %k0, %k0
4218; AVX512F-NEXT:    korw %k0, %k1, %k0
4219; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
4220; AVX512F-NEXT:    kshiftlw $9, %k1, %k1
4221; AVX512F-NEXT:    kshiftrw $9, %k1, %k1
4222; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
4223; AVX512F-NEXT:    kshiftlw $7, %k2, %k2
4224; AVX512F-NEXT:    korw %k2, %k1, %k1
4225; AVX512F-NEXT:    kxorw %k0, %k1, %k0
4226; AVX512F-NEXT:    kshiftrw $4, %k0, %k1
4227; AVX512F-NEXT:    kxorw %k1, %k0, %k0
4228; AVX512F-NEXT:    kshiftrw $2, %k0, %k1
4229; AVX512F-NEXT:    kxorw %k1, %k0, %k0
4230; AVX512F-NEXT:    kshiftrw $1, %k0, %k1
4231; AVX512F-NEXT:    kxorw %k1, %k0, %k0
4232; AVX512F-NEXT:    kmovw %k0, %eax
4233; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
4234; AVX512F-NEXT:    vpcompressd %zmm2, %zmm2 {%k1} {z}
4235; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
4236; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
4237; AVX512F-NEXT:    kandw %k1, %k0, %k0
4238; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
4239; AVX512F-NEXT:    kshiftrw $14, %k1, %k1
4240; AVX512F-NEXT:    korw %k1, %k0, %k0
4241; AVX512F-NEXT:    kandw %k7, %k0, %k0
4242; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
4243; AVX512F-NEXT:    kshiftrw $13, %k1, %k1
4244; AVX512F-NEXT:    korw %k1, %k0, %k0
4245; AVX512F-NEXT:    kandw %k6, %k0, %k0
4246; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
4247; AVX512F-NEXT:    kshiftrw $12, %k1, %k1
4248; AVX512F-NEXT:    korw %k1, %k0, %k0
4249; AVX512F-NEXT:    kandw %k5, %k0, %k0
4250; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
4251; AVX512F-NEXT:    kshiftrw $11, %k1, %k1
4252; AVX512F-NEXT:    korw %k1, %k0, %k0
4253; AVX512F-NEXT:    kandw %k4, %k0, %k0
4254; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
4255; AVX512F-NEXT:    kshiftrw $10, %k1, %k1
4256; AVX512F-NEXT:    korw %k1, %k0, %k0
4257; AVX512F-NEXT:    kandw %k3, %k0, %k0
4258; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
4259; AVX512F-NEXT:    kshiftlw $6, %k1, %k1
4260; AVX512F-NEXT:    korw %k1, %k0, %k0
4261; AVX512F-NEXT:    kshiftlw $9, %k0, %k0
4262; AVX512F-NEXT:    kshiftrw $9, %k0, %k0
4263; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
4264; AVX512F-NEXT:    kshiftlw $7, %k1, %k1
4265; AVX512F-NEXT:    korw %k1, %k0, %k0
4266; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
4267; AVX512F-NEXT:    kshiftlw $9, %k1, %k1
4268; AVX512F-NEXT:    kshiftrw $9, %k1, %k1
4269; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
4270; AVX512F-NEXT:    kshiftlw $7, %k2, %k2
4271; AVX512F-NEXT:    korw %k2, %k1, %k1
4272; AVX512F-NEXT:    kxorw %k0, %k1, %k0
4273; AVX512F-NEXT:    kshiftrw $4, %k0, %k1
4274; AVX512F-NEXT:    kxorw %k1, %k0, %k0
4275; AVX512F-NEXT:    kshiftrw $2, %k0, %k1
4276; AVX512F-NEXT:    kxorw %k1, %k0, %k0
4277; AVX512F-NEXT:    kshiftrw $1, %k0, %k1
4278; AVX512F-NEXT:    kxorw %k1, %k0, %k0
4279; AVX512F-NEXT:    kmovw %k0, %ecx
4280; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
4281; AVX512F-NEXT:    vpcompressd %zmm3, %zmm3 {%k1} {z}
4282; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
4283; AVX512F-NEXT:    vpcompressd %zmm0, %zmm0 {%k2} {z}
4284; AVX512F-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
4285; AVX512F-NEXT:    vpcompressd %zmm1, %zmm1 {%k1} {z}
4286; AVX512F-NEXT:    kxorw %k1, %k2, %k0
4287; AVX512F-NEXT:    kshiftrw $8, %k0, %k1
4288; AVX512F-NEXT:    kxorw %k1, %k0, %k0
4289; AVX512F-NEXT:    kshiftrw $4, %k0, %k1
4290; AVX512F-NEXT:    kxorw %k1, %k0, %k0
4291; AVX512F-NEXT:    kshiftrw $2, %k0, %k1
4292; AVX512F-NEXT:    kxorw %k1, %k0, %k0
4293; AVX512F-NEXT:    kshiftrw $1, %k0, %k1
4294; AVX512F-NEXT:    kxorw %k1, %k0, %k0
4295; AVX512F-NEXT:    kmovw %k0, %edx
4296; AVX512F-NEXT:    vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
4297; AVX512F-NEXT:    andl $31, %eax
4298; AVX512F-NEXT:    vmovdqa64 %zmm1, 64(%rsp,%rax,4)
4299; AVX512F-NEXT:    vmovdqa64 %zmm2, {{[0-9]+}}(%rsp)
4300; AVX512F-NEXT:    andl $31, %ecx
4301; AVX512F-NEXT:    vmovdqa64 %zmm3, 192(%rsp,%rcx,4)
4302; AVX512F-NEXT:    vmovaps {{[0-9]+}}(%rsp), %zmm0
4303; AVX512F-NEXT:    vmovaps {{[0-9]+}}(%rsp), %zmm1
4304; AVX512F-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
4305; AVX512F-NEXT:    andl $63, %edx
4306; AVX512F-NEXT:    vmovaps {{[0-9]+}}(%rsp), %zmm0
4307; AVX512F-NEXT:    vmovaps {{[0-9]+}}(%rsp), %zmm2
4308; AVX512F-NEXT:    vmovaps %zmm0, 320(%rsp,%rdx,4)
4309; AVX512F-NEXT:    vmovaps %zmm1, {{[0-9]+}}(%rsp)
4310; AVX512F-NEXT:    vmovaps %zmm2, 384(%rsp,%rdx,4)
4311; AVX512F-NEXT:    vmovaps {{[0-9]+}}(%rsp), %zmm0
4312; AVX512F-NEXT:    vmovaps {{[0-9]+}}(%rsp), %zmm1
4313; AVX512F-NEXT:    vmovaps {{[0-9]+}}(%rsp), %zmm2
4314; AVX512F-NEXT:    vmovaps {{[0-9]+}}(%rsp), %zmm3
4315; AVX512F-NEXT:    movq %rbp, %rsp
4316; AVX512F-NEXT:    popq %rbp
4317; AVX512F-NEXT:    retq
4318;
4319; AVX512VL-LABEL: test_compress_large:
4320; AVX512VL:       # %bb.0:
4321; AVX512VL-NEXT:    pushq %rbp
4322; AVX512VL-NEXT:    movq %rsp, %rbp
4323; AVX512VL-NEXT:    andq $-64, %rsp
4324; AVX512VL-NEXT:    subq $576, %rsp # imm = 0x240
4325; AVX512VL-NEXT:    vpsllw $7, %zmm0, %zmm0
4326; AVX512VL-NEXT:    vpmovb2m %zmm0, %k1
4327; AVX512VL-NEXT:    kshiftrq $48, %k1, %k3
4328; AVX512VL-NEXT:    kshiftrq $32, %k1, %k4
4329; AVX512VL-NEXT:    kshiftrq $16, %k1, %k2
4330; AVX512VL-NEXT:    vpcompressd %zmm1, %zmm0 {%k1} {z}
4331; AVX512VL-NEXT:    vmovdqa64 %zmm0, (%rsp)
4332; AVX512VL-NEXT:    kshiftrq $8, %k1, %k0
4333; AVX512VL-NEXT:    kxorw %k0, %k1, %k0
4334; AVX512VL-NEXT:    kshiftrw $4, %k0, %k5
4335; AVX512VL-NEXT:    kxorw %k5, %k0, %k0
4336; AVX512VL-NEXT:    kshiftrw $2, %k0, %k5
4337; AVX512VL-NEXT:    kxorw %k5, %k0, %k0
4338; AVX512VL-NEXT:    kshiftrw $1, %k0, %k5
4339; AVX512VL-NEXT:    kxorw %k5, %k0, %k0
4340; AVX512VL-NEXT:    kmovd %k0, %eax
4341; AVX512VL-NEXT:    andl $31, %eax
4342; AVX512VL-NEXT:    vpcompressd %zmm2, %zmm0 {%k2} {z}
4343; AVX512VL-NEXT:    vmovdqa64 %zmm0, (%rsp,%rax,4)
4344; AVX512VL-NEXT:    vpcompressd %zmm3, %zmm0 {%k4} {z}
4345; AVX512VL-NEXT:    vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
4346; AVX512VL-NEXT:    kshiftrq $40, %k1, %k0
4347; AVX512VL-NEXT:    kxorw %k0, %k4, %k0
4348; AVX512VL-NEXT:    kshiftrw $4, %k0, %k4
4349; AVX512VL-NEXT:    kxorw %k4, %k0, %k0
4350; AVX512VL-NEXT:    kshiftrw $2, %k0, %k4
4351; AVX512VL-NEXT:    kxorw %k4, %k0, %k0
4352; AVX512VL-NEXT:    kshiftrw $1, %k0, %k4
4353; AVX512VL-NEXT:    kxorw %k4, %k0, %k0
4354; AVX512VL-NEXT:    kmovd %k0, %eax
4355; AVX512VL-NEXT:    andl $31, %eax
4356; AVX512VL-NEXT:    vpcompressd %zmm4, %zmm0 {%k3} {z}
4357; AVX512VL-NEXT:    vmovdqa64 %zmm0, 128(%rsp,%rax,4)
4358; AVX512VL-NEXT:    vmovaps (%rsp), %zmm0
4359; AVX512VL-NEXT:    vmovaps {{[0-9]+}}(%rsp), %zmm1
4360; AVX512VL-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
4361; AVX512VL-NEXT:    kxorw %k2, %k1, %k0
4362; AVX512VL-NEXT:    kshiftrw $8, %k0, %k1
4363; AVX512VL-NEXT:    kxorw %k1, %k0, %k0
4364; AVX512VL-NEXT:    kshiftrw $4, %k0, %k1
4365; AVX512VL-NEXT:    kxorw %k1, %k0, %k0
4366; AVX512VL-NEXT:    kshiftrw $2, %k0, %k1
4367; AVX512VL-NEXT:    kxorw %k1, %k0, %k0
4368; AVX512VL-NEXT:    kshiftrw $1, %k0, %k1
4369; AVX512VL-NEXT:    kxorw %k1, %k0, %k0
4370; AVX512VL-NEXT:    kmovd %k0, %eax
4371; AVX512VL-NEXT:    andl $63, %eax
4372; AVX512VL-NEXT:    vmovaps {{[0-9]+}}(%rsp), %zmm0
4373; AVX512VL-NEXT:    vmovaps {{[0-9]+}}(%rsp), %zmm2
4374; AVX512VL-NEXT:    vmovaps %zmm0, 256(%rsp,%rax,4)
4375; AVX512VL-NEXT:    vmovaps %zmm1, {{[0-9]+}}(%rsp)
4376; AVX512VL-NEXT:    vmovaps %zmm2, 320(%rsp,%rax,4)
4377; AVX512VL-NEXT:    vmovaps {{[0-9]+}}(%rsp), %zmm0
4378; AVX512VL-NEXT:    vmovaps {{[0-9]+}}(%rsp), %zmm1
4379; AVX512VL-NEXT:    vmovaps {{[0-9]+}}(%rsp), %zmm2
4380; AVX512VL-NEXT:    vmovaps {{[0-9]+}}(%rsp), %zmm3
4381; AVX512VL-NEXT:    movq %rbp, %rsp
4382; AVX512VL-NEXT:    popq %rbp
4383; AVX512VL-NEXT:    retq
4384    %out = call <64 x i32> @llvm.experimental.vector.compress(<64 x i32> %vec, <64 x i1> %mask, <64 x i32> undef)
4385    ret <64 x i32> %out
4386}
4387
4388define <4 x i32> @test_compress_all_const() nounwind {
4389; AVX2-LABEL: test_compress_all_const:
4390; AVX2:       # %bb.0:
4391; AVX2-NEXT:    vmovsd {{.*#+}} xmm0 = [5,9,0,0]
4392; AVX2-NEXT:    retq
4393;
4394; AVX512F-LABEL: test_compress_all_const:
4395; AVX512F:       # %bb.0:
4396; AVX512F-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [5,9,0,0]
4397; AVX512F-NEXT:    retq
4398;
4399; AVX512VL-LABEL: test_compress_all_const:
4400; AVX512VL:       # %bb.0:
4401; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [5,9,0,0]
4402; AVX512VL-NEXT:    retq
4403    %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> <i32 3, i32 5, i32 7, i32 9>,
4404                                                <4 x i1>   <i1 0,  i1 1,  i1 0,  i1 1>,
4405                                                <4 x i32> undef)
4406    ret <4 x i32> %out
4407}
4408
4409define <4 x i32> @test_compress_const_mask(<4 x i32> %vec) nounwind {
4410; CHECK-LABEL: test_compress_const_mask:
4411; CHECK:       # %bb.0:
4412; CHECK-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
4413; CHECK-NEXT:    retq
4414    %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> <i1 1, i1 undef, i1 0, i1 1>, <4 x i32> undef)
4415    ret <4 x i32> %out
4416}
4417
4418define <4 x i32> @test_compress_const_mask_passthrough(<4 x i32> %vec, <4 x i32> %passthru) nounwind {
4419; CHECK-LABEL: test_compress_const_mask_passthrough:
4420; CHECK:       # %bb.0:
4421; CHECK-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[2,3]
4422; CHECK-NEXT:    retq
4423    %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> <i1 1, i1 undef, i1 0, i1 1>, <4 x i32> %passthru)
4424    ret <4 x i32> %out
4425}
4426
4427define <4 x i32> @test_compress_const_mask_const_passthrough(<4 x i32> %vec) nounwind {
4428; CHECK-LABEL: test_compress_const_mask_const_passthrough:
4429; CHECK:       # %bb.0:
4430; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
4431; CHECK-NEXT:    movl $7, %eax
4432; CHECK-NEXT:    vpinsrd $2, %eax, %xmm0, %xmm0
4433; CHECK-NEXT:    movl $8, %eax
4434; CHECK-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
4435; CHECK-NEXT:    retq
4436    %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x i32> <i32 5, i32 6, i32 7, i32 8>)
4437    ret <4 x i32> %out
4438}
4439
4440; We pass a placeholder value for the const_mask* tests to check that they are converted to a no-op by simply copying
4441; the second vector input register to the return register or doing nothing.
4442define <4 x i32> @test_compress_const_splat1_mask(<4 x i32> %ignore, <4 x i32> %vec) nounwind {
4443; CHECK-LABEL: test_compress_const_splat1_mask:
4444; CHECK:       # %bb.0:
4445; CHECK-NEXT:    vmovaps %xmm1, %xmm0
4446; CHECK-NEXT:    retq
4447    %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> splat (i1 -1), <4 x i32> undef)
4448    ret <4 x i32> %out
4449}
4450define <4 x i32> @test_compress_const_splat0_mask(<4 x i32> %ignore, <4 x i32> %vec) nounwind {
4451; CHECK-LABEL: test_compress_const_splat0_mask:
4452; CHECK:       # %bb.0:
4453; CHECK-NEXT:    retq
4454    %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> splat (i1 0), <4 x i32> undef)
4455    ret <4 x i32> %out
4456}
4457define <4 x i32> @test_compress_undef_mask(<4 x i32> %ignore, <4 x i32> %vec) nounwind {
4458; CHECK-LABEL: test_compress_undef_mask:
4459; CHECK:       # %bb.0:
4460; CHECK-NEXT:    retq
4461    %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> undef, <4 x i32> undef)
4462    ret <4 x i32> %out
4463}
4464define <4 x i32> @test_compress_const_splat0_mask_with_passthru(<4 x i32> %ignore, <4 x i32> %vec, <4 x i32> %passthru) nounwind {
4465; CHECK-LABEL: test_compress_const_splat0_mask_with_passthru:
4466; CHECK:       # %bb.0:
4467; CHECK-NEXT:    vmovaps %xmm2, %xmm0
4468; CHECK-NEXT:    retq
4469    %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> splat (i1 0), <4 x i32> %passthru)
4470    ret <4 x i32> %out
4471}
4472define <4 x i32> @test_compress_const_splat0_mask_without_passthru(<4 x i32> %ignore, <4 x i32> %vec) nounwind {
4473; CHECK-LABEL: test_compress_const_splat0_mask_without_passthru:
4474; CHECK:       # %bb.0:
4475; CHECK-NEXT:    retq
4476    %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> splat (i1 0), <4 x i32> undef)
4477    ret <4 x i32> %out
4478}
4479
4480define <4 x i8> @test_compress_small(<4 x i8> %vec, <4 x i1> %mask) nounwind {
4481; AVX2-LABEL: test_compress_small:
4482; AVX2:       # %bb.0:
4483; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4484; AVX2-NEXT:    vpsllw $7, %xmm1, %xmm1
4485; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
4486; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4487; AVX2-NEXT:    vpcmpgtb %xmm1, %xmm2, %xmm1
4488; AVX2-NEXT:    vpextrb $0, %xmm0, -{{[0-9]+}}(%rsp)
4489; AVX2-NEXT:    vmovd %xmm1, %eax
4490; AVX2-NEXT:    andl $1, %eax
4491; AVX2-NEXT:    vpextrb $1, %xmm0, -24(%rsp,%rax)
4492; AVX2-NEXT:    vpextrb $1, %xmm1, %ecx
4493; AVX2-NEXT:    andl $1, %ecx
4494; AVX2-NEXT:    addq %rax, %rcx
4495; AVX2-NEXT:    vpextrb $2, %xmm0, -24(%rsp,%rcx)
4496; AVX2-NEXT:    vpextrb $2, %xmm1, %eax
4497; AVX2-NEXT:    andl $1, %eax
4498; AVX2-NEXT:    addq %rcx, %rax
4499; AVX2-NEXT:    vpextrb $3, %xmm0, -24(%rsp,%rax)
4500; AVX2-NEXT:    vpextrb $3, %xmm1, %ecx
4501; AVX2-NEXT:    andl $1, %ecx
4502; AVX2-NEXT:    addq %rax, %rcx
4503; AVX2-NEXT:    vpextrb $4, %xmm0, -24(%rsp,%rcx)
4504; AVX2-NEXT:    vpextrb $4, %xmm1, %eax
4505; AVX2-NEXT:    andl $1, %eax
4506; AVX2-NEXT:    addq %rcx, %rax
4507; AVX2-NEXT:    vpextrb $5, %xmm1, %ecx
4508; AVX2-NEXT:    andl $1, %ecx
4509; AVX2-NEXT:    addq %rax, %rcx
4510; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
4511; AVX2-NEXT:    andl $15, %eax
4512; AVX2-NEXT:    vpextrb $5, %xmm0, -24(%rsp,%rax)
4513; AVX2-NEXT:    vpextrb $6, %xmm1, %eax
4514; AVX2-NEXT:    andl $1, %eax
4515; AVX2-NEXT:    addq %rcx, %rax
4516; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
4517; AVX2-NEXT:    andl $15, %ecx
4518; AVX2-NEXT:    vpextrb $6, %xmm0, -24(%rsp,%rcx)
4519; AVX2-NEXT:    vpextrb $7, %xmm1, %ecx
4520; AVX2-NEXT:    andl $1, %ecx
4521; AVX2-NEXT:    addq %rax, %rcx
4522; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
4523; AVX2-NEXT:    andl $15, %eax
4524; AVX2-NEXT:    vpextrb $7, %xmm0, -24(%rsp,%rax)
4525; AVX2-NEXT:    vpextrb $8, %xmm1, %eax
4526; AVX2-NEXT:    andl $1, %eax
4527; AVX2-NEXT:    addq %rcx, %rax
4528; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
4529; AVX2-NEXT:    andl $15, %ecx
4530; AVX2-NEXT:    vpextrb $8, %xmm0, -24(%rsp,%rcx)
4531; AVX2-NEXT:    vpextrb $9, %xmm1, %ecx
4532; AVX2-NEXT:    andl $1, %ecx
4533; AVX2-NEXT:    addq %rax, %rcx
4534; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
4535; AVX2-NEXT:    andl $15, %eax
4536; AVX2-NEXT:    vpextrb $9, %xmm0, -24(%rsp,%rax)
4537; AVX2-NEXT:    vpextrb $10, %xmm1, %eax
4538; AVX2-NEXT:    andl $1, %eax
4539; AVX2-NEXT:    addq %rcx, %rax
4540; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
4541; AVX2-NEXT:    andl $15, %ecx
4542; AVX2-NEXT:    vpextrb $10, %xmm0, -24(%rsp,%rcx)
4543; AVX2-NEXT:    vpextrb $11, %xmm1, %ecx
4544; AVX2-NEXT:    andl $1, %ecx
4545; AVX2-NEXT:    addq %rax, %rcx
4546; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
4547; AVX2-NEXT:    andl $15, %eax
4548; AVX2-NEXT:    vpextrb $11, %xmm0, -24(%rsp,%rax)
4549; AVX2-NEXT:    vpextrb $12, %xmm1, %eax
4550; AVX2-NEXT:    andl $1, %eax
4551; AVX2-NEXT:    addq %rcx, %rax
4552; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
4553; AVX2-NEXT:    andl $15, %ecx
4554; AVX2-NEXT:    vpextrb $12, %xmm0, -24(%rsp,%rcx)
4555; AVX2-NEXT:    vpextrb $13, %xmm1, %ecx
4556; AVX2-NEXT:    andl $1, %ecx
4557; AVX2-NEXT:    addq %rax, %rcx
4558; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
4559; AVX2-NEXT:    andl $15, %eax
4560; AVX2-NEXT:    vpextrb $13, %xmm0, -24(%rsp,%rax)
4561; AVX2-NEXT:    vpextrb $14, %xmm1, %eax
4562; AVX2-NEXT:    andl $1, %eax
4563; AVX2-NEXT:    addl %ecx, %eax
4564; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
4565; AVX2-NEXT:    andl $15, %ecx
4566; AVX2-NEXT:    vpextrb $14, %xmm0, -24(%rsp,%rcx)
4567; AVX2-NEXT:    andl $15, %eax
4568; AVX2-NEXT:    vpextrb $15, %xmm0, -24(%rsp,%rax)
4569; AVX2-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
4570; AVX2-NEXT:    retq
4571;
4572; AVX512F-LABEL: test_compress_small:
4573; AVX512F:       # %bb.0:
4574; AVX512F-NEXT:    vpslld $31, %xmm1, %xmm1
4575; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k0
4576; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
4577; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
4578; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
4579; AVX512F-NEXT:    vpcompressd %zmm0, %zmm0 {%k1} {z}
4580; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
4581; AVX512F-NEXT:    vzeroupper
4582; AVX512F-NEXT:    retq
4583;
4584; AVX512VL-LABEL: test_compress_small:
4585; AVX512VL:       # %bb.0:
4586; AVX512VL-NEXT:    vpslld $31, %xmm1, %xmm1
4587; AVX512VL-NEXT:    vptestmd %xmm1, %xmm1, %k1
4588; AVX512VL-NEXT:    vpcompressb %xmm0, %xmm0 {%k1} {z}
4589; AVX512VL-NEXT:    retq
4590    %out = call <4 x i8> @llvm.experimental.vector.compress(<4 x i8> %vec, <4 x i1> %mask, <4 x i8> undef)
4591    ret <4 x i8> %out
4592}
4593
4594define <4 x i4> @test_compress_illegal_element_type(<4 x i4> %vec, <4 x i1> %mask) nounwind {
4595; AVX2-LABEL: test_compress_illegal_element_type:
4596; AVX2:       # %bb.0:
4597; AVX2-NEXT:    vpslld $31, %xmm1, %xmm1
4598; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm1
4599; AVX2-NEXT:    vmovss %xmm0, -{{[0-9]+}}(%rsp)
4600; AVX2-NEXT:    vmovd %xmm1, %eax
4601; AVX2-NEXT:    andl $1, %eax
4602; AVX2-NEXT:    vextractps $1, %xmm0, -24(%rsp,%rax,4)
4603; AVX2-NEXT:    vpextrd $1, %xmm1, %ecx
4604; AVX2-NEXT:    subl %ecx, %eax
4605; AVX2-NEXT:    leal (,%rax,4), %ecx
4606; AVX2-NEXT:    vextractps $2, %xmm0, -24(%rsp,%rcx)
4607; AVX2-NEXT:    vpextrd $2, %xmm1, %ecx
4608; AVX2-NEXT:    subl %ecx, %eax
4609; AVX2-NEXT:    andl $3, %eax
4610; AVX2-NEXT:    vextractps $3, %xmm0, -24(%rsp,%rax,4)
4611; AVX2-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
4612; AVX2-NEXT:    retq
4613;
4614; AVX512F-LABEL: test_compress_illegal_element_type:
4615; AVX512F:       # %bb.0:
4616; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
4617; AVX512F-NEXT:    vpslld $31, %xmm1, %xmm1
4618; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k0
4619; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
4620; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
4621; AVX512F-NEXT:    vpcompressd %zmm0, %zmm0 {%k1} {z}
4622; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
4623; AVX512F-NEXT:    vzeroupper
4624; AVX512F-NEXT:    retq
4625;
4626; AVX512VL-LABEL: test_compress_illegal_element_type:
4627; AVX512VL:       # %bb.0:
4628; AVX512VL-NEXT:    vpslld $31, %xmm1, %xmm1
4629; AVX512VL-NEXT:    vptestmd %xmm1, %xmm1, %k1
4630; AVX512VL-NEXT:    vpcompressd %xmm0, %xmm0 {%k1} {z}
4631; AVX512VL-NEXT:    retq
4632    %out = call <4 x i4> @llvm.experimental.vector.compress(<4 x i4> %vec, <4 x i1> %mask, <4 x i4> undef)
4633    ret <4 x i4> %out
4634}
4635
4636define <3 x i32> @test_compress_narrow(<3 x i32> %vec, <3 x i1> %mask) nounwind {
4637; AVX2-LABEL: test_compress_narrow:
4638; AVX2:       # %bb.0:
4639; AVX2-NEXT:    vmovd %edi, %xmm1
4640; AVX2-NEXT:    vpinsrd $1, %esi, %xmm1, %xmm1
4641; AVX2-NEXT:    vpinsrd $2, %edx, %xmm1, %xmm1
4642; AVX2-NEXT:    vpslld $31, %xmm1, %xmm1
4643; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm1
4644; AVX2-NEXT:    vmovss %xmm0, -{{[0-9]+}}(%rsp)
4645; AVX2-NEXT:    vmovd %xmm1, %eax
4646; AVX2-NEXT:    andl $1, %eax
4647; AVX2-NEXT:    vextractps $1, %xmm0, -24(%rsp,%rax,4)
4648; AVX2-NEXT:    vpextrd $1, %xmm1, %ecx
4649; AVX2-NEXT:    subl %ecx, %eax
4650; AVX2-NEXT:    leal (,%rax,4), %ecx
4651; AVX2-NEXT:    vextractps $2, %xmm0, -24(%rsp,%rcx)
4652; AVX2-NEXT:    vpextrd $2, %xmm1, %ecx
4653; AVX2-NEXT:    subl %ecx, %eax
4654; AVX2-NEXT:    andl $3, %eax
4655; AVX2-NEXT:    vextractps $3, %xmm0, -24(%rsp,%rax,4)
4656; AVX2-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
4657; AVX2-NEXT:    retq
4658;
4659; AVX512F-LABEL: test_compress_narrow:
4660; AVX512F:       # %bb.0:
4661; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
4662; AVX512F-NEXT:    andl $1, %edi
4663; AVX512F-NEXT:    kmovw %edi, %k0
4664; AVX512F-NEXT:    kmovw %esi, %k1
4665; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
4666; AVX512F-NEXT:    kshiftrw $14, %k1, %k1
4667; AVX512F-NEXT:    korw %k1, %k0, %k0
4668; AVX512F-NEXT:    movw $-5, %ax
4669; AVX512F-NEXT:    kmovw %eax, %k1
4670; AVX512F-NEXT:    kandw %k1, %k0, %k0
4671; AVX512F-NEXT:    kmovw %edx, %k1
4672; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
4673; AVX512F-NEXT:    kshiftrw $13, %k1, %k1
4674; AVX512F-NEXT:    korw %k1, %k0, %k0
4675; AVX512F-NEXT:    movb $7, %al
4676; AVX512F-NEXT:    kmovw %eax, %k1
4677; AVX512F-NEXT:    kandw %k1, %k0, %k0
4678; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
4679; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
4680; AVX512F-NEXT:    vpcompressd %zmm0, %zmm0 {%k1} {z}
4681; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
4682; AVX512F-NEXT:    vzeroupper
4683; AVX512F-NEXT:    retq
4684;
4685; AVX512VL-LABEL: test_compress_narrow:
4686; AVX512VL:       # %bb.0:
4687; AVX512VL-NEXT:    andl $1, %edi
4688; AVX512VL-NEXT:    kmovw %edi, %k0
4689; AVX512VL-NEXT:    kmovd %esi, %k1
4690; AVX512VL-NEXT:    kshiftlw $15, %k1, %k1
4691; AVX512VL-NEXT:    kshiftrw $14, %k1, %k1
4692; AVX512VL-NEXT:    korw %k1, %k0, %k0
4693; AVX512VL-NEXT:    movw $-5, %ax
4694; AVX512VL-NEXT:    kmovd %eax, %k1
4695; AVX512VL-NEXT:    kandw %k1, %k0, %k0
4696; AVX512VL-NEXT:    kmovd %edx, %k1
4697; AVX512VL-NEXT:    kshiftlw $15, %k1, %k1
4698; AVX512VL-NEXT:    kshiftrw $13, %k1, %k1
4699; AVX512VL-NEXT:    korw %k1, %k0, %k0
4700; AVX512VL-NEXT:    movb $7, %al
4701; AVX512VL-NEXT:    kmovd %eax, %k1
4702; AVX512VL-NEXT:    kandw %k1, %k0, %k1
4703; AVX512VL-NEXT:    vpcompressd %xmm0, %xmm0 {%k1} {z}
4704; AVX512VL-NEXT:    retq
4705    %out = call <3 x i32> @llvm.experimental.vector.compress(<3 x i32> %vec, <3 x i1> %mask, <3 x i32> undef)
4706    ret <3 x i32> %out
4707}
4708
4709define <3 x i3> @test_compress_narrow_illegal_element_type(<3 x i3> %vec, <3 x i1> %mask) nounwind {
4710; AVX2-LABEL: test_compress_narrow_illegal_element_type:
4711; AVX2:       # %bb.0:
4712; AVX2-NEXT:    vmovd %ecx, %xmm0
4713; AVX2-NEXT:    vpinsrd $1, %r8d, %xmm0, %xmm0
4714; AVX2-NEXT:    vpslld $31, %xmm0, %xmm0
4715; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm0
4716; AVX2-NEXT:    movl %edi, -{{[0-9]+}}(%rsp)
4717; AVX2-NEXT:    vmovd %xmm0, %eax
4718; AVX2-NEXT:    andl $1, %eax
4719; AVX2-NEXT:    movl %esi, -24(%rsp,%rax,4)
4720; AVX2-NEXT:    vpextrd $1, %xmm0, %ecx
4721; AVX2-NEXT:    subl %ecx, %eax
4722; AVX2-NEXT:    shll $2, %eax
4723; AVX2-NEXT:    movl %edx, -24(%rsp,%rax)
4724; AVX2-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm0
4725; AVX2-NEXT:    vmovd %xmm0, %eax
4726; AVX2-NEXT:    vpextrb $4, %xmm0, %edx
4727; AVX2-NEXT:    vpextrb $8, %xmm0, %ecx
4728; AVX2-NEXT:    # kill: def $al killed $al killed $eax
4729; AVX2-NEXT:    # kill: def $dl killed $dl killed $edx
4730; AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
4731; AVX2-NEXT:    retq
4732;
4733; AVX512F-LABEL: test_compress_narrow_illegal_element_type:
4734; AVX512F:       # %bb.0:
4735; AVX512F-NEXT:    andl $1, %ecx
4736; AVX512F-NEXT:    kmovw %ecx, %k0
4737; AVX512F-NEXT:    kmovw %r8d, %k1
4738; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
4739; AVX512F-NEXT:    kshiftrw $14, %k1, %k1
4740; AVX512F-NEXT:    korw %k1, %k0, %k0
4741; AVX512F-NEXT:    movw $-5, %ax
4742; AVX512F-NEXT:    kmovw %eax, %k1
4743; AVX512F-NEXT:    kandw %k1, %k0, %k0
4744; AVX512F-NEXT:    kmovw %r9d, %k1
4745; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
4746; AVX512F-NEXT:    kshiftrw $13, %k1, %k1
4747; AVX512F-NEXT:    korw %k1, %k0, %k0
4748; AVX512F-NEXT:    movb $7, %al
4749; AVX512F-NEXT:    kmovw %eax, %k1
4750; AVX512F-NEXT:    kandw %k1, %k0, %k0
4751; AVX512F-NEXT:    vmovd %edi, %xmm0
4752; AVX512F-NEXT:    vpinsrd $1, %esi, %xmm0, %xmm0
4753; AVX512F-NEXT:    vpinsrd $2, %edx, %xmm0, %xmm0
4754; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
4755; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
4756; AVX512F-NEXT:    vpcompressd %zmm0, %zmm0 {%k1} {z}
4757; AVX512F-NEXT:    vmovd %xmm0, %eax
4758; AVX512F-NEXT:    vpextrb $4, %xmm0, %edx
4759; AVX512F-NEXT:    vpextrb $8, %xmm0, %ecx
4760; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
4761; AVX512F-NEXT:    # kill: def $dl killed $dl killed $edx
4762; AVX512F-NEXT:    # kill: def $cl killed $cl killed $ecx
4763; AVX512F-NEXT:    vzeroupper
4764; AVX512F-NEXT:    retq
4765;
4766; AVX512VL-LABEL: test_compress_narrow_illegal_element_type:
4767; AVX512VL:       # %bb.0:
4768; AVX512VL-NEXT:    andl $1, %ecx
4769; AVX512VL-NEXT:    kmovw %ecx, %k0
4770; AVX512VL-NEXT:    kmovd %r8d, %k1
4771; AVX512VL-NEXT:    kshiftlw $15, %k1, %k1
4772; AVX512VL-NEXT:    kshiftrw $14, %k1, %k1
4773; AVX512VL-NEXT:    korw %k1, %k0, %k0
4774; AVX512VL-NEXT:    movw $-5, %ax
4775; AVX512VL-NEXT:    kmovd %eax, %k1
4776; AVX512VL-NEXT:    kandw %k1, %k0, %k0
4777; AVX512VL-NEXT:    kmovd %r9d, %k1
4778; AVX512VL-NEXT:    kshiftlw $15, %k1, %k1
4779; AVX512VL-NEXT:    kshiftrw $13, %k1, %k1
4780; AVX512VL-NEXT:    korw %k1, %k0, %k0
4781; AVX512VL-NEXT:    movb $7, %al
4782; AVX512VL-NEXT:    kmovd %eax, %k1
4783; AVX512VL-NEXT:    kandw %k1, %k0, %k1
4784; AVX512VL-NEXT:    vmovd %edi, %xmm0
4785; AVX512VL-NEXT:    vpinsrd $1, %esi, %xmm0, %xmm0
4786; AVX512VL-NEXT:    vpinsrd $2, %edx, %xmm0, %xmm0
4787; AVX512VL-NEXT:    vpcompressd %xmm0, %xmm0 {%k1} {z}
4788; AVX512VL-NEXT:    vmovd %xmm0, %eax
4789; AVX512VL-NEXT:    vpextrb $4, %xmm0, %edx
4790; AVX512VL-NEXT:    vpextrb $8, %xmm0, %ecx
4791; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
4792; AVX512VL-NEXT:    # kill: def $dl killed $dl killed $edx
4793; AVX512VL-NEXT:    # kill: def $cl killed $cl killed $ecx
4794; AVX512VL-NEXT:    retq
4795    %out = call <3 x i3> @llvm.experimental.vector.compress(<3 x i3> %vec, <3 x i1> %mask, <3 x i3> undef)
4796    ret <3 x i3> %out
4797}
4798
4799define <4 x i32> @test_compress_v4i32_zero_passthru(<4 x i32> %vec, <4 x i1> %mask) nounwind {
4800; AVX2-LABEL: test_compress_v4i32_zero_passthru:
4801; AVX2:       # %bb.0:
4802; AVX2-NEXT:    vpslld $31, %xmm1, %xmm1
4803; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm1
4804; AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
4805; AVX2-NEXT:    vmovaps %xmm2, -{{[0-9]+}}(%rsp)
4806; AVX2-NEXT:    vmovss %xmm0, -{{[0-9]+}}(%rsp)
4807; AVX2-NEXT:    vmovd %xmm1, %eax
4808; AVX2-NEXT:    andl $1, %eax
4809; AVX2-NEXT:    vextractps $1, %xmm0, -24(%rsp,%rax,4)
4810; AVX2-NEXT:    vpextrd $1, %xmm1, %ecx
4811; AVX2-NEXT:    andl $1, %ecx
4812; AVX2-NEXT:    addq %rax, %rcx
4813; AVX2-NEXT:    vextractps $2, %xmm0, -24(%rsp,%rcx,4)
4814; AVX2-NEXT:    vpextrd $2, %xmm1, %eax
4815; AVX2-NEXT:    andl $1, %eax
4816; AVX2-NEXT:    addq %rcx, %rax
4817; AVX2-NEXT:    vpextrd $3, %xmm1, %ecx
4818; AVX2-NEXT:    andl $1, %ecx
4819; AVX2-NEXT:    addq %rax, %rcx
4820; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
4821; AVX2-NEXT:    andl $3, %eax
4822; AVX2-NEXT:    vextractps $3, %xmm0, -24(%rsp,%rax,4)
4823; AVX2-NEXT:    xorl %eax, %eax
4824; AVX2-NEXT:    cmpq $3, %rcx
4825; AVX2-NEXT:    movl $3, %edx
4826; AVX2-NEXT:    cmovbq %rcx, %rdx
4827; AVX2-NEXT:    vextractps $3, %xmm0, %ecx
4828; AVX2-NEXT:    cmovbel %eax, %ecx
4829; AVX2-NEXT:    movl %ecx, -24(%rsp,%rdx,4)
4830; AVX2-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
4831; AVX2-NEXT:    retq
4832;
4833; AVX512F-LABEL: test_compress_v4i32_zero_passthru:
4834; AVX512F:       # %bb.0:
4835; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
4836; AVX512F-NEXT:    vpslld $31, %xmm1, %xmm1
4837; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k0
4838; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
4839; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
4840; AVX512F-NEXT:    vpcompressd %zmm0, %zmm0 {%k1} {z}
4841; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
4842; AVX512F-NEXT:    vzeroupper
4843; AVX512F-NEXT:    retq
4844;
4845; AVX512VL-LABEL: test_compress_v4i32_zero_passthru:
4846; AVX512VL:       # %bb.0:
4847; AVX512VL-NEXT:    vpslld $31, %xmm1, %xmm1
4848; AVX512VL-NEXT:    vptestmd %xmm1, %xmm1, %k1
4849; AVX512VL-NEXT:    vpcompressd %xmm0, %xmm0 {%k1} {z}
4850; AVX512VL-NEXT:    retq
4851    %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> %mask, <4 x i32> zeroinitializer)
4852    ret <4 x i32> %out
4853}
4854