1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 2; RUN: llc -mtriple=x86_64 -mattr=+avx2 < %s | FileCheck %s --check-prefixes=CHECK,AVX2 3; RUN: llc -mtriple=x86_64 -mattr=+avx512f < %s | FileCheck %s --check-prefixes=CHECK,AVX512F 4; RUN: llc -mtriple=x86_64 -mattr=+avx512f,+avx512vl,+avx512vbmi2 < %s | FileCheck %s --check-prefixes=CHECK,AVX512VL 5 6define <4 x i32> @test_compress_v4i32(<4 x i32> %vec, <4 x i1> %mask, <4 x i32> %passthru) nounwind { 7; AVX2-LABEL: test_compress_v4i32: 8; AVX2: # %bb.0: 9; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 10; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1 11; AVX2-NEXT: vmovaps %xmm2, -{{[0-9]+}}(%rsp) 12; AVX2-NEXT: vpextrd $1, %xmm1, %eax 13; AVX2-NEXT: vmovd %xmm1, %esi 14; AVX2-NEXT: andl $1, %esi 15; AVX2-NEXT: movl %esi, %edi 16; AVX2-NEXT: subl %eax, %edi 17; AVX2-NEXT: vpextrd $2, %xmm1, %edx 18; AVX2-NEXT: subl %edx, %edi 19; AVX2-NEXT: vpextrd $3, %xmm1, %ecx 20; AVX2-NEXT: subl %ecx, %edi 21; AVX2-NEXT: andl $3, %edi 22; AVX2-NEXT: andl $1, %eax 23; AVX2-NEXT: addq %rsi, %rax 24; AVX2-NEXT: andl $1, %edx 25; AVX2-NEXT: addq %rax, %rdx 26; AVX2-NEXT: andl $1, %ecx 27; AVX2-NEXT: addq %rdx, %rcx 28; AVX2-NEXT: vextractps $3, %xmm0, %r8d 29; AVX2-NEXT: cmpq $4, %rcx 30; AVX2-NEXT: cmovbl -24(%rsp,%rdi,4), %r8d 31; AVX2-NEXT: vmovss %xmm0, -{{[0-9]+}}(%rsp) 32; AVX2-NEXT: vextractps $1, %xmm0, -24(%rsp,%rsi,4) 33; AVX2-NEXT: vextractps $2, %xmm0, -24(%rsp,%rax,4) 34; AVX2-NEXT: andl $3, %edx 35; AVX2-NEXT: vextractps $3, %xmm0, -24(%rsp,%rdx,4) 36; AVX2-NEXT: cmpq $3, %rcx 37; AVX2-NEXT: movl $3, %eax 38; AVX2-NEXT: cmovbq %rcx, %rax 39; AVX2-NEXT: movl %r8d, -24(%rsp,%rax,4) 40; AVX2-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 41; AVX2-NEXT: retq 42; 43; AVX512F-LABEL: test_compress_v4i32: 44; AVX512F: # %bb.0: 45; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 46; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 47; AVX512F-NEXT: vpslld $31, %xmm1, %xmm1 48; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 49; AVX512F-NEXT: kshiftlw $12, %k0, %k0 50; AVX512F-NEXT: kshiftrw $12, %k0, %k1 51; AVX512F-NEXT: vpcompressd %zmm0, %zmm2 {%k1} 52; AVX512F-NEXT: vmovdqa %xmm2, %xmm0 53; AVX512F-NEXT: vzeroupper 54; AVX512F-NEXT: retq 55; 56; AVX512VL-LABEL: test_compress_v4i32: 57; AVX512VL: # %bb.0: 58; AVX512VL-NEXT: vpslld $31, %xmm1, %xmm1 59; AVX512VL-NEXT: vptestmd %xmm1, %xmm1, %k1 60; AVX512VL-NEXT: vpcompressd %xmm0, %xmm2 {%k1} 61; AVX512VL-NEXT: vmovdqa %xmm2, %xmm0 62; AVX512VL-NEXT: retq 63 %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> %mask, <4 x i32> %passthru) 64 ret <4 x i32> %out 65} 66 67define <4 x float> @test_compress_v4f32(<4 x float> %vec, <4 x i1> %mask, <4 x float> %passthru) nounwind { 68; AVX2-LABEL: test_compress_v4f32: 69; AVX2: # %bb.0: 70; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 71; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1 72; AVX2-NEXT: vmovaps %xmm2, -{{[0-9]+}}(%rsp) 73; AVX2-NEXT: vpextrd $1, %xmm1, %edx 74; AVX2-NEXT: vmovd %xmm1, %esi 75; AVX2-NEXT: andl $1, %esi 76; AVX2-NEXT: movl %esi, %edi 77; AVX2-NEXT: subl %edx, %edi 78; AVX2-NEXT: vpextrd $2, %xmm1, %ecx 79; AVX2-NEXT: subl %ecx, %edi 80; AVX2-NEXT: vpextrd $3, %xmm1, %eax 81; AVX2-NEXT: subl %eax, %edi 82; AVX2-NEXT: andl $3, %edi 83; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 84; AVX2-NEXT: vmovss %xmm0, -{{[0-9]+}}(%rsp) 85; AVX2-NEXT: vextractps $1, %xmm0, -24(%rsp,%rsi,4) 86; AVX2-NEXT: andl $1, %edx 87; AVX2-NEXT: addq %rsi, %rdx 88; AVX2-NEXT: vextractps $2, %xmm0, -24(%rsp,%rdx,4) 89; AVX2-NEXT: andl $1, %ecx 90; AVX2-NEXT: addq %rdx, %rcx 91; AVX2-NEXT: andl $1, %eax 92; AVX2-NEXT: addq %rcx, %rax 93; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx 94; AVX2-NEXT: andl $3, %ecx 95; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 96; AVX2-NEXT: vmovss %xmm0, -24(%rsp,%rcx,4) 97; AVX2-NEXT: cmpq $3, %rax 98; AVX2-NEXT: movl $3, %ecx 99; AVX2-NEXT: cmovbq %rax, %rcx 100; AVX2-NEXT: ja .LBB1_2 101; AVX2-NEXT: # %bb.1: 102; AVX2-NEXT: vmovaps %xmm1, %xmm0 103; AVX2-NEXT: .LBB1_2: 104; AVX2-NEXT: vmovss %xmm0, -24(%rsp,%rcx,4) 105; AVX2-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 106; AVX2-NEXT: retq 107; 108; AVX512F-LABEL: test_compress_v4f32: 109; AVX512F: # %bb.0: 110; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 111; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 112; AVX512F-NEXT: vpslld $31, %xmm1, %xmm1 113; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 114; AVX512F-NEXT: kshiftlw $12, %k0, %k0 115; AVX512F-NEXT: kshiftrw $12, %k0, %k1 116; AVX512F-NEXT: vcompressps %zmm0, %zmm2 {%k1} 117; AVX512F-NEXT: vmovdqa %xmm2, %xmm0 118; AVX512F-NEXT: vzeroupper 119; AVX512F-NEXT: retq 120; 121; AVX512VL-LABEL: test_compress_v4f32: 122; AVX512VL: # %bb.0: 123; AVX512VL-NEXT: vpslld $31, %xmm1, %xmm1 124; AVX512VL-NEXT: vptestmd %xmm1, %xmm1, %k1 125; AVX512VL-NEXT: vcompressps %xmm0, %xmm2 {%k1} 126; AVX512VL-NEXT: vmovdqa %xmm2, %xmm0 127; AVX512VL-NEXT: retq 128 %out = call <4 x float> @llvm.experimental.vector.compress(<4 x float> %vec, <4 x i1> %mask, <4 x float> %passthru) 129 ret <4 x float> %out 130} 131 132define <2 x i64> @test_compress_v2i64(<2 x i64> %vec, <2 x i1> %mask, <2 x i64> %passthru) nounwind { 133; AVX2-LABEL: test_compress_v2i64: 134; AVX2: # %bb.0: 135; AVX2-NEXT: vpsllq $63, %xmm1, %xmm1 136; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 137; AVX2-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1 138; AVX2-NEXT: vmovaps %xmm2, -{{[0-9]+}}(%rsp) 139; AVX2-NEXT: vpextrq $1, %xmm1, %rax 140; AVX2-NEXT: vmovq %xmm1, %rcx 141; AVX2-NEXT: movl %ecx, %edx 142; AVX2-NEXT: subl %eax, %edx 143; AVX2-NEXT: andl $1, %edx 144; AVX2-NEXT: andl $1, %eax 145; AVX2-NEXT: andl $1, %ecx 146; AVX2-NEXT: addq %rcx, %rax 147; AVX2-NEXT: vpextrq $1, %xmm0, %rsi 148; AVX2-NEXT: cmpq $2, %rax 149; AVX2-NEXT: cmovbq -24(%rsp,%rdx,8), %rsi 150; AVX2-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) 151; AVX2-NEXT: movl %ecx, %ecx 152; AVX2-NEXT: vpextrq $1, %xmm0, -24(%rsp,%rcx,8) 153; AVX2-NEXT: cmpq $1, %rax 154; AVX2-NEXT: movl $1, %ecx 155; AVX2-NEXT: cmovbq %rax, %rcx 156; AVX2-NEXT: movq %rsi, -24(%rsp,%rcx,8) 157; AVX2-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 158; AVX2-NEXT: retq 159; 160; AVX512F-LABEL: test_compress_v2i64: 161; AVX512F: # %bb.0: 162; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 163; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 164; AVX512F-NEXT: vpsllq $63, %xmm1, %xmm1 165; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 166; AVX512F-NEXT: kshiftlw $14, %k0, %k0 167; AVX512F-NEXT: kshiftrw $14, %k0, %k1 168; AVX512F-NEXT: vpcompressq %zmm0, %zmm2 {%k1} 169; AVX512F-NEXT: vmovdqa %xmm2, %xmm0 170; AVX512F-NEXT: vzeroupper 171; AVX512F-NEXT: retq 172; 173; AVX512VL-LABEL: test_compress_v2i64: 174; AVX512VL: # %bb.0: 175; AVX512VL-NEXT: vpsllq $63, %xmm1, %xmm1 176; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1 177; AVX512VL-NEXT: vpcompressq %xmm0, %xmm2 {%k1} 178; AVX512VL-NEXT: vmovdqa %xmm2, %xmm0 179; AVX512VL-NEXT: retq 180 %out = call <2 x i64> @llvm.experimental.vector.compress(<2 x i64> %vec, <2 x i1> %mask, <2 x i64> %passthru) 181 ret <2 x i64> %out 182} 183 184define <2 x double> @test_compress_v2f64(<2 x double> %vec, <2 x i1> %mask, <2 x double> %passthru) nounwind { 185; AVX2-LABEL: test_compress_v2f64: 186; AVX2: # %bb.0: 187; AVX2-NEXT: vpsllq $63, %xmm1, %xmm1 188; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 189; AVX2-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1 190; AVX2-NEXT: vmovaps %xmm2, -{{[0-9]+}}(%rsp) 191; AVX2-NEXT: vpextrq $1, %xmm1, %rax 192; AVX2-NEXT: vmovq %xmm1, %rcx 193; AVX2-NEXT: movl %ecx, %edx 194; AVX2-NEXT: subl %eax, %edx 195; AVX2-NEXT: andl $1, %edx 196; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 197; AVX2-NEXT: vmovlpd %xmm0, -{{[0-9]+}}(%rsp) 198; AVX2-NEXT: andl $1, %ecx 199; AVX2-NEXT: movl %ecx, %edx 200; AVX2-NEXT: vmovhpd %xmm0, -24(%rsp,%rdx,8) 201; AVX2-NEXT: andl $1, %eax 202; AVX2-NEXT: addq %rcx, %rax 203; AVX2-NEXT: cmpq $2, %rax 204; AVX2-NEXT: jb .LBB3_2 205; AVX2-NEXT: # %bb.1: 206; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 207; AVX2-NEXT: .LBB3_2: 208; AVX2-NEXT: cmpq $1, %rax 209; AVX2-NEXT: movl $1, %ecx 210; AVX2-NEXT: cmovbq %rax, %rcx 211; AVX2-NEXT: vmovsd %xmm1, -24(%rsp,%rcx,8) 212; AVX2-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 213; AVX2-NEXT: retq 214; 215; AVX512F-LABEL: test_compress_v2f64: 216; AVX512F: # %bb.0: 217; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 218; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 219; AVX512F-NEXT: vpsllq $63, %xmm1, %xmm1 220; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 221; AVX512F-NEXT: kshiftlw $14, %k0, %k0 222; AVX512F-NEXT: kshiftrw $14, %k0, %k1 223; AVX512F-NEXT: vcompresspd %zmm0, %zmm2 {%k1} 224; AVX512F-NEXT: vmovdqa %xmm2, %xmm0 225; AVX512F-NEXT: vzeroupper 226; AVX512F-NEXT: retq 227; 228; AVX512VL-LABEL: test_compress_v2f64: 229; AVX512VL: # %bb.0: 230; AVX512VL-NEXT: vpsllq $63, %xmm1, %xmm1 231; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1 232; AVX512VL-NEXT: vcompresspd %xmm0, %xmm2 {%k1} 233; AVX512VL-NEXT: vmovdqa %xmm2, %xmm0 234; AVX512VL-NEXT: retq 235 %out = call <2 x double> @llvm.experimental.vector.compress(<2 x double> %vec, <2 x i1> %mask, <2 x double> %passthru) 236 ret <2 x double> %out 237} 238 239define <8 x i32> @test_compress_v8i32(<8 x i32> %vec, <8 x i1> %mask, <8 x i32> %passthru) nounwind { 240; AVX2-LABEL: test_compress_v8i32: 241; AVX2: # %bb.0: 242; AVX2-NEXT: pushq %rbp 243; AVX2-NEXT: movq %rsp, %rbp 244; AVX2-NEXT: pushq %rbx 245; AVX2-NEXT: andq $-32, %rsp 246; AVX2-NEXT: subq $64, %rsp 247; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 248; AVX2-NEXT: vpslld $31, %ymm1, %ymm1 249; AVX2-NEXT: vpsrad $31, %ymm1, %ymm3 250; AVX2-NEXT: vmovaps %ymm2, (%rsp) 251; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm1 252; AVX2-NEXT: vpackssdw %xmm1, %xmm3, %xmm2 253; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 254; AVX2-NEXT: vpslld $31, %ymm2, %ymm2 255; AVX2-NEXT: vpsrld $31, %ymm2, %ymm2 256; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 257; AVX2-NEXT: vpaddd %xmm4, %xmm2, %xmm2 258; AVX2-NEXT: vpextrd $1, %xmm2, %eax 259; AVX2-NEXT: vmovd %xmm2, %ecx 260; AVX2-NEXT: addl %eax, %ecx 261; AVX2-NEXT: vpextrd $2, %xmm2, %edx 262; AVX2-NEXT: vpextrd $3, %xmm2, %eax 263; AVX2-NEXT: addl %edx, %eax 264; AVX2-NEXT: addl %ecx, %eax 265; AVX2-NEXT: andl $7, %eax 266; AVX2-NEXT: vpextrd $1, %xmm3, %ecx 267; AVX2-NEXT: andl $1, %ecx 268; AVX2-NEXT: vmovd %xmm3, %edx 269; AVX2-NEXT: andl $1, %edx 270; AVX2-NEXT: addq %rdx, %rcx 271; AVX2-NEXT: vpextrd $2, %xmm3, %esi 272; AVX2-NEXT: andl $1, %esi 273; AVX2-NEXT: addq %rcx, %rsi 274; AVX2-NEXT: vpextrd $3, %xmm3, %edi 275; AVX2-NEXT: andl $1, %edi 276; AVX2-NEXT: addq %rsi, %rdi 277; AVX2-NEXT: vmovd %xmm1, %r8d 278; AVX2-NEXT: andl $1, %r8d 279; AVX2-NEXT: addq %rdi, %r8 280; AVX2-NEXT: vpextrd $1, %xmm1, %r9d 281; AVX2-NEXT: andl $1, %r9d 282; AVX2-NEXT: addq %r8, %r9 283; AVX2-NEXT: vpextrd $2, %xmm1, %r10d 284; AVX2-NEXT: andl $1, %r10d 285; AVX2-NEXT: addq %r9, %r10 286; AVX2-NEXT: vpextrd $3, %xmm1, %r11d 287; AVX2-NEXT: andl $1, %r11d 288; AVX2-NEXT: addq %r10, %r11 289; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 290; AVX2-NEXT: vextractps $3, %xmm1, %ebx 291; AVX2-NEXT: cmpq $8, %r11 292; AVX2-NEXT: cmovbl (%rsp,%rax,4), %ebx 293; AVX2-NEXT: vmovss %xmm0, (%rsp) 294; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rdx,4) 295; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rcx,4) 296; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rsi,4) 297; AVX2-NEXT: andl $7, %edi 298; AVX2-NEXT: vmovss %xmm1, (%rsp,%rdi,4) 299; AVX2-NEXT: andl $7, %r8d 300; AVX2-NEXT: vextractps $1, %xmm1, (%rsp,%r8,4) 301; AVX2-NEXT: andl $7, %r9d 302; AVX2-NEXT: vextractps $2, %xmm1, (%rsp,%r9,4) 303; AVX2-NEXT: andl $7, %r10d 304; AVX2-NEXT: vextractps $3, %xmm1, (%rsp,%r10,4) 305; AVX2-NEXT: cmpq $7, %r11 306; AVX2-NEXT: movl $7, %eax 307; AVX2-NEXT: cmovbq %r11, %rax 308; AVX2-NEXT: movl %eax, %eax 309; AVX2-NEXT: movl %ebx, (%rsp,%rax,4) 310; AVX2-NEXT: vmovaps (%rsp), %ymm0 311; AVX2-NEXT: leaq -8(%rbp), %rsp 312; AVX2-NEXT: popq %rbx 313; AVX2-NEXT: popq %rbp 314; AVX2-NEXT: retq 315; 316; AVX512F-LABEL: test_compress_v8i32: 317; AVX512F: # %bb.0: 318; AVX512F-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 319; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 320; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1 321; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1 322; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1 323; AVX512F-NEXT: vpcompressd %zmm0, %zmm2 {%k1} 324; AVX512F-NEXT: vmovdqa %ymm2, %ymm0 325; AVX512F-NEXT: retq 326; 327; AVX512VL-LABEL: test_compress_v8i32: 328; AVX512VL: # %bb.0: 329; AVX512VL-NEXT: vpsllw $15, %xmm1, %xmm1 330; AVX512VL-NEXT: vpmovw2m %xmm1, %k1 331; AVX512VL-NEXT: vpcompressd %ymm0, %ymm2 {%k1} 332; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 333; AVX512VL-NEXT: retq 334 %out = call <8 x i32> @llvm.experimental.vector.compress(<8 x i32> %vec, <8 x i1> %mask, <8 x i32> %passthru) 335 ret <8 x i32> %out 336} 337 338define <8 x float> @test_compress_v8f32(<8 x float> %vec, <8 x i1> %mask, <8 x float> %passthru) nounwind { 339; AVX2-LABEL: test_compress_v8f32: 340; AVX2: # %bb.0: 341; AVX2-NEXT: pushq %rbp 342; AVX2-NEXT: movq %rsp, %rbp 343; AVX2-NEXT: andq $-32, %rsp 344; AVX2-NEXT: subq $64, %rsp 345; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 346; AVX2-NEXT: vpslld $31, %ymm1, %ymm1 347; AVX2-NEXT: vpsrad $31, %ymm1, %ymm3 348; AVX2-NEXT: vmovaps %ymm2, (%rsp) 349; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm1 350; AVX2-NEXT: vpackssdw %xmm1, %xmm3, %xmm2 351; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 352; AVX2-NEXT: vpslld $31, %ymm2, %ymm2 353; AVX2-NEXT: vpsrld $31, %ymm2, %ymm2 354; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 355; AVX2-NEXT: vpaddd %xmm4, %xmm2, %xmm2 356; AVX2-NEXT: vpextrd $1, %xmm2, %eax 357; AVX2-NEXT: vmovd %xmm2, %ecx 358; AVX2-NEXT: addl %eax, %ecx 359; AVX2-NEXT: vpextrd $2, %xmm2, %eax 360; AVX2-NEXT: vpextrd $3, %xmm2, %edx 361; AVX2-NEXT: addl %eax, %edx 362; AVX2-NEXT: addl %ecx, %edx 363; AVX2-NEXT: andl $7, %edx 364; AVX2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 365; AVX2-NEXT: vmovss %xmm0, (%rsp) 366; AVX2-NEXT: vmovd %xmm3, %eax 367; AVX2-NEXT: andl $1, %eax 368; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rax,4) 369; AVX2-NEXT: vpextrd $1, %xmm3, %ecx 370; AVX2-NEXT: andl $1, %ecx 371; AVX2-NEXT: addq %rax, %rcx 372; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rcx,4) 373; AVX2-NEXT: vpextrd $2, %xmm3, %eax 374; AVX2-NEXT: andl $1, %eax 375; AVX2-NEXT: addq %rcx, %rax 376; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rax,4) 377; AVX2-NEXT: vpextrd $3, %xmm3, %ecx 378; AVX2-NEXT: andl $1, %ecx 379; AVX2-NEXT: addq %rax, %rcx 380; AVX2-NEXT: vmovd %xmm1, %eax 381; AVX2-NEXT: andl $1, %eax 382; AVX2-NEXT: addq %rcx, %rax 383; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx 384; AVX2-NEXT: andl $7, %ecx 385; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 386; AVX2-NEXT: vmovss %xmm0, (%rsp,%rcx,4) 387; AVX2-NEXT: vpextrd $1, %xmm1, %ecx 388; AVX2-NEXT: andl $1, %ecx 389; AVX2-NEXT: addq %rax, %rcx 390; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax 391; AVX2-NEXT: andl $7, %eax 392; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rax,4) 393; AVX2-NEXT: vpextrd $2, %xmm1, %edx 394; AVX2-NEXT: andl $1, %edx 395; AVX2-NEXT: addq %rcx, %rdx 396; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx 397; AVX2-NEXT: andl $7, %ecx 398; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rcx,4) 399; AVX2-NEXT: vpextrd $3, %xmm1, %eax 400; AVX2-NEXT: andl $1, %eax 401; AVX2-NEXT: addq %rdx, %rax 402; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx 403; AVX2-NEXT: andl $7, %edx 404; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 405; AVX2-NEXT: vmovss %xmm0, (%rsp,%rdx,4) 406; AVX2-NEXT: cmpq $8, %rax 407; AVX2-NEXT: jae .LBB5_2 408; AVX2-NEXT: # %bb.1: 409; AVX2-NEXT: vmovaps %xmm2, %xmm0 410; AVX2-NEXT: .LBB5_2: 411; AVX2-NEXT: cmpq $7, %rax 412; AVX2-NEXT: movl $7, %ecx 413; AVX2-NEXT: cmovbq %rax, %rcx 414; AVX2-NEXT: movl %ecx, %eax 415; AVX2-NEXT: vmovss %xmm0, (%rsp,%rax,4) 416; AVX2-NEXT: vmovaps (%rsp), %ymm0 417; AVX2-NEXT: movq %rbp, %rsp 418; AVX2-NEXT: popq %rbp 419; AVX2-NEXT: retq 420; 421; AVX512F-LABEL: test_compress_v8f32: 422; AVX512F: # %bb.0: 423; AVX512F-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 424; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 425; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1 426; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1 427; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1 428; AVX512F-NEXT: vcompressps %zmm0, %zmm2 {%k1} 429; AVX512F-NEXT: vmovdqa %ymm2, %ymm0 430; AVX512F-NEXT: retq 431; 432; AVX512VL-LABEL: test_compress_v8f32: 433; AVX512VL: # %bb.0: 434; AVX512VL-NEXT: vpsllw $15, %xmm1, %xmm1 435; AVX512VL-NEXT: vpmovw2m %xmm1, %k1 436; AVX512VL-NEXT: vcompressps %ymm0, %ymm2 {%k1} 437; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 438; AVX512VL-NEXT: retq 439 %out = call <8 x float> @llvm.experimental.vector.compress(<8 x float> %vec, <8 x i1> %mask, <8 x float> %passthru) 440 ret <8 x float> %out 441} 442 443define <4 x i64> @test_compress_v4i64(<4 x i64> %vec, <4 x i1> %mask, <4 x i64> %passthru) nounwind { 444; AVX2-LABEL: test_compress_v4i64: 445; AVX2: # %bb.0: 446; AVX2-NEXT: pushq %rbp 447; AVX2-NEXT: movq %rsp, %rbp 448; AVX2-NEXT: andq $-32, %rsp 449; AVX2-NEXT: subq $64, %rsp 450; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 451; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1 452; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1 453; AVX2-NEXT: vmovaps %ymm2, (%rsp) 454; AVX2-NEXT: vpsrlq $63, %ymm1, %ymm2 455; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 456; AVX2-NEXT: vpaddq %xmm3, %xmm2, %xmm2 457; AVX2-NEXT: vpextrq $1, %xmm2, %rcx 458; AVX2-NEXT: vmovq %xmm2, %rax 459; AVX2-NEXT: addl %ecx, %eax 460; AVX2-NEXT: andl $3, %eax 461; AVX2-NEXT: vpextrq $1, %xmm1, %rcx 462; AVX2-NEXT: vmovq %xmm1, %rdx 463; AVX2-NEXT: andl $1, %edx 464; AVX2-NEXT: movl %edx, %esi 465; AVX2-NEXT: subq %rcx, %rdx 466; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 467; AVX2-NEXT: vmovq %xmm1, %rcx 468; AVX2-NEXT: movl %edx, %edi 469; AVX2-NEXT: subq %rcx, %rdx 470; AVX2-NEXT: vpextrq $1, %xmm1, %rcx 471; AVX2-NEXT: movq %rdx, %r8 472; AVX2-NEXT: subq %rcx, %r8 473; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 474; AVX2-NEXT: vpextrq $1, %xmm1, %rcx 475; AVX2-NEXT: cmpq $4, %r8 476; AVX2-NEXT: cmovbq (%rsp,%rax,8), %rcx 477; AVX2-NEXT: vmovq %xmm0, (%rsp) 478; AVX2-NEXT: vpextrq $1, %xmm0, (%rsp,%rsi,8) 479; AVX2-NEXT: vmovq %xmm1, (%rsp,%rdi,8) 480; AVX2-NEXT: andl $3, %edx 481; AVX2-NEXT: vpextrq $1, %xmm1, (%rsp,%rdx,8) 482; AVX2-NEXT: cmpq $3, %r8 483; AVX2-NEXT: movl $3, %eax 484; AVX2-NEXT: cmovbq %r8, %rax 485; AVX2-NEXT: movl %eax, %eax 486; AVX2-NEXT: movq %rcx, (%rsp,%rax,8) 487; AVX2-NEXT: vmovaps (%rsp), %ymm0 488; AVX2-NEXT: movq %rbp, %rsp 489; AVX2-NEXT: popq %rbp 490; AVX2-NEXT: retq 491; 492; AVX512F-LABEL: test_compress_v4i64: 493; AVX512F: # %bb.0: 494; AVX512F-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 495; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 496; AVX512F-NEXT: vpslld $31, %xmm1, %xmm1 497; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 498; AVX512F-NEXT: kshiftlw $12, %k0, %k0 499; AVX512F-NEXT: kshiftrw $12, %k0, %k1 500; AVX512F-NEXT: vpcompressq %zmm0, %zmm2 {%k1} 501; AVX512F-NEXT: vmovdqa %ymm2, %ymm0 502; AVX512F-NEXT: retq 503; 504; AVX512VL-LABEL: test_compress_v4i64: 505; AVX512VL: # %bb.0: 506; AVX512VL-NEXT: vpslld $31, %xmm1, %xmm1 507; AVX512VL-NEXT: vptestmd %xmm1, %xmm1, %k1 508; AVX512VL-NEXT: vpcompressq %ymm0, %ymm2 {%k1} 509; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 510; AVX512VL-NEXT: retq 511 %out = call <4 x i64> @llvm.experimental.vector.compress(<4 x i64> %vec, <4 x i1> %mask, <4 x i64> %passthru) 512 ret <4 x i64> %out 513} 514 515define <4 x double> @test_compress_v4f64(<4 x double> %vec, <4 x i1> %mask, <4 x double> %passthru) nounwind { 516; AVX2-LABEL: test_compress_v4f64: 517; AVX2: # %bb.0: 518; AVX2-NEXT: pushq %rbp 519; AVX2-NEXT: movq %rsp, %rbp 520; AVX2-NEXT: andq $-32, %rsp 521; AVX2-NEXT: subq $64, %rsp 522; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 523; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1 524; AVX2-NEXT: vpmovsxdq %xmm1, %ymm3 525; AVX2-NEXT: vmovaps %ymm2, (%rsp) 526; AVX2-NEXT: vpsrlq $63, %ymm3, %ymm1 527; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 528; AVX2-NEXT: vpaddq %xmm2, %xmm1, %xmm1 529; AVX2-NEXT: vpextrq $1, %xmm1, %rax 530; AVX2-NEXT: vmovq %xmm1, %rcx 531; AVX2-NEXT: addl %eax, %ecx 532; AVX2-NEXT: andl $3, %ecx 533; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 534; AVX2-NEXT: vmovlpd %xmm0, (%rsp) 535; AVX2-NEXT: vmovq %xmm3, %rax 536; AVX2-NEXT: andl $1, %eax 537; AVX2-NEXT: movl %eax, %ecx 538; AVX2-NEXT: vmovhpd %xmm0, (%rsp,%rcx,8) 539; AVX2-NEXT: vpextrq $1, %xmm3, %rcx 540; AVX2-NEXT: subq %rcx, %rax 541; AVX2-NEXT: movl %eax, %ecx 542; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 543; AVX2-NEXT: vmovlpd %xmm0, (%rsp,%rcx,8) 544; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm2 545; AVX2-NEXT: vmovq %xmm2, %rcx 546; AVX2-NEXT: subq %rcx, %rax 547; AVX2-NEXT: movl %eax, %ecx 548; AVX2-NEXT: andl $3, %ecx 549; AVX2-NEXT: vmovhpd %xmm0, (%rsp,%rcx,8) 550; AVX2-NEXT: vpextrq $1, %xmm2, %rcx 551; AVX2-NEXT: subq %rcx, %rax 552; AVX2-NEXT: cmpq $4, %rax 553; AVX2-NEXT: jb .LBB7_2 554; AVX2-NEXT: # %bb.1: 555; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 556; AVX2-NEXT: .LBB7_2: 557; AVX2-NEXT: cmpq $3, %rax 558; AVX2-NEXT: movl $3, %ecx 559; AVX2-NEXT: cmovbq %rax, %rcx 560; AVX2-NEXT: movl %ecx, %eax 561; AVX2-NEXT: vmovsd %xmm1, (%rsp,%rax,8) 562; AVX2-NEXT: vmovaps (%rsp), %ymm0 563; AVX2-NEXT: movq %rbp, %rsp 564; AVX2-NEXT: popq %rbp 565; AVX2-NEXT: retq 566; 567; AVX512F-LABEL: test_compress_v4f64: 568; AVX512F: # %bb.0: 569; AVX512F-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 570; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 571; AVX512F-NEXT: vpslld $31, %xmm1, %xmm1 572; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 573; AVX512F-NEXT: kshiftlw $12, %k0, %k0 574; AVX512F-NEXT: kshiftrw $12, %k0, %k1 575; AVX512F-NEXT: vcompresspd %zmm0, %zmm2 {%k1} 576; AVX512F-NEXT: vmovdqa %ymm2, %ymm0 577; AVX512F-NEXT: retq 578; 579; AVX512VL-LABEL: test_compress_v4f64: 580; AVX512VL: # %bb.0: 581; AVX512VL-NEXT: vpslld $31, %xmm1, %xmm1 582; AVX512VL-NEXT: vptestmd %xmm1, %xmm1, %k1 583; AVX512VL-NEXT: vcompresspd %ymm0, %ymm2 {%k1} 584; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 585; AVX512VL-NEXT: retq 586 %out = call <4 x double> @llvm.experimental.vector.compress(<4 x double> %vec, <4 x i1> %mask, <4 x double> %passthru) 587 ret <4 x double> %out 588} 589 590define <16 x i32> @test_compress_v16i32(<16 x i32> %vec, <16 x i1> %mask, <16 x i32> %passthru) nounwind { 591; AVX2-LABEL: test_compress_v16i32: 592; AVX2: # %bb.0: 593; AVX2-NEXT: pushq %rbp 594; AVX2-NEXT: movq %rsp, %rbp 595; AVX2-NEXT: pushq %r15 596; AVX2-NEXT: pushq %r14 597; AVX2-NEXT: pushq %r13 598; AVX2-NEXT: pushq %r12 599; AVX2-NEXT: pushq %rbx 600; AVX2-NEXT: andq $-32, %rsp 601; AVX2-NEXT: subq $128, %rsp 602; AVX2-NEXT: vmovaps %ymm4, {{[0-9]+}}(%rsp) 603; AVX2-NEXT: vmovaps %ymm3, (%rsp) 604; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 605; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero 606; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] 607; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero 608; AVX2-NEXT: vpaddd %ymm3, %ymm4, %ymm3 609; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 610; AVX2-NEXT: vpaddd %xmm4, %xmm3, %xmm3 611; AVX2-NEXT: vpextrd $1, %xmm3, %eax 612; AVX2-NEXT: vmovd %xmm3, %ecx 613; AVX2-NEXT: addl %eax, %ecx 614; AVX2-NEXT: vpextrd $2, %xmm3, %eax 615; AVX2-NEXT: vpextrd $3, %xmm3, %edx 616; AVX2-NEXT: addl %eax, %edx 617; AVX2-NEXT: addl %ecx, %edx 618; AVX2-NEXT: andl $15, %edx 619; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 620; AVX2-NEXT: vpextrb $1, %xmm2, %eax 621; AVX2-NEXT: andl $1, %eax 622; AVX2-NEXT: vmovd %xmm2, %ecx 623; AVX2-NEXT: andl $1, %ecx 624; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 625; AVX2-NEXT: addq %rcx, %rax 626; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 627; AVX2-NEXT: vpextrb $2, %xmm2, %ecx 628; AVX2-NEXT: andl $1, %ecx 629; AVX2-NEXT: addq %rax, %rcx 630; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 631; AVX2-NEXT: vpextrb $3, %xmm2, %eax 632; AVX2-NEXT: andl $1, %eax 633; AVX2-NEXT: addq %rcx, %rax 634; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 635; AVX2-NEXT: vpextrb $4, %xmm2, %r8d 636; AVX2-NEXT: andl $1, %r8d 637; AVX2-NEXT: addq %rax, %r8 638; AVX2-NEXT: vpextrb $5, %xmm2, %r9d 639; AVX2-NEXT: andl $1, %r9d 640; AVX2-NEXT: addq %r8, %r9 641; AVX2-NEXT: vpextrb $6, %xmm2, %r10d 642; AVX2-NEXT: andl $1, %r10d 643; AVX2-NEXT: addq %r9, %r10 644; AVX2-NEXT: vpextrb $7, %xmm2, %r11d 645; AVX2-NEXT: andl $1, %r11d 646; AVX2-NEXT: addq %r10, %r11 647; AVX2-NEXT: vpextrb $8, %xmm2, %ebx 648; AVX2-NEXT: andl $1, %ebx 649; AVX2-NEXT: addq %r11, %rbx 650; AVX2-NEXT: vpextrb $9, %xmm2, %r14d 651; AVX2-NEXT: andl $1, %r14d 652; AVX2-NEXT: addq %rbx, %r14 653; AVX2-NEXT: vpextrb $10, %xmm2, %r15d 654; AVX2-NEXT: andl $1, %r15d 655; AVX2-NEXT: addq %r14, %r15 656; AVX2-NEXT: vpextrb $11, %xmm2, %r12d 657; AVX2-NEXT: andl $1, %r12d 658; AVX2-NEXT: addq %r15, %r12 659; AVX2-NEXT: vpextrb $12, %xmm2, %r13d 660; AVX2-NEXT: andl $1, %r13d 661; AVX2-NEXT: addq %r12, %r13 662; AVX2-NEXT: vpextrb $13, %xmm2, %ecx 663; AVX2-NEXT: andl $1, %ecx 664; AVX2-NEXT: addq %r13, %rcx 665; AVX2-NEXT: vpextrb $14, %xmm2, %eax 666; AVX2-NEXT: andl $1, %eax 667; AVX2-NEXT: addq %rcx, %rax 668; AVX2-NEXT: vpextrb $15, %xmm2, %edx 669; AVX2-NEXT: andl $1, %edx 670; AVX2-NEXT: addq %rax, %rdx 671; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 672; AVX2-NEXT: cmpq $16, %rdx 673; AVX2-NEXT: vextractps $3, %xmm2, %esi 674; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload 675; AVX2-NEXT: cmovbl (%rsp,%rdi,4), %esi 676; AVX2-NEXT: movl %esi, %edi 677; AVX2-NEXT: vmovss %xmm0, (%rsp) 678; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload 679; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rsi,4) 680; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload 681; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rsi,4) 682; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload 683; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rsi,4) 684; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 685; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload 686; AVX2-NEXT: vmovss %xmm0, (%rsp,%rsi,4) 687; AVX2-NEXT: andl $15, %r8d 688; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%r8,4) 689; AVX2-NEXT: andl $15, %r9d 690; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%r9,4) 691; AVX2-NEXT: andl $15, %r10d 692; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%r10,4) 693; AVX2-NEXT: andl $15, %r11d 694; AVX2-NEXT: vmovss %xmm1, (%rsp,%r11,4) 695; AVX2-NEXT: andl $15, %ebx 696; AVX2-NEXT: vextractps $1, %xmm1, (%rsp,%rbx,4) 697; AVX2-NEXT: andl $15, %r14d 698; AVX2-NEXT: vextractps $2, %xmm1, (%rsp,%r14,4) 699; AVX2-NEXT: andl $15, %r15d 700; AVX2-NEXT: vextractps $3, %xmm1, (%rsp,%r15,4) 701; AVX2-NEXT: andl $15, %r12d 702; AVX2-NEXT: vmovss %xmm2, (%rsp,%r12,4) 703; AVX2-NEXT: andl $15, %r13d 704; AVX2-NEXT: vextractps $1, %xmm2, (%rsp,%r13,4) 705; AVX2-NEXT: andl $15, %ecx 706; AVX2-NEXT: vextractps $2, %xmm2, (%rsp,%rcx,4) 707; AVX2-NEXT: andl $15, %eax 708; AVX2-NEXT: vextractps $3, %xmm2, (%rsp,%rax,4) 709; AVX2-NEXT: cmpq $15, %rdx 710; AVX2-NEXT: movl $15, %eax 711; AVX2-NEXT: cmovbq %rdx, %rax 712; AVX2-NEXT: movl %eax, %eax 713; AVX2-NEXT: movl %edi, (%rsp,%rax,4) 714; AVX2-NEXT: vmovaps (%rsp), %ymm0 715; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 716; AVX2-NEXT: leaq -40(%rbp), %rsp 717; AVX2-NEXT: popq %rbx 718; AVX2-NEXT: popq %r12 719; AVX2-NEXT: popq %r13 720; AVX2-NEXT: popq %r14 721; AVX2-NEXT: popq %r15 722; AVX2-NEXT: popq %rbp 723; AVX2-NEXT: retq 724; 725; AVX512F-LABEL: test_compress_v16i32: 726; AVX512F: # %bb.0: 727; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 728; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1 729; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1 730; AVX512F-NEXT: vpcompressd %zmm0, %zmm2 {%k1} 731; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 732; AVX512F-NEXT: retq 733; 734; AVX512VL-LABEL: test_compress_v16i32: 735; AVX512VL: # %bb.0: 736; AVX512VL-NEXT: vpsllw $7, %xmm1, %xmm1 737; AVX512VL-NEXT: vpmovb2m %xmm1, %k1 738; AVX512VL-NEXT: vpcompressd %zmm0, %zmm2 {%k1} 739; AVX512VL-NEXT: vmovdqa64 %zmm2, %zmm0 740; AVX512VL-NEXT: retq 741 %out = call <16 x i32> @llvm.experimental.vector.compress(<16 x i32> %vec, <16 x i1> %mask, <16 x i32> %passthru) 742 ret <16 x i32> %out 743} 744 745define <16 x float> @test_compress_v16f32(<16 x float> %vec, <16 x i1> %mask, <16 x float> %passthru) nounwind { 746; AVX2-LABEL: test_compress_v16f32: 747; AVX2: # %bb.0: 748; AVX2-NEXT: pushq %rbp 749; AVX2-NEXT: movq %rsp, %rbp 750; AVX2-NEXT: andq $-32, %rsp 751; AVX2-NEXT: subq $96, %rsp 752; AVX2-NEXT: vmovaps %ymm4, {{[0-9]+}}(%rsp) 753; AVX2-NEXT: vmovaps %ymm3, (%rsp) 754; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 755; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero 756; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] 757; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero 758; AVX2-NEXT: vpaddd %ymm3, %ymm4, %ymm3 759; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 760; AVX2-NEXT: vpaddd %xmm4, %xmm3, %xmm3 761; AVX2-NEXT: vpextrd $1, %xmm3, %eax 762; AVX2-NEXT: vmovd %xmm3, %ecx 763; AVX2-NEXT: addl %eax, %ecx 764; AVX2-NEXT: vpextrd $2, %xmm3, %eax 765; AVX2-NEXT: vpextrd $3, %xmm3, %edx 766; AVX2-NEXT: addl %eax, %edx 767; AVX2-NEXT: addl %ecx, %edx 768; AVX2-NEXT: andl $15, %edx 769; AVX2-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero 770; AVX2-NEXT: vmovss %xmm0, (%rsp) 771; AVX2-NEXT: vmovd %xmm2, %eax 772; AVX2-NEXT: andl $1, %eax 773; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rax,4) 774; AVX2-NEXT: vpextrb $1, %xmm2, %ecx 775; AVX2-NEXT: andl $1, %ecx 776; AVX2-NEXT: addq %rax, %rcx 777; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rcx,4) 778; AVX2-NEXT: vpextrb $2, %xmm2, %eax 779; AVX2-NEXT: andl $1, %eax 780; AVX2-NEXT: addq %rcx, %rax 781; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rax,4) 782; AVX2-NEXT: vpextrb $3, %xmm2, %ecx 783; AVX2-NEXT: andl $1, %ecx 784; AVX2-NEXT: addq %rax, %rcx 785; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 786; AVX2-NEXT: vmovss %xmm0, (%rsp,%rcx,4) 787; AVX2-NEXT: vpextrb $4, %xmm2, %eax 788; AVX2-NEXT: andl $1, %eax 789; AVX2-NEXT: addq %rcx, %rax 790; AVX2-NEXT: vpextrb $5, %xmm2, %ecx 791; AVX2-NEXT: andl $1, %ecx 792; AVX2-NEXT: addq %rax, %rcx 793; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax 794; AVX2-NEXT: andl $15, %eax 795; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rax,4) 796; AVX2-NEXT: vpextrb $6, %xmm2, %eax 797; AVX2-NEXT: andl $1, %eax 798; AVX2-NEXT: addq %rcx, %rax 799; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx 800; AVX2-NEXT: andl $15, %ecx 801; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rcx,4) 802; AVX2-NEXT: vpextrb $7, %xmm2, %ecx 803; AVX2-NEXT: andl $1, %ecx 804; AVX2-NEXT: addq %rax, %rcx 805; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax 806; AVX2-NEXT: andl $15, %eax 807; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rax,4) 808; AVX2-NEXT: vpextrb $8, %xmm2, %eax 809; AVX2-NEXT: andl $1, %eax 810; AVX2-NEXT: addq %rcx, %rax 811; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx 812; AVX2-NEXT: andl $15, %ecx 813; AVX2-NEXT: vmovss %xmm1, (%rsp,%rcx,4) 814; AVX2-NEXT: vpextrb $9, %xmm2, %ecx 815; AVX2-NEXT: andl $1, %ecx 816; AVX2-NEXT: addq %rax, %rcx 817; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax 818; AVX2-NEXT: andl $15, %eax 819; AVX2-NEXT: vextractps $1, %xmm1, (%rsp,%rax,4) 820; AVX2-NEXT: vpextrb $10, %xmm2, %eax 821; AVX2-NEXT: andl $1, %eax 822; AVX2-NEXT: addq %rcx, %rax 823; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx 824; AVX2-NEXT: andl $15, %ecx 825; AVX2-NEXT: vextractps $2, %xmm1, (%rsp,%rcx,4) 826; AVX2-NEXT: vpextrb $11, %xmm2, %ecx 827; AVX2-NEXT: andl $1, %ecx 828; AVX2-NEXT: addq %rax, %rcx 829; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax 830; AVX2-NEXT: andl $15, %eax 831; AVX2-NEXT: vextractps $3, %xmm1, (%rsp,%rax,4) 832; AVX2-NEXT: vpextrb $12, %xmm2, %eax 833; AVX2-NEXT: andl $1, %eax 834; AVX2-NEXT: addq %rcx, %rax 835; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx 836; AVX2-NEXT: andl $15, %ecx 837; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm0 838; AVX2-NEXT: vmovss %xmm0, (%rsp,%rcx,4) 839; AVX2-NEXT: vpextrb $13, %xmm2, %ecx 840; AVX2-NEXT: andl $1, %ecx 841; AVX2-NEXT: addq %rax, %rcx 842; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax 843; AVX2-NEXT: andl $15, %eax 844; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rax,4) 845; AVX2-NEXT: vpextrb $14, %xmm2, %edx 846; AVX2-NEXT: andl $1, %edx 847; AVX2-NEXT: addq %rcx, %rdx 848; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx 849; AVX2-NEXT: andl $15, %ecx 850; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rcx,4) 851; AVX2-NEXT: vpextrb $15, %xmm2, %eax 852; AVX2-NEXT: andl $1, %eax 853; AVX2-NEXT: addq %rdx, %rax 854; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx 855; AVX2-NEXT: andl $15, %edx 856; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 857; AVX2-NEXT: vmovss %xmm0, (%rsp,%rdx,4) 858; AVX2-NEXT: cmpq $16, %rax 859; AVX2-NEXT: jae .LBB9_2 860; AVX2-NEXT: # %bb.1: 861; AVX2-NEXT: vmovaps %xmm3, %xmm0 862; AVX2-NEXT: .LBB9_2: 863; AVX2-NEXT: cmpq $15, %rax 864; AVX2-NEXT: movl $15, %ecx 865; AVX2-NEXT: cmovbq %rax, %rcx 866; AVX2-NEXT: movl %ecx, %eax 867; AVX2-NEXT: vmovss %xmm0, (%rsp,%rax,4) 868; AVX2-NEXT: vmovaps (%rsp), %ymm0 869; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 870; AVX2-NEXT: movq %rbp, %rsp 871; AVX2-NEXT: popq %rbp 872; AVX2-NEXT: retq 873; 874; AVX512F-LABEL: test_compress_v16f32: 875; AVX512F: # %bb.0: 876; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 877; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1 878; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1 879; AVX512F-NEXT: vcompressps %zmm0, %zmm2 {%k1} 880; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 881; AVX512F-NEXT: retq 882; 883; AVX512VL-LABEL: test_compress_v16f32: 884; AVX512VL: # %bb.0: 885; AVX512VL-NEXT: vpsllw $7, %xmm1, %xmm1 886; AVX512VL-NEXT: vpmovb2m %xmm1, %k1 887; AVX512VL-NEXT: vcompressps %zmm0, %zmm2 {%k1} 888; AVX512VL-NEXT: vmovdqa64 %zmm2, %zmm0 889; AVX512VL-NEXT: retq 890 %out = call <16 x float> @llvm.experimental.vector.compress(<16 x float> %vec, <16 x i1> %mask, <16 x float> %passthru) 891 ret <16 x float> %out 892} 893 894define <8 x i64> @test_compress_v8i64(<8 x i64> %vec, <8 x i1> %mask, <8 x i64> %passthru) nounwind { 895; AVX2-LABEL: test_compress_v8i64: 896; AVX2: # %bb.0: 897; AVX2-NEXT: pushq %rbp 898; AVX2-NEXT: movq %rsp, %rbp 899; AVX2-NEXT: pushq %rbx 900; AVX2-NEXT: andq $-32, %rsp 901; AVX2-NEXT: subq $96, %rsp 902; AVX2-NEXT: vmovaps %ymm4, {{[0-9]+}}(%rsp) 903; AVX2-NEXT: vmovaps %ymm3, (%rsp) 904; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 905; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero 906; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] 907; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero 908; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm3 909; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 910; AVX2-NEXT: vpaddq %xmm4, %xmm3, %xmm3 911; AVX2-NEXT: vpextrq $1, %xmm3, %rcx 912; AVX2-NEXT: vmovq %xmm3, %rax 913; AVX2-NEXT: addl %ecx, %eax 914; AVX2-NEXT: andl $7, %eax 915; AVX2-NEXT: vpextrw $1, %xmm2, %ecx 916; AVX2-NEXT: andl $1, %ecx 917; AVX2-NEXT: vmovd %xmm2, %edx 918; AVX2-NEXT: andl $1, %edx 919; AVX2-NEXT: addq %rdx, %rcx 920; AVX2-NEXT: vpextrw $2, %xmm2, %esi 921; AVX2-NEXT: andl $1, %esi 922; AVX2-NEXT: addq %rcx, %rsi 923; AVX2-NEXT: vpextrw $3, %xmm2, %edi 924; AVX2-NEXT: andl $1, %edi 925; AVX2-NEXT: addq %rsi, %rdi 926; AVX2-NEXT: vpextrw $4, %xmm2, %r8d 927; AVX2-NEXT: andl $1, %r8d 928; AVX2-NEXT: addq %rdi, %r8 929; AVX2-NEXT: vpextrw $5, %xmm2, %r9d 930; AVX2-NEXT: andl $1, %r9d 931; AVX2-NEXT: addq %r8, %r9 932; AVX2-NEXT: vpextrw $6, %xmm2, %r10d 933; AVX2-NEXT: andl $1, %r10d 934; AVX2-NEXT: addq %r9, %r10 935; AVX2-NEXT: vpextrw $7, %xmm2, %r11d 936; AVX2-NEXT: andl $1, %r11d 937; AVX2-NEXT: addq %r10, %r11 938; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 939; AVX2-NEXT: vpextrq $1, %xmm2, %rbx 940; AVX2-NEXT: cmpq $8, %r11 941; AVX2-NEXT: cmovbq (%rsp,%rax,8), %rbx 942; AVX2-NEXT: vmovq %xmm0, (%rsp) 943; AVX2-NEXT: vpextrq $1, %xmm0, (%rsp,%rdx,8) 944; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 945; AVX2-NEXT: vmovq %xmm0, (%rsp,%rcx,8) 946; AVX2-NEXT: vpextrq $1, %xmm0, (%rsp,%rsi,8) 947; AVX2-NEXT: andl $7, %edi 948; AVX2-NEXT: vmovq %xmm1, (%rsp,%rdi,8) 949; AVX2-NEXT: andl $7, %r8d 950; AVX2-NEXT: vpextrq $1, %xmm1, (%rsp,%r8,8) 951; AVX2-NEXT: andl $7, %r9d 952; AVX2-NEXT: vmovq %xmm2, (%rsp,%r9,8) 953; AVX2-NEXT: andl $7, %r10d 954; AVX2-NEXT: vpextrq $1, %xmm2, (%rsp,%r10,8) 955; AVX2-NEXT: cmpq $7, %r11 956; AVX2-NEXT: movl $7, %eax 957; AVX2-NEXT: cmovbq %r11, %rax 958; AVX2-NEXT: movl %eax, %eax 959; AVX2-NEXT: movq %rbx, (%rsp,%rax,8) 960; AVX2-NEXT: vmovaps (%rsp), %ymm0 961; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 962; AVX2-NEXT: leaq -8(%rbp), %rsp 963; AVX2-NEXT: popq %rbx 964; AVX2-NEXT: popq %rbp 965; AVX2-NEXT: retq 966; 967; AVX512F-LABEL: test_compress_v8i64: 968; AVX512F: # %bb.0: 969; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1 970; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1 971; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1 972; AVX512F-NEXT: vpcompressq %zmm0, %zmm2 {%k1} 973; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 974; AVX512F-NEXT: retq 975; 976; AVX512VL-LABEL: test_compress_v8i64: 977; AVX512VL: # %bb.0: 978; AVX512VL-NEXT: vpsllw $15, %xmm1, %xmm1 979; AVX512VL-NEXT: vpmovw2m %xmm1, %k1 980; AVX512VL-NEXT: vpcompressq %zmm0, %zmm2 {%k1} 981; AVX512VL-NEXT: vmovdqa64 %zmm2, %zmm0 982; AVX512VL-NEXT: retq 983 %out = call <8 x i64> @llvm.experimental.vector.compress(<8 x i64> %vec, <8 x i1> %mask, <8 x i64> %passthru) 984 ret <8 x i64> %out 985} 986 987define <8 x double> @test_compress_v8f64(<8 x double> %vec, <8 x i1> %mask, <8 x double> %passthru) nounwind { 988; AVX2-LABEL: test_compress_v8f64: 989; AVX2: # %bb.0: 990; AVX2-NEXT: pushq %rbp 991; AVX2-NEXT: movq %rsp, %rbp 992; AVX2-NEXT: andq $-32, %rsp 993; AVX2-NEXT: subq $96, %rsp 994; AVX2-NEXT: vmovaps %ymm4, {{[0-9]+}}(%rsp) 995; AVX2-NEXT: vmovaps %ymm3, (%rsp) 996; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 997; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero 998; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] 999; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero 1000; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm3 1001; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 1002; AVX2-NEXT: vpaddq %xmm4, %xmm3, %xmm3 1003; AVX2-NEXT: vpextrq $1, %xmm3, %rax 1004; AVX2-NEXT: vmovq %xmm3, %rcx 1005; AVX2-NEXT: addl %eax, %ecx 1006; AVX2-NEXT: andl $7, %ecx 1007; AVX2-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero 1008; AVX2-NEXT: vmovlps %xmm0, (%rsp) 1009; AVX2-NEXT: vmovd %xmm2, %eax 1010; AVX2-NEXT: andl $1, %eax 1011; AVX2-NEXT: vmovhps %xmm0, (%rsp,%rax,8) 1012; AVX2-NEXT: vpextrw $1, %xmm2, %ecx 1013; AVX2-NEXT: andl $1, %ecx 1014; AVX2-NEXT: addq %rax, %rcx 1015; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 1016; AVX2-NEXT: vmovlps %xmm0, (%rsp,%rcx,8) 1017; AVX2-NEXT: vpextrw $2, %xmm2, %eax 1018; AVX2-NEXT: andl $1, %eax 1019; AVX2-NEXT: addq %rcx, %rax 1020; AVX2-NEXT: vmovhps %xmm0, (%rsp,%rax,8) 1021; AVX2-NEXT: vpextrw $3, %xmm2, %ecx 1022; AVX2-NEXT: andl $1, %ecx 1023; AVX2-NEXT: addq %rax, %rcx 1024; AVX2-NEXT: vpextrw $4, %xmm2, %eax 1025; AVX2-NEXT: andl $1, %eax 1026; AVX2-NEXT: addq %rcx, %rax 1027; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx 1028; AVX2-NEXT: andl $7, %ecx 1029; AVX2-NEXT: vmovlpd %xmm1, (%rsp,%rcx,8) 1030; AVX2-NEXT: vpextrw $5, %xmm2, %ecx 1031; AVX2-NEXT: andl $1, %ecx 1032; AVX2-NEXT: addq %rax, %rcx 1033; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax 1034; AVX2-NEXT: andl $7, %eax 1035; AVX2-NEXT: vmovhpd %xmm1, (%rsp,%rax,8) 1036; AVX2-NEXT: vpextrw $6, %xmm2, %edx 1037; AVX2-NEXT: andl $1, %edx 1038; AVX2-NEXT: addq %rcx, %rdx 1039; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx 1040; AVX2-NEXT: andl $7, %ecx 1041; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm0 1042; AVX2-NEXT: vmovlpd %xmm0, (%rsp,%rcx,8) 1043; AVX2-NEXT: vpextrw $7, %xmm2, %eax 1044; AVX2-NEXT: andl $1, %eax 1045; AVX2-NEXT: addq %rdx, %rax 1046; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx 1047; AVX2-NEXT: andl $7, %edx 1048; AVX2-NEXT: vmovhpd %xmm0, (%rsp,%rdx,8) 1049; AVX2-NEXT: cmpq $8, %rax 1050; AVX2-NEXT: jb .LBB11_2 1051; AVX2-NEXT: # %bb.1: 1052; AVX2-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] 1053; AVX2-NEXT: .LBB11_2: 1054; AVX2-NEXT: cmpq $7, %rax 1055; AVX2-NEXT: movl $7, %ecx 1056; AVX2-NEXT: cmovbq %rax, %rcx 1057; AVX2-NEXT: movl %ecx, %eax 1058; AVX2-NEXT: vmovsd %xmm3, (%rsp,%rax,8) 1059; AVX2-NEXT: vmovaps (%rsp), %ymm0 1060; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 1061; AVX2-NEXT: movq %rbp, %rsp 1062; AVX2-NEXT: popq %rbp 1063; AVX2-NEXT: retq 1064; 1065; AVX512F-LABEL: test_compress_v8f64: 1066; AVX512F: # %bb.0: 1067; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1 1068; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1 1069; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1 1070; AVX512F-NEXT: vcompresspd %zmm0, %zmm2 {%k1} 1071; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 1072; AVX512F-NEXT: retq 1073; 1074; AVX512VL-LABEL: test_compress_v8f64: 1075; AVX512VL: # %bb.0: 1076; AVX512VL-NEXT: vpsllw $15, %xmm1, %xmm1 1077; AVX512VL-NEXT: vpmovw2m %xmm1, %k1 1078; AVX512VL-NEXT: vcompresspd %zmm0, %zmm2 {%k1} 1079; AVX512VL-NEXT: vmovdqa64 %zmm2, %zmm0 1080; AVX512VL-NEXT: retq 1081 %out = call <8 x double> @llvm.experimental.vector.compress(<8 x double> %vec, <8 x i1> %mask, <8 x double> %passthru) 1082 ret <8 x double> %out 1083} 1084 1085define <16 x i8> @test_compress_v16i8(<16 x i8> %vec, <16 x i1> %mask, <16 x i8> %passthru) nounwind { 1086; AVX2-LABEL: test_compress_v16i8: 1087; AVX2: # %bb.0: 1088; AVX2-NEXT: pushq %rbp 1089; AVX2-NEXT: pushq %r15 1090; AVX2-NEXT: pushq %r14 1091; AVX2-NEXT: pushq %r13 1092; AVX2-NEXT: pushq %r12 1093; AVX2-NEXT: pushq %rbx 1094; AVX2-NEXT: vpsllw $7, %xmm1, %xmm1 1095; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1096; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 1097; AVX2-NEXT: vpcmpgtb %xmm1, %xmm3, %xmm1 1098; AVX2-NEXT: vmovaps %xmm2, -{{[0-9]+}}(%rsp) 1099; AVX2-NEXT: vpextrb $1, %xmm1, %r11d 1100; AVX2-NEXT: vmovd %xmm1, %eax 1101; AVX2-NEXT: movzbl %al, %edx 1102; AVX2-NEXT: # kill: def $al killed $al killed $eax 1103; AVX2-NEXT: andb $1, %al 1104; AVX2-NEXT: subb %r11b, %al 1105; AVX2-NEXT: vpextrb $2, %xmm1, %esi 1106; AVX2-NEXT: subb %sil, %al 1107; AVX2-NEXT: vpextrb $3, %xmm1, %r13d 1108; AVX2-NEXT: subb %r13b, %al 1109; AVX2-NEXT: vpextrb $4, %xmm1, %r12d 1110; AVX2-NEXT: subb %r12b, %al 1111; AVX2-NEXT: vpextrb $5, %xmm1, %r15d 1112; AVX2-NEXT: subb %r15b, %al 1113; AVX2-NEXT: vpextrb $6, %xmm1, %r14d 1114; AVX2-NEXT: subb %r14b, %al 1115; AVX2-NEXT: vpextrb $7, %xmm1, %ebp 1116; AVX2-NEXT: subb %bpl, %al 1117; AVX2-NEXT: vpextrb $8, %xmm1, %ebx 1118; AVX2-NEXT: subb %bl, %al 1119; AVX2-NEXT: vpextrb $9, %xmm1, %r10d 1120; AVX2-NEXT: subb %r10b, %al 1121; AVX2-NEXT: vpextrb $10, %xmm1, %r9d 1122; AVX2-NEXT: subb %r9b, %al 1123; AVX2-NEXT: vpextrb $11, %xmm1, %r8d 1124; AVX2-NEXT: subb %r8b, %al 1125; AVX2-NEXT: vpextrb $12, %xmm1, %edi 1126; AVX2-NEXT: subb %dil, %al 1127; AVX2-NEXT: vpextrb $13, %xmm1, %ecx 1128; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1129; AVX2-NEXT: subb %cl, %al 1130; AVX2-NEXT: vpextrb $14, %xmm1, %ecx 1131; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1132; AVX2-NEXT: subb %cl, %al 1133; AVX2-NEXT: vpextrb $15, %xmm1, %ecx 1134; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1135; AVX2-NEXT: subb %cl, %al 1136; AVX2-NEXT: movzbl %al, %eax 1137; AVX2-NEXT: andl $15, %eax 1138; AVX2-NEXT: movzbl -40(%rsp,%rax), %eax 1139; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1140; AVX2-NEXT: vpextrb $0, %xmm0, -{{[0-9]+}}(%rsp) 1141; AVX2-NEXT: andl $1, %edx 1142; AVX2-NEXT: vpextrb $1, %xmm0, -40(%rsp,%rdx) 1143; AVX2-NEXT: movzbl %r11b, %eax 1144; AVX2-NEXT: andl $1, %eax 1145; AVX2-NEXT: addq %rdx, %rax 1146; AVX2-NEXT: vpextrb $2, %xmm0, -40(%rsp,%rax) 1147; AVX2-NEXT: movzbl %sil, %ecx 1148; AVX2-NEXT: andl $1, %ecx 1149; AVX2-NEXT: addq %rax, %rcx 1150; AVX2-NEXT: vpextrb $3, %xmm0, -40(%rsp,%rcx) 1151; AVX2-NEXT: movzbl %r13b, %eax 1152; AVX2-NEXT: andl $1, %eax 1153; AVX2-NEXT: addq %rcx, %rax 1154; AVX2-NEXT: vpextrb $4, %xmm0, -40(%rsp,%rax) 1155; AVX2-NEXT: movzbl %r12b, %ecx 1156; AVX2-NEXT: andl $1, %ecx 1157; AVX2-NEXT: addq %rax, %rcx 1158; AVX2-NEXT: movzbl %r15b, %eax 1159; AVX2-NEXT: andl $1, %eax 1160; AVX2-NEXT: addq %rcx, %rax 1161; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx 1162; AVX2-NEXT: andl $15, %ecx 1163; AVX2-NEXT: vpextrb $5, %xmm0, -40(%rsp,%rcx) 1164; AVX2-NEXT: movzbl %r14b, %ecx 1165; AVX2-NEXT: andl $1, %ecx 1166; AVX2-NEXT: addq %rax, %rcx 1167; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax 1168; AVX2-NEXT: andl $15, %eax 1169; AVX2-NEXT: vpextrb $6, %xmm0, -40(%rsp,%rax) 1170; AVX2-NEXT: movzbl %bpl, %eax 1171; AVX2-NEXT: andl $1, %eax 1172; AVX2-NEXT: addq %rcx, %rax 1173; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx 1174; AVX2-NEXT: andl $15, %ecx 1175; AVX2-NEXT: vpextrb $7, %xmm0, -40(%rsp,%rcx) 1176; AVX2-NEXT: movzbl %bl, %ecx 1177; AVX2-NEXT: andl $1, %ecx 1178; AVX2-NEXT: addq %rax, %rcx 1179; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax 1180; AVX2-NEXT: andl $15, %eax 1181; AVX2-NEXT: vpextrb $8, %xmm0, -40(%rsp,%rax) 1182; AVX2-NEXT: movzbl %r10b, %eax 1183; AVX2-NEXT: andl $1, %eax 1184; AVX2-NEXT: addq %rcx, %rax 1185; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx 1186; AVX2-NEXT: andl $15, %ecx 1187; AVX2-NEXT: vpextrb $9, %xmm0, -40(%rsp,%rcx) 1188; AVX2-NEXT: movzbl %r9b, %ecx 1189; AVX2-NEXT: andl $1, %ecx 1190; AVX2-NEXT: addq %rax, %rcx 1191; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax 1192; AVX2-NEXT: andl $15, %eax 1193; AVX2-NEXT: vpextrb $10, %xmm0, -40(%rsp,%rax) 1194; AVX2-NEXT: movzbl %r8b, %eax 1195; AVX2-NEXT: andl $1, %eax 1196; AVX2-NEXT: addq %rcx, %rax 1197; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx 1198; AVX2-NEXT: andl $15, %ecx 1199; AVX2-NEXT: vpextrb $11, %xmm0, -40(%rsp,%rcx) 1200; AVX2-NEXT: movzbl %dil, %ecx 1201; AVX2-NEXT: andl $1, %ecx 1202; AVX2-NEXT: addq %rax, %rcx 1203; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax 1204; AVX2-NEXT: andl $15, %eax 1205; AVX2-NEXT: vpextrb $12, %xmm0, -40(%rsp,%rax) 1206; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload 1207; AVX2-NEXT: andl $1, %eax 1208; AVX2-NEXT: addq %rcx, %rax 1209; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx 1210; AVX2-NEXT: andl $15, %ecx 1211; AVX2-NEXT: vpextrb $13, %xmm0, -40(%rsp,%rcx) 1212; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload 1213; AVX2-NEXT: andl $1, %ecx 1214; AVX2-NEXT: addq %rax, %rcx 1215; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax 1216; AVX2-NEXT: andl $15, %eax 1217; AVX2-NEXT: vpextrb $14, %xmm0, -40(%rsp,%rax) 1218; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload 1219; AVX2-NEXT: andl $1, %eax 1220; AVX2-NEXT: addq %rcx, %rax 1221; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx 1222; AVX2-NEXT: andl $15, %ecx 1223; AVX2-NEXT: vpextrb $15, %xmm0, -40(%rsp,%rcx) 1224; AVX2-NEXT: cmpq $15, %rax 1225; AVX2-NEXT: movl $15, %ecx 1226; AVX2-NEXT: cmovbq %rax, %rcx 1227; AVX2-NEXT: vpextrb $15, %xmm0, %eax 1228; AVX2-NEXT: cmovbel {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload 1229; AVX2-NEXT: movb %al, -40(%rsp,%rcx) 1230; AVX2-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 1231; AVX2-NEXT: popq %rbx 1232; AVX2-NEXT: popq %r12 1233; AVX2-NEXT: popq %r13 1234; AVX2-NEXT: popq %r14 1235; AVX2-NEXT: popq %r15 1236; AVX2-NEXT: popq %rbp 1237; AVX2-NEXT: retq 1238; 1239; AVX512F-LABEL: test_compress_v16i8: 1240; AVX512F: # %bb.0: 1241; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 1242; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1 1243; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1 1244; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 1245; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero 1246; AVX512F-NEXT: vpcompressd %zmm0, %zmm1 {%k1} 1247; AVX512F-NEXT: vpmovdb %zmm1, %xmm0 1248; AVX512F-NEXT: vzeroupper 1249; AVX512F-NEXT: retq 1250; 1251; AVX512VL-LABEL: test_compress_v16i8: 1252; AVX512VL: # %bb.0: 1253; AVX512VL-NEXT: vpsllw $7, %xmm1, %xmm1 1254; AVX512VL-NEXT: vpmovb2m %xmm1, %k1 1255; AVX512VL-NEXT: vpcompressb %xmm0, %xmm2 {%k1} 1256; AVX512VL-NEXT: vmovdqa %xmm2, %xmm0 1257; AVX512VL-NEXT: retq 1258 %out = call <16 x i8> @llvm.experimental.vector.compress(<16 x i8> %vec, <16 x i1> %mask, <16 x i8> %passthru) 1259 ret <16 x i8> %out 1260} 1261 1262define <8 x i16> @test_compress_v8i16(<8 x i16> %vec, <8 x i1> %mask, <8 x i16> %passthru) nounwind { 1263; AVX2-LABEL: test_compress_v8i16: 1264; AVX2: # %bb.0: 1265; AVX2-NEXT: pushq %rbx 1266; AVX2-NEXT: vpsllw $15, %xmm1, %xmm1 1267; AVX2-NEXT: vpsraw $15, %xmm1, %xmm1 1268; AVX2-NEXT: vmovaps %xmm2, -{{[0-9]+}}(%rsp) 1269; AVX2-NEXT: vpextrw $1, %xmm1, %eax 1270; AVX2-NEXT: andl $1, %eax 1271; AVX2-NEXT: vmovd %xmm1, %ecx 1272; AVX2-NEXT: andl $1, %ecx 1273; AVX2-NEXT: leal (%rcx,%rax), %esi 1274; AVX2-NEXT: vpextrw $2, %xmm1, %edi 1275; AVX2-NEXT: andl $1, %edi 1276; AVX2-NEXT: vpextrw $3, %xmm1, %edx 1277; AVX2-NEXT: andl $1, %edx 1278; AVX2-NEXT: leal (%rdi,%rdx), %r10d 1279; AVX2-NEXT: addl %esi, %r10d 1280; AVX2-NEXT: vpextrw $4, %xmm1, %r9d 1281; AVX2-NEXT: andl $1, %r9d 1282; AVX2-NEXT: vpextrw $5, %xmm1, %esi 1283; AVX2-NEXT: andl $1, %esi 1284; AVX2-NEXT: leal (%r9,%rsi), %r11d 1285; AVX2-NEXT: vpextrw $6, %xmm1, %r8d 1286; AVX2-NEXT: andl $1, %r8d 1287; AVX2-NEXT: addl %r8d, %r11d 1288; AVX2-NEXT: addl %r10d, %r11d 1289; AVX2-NEXT: vpextrw $7, %xmm1, %r10d 1290; AVX2-NEXT: andl $1, %r10d 1291; AVX2-NEXT: addl %r10d, %r11d 1292; AVX2-NEXT: andl $7, %r11d 1293; AVX2-NEXT: addq %rcx, %rax 1294; AVX2-NEXT: addq %rax, %rdi 1295; AVX2-NEXT: addq %rdi, %rdx 1296; AVX2-NEXT: addq %rdx, %r9 1297; AVX2-NEXT: addq %r9, %rsi 1298; AVX2-NEXT: addq %rsi, %r8 1299; AVX2-NEXT: addq %r8, %r10 1300; AVX2-NEXT: vpextrw $7, %xmm0, %ebx 1301; AVX2-NEXT: cmpq $8, %r10 1302; AVX2-NEXT: cmovbw -16(%rsp,%r11,2), %bx 1303; AVX2-NEXT: vpextrw $0, %xmm0, -{{[0-9]+}}(%rsp) 1304; AVX2-NEXT: vpextrw $1, %xmm0, -16(%rsp,%rcx,2) 1305; AVX2-NEXT: vpextrw $2, %xmm0, -16(%rsp,%rax,2) 1306; AVX2-NEXT: vpextrw $3, %xmm0, -16(%rsp,%rdi,2) 1307; AVX2-NEXT: andl $7, %edx 1308; AVX2-NEXT: vpextrw $4, %xmm0, -16(%rsp,%rdx,2) 1309; AVX2-NEXT: andl $7, %r9d 1310; AVX2-NEXT: vpextrw $5, %xmm0, -16(%rsp,%r9,2) 1311; AVX2-NEXT: andl $7, %esi 1312; AVX2-NEXT: vpextrw $6, %xmm0, -16(%rsp,%rsi,2) 1313; AVX2-NEXT: andl $7, %r8d 1314; AVX2-NEXT: vpextrw $7, %xmm0, -16(%rsp,%r8,2) 1315; AVX2-NEXT: cmpq $7, %r10 1316; AVX2-NEXT: movl $7, %eax 1317; AVX2-NEXT: cmovbq %r10, %rax 1318; AVX2-NEXT: movl %eax, %eax 1319; AVX2-NEXT: movw %bx, -16(%rsp,%rax,2) 1320; AVX2-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 1321; AVX2-NEXT: popq %rbx 1322; AVX2-NEXT: retq 1323; 1324; AVX512F-LABEL: test_compress_v8i16: 1325; AVX512F: # %bb.0: 1326; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1 1327; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1 1328; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1 1329; AVX512F-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 1330; AVX512F-NEXT: vpmovzxwq {{.*#+}} zmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero 1331; AVX512F-NEXT: vpcompressq %zmm0, %zmm1 {%k1} 1332; AVX512F-NEXT: vpmovqw %zmm1, %xmm0 1333; AVX512F-NEXT: vzeroupper 1334; AVX512F-NEXT: retq 1335; 1336; AVX512VL-LABEL: test_compress_v8i16: 1337; AVX512VL: # %bb.0: 1338; AVX512VL-NEXT: vpsllw $15, %xmm1, %xmm1 1339; AVX512VL-NEXT: vpmovw2m %xmm1, %k1 1340; AVX512VL-NEXT: vpcompressw %xmm0, %xmm2 {%k1} 1341; AVX512VL-NEXT: vmovdqa %xmm2, %xmm0 1342; AVX512VL-NEXT: retq 1343 %out = call <8 x i16> @llvm.experimental.vector.compress(<8 x i16> %vec, <8 x i1> %mask, <8 x i16> %passthru) 1344 ret <8 x i16> %out 1345} 1346 1347define <32 x i8> @test_compress_v32i8(<32 x i8> %vec, <32 x i1> %mask, <32 x i8> %passthru) nounwind { 1348; AVX2-LABEL: test_compress_v32i8: 1349; AVX2: # %bb.0: 1350; AVX2-NEXT: pushq %rbp 1351; AVX2-NEXT: movq %rsp, %rbp 1352; AVX2-NEXT: andq $-32, %rsp 1353; AVX2-NEXT: subq $64, %rsp 1354; AVX2-NEXT: vpsllw $7, %ymm1, %ymm1 1355; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 1356; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 1357; AVX2-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm3 1358; AVX2-NEXT: vmovaps %ymm2, (%rsp) 1359; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm1 1360; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1361; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm4 1362; AVX2-NEXT: vpand %xmm2, %xmm3, %xmm2 1363; AVX2-NEXT: vpaddb %xmm4, %xmm2, %xmm2 1364; AVX2-NEXT: vpextrb $1, %xmm2, %eax 1365; AVX2-NEXT: vmovd %xmm2, %ecx 1366; AVX2-NEXT: addb %al, %cl 1367; AVX2-NEXT: vpextrb $2, %xmm2, %eax 1368; AVX2-NEXT: vpextrb $3, %xmm2, %edx 1369; AVX2-NEXT: addb %al, %dl 1370; AVX2-NEXT: addb %cl, %dl 1371; AVX2-NEXT: vpextrb $4, %xmm2, %eax 1372; AVX2-NEXT: vpextrb $5, %xmm2, %ecx 1373; AVX2-NEXT: addb %al, %cl 1374; AVX2-NEXT: vpextrb $6, %xmm2, %eax 1375; AVX2-NEXT: addb %cl, %al 1376; AVX2-NEXT: addb %dl, %al 1377; AVX2-NEXT: vpextrb $7, %xmm2, %ecx 1378; AVX2-NEXT: vpextrb $8, %xmm2, %edx 1379; AVX2-NEXT: addb %cl, %dl 1380; AVX2-NEXT: vpextrb $9, %xmm2, %ecx 1381; AVX2-NEXT: addb %dl, %cl 1382; AVX2-NEXT: vpextrb $10, %xmm2, %edx 1383; AVX2-NEXT: addb %cl, %dl 1384; AVX2-NEXT: addb %al, %dl 1385; AVX2-NEXT: vpextrb $11, %xmm2, %eax 1386; AVX2-NEXT: vpextrb $12, %xmm2, %ecx 1387; AVX2-NEXT: addb %al, %cl 1388; AVX2-NEXT: vpextrb $13, %xmm2, %eax 1389; AVX2-NEXT: addb %cl, %al 1390; AVX2-NEXT: vpextrb $14, %xmm2, %ecx 1391; AVX2-NEXT: addb %al, %cl 1392; AVX2-NEXT: vpextrb $15, %xmm2, %eax 1393; AVX2-NEXT: addb %cl, %al 1394; AVX2-NEXT: addb %dl, %al 1395; AVX2-NEXT: movzbl %al, %eax 1396; AVX2-NEXT: andl $31, %eax 1397; AVX2-NEXT: movzbl (%rsp,%rax), %eax 1398; AVX2-NEXT: vpextrb $0, %xmm0, (%rsp) 1399; AVX2-NEXT: vmovd %xmm3, %ecx 1400; AVX2-NEXT: andl $1, %ecx 1401; AVX2-NEXT: vpextrb $1, %xmm0, (%rsp,%rcx) 1402; AVX2-NEXT: vpextrb $1, %xmm3, %edx 1403; AVX2-NEXT: andl $1, %edx 1404; AVX2-NEXT: addq %rcx, %rdx 1405; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rdx) 1406; AVX2-NEXT: vpextrb $2, %xmm3, %ecx 1407; AVX2-NEXT: andl $1, %ecx 1408; AVX2-NEXT: addq %rdx, %rcx 1409; AVX2-NEXT: vpextrb $3, %xmm0, (%rsp,%rcx) 1410; AVX2-NEXT: vpextrb $3, %xmm3, %edx 1411; AVX2-NEXT: andl $1, %edx 1412; AVX2-NEXT: addq %rcx, %rdx 1413; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%rdx) 1414; AVX2-NEXT: vpextrb $4, %xmm3, %ecx 1415; AVX2-NEXT: andl $1, %ecx 1416; AVX2-NEXT: addq %rdx, %rcx 1417; AVX2-NEXT: vpextrb $5, %xmm0, (%rsp,%rcx) 1418; AVX2-NEXT: vpextrb $5, %xmm3, %edx 1419; AVX2-NEXT: andl $1, %edx 1420; AVX2-NEXT: addq %rcx, %rdx 1421; AVX2-NEXT: vpextrb $6, %xmm3, %ecx 1422; AVX2-NEXT: andl $1, %ecx 1423; AVX2-NEXT: addq %rdx, %rcx 1424; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx 1425; AVX2-NEXT: andl $31, %edx 1426; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%rdx) 1427; AVX2-NEXT: vpextrb $7, %xmm3, %edx 1428; AVX2-NEXT: andl $1, %edx 1429; AVX2-NEXT: addq %rcx, %rdx 1430; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx 1431; AVX2-NEXT: andl $31, %ecx 1432; AVX2-NEXT: vpextrb $7, %xmm0, (%rsp,%rcx) 1433; AVX2-NEXT: vpextrb $8, %xmm3, %ecx 1434; AVX2-NEXT: andl $1, %ecx 1435; AVX2-NEXT: addq %rdx, %rcx 1436; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx 1437; AVX2-NEXT: andl $31, %edx 1438; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rdx) 1439; AVX2-NEXT: vpextrb $9, %xmm3, %edx 1440; AVX2-NEXT: andl $1, %edx 1441; AVX2-NEXT: addq %rcx, %rdx 1442; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx 1443; AVX2-NEXT: andl $31, %ecx 1444; AVX2-NEXT: vpextrb $9, %xmm0, (%rsp,%rcx) 1445; AVX2-NEXT: vpextrb $10, %xmm3, %ecx 1446; AVX2-NEXT: andl $1, %ecx 1447; AVX2-NEXT: addq %rdx, %rcx 1448; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx 1449; AVX2-NEXT: andl $31, %edx 1450; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%rdx) 1451; AVX2-NEXT: vpextrb $11, %xmm3, %edx 1452; AVX2-NEXT: andl $1, %edx 1453; AVX2-NEXT: addq %rcx, %rdx 1454; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx 1455; AVX2-NEXT: andl $31, %ecx 1456; AVX2-NEXT: vpextrb $11, %xmm0, (%rsp,%rcx) 1457; AVX2-NEXT: vpextrb $12, %xmm3, %ecx 1458; AVX2-NEXT: andl $1, %ecx 1459; AVX2-NEXT: addq %rdx, %rcx 1460; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx 1461; AVX2-NEXT: andl $31, %edx 1462; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%rdx) 1463; AVX2-NEXT: vpextrb $13, %xmm3, %edx 1464; AVX2-NEXT: andl $1, %edx 1465; AVX2-NEXT: addq %rcx, %rdx 1466; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx 1467; AVX2-NEXT: andl $31, %ecx 1468; AVX2-NEXT: vpextrb $13, %xmm0, (%rsp,%rcx) 1469; AVX2-NEXT: vpextrb $14, %xmm3, %ecx 1470; AVX2-NEXT: andl $1, %ecx 1471; AVX2-NEXT: addq %rdx, %rcx 1472; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx 1473; AVX2-NEXT: andl $31, %edx 1474; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%rdx) 1475; AVX2-NEXT: vpextrb $15, %xmm3, %edx 1476; AVX2-NEXT: andl $1, %edx 1477; AVX2-NEXT: addq %rcx, %rdx 1478; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx 1479; AVX2-NEXT: andl $31, %ecx 1480; AVX2-NEXT: vpextrb $15, %xmm0, (%rsp,%rcx) 1481; AVX2-NEXT: vmovd %xmm1, %ecx 1482; AVX2-NEXT: andl $1, %ecx 1483; AVX2-NEXT: addq %rdx, %rcx 1484; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx 1485; AVX2-NEXT: andl $31, %edx 1486; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 1487; AVX2-NEXT: vpextrb $0, %xmm0, (%rsp,%rdx) 1488; AVX2-NEXT: vpextrb $1, %xmm1, %edx 1489; AVX2-NEXT: andl $1, %edx 1490; AVX2-NEXT: addq %rcx, %rdx 1491; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx 1492; AVX2-NEXT: andl $31, %ecx 1493; AVX2-NEXT: vpextrb $1, %xmm0, (%rsp,%rcx) 1494; AVX2-NEXT: vpextrb $2, %xmm1, %ecx 1495; AVX2-NEXT: andl $1, %ecx 1496; AVX2-NEXT: addq %rdx, %rcx 1497; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx 1498; AVX2-NEXT: andl $31, %edx 1499; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rdx) 1500; AVX2-NEXT: vpextrb $3, %xmm1, %edx 1501; AVX2-NEXT: andl $1, %edx 1502; AVX2-NEXT: addq %rcx, %rdx 1503; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx 1504; AVX2-NEXT: andl $31, %ecx 1505; AVX2-NEXT: vpextrb $3, %xmm0, (%rsp,%rcx) 1506; AVX2-NEXT: vpextrb $4, %xmm1, %ecx 1507; AVX2-NEXT: andl $1, %ecx 1508; AVX2-NEXT: addq %rdx, %rcx 1509; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx 1510; AVX2-NEXT: andl $31, %edx 1511; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%rdx) 1512; AVX2-NEXT: vpextrb $5, %xmm1, %edx 1513; AVX2-NEXT: andl $1, %edx 1514; AVX2-NEXT: addq %rcx, %rdx 1515; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx 1516; AVX2-NEXT: andl $31, %ecx 1517; AVX2-NEXT: vpextrb $5, %xmm0, (%rsp,%rcx) 1518; AVX2-NEXT: vpextrb $6, %xmm1, %ecx 1519; AVX2-NEXT: andl $1, %ecx 1520; AVX2-NEXT: addq %rdx, %rcx 1521; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx 1522; AVX2-NEXT: andl $31, %edx 1523; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%rdx) 1524; AVX2-NEXT: vpextrb $7, %xmm1, %edx 1525; AVX2-NEXT: andl $1, %edx 1526; AVX2-NEXT: addq %rcx, %rdx 1527; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx 1528; AVX2-NEXT: andl $31, %ecx 1529; AVX2-NEXT: vpextrb $7, %xmm0, (%rsp,%rcx) 1530; AVX2-NEXT: vpextrb $8, %xmm1, %ecx 1531; AVX2-NEXT: andl $1, %ecx 1532; AVX2-NEXT: addq %rdx, %rcx 1533; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx 1534; AVX2-NEXT: andl $31, %edx 1535; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rdx) 1536; AVX2-NEXT: vpextrb $9, %xmm1, %edx 1537; AVX2-NEXT: andl $1, %edx 1538; AVX2-NEXT: addq %rcx, %rdx 1539; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx 1540; AVX2-NEXT: andl $31, %ecx 1541; AVX2-NEXT: vpextrb $9, %xmm0, (%rsp,%rcx) 1542; AVX2-NEXT: vpextrb $10, %xmm1, %ecx 1543; AVX2-NEXT: andl $1, %ecx 1544; AVX2-NEXT: addq %rdx, %rcx 1545; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx 1546; AVX2-NEXT: andl $31, %edx 1547; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%rdx) 1548; AVX2-NEXT: vpextrb $11, %xmm1, %edx 1549; AVX2-NEXT: andl $1, %edx 1550; AVX2-NEXT: addq %rcx, %rdx 1551; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx 1552; AVX2-NEXT: andl $31, %ecx 1553; AVX2-NEXT: vpextrb $11, %xmm0, (%rsp,%rcx) 1554; AVX2-NEXT: vpextrb $12, %xmm1, %ecx 1555; AVX2-NEXT: andl $1, %ecx 1556; AVX2-NEXT: addq %rdx, %rcx 1557; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx 1558; AVX2-NEXT: andl $31, %edx 1559; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%rdx) 1560; AVX2-NEXT: vpextrb $13, %xmm1, %edx 1561; AVX2-NEXT: andl $1, %edx 1562; AVX2-NEXT: addq %rcx, %rdx 1563; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx 1564; AVX2-NEXT: andl $31, %ecx 1565; AVX2-NEXT: vpextrb $13, %xmm0, (%rsp,%rcx) 1566; AVX2-NEXT: vpextrb $14, %xmm1, %ecx 1567; AVX2-NEXT: andl $1, %ecx 1568; AVX2-NEXT: addq %rdx, %rcx 1569; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx 1570; AVX2-NEXT: andl $31, %edx 1571; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%rdx) 1572; AVX2-NEXT: vpextrb $15, %xmm1, %edx 1573; AVX2-NEXT: andl $1, %edx 1574; AVX2-NEXT: addq %rcx, %rdx 1575; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx 1576; AVX2-NEXT: andl $31, %ecx 1577; AVX2-NEXT: vpextrb $15, %xmm0, (%rsp,%rcx) 1578; AVX2-NEXT: cmpq $31, %rdx 1579; AVX2-NEXT: movl $31, %ecx 1580; AVX2-NEXT: cmovbq %rdx, %rcx 1581; AVX2-NEXT: vpextrb $15, %xmm0, %edx 1582; AVX2-NEXT: cmovbel %eax, %edx 1583; AVX2-NEXT: movb %dl, (%rsp,%rcx) 1584; AVX2-NEXT: vmovaps (%rsp), %ymm0 1585; AVX2-NEXT: movq %rbp, %rsp 1586; AVX2-NEXT: popq %rbp 1587; AVX2-NEXT: retq 1588; 1589; AVX512F-LABEL: test_compress_v32i8: 1590; AVX512F: # %bb.0: 1591; AVX512F-NEXT: pushq %rbp 1592; AVX512F-NEXT: movq %rsp, %rbp 1593; AVX512F-NEXT: andq $-32, %rsp 1594; AVX512F-NEXT: subq $64, %rsp 1595; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3 1596; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 1597; AVX512F-NEXT: vpslld $31, %zmm3, %zmm3 1598; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k1 1599; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm3 1600; AVX512F-NEXT: vpslld $31, %zmm3, %zmm3 1601; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k2 1602; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 1603; AVX512F-NEXT: vpcompressd %zmm3, %zmm3 {%k2} {z} 1604; AVX512F-NEXT: vpmovdb %zmm3, (%rsp) 1605; AVX512F-NEXT: kshiftrw $8, %k2, %k0 1606; AVX512F-NEXT: kxorw %k0, %k2, %k0 1607; AVX512F-NEXT: kshiftrw $4, %k0, %k2 1608; AVX512F-NEXT: kxorw %k2, %k0, %k0 1609; AVX512F-NEXT: kshiftrw $2, %k0, %k2 1610; AVX512F-NEXT: kxorw %k2, %k0, %k0 1611; AVX512F-NEXT: kshiftrw $1, %k0, %k2 1612; AVX512F-NEXT: kxorw %k2, %k0, %k0 1613; AVX512F-NEXT: kmovw %k0, %eax 1614; AVX512F-NEXT: andl $31, %eax 1615; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 1616; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 1617; AVX512F-NEXT: vpcompressd %zmm0, %zmm0 {%k1} {z} 1618; AVX512F-NEXT: vpmovdb %zmm0, (%rsp,%rax) 1619; AVX512F-NEXT: vpsllw $7, %ymm1, %ymm0 1620; AVX512F-NEXT: vpblendvb %ymm0, (%rsp), %ymm2, %ymm0 1621; AVX512F-NEXT: movq %rbp, %rsp 1622; AVX512F-NEXT: popq %rbp 1623; AVX512F-NEXT: retq 1624; 1625; AVX512VL-LABEL: test_compress_v32i8: 1626; AVX512VL: # %bb.0: 1627; AVX512VL-NEXT: vpsllw $7, %ymm1, %ymm1 1628; AVX512VL-NEXT: vpmovb2m %ymm1, %k1 1629; AVX512VL-NEXT: vpcompressb %ymm0, %ymm2 {%k1} 1630; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 1631; AVX512VL-NEXT: retq 1632 %out = call <32 x i8> @llvm.experimental.vector.compress(<32 x i8> %vec, <32 x i1> %mask, <32 x i8> %passthru) 1633 ret <32 x i8> %out 1634} 1635 1636define <16 x i16> @test_compress_v16i16(<16 x i16> %vec, <16 x i1> %mask, <16 x i16> %passthru) nounwind { 1637; AVX2-LABEL: test_compress_v16i16: 1638; AVX2: # %bb.0: 1639; AVX2-NEXT: pushq %rbp 1640; AVX2-NEXT: movq %rsp, %rbp 1641; AVX2-NEXT: pushq %r15 1642; AVX2-NEXT: pushq %r14 1643; AVX2-NEXT: pushq %r13 1644; AVX2-NEXT: pushq %r12 1645; AVX2-NEXT: pushq %rbx 1646; AVX2-NEXT: andq $-32, %rsp 1647; AVX2-NEXT: subq $96, %rsp 1648; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 1649; AVX2-NEXT: vpsllw $15, %ymm1, %ymm3 1650; AVX2-NEXT: vpsraw $15, %ymm3, %ymm1 1651; AVX2-NEXT: vmovaps %ymm2, (%rsp) 1652; AVX2-NEXT: vpsrlw $15, %ymm3, %ymm2 1653; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 1654; AVX2-NEXT: vpaddw %xmm3, %xmm2, %xmm2 1655; AVX2-NEXT: vpextrw $1, %xmm2, %eax 1656; AVX2-NEXT: vmovd %xmm2, %ecx 1657; AVX2-NEXT: addl %eax, %ecx 1658; AVX2-NEXT: vpextrw $2, %xmm2, %eax 1659; AVX2-NEXT: vpextrw $3, %xmm2, %edx 1660; AVX2-NEXT: addl %eax, %edx 1661; AVX2-NEXT: addl %ecx, %edx 1662; AVX2-NEXT: vpextrw $4, %xmm2, %eax 1663; AVX2-NEXT: vpextrw $5, %xmm2, %ecx 1664; AVX2-NEXT: addl %eax, %ecx 1665; AVX2-NEXT: vpextrw $6, %xmm2, %eax 1666; AVX2-NEXT: addl %ecx, %eax 1667; AVX2-NEXT: addl %edx, %eax 1668; AVX2-NEXT: vpextrw $7, %xmm2, %ecx 1669; AVX2-NEXT: addl %eax, %ecx 1670; AVX2-NEXT: andl $15, %ecx 1671; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 1672; AVX2-NEXT: vpextrw $1, %xmm1, %eax 1673; AVX2-NEXT: andl $1, %eax 1674; AVX2-NEXT: vmovd %xmm1, %ecx 1675; AVX2-NEXT: andl $1, %ecx 1676; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 1677; AVX2-NEXT: addq %rcx, %rax 1678; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 1679; AVX2-NEXT: vpextrw $2, %xmm1, %ecx 1680; AVX2-NEXT: andl $1, %ecx 1681; AVX2-NEXT: addq %rax, %rcx 1682; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 1683; AVX2-NEXT: vpextrw $3, %xmm1, %eax 1684; AVX2-NEXT: andl $1, %eax 1685; AVX2-NEXT: addq %rcx, %rax 1686; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 1687; AVX2-NEXT: vpextrw $4, %xmm1, %r8d 1688; AVX2-NEXT: andl $1, %r8d 1689; AVX2-NEXT: addq %rax, %r8 1690; AVX2-NEXT: vpextrw $5, %xmm1, %r9d 1691; AVX2-NEXT: andl $1, %r9d 1692; AVX2-NEXT: addq %r8, %r9 1693; AVX2-NEXT: vpextrw $6, %xmm1, %r10d 1694; AVX2-NEXT: andl $1, %r10d 1695; AVX2-NEXT: addq %r9, %r10 1696; AVX2-NEXT: vpextrw $7, %xmm1, %r11d 1697; AVX2-NEXT: andl $1, %r11d 1698; AVX2-NEXT: addq %r10, %r11 1699; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 1700; AVX2-NEXT: vmovd %xmm1, %ebx 1701; AVX2-NEXT: andl $1, %ebx 1702; AVX2-NEXT: addq %r11, %rbx 1703; AVX2-NEXT: vpextrw $1, %xmm1, %r14d 1704; AVX2-NEXT: andl $1, %r14d 1705; AVX2-NEXT: addq %rbx, %r14 1706; AVX2-NEXT: vpextrw $2, %xmm1, %r15d 1707; AVX2-NEXT: andl $1, %r15d 1708; AVX2-NEXT: addq %r14, %r15 1709; AVX2-NEXT: vpextrw $3, %xmm1, %r12d 1710; AVX2-NEXT: andl $1, %r12d 1711; AVX2-NEXT: addq %r15, %r12 1712; AVX2-NEXT: vpextrw $4, %xmm1, %r13d 1713; AVX2-NEXT: andl $1, %r13d 1714; AVX2-NEXT: addq %r12, %r13 1715; AVX2-NEXT: vpextrw $5, %xmm1, %edx 1716; AVX2-NEXT: andl $1, %edx 1717; AVX2-NEXT: addq %r13, %rdx 1718; AVX2-NEXT: vpextrw $6, %xmm1, %ecx 1719; AVX2-NEXT: andl $1, %ecx 1720; AVX2-NEXT: addq %rdx, %rcx 1721; AVX2-NEXT: vpextrw $7, %xmm1, %edi 1722; AVX2-NEXT: andl $1, %edi 1723; AVX2-NEXT: addq %rcx, %rdi 1724; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1725; AVX2-NEXT: cmpq $16, %rdi 1726; AVX2-NEXT: vpextrw $7, %xmm1, %eax 1727; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload 1728; AVX2-NEXT: cmovbw (%rsp,%rsi,2), %ax 1729; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1730; AVX2-NEXT: vpextrw $0, %xmm0, (%rsp) 1731; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload 1732; AVX2-NEXT: vpextrw $1, %xmm0, (%rsp,%rsi,2) 1733; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload 1734; AVX2-NEXT: vpextrw $2, %xmm0, (%rsp,%rsi,2) 1735; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload 1736; AVX2-NEXT: vpextrw $3, %xmm0, (%rsp,%rsi,2) 1737; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 1738; AVX2-NEXT: vpextrw $4, %xmm0, (%rsp,%rax,2) 1739; AVX2-NEXT: andl $15, %r8d 1740; AVX2-NEXT: vpextrw $5, %xmm0, (%rsp,%r8,2) 1741; AVX2-NEXT: andl $15, %r9d 1742; AVX2-NEXT: vpextrw $6, %xmm0, (%rsp,%r9,2) 1743; AVX2-NEXT: andl $15, %r10d 1744; AVX2-NEXT: vpextrw $7, %xmm0, (%rsp,%r10,2) 1745; AVX2-NEXT: andl $15, %r11d 1746; AVX2-NEXT: vpextrw $0, %xmm1, (%rsp,%r11,2) 1747; AVX2-NEXT: andl $15, %ebx 1748; AVX2-NEXT: vpextrw $1, %xmm1, (%rsp,%rbx,2) 1749; AVX2-NEXT: andl $15, %r14d 1750; AVX2-NEXT: vpextrw $2, %xmm1, (%rsp,%r14,2) 1751; AVX2-NEXT: andl $15, %r15d 1752; AVX2-NEXT: vpextrw $3, %xmm1, (%rsp,%r15,2) 1753; AVX2-NEXT: andl $15, %r12d 1754; AVX2-NEXT: vpextrw $4, %xmm1, (%rsp,%r12,2) 1755; AVX2-NEXT: andl $15, %r13d 1756; AVX2-NEXT: vpextrw $5, %xmm1, (%rsp,%r13,2) 1757; AVX2-NEXT: andl $15, %edx 1758; AVX2-NEXT: vpextrw $6, %xmm1, (%rsp,%rdx,2) 1759; AVX2-NEXT: andl $15, %ecx 1760; AVX2-NEXT: vpextrw $7, %xmm1, (%rsp,%rcx,2) 1761; AVX2-NEXT: cmpq $15, %rdi 1762; AVX2-NEXT: movl $15, %eax 1763; AVX2-NEXT: cmovbq %rdi, %rax 1764; AVX2-NEXT: movl %eax, %eax 1765; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload 1766; AVX2-NEXT: movw %cx, (%rsp,%rax,2) 1767; AVX2-NEXT: vmovaps (%rsp), %ymm0 1768; AVX2-NEXT: leaq -40(%rbp), %rsp 1769; AVX2-NEXT: popq %rbx 1770; AVX2-NEXT: popq %r12 1771; AVX2-NEXT: popq %r13 1772; AVX2-NEXT: popq %r14 1773; AVX2-NEXT: popq %r15 1774; AVX2-NEXT: popq %rbp 1775; AVX2-NEXT: retq 1776; 1777; AVX512F-LABEL: test_compress_v16i16: 1778; AVX512F: # %bb.0: 1779; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 1780; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1 1781; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1 1782; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1783; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero 1784; AVX512F-NEXT: vpcompressd %zmm0, %zmm1 {%k1} 1785; AVX512F-NEXT: vpmovdw %zmm1, %ymm0 1786; AVX512F-NEXT: retq 1787; 1788; AVX512VL-LABEL: test_compress_v16i16: 1789; AVX512VL: # %bb.0: 1790; AVX512VL-NEXT: vpsllw $7, %xmm1, %xmm1 1791; AVX512VL-NEXT: vpmovb2m %xmm1, %k1 1792; AVX512VL-NEXT: vpcompressw %ymm0, %ymm2 {%k1} 1793; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 1794; AVX512VL-NEXT: retq 1795 %out = call <16 x i16> @llvm.experimental.vector.compress(<16 x i16> %vec, <16 x i1> %mask, <16 x i16> %passthru) 1796 ret <16 x i16> %out 1797} 1798 1799define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8> %passthru) nounwind { 1800; AVX2-LABEL: test_compress_v64i8: 1801; AVX2: # %bb.0: 1802; AVX2-NEXT: pushq %rbp 1803; AVX2-NEXT: movq %rsp, %rbp 1804; AVX2-NEXT: pushq %r15 1805; AVX2-NEXT: pushq %r14 1806; AVX2-NEXT: pushq %r13 1807; AVX2-NEXT: pushq %r12 1808; AVX2-NEXT: pushq %rbx 1809; AVX2-NEXT: andq $-32, %rsp 1810; AVX2-NEXT: subq $128, %rsp 1811; AVX2-NEXT: # kill: def $r9d killed $r9d def $r9 1812; AVX2-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 1813; AVX2-NEXT: # kill: def $r8d killed $r8d def $r8 1814; AVX2-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 1815; AVX2-NEXT: movl %ecx, %r13d 1816; AVX2-NEXT: movl %edx, %r15d 1817; AVX2-NEXT: movl %esi, %ebx 1818; AVX2-NEXT: # kill: def $edi killed $edi def $rdi 1819; AVX2-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 1820; AVX2-NEXT: movl 360(%rbp), %eax 1821; AVX2-NEXT: movl 352(%rbp), %ecx 1822; AVX2-NEXT: vmovd %ecx, %xmm4 1823; AVX2-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4 1824; AVX2-NEXT: movl 368(%rbp), %eax 1825; AVX2-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 1826; AVX2-NEXT: movl 376(%rbp), %eax 1827; AVX2-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4 1828; AVX2-NEXT: movl 384(%rbp), %eax 1829; AVX2-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 1830; AVX2-NEXT: movl 392(%rbp), %eax 1831; AVX2-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4 1832; AVX2-NEXT: movl 400(%rbp), %eax 1833; AVX2-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 1834; AVX2-NEXT: movl 408(%rbp), %eax 1835; AVX2-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4 1836; AVX2-NEXT: movl 416(%rbp), %eax 1837; AVX2-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4 1838; AVX2-NEXT: movl 424(%rbp), %eax 1839; AVX2-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4 1840; AVX2-NEXT: movl 432(%rbp), %eax 1841; AVX2-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4 1842; AVX2-NEXT: movl 440(%rbp), %eax 1843; AVX2-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4 1844; AVX2-NEXT: movl 448(%rbp), %eax 1845; AVX2-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 1846; AVX2-NEXT: movl 456(%rbp), %eax 1847; AVX2-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 1848; AVX2-NEXT: movl 464(%rbp), %eax 1849; AVX2-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 1850; AVX2-NEXT: movl 472(%rbp), %eax 1851; AVX2-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4 1852; AVX2-NEXT: movl 224(%rbp), %eax 1853; AVX2-NEXT: vmovd %eax, %xmm5 1854; AVX2-NEXT: movl 232(%rbp), %eax 1855; AVX2-NEXT: vpinsrb $1, %eax, %xmm5, %xmm5 1856; AVX2-NEXT: movl 240(%rbp), %eax 1857; AVX2-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5 1858; AVX2-NEXT: movl 248(%rbp), %eax 1859; AVX2-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5 1860; AVX2-NEXT: movl 256(%rbp), %eax 1861; AVX2-NEXT: vpinsrb $4, %eax, %xmm5, %xmm5 1862; AVX2-NEXT: movl 264(%rbp), %eax 1863; AVX2-NEXT: vpinsrb $5, %eax, %xmm5, %xmm5 1864; AVX2-NEXT: movl 272(%rbp), %eax 1865; AVX2-NEXT: vpinsrb $6, %eax, %xmm5, %xmm5 1866; AVX2-NEXT: movl 280(%rbp), %eax 1867; AVX2-NEXT: vpinsrb $7, %eax, %xmm5, %xmm5 1868; AVX2-NEXT: movl 288(%rbp), %eax 1869; AVX2-NEXT: vpinsrb $8, %eax, %xmm5, %xmm5 1870; AVX2-NEXT: movl 296(%rbp), %eax 1871; AVX2-NEXT: vpinsrb $9, %eax, %xmm5, %xmm5 1872; AVX2-NEXT: movl 304(%rbp), %eax 1873; AVX2-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5 1874; AVX2-NEXT: movl 312(%rbp), %eax 1875; AVX2-NEXT: vpinsrb $11, %eax, %xmm5, %xmm5 1876; AVX2-NEXT: movl 320(%rbp), %eax 1877; AVX2-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5 1878; AVX2-NEXT: movl 328(%rbp), %eax 1879; AVX2-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5 1880; AVX2-NEXT: movl 336(%rbp), %eax 1881; AVX2-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5 1882; AVX2-NEXT: movl 344(%rbp), %eax 1883; AVX2-NEXT: vpinsrb $15, %eax, %xmm5, %xmm5 1884; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 1885; AVX2-NEXT: movl 96(%rbp), %eax 1886; AVX2-NEXT: vmovd %eax, %xmm5 1887; AVX2-NEXT: movl 104(%rbp), %eax 1888; AVX2-NEXT: vpinsrb $1, %eax, %xmm5, %xmm5 1889; AVX2-NEXT: movl 112(%rbp), %eax 1890; AVX2-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5 1891; AVX2-NEXT: movl 120(%rbp), %eax 1892; AVX2-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5 1893; AVX2-NEXT: movl 128(%rbp), %eax 1894; AVX2-NEXT: vpinsrb $4, %eax, %xmm5, %xmm5 1895; AVX2-NEXT: movl 136(%rbp), %eax 1896; AVX2-NEXT: vpinsrb $5, %eax, %xmm5, %xmm5 1897; AVX2-NEXT: movl 144(%rbp), %eax 1898; AVX2-NEXT: vpinsrb $6, %eax, %xmm5, %xmm5 1899; AVX2-NEXT: movl 152(%rbp), %eax 1900; AVX2-NEXT: vpinsrb $7, %eax, %xmm5, %xmm5 1901; AVX2-NEXT: movl 160(%rbp), %eax 1902; AVX2-NEXT: vpinsrb $8, %eax, %xmm5, %xmm5 1903; AVX2-NEXT: movl 168(%rbp), %eax 1904; AVX2-NEXT: vpinsrb $9, %eax, %xmm5, %xmm5 1905; AVX2-NEXT: movl 176(%rbp), %eax 1906; AVX2-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5 1907; AVX2-NEXT: movl 184(%rbp), %eax 1908; AVX2-NEXT: vpinsrb $11, %eax, %xmm5, %xmm5 1909; AVX2-NEXT: movl 192(%rbp), %eax 1910; AVX2-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5 1911; AVX2-NEXT: movl 200(%rbp), %eax 1912; AVX2-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5 1913; AVX2-NEXT: movl 208(%rbp), %eax 1914; AVX2-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5 1915; AVX2-NEXT: movl 216(%rbp), %eax 1916; AVX2-NEXT: vpinsrb $15, %eax, %xmm5, %xmm5 1917; AVX2-NEXT: vmovd %edi, %xmm6 1918; AVX2-NEXT: vpinsrb $1, %esi, %xmm6, %xmm6 1919; AVX2-NEXT: vpinsrb $2, %edx, %xmm6, %xmm6 1920; AVX2-NEXT: vpinsrb $3, %r13d, %xmm6, %xmm6 1921; AVX2-NEXT: vpinsrb $4, %r8d, %xmm6, %xmm6 1922; AVX2-NEXT: vpinsrb $5, %r9d, %xmm6, %xmm6 1923; AVX2-NEXT: movl 16(%rbp), %esi 1924; AVX2-NEXT: vpinsrb $6, %esi, %xmm6, %xmm6 1925; AVX2-NEXT: movl 24(%rbp), %edi 1926; AVX2-NEXT: vpinsrb $7, %edi, %xmm6, %xmm6 1927; AVX2-NEXT: movl 32(%rbp), %r8d 1928; AVX2-NEXT: vpinsrb $8, %r8d, %xmm6, %xmm6 1929; AVX2-NEXT: movl 40(%rbp), %r9d 1930; AVX2-NEXT: vpinsrb $9, %r9d, %xmm6, %xmm6 1931; AVX2-NEXT: movl 48(%rbp), %r10d 1932; AVX2-NEXT: vpinsrb $10, %r10d, %xmm6, %xmm6 1933; AVX2-NEXT: movl 56(%rbp), %r11d 1934; AVX2-NEXT: vpinsrb $11, %r11d, %xmm6, %xmm6 1935; AVX2-NEXT: movl 64(%rbp), %r14d 1936; AVX2-NEXT: vpinsrb $12, %r14d, %xmm6, %xmm6 1937; AVX2-NEXT: movl 72(%rbp), %r12d 1938; AVX2-NEXT: vpinsrb $13, %r12d, %xmm6, %xmm6 1939; AVX2-NEXT: movl 80(%rbp), %eax 1940; AVX2-NEXT: vpinsrb $14, %eax, %xmm6, %xmm6 1941; AVX2-NEXT: movl 88(%rbp), %eax 1942; AVX2-NEXT: vpinsrb $15, %eax, %xmm6, %xmm6 1943; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 1944; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1945; AVX2-NEXT: vpand %ymm6, %ymm5, %ymm5 1946; AVX2-NEXT: vpand %ymm6, %ymm4, %ymm4 1947; AVX2-NEXT: vpaddb %ymm4, %ymm5, %ymm4 1948; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 1949; AVX2-NEXT: vpaddb %xmm5, %xmm4, %xmm4 1950; AVX2-NEXT: vpextrb $1, %xmm4, %eax 1951; AVX2-NEXT: vmovd %xmm4, %ecx 1952; AVX2-NEXT: addb %al, %cl 1953; AVX2-NEXT: vpextrb $2, %xmm4, %edx 1954; AVX2-NEXT: vpextrb $3, %xmm4, %eax 1955; AVX2-NEXT: addb %dl, %al 1956; AVX2-NEXT: addb %cl, %al 1957; AVX2-NEXT: vpextrb $4, %xmm4, %ecx 1958; AVX2-NEXT: vpextrb $5, %xmm4, %edx 1959; AVX2-NEXT: addb %cl, %dl 1960; AVX2-NEXT: vpextrb $6, %xmm4, %ecx 1961; AVX2-NEXT: addb %dl, %cl 1962; AVX2-NEXT: addb %al, %cl 1963; AVX2-NEXT: vpextrb $7, %xmm4, %eax 1964; AVX2-NEXT: vpextrb $8, %xmm4, %edx 1965; AVX2-NEXT: addb %al, %dl 1966; AVX2-NEXT: vpextrb $9, %xmm4, %eax 1967; AVX2-NEXT: addb %dl, %al 1968; AVX2-NEXT: vpextrb $10, %xmm4, %edx 1969; AVX2-NEXT: addb %al, %dl 1970; AVX2-NEXT: addb %cl, %dl 1971; AVX2-NEXT: vpextrb $11, %xmm4, %eax 1972; AVX2-NEXT: vpextrb $12, %xmm4, %ecx 1973; AVX2-NEXT: addb %al, %cl 1974; AVX2-NEXT: vpextrb $13, %xmm4, %eax 1975; AVX2-NEXT: addb %cl, %al 1976; AVX2-NEXT: vpextrb $14, %xmm4, %ecx 1977; AVX2-NEXT: addb %al, %cl 1978; AVX2-NEXT: vpextrb $15, %xmm4, %eax 1979; AVX2-NEXT: addb %cl, %al 1980; AVX2-NEXT: addb %dl, %al 1981; AVX2-NEXT: vmovaps %ymm3, {{[0-9]+}}(%rsp) 1982; AVX2-NEXT: vmovaps %ymm2, (%rsp) 1983; AVX2-NEXT: movzbl %al, %eax 1984; AVX2-NEXT: andl $63, %eax 1985; AVX2-NEXT: movzbl (%rsp,%rax), %eax 1986; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1987; AVX2-NEXT: vpextrb $0, %xmm0, (%rsp) 1988; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 1989; AVX2-NEXT: andl $1, %eax 1990; AVX2-NEXT: vpextrb $1, %xmm0, (%rsp,%rax) 1991; AVX2-NEXT: andl $1, %ebx 1992; AVX2-NEXT: addq %rax, %rbx 1993; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rbx) 1994; AVX2-NEXT: andl $1, %r15d 1995; AVX2-NEXT: addq %rbx, %r15 1996; AVX2-NEXT: vpextrb $3, %xmm0, (%rsp,%r15) 1997; AVX2-NEXT: andl $1, %r13d 1998; AVX2-NEXT: addq %r15, %r13 1999; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%r13) 2000; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload 2001; AVX2-NEXT: andl $1, %ecx 2002; AVX2-NEXT: addq %r13, %rcx 2003; AVX2-NEXT: movl %ecx, %eax 2004; AVX2-NEXT: vpextrb $5, %xmm0, (%rsp,%rax) 2005; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 2006; AVX2-NEXT: andl $1, %eax 2007; AVX2-NEXT: addq %rcx, %rax 2008; AVX2-NEXT: andl $1, %esi 2009; AVX2-NEXT: addq %rax, %rsi 2010; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax 2011; AVX2-NEXT: andl $63, %eax 2012; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%rax) 2013; AVX2-NEXT: andl $1, %edi 2014; AVX2-NEXT: addq %rsi, %rdi 2015; AVX2-NEXT: # kill: def $esi killed $esi killed $rsi def $rsi 2016; AVX2-NEXT: andl $63, %esi 2017; AVX2-NEXT: vpextrb $7, %xmm0, (%rsp,%rsi) 2018; AVX2-NEXT: andl $1, %r8d 2019; AVX2-NEXT: addq %rdi, %r8 2020; AVX2-NEXT: # kill: def $edi killed $edi killed $rdi def $rdi 2021; AVX2-NEXT: andl $63, %edi 2022; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rdi) 2023; AVX2-NEXT: andl $1, %r9d 2024; AVX2-NEXT: addq %r8, %r9 2025; AVX2-NEXT: # kill: def $r8d killed $r8d killed $r8 def $r8 2026; AVX2-NEXT: andl $63, %r8d 2027; AVX2-NEXT: vpextrb $9, %xmm0, (%rsp,%r8) 2028; AVX2-NEXT: andl $1, %r10d 2029; AVX2-NEXT: addq %r9, %r10 2030; AVX2-NEXT: # kill: def $r9d killed $r9d killed $r9 def $r9 2031; AVX2-NEXT: andl $63, %r9d 2032; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%r9) 2033; AVX2-NEXT: andl $1, %r11d 2034; AVX2-NEXT: addq %r10, %r11 2035; AVX2-NEXT: # kill: def $r10d killed $r10d killed $r10 def $r10 2036; AVX2-NEXT: andl $63, %r10d 2037; AVX2-NEXT: vpextrb $11, %xmm0, (%rsp,%r10) 2038; AVX2-NEXT: andl $1, %r14d 2039; AVX2-NEXT: addq %r11, %r14 2040; AVX2-NEXT: # kill: def $r11d killed $r11d killed $r11 def $r11 2041; AVX2-NEXT: andl $63, %r11d 2042; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%r11) 2043; AVX2-NEXT: andl $1, %r12d 2044; AVX2-NEXT: addq %r14, %r12 2045; AVX2-NEXT: # kill: def $r14d killed $r14d killed $r14 def $r14 2046; AVX2-NEXT: andl $63, %r14d 2047; AVX2-NEXT: vpextrb $13, %xmm0, (%rsp,%r14) 2048; AVX2-NEXT: movl 80(%rbp), %eax 2049; AVX2-NEXT: andl $1, %eax 2050; AVX2-NEXT: addq %r12, %rax 2051; AVX2-NEXT: # kill: def $r12d killed $r12d killed $r12 def $r12 2052; AVX2-NEXT: andl $63, %r12d 2053; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%r12) 2054; AVX2-NEXT: movl 88(%rbp), %ecx 2055; AVX2-NEXT: andl $1, %ecx 2056; AVX2-NEXT: addq %rax, %rcx 2057; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax 2058; AVX2-NEXT: andl $63, %eax 2059; AVX2-NEXT: vpextrb $15, %xmm0, (%rsp,%rax) 2060; AVX2-NEXT: movl 96(%rbp), %edx 2061; AVX2-NEXT: andl $1, %edx 2062; AVX2-NEXT: addq %rcx, %rdx 2063; AVX2-NEXT: movl %ecx, %eax 2064; AVX2-NEXT: andl $63, %eax 2065; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 2066; AVX2-NEXT: vpextrb $0, %xmm0, (%rsp,%rax) 2067; AVX2-NEXT: movl 104(%rbp), %ecx 2068; AVX2-NEXT: andl $1, %ecx 2069; AVX2-NEXT: addq %rdx, %rcx 2070; AVX2-NEXT: movl %edx, %eax 2071; AVX2-NEXT: andl $63, %eax 2072; AVX2-NEXT: vpextrb $1, %xmm0, (%rsp,%rax) 2073; AVX2-NEXT: movl 112(%rbp), %edx 2074; AVX2-NEXT: andl $1, %edx 2075; AVX2-NEXT: addq %rcx, %rdx 2076; AVX2-NEXT: movl %ecx, %eax 2077; AVX2-NEXT: andl $63, %eax 2078; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rax) 2079; AVX2-NEXT: movl 120(%rbp), %ecx 2080; AVX2-NEXT: andl $1, %ecx 2081; AVX2-NEXT: addq %rdx, %rcx 2082; AVX2-NEXT: movl %edx, %eax 2083; AVX2-NEXT: andl $63, %eax 2084; AVX2-NEXT: vpextrb $3, %xmm0, (%rsp,%rax) 2085; AVX2-NEXT: movl 128(%rbp), %edx 2086; AVX2-NEXT: andl $1, %edx 2087; AVX2-NEXT: addq %rcx, %rdx 2088; AVX2-NEXT: movl %ecx, %eax 2089; AVX2-NEXT: andl $63, %eax 2090; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%rax) 2091; AVX2-NEXT: movl 136(%rbp), %ecx 2092; AVX2-NEXT: andl $1, %ecx 2093; AVX2-NEXT: addq %rdx, %rcx 2094; AVX2-NEXT: movl %edx, %eax 2095; AVX2-NEXT: andl $63, %eax 2096; AVX2-NEXT: vpextrb $5, %xmm0, (%rsp,%rax) 2097; AVX2-NEXT: movl 144(%rbp), %edx 2098; AVX2-NEXT: andl $1, %edx 2099; AVX2-NEXT: addq %rcx, %rdx 2100; AVX2-NEXT: movl %ecx, %eax 2101; AVX2-NEXT: andl $63, %eax 2102; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%rax) 2103; AVX2-NEXT: movl 152(%rbp), %ecx 2104; AVX2-NEXT: andl $1, %ecx 2105; AVX2-NEXT: addq %rdx, %rcx 2106; AVX2-NEXT: movl %edx, %eax 2107; AVX2-NEXT: andl $63, %eax 2108; AVX2-NEXT: vpextrb $7, %xmm0, (%rsp,%rax) 2109; AVX2-NEXT: movl 160(%rbp), %edx 2110; AVX2-NEXT: andl $1, %edx 2111; AVX2-NEXT: addq %rcx, %rdx 2112; AVX2-NEXT: movl %ecx, %eax 2113; AVX2-NEXT: andl $63, %eax 2114; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rax) 2115; AVX2-NEXT: movl 168(%rbp), %ecx 2116; AVX2-NEXT: andl $1, %ecx 2117; AVX2-NEXT: addq %rdx, %rcx 2118; AVX2-NEXT: movl %edx, %eax 2119; AVX2-NEXT: andl $63, %eax 2120; AVX2-NEXT: vpextrb $9, %xmm0, (%rsp,%rax) 2121; AVX2-NEXT: movl 176(%rbp), %edx 2122; AVX2-NEXT: andl $1, %edx 2123; AVX2-NEXT: addq %rcx, %rdx 2124; AVX2-NEXT: movl %ecx, %eax 2125; AVX2-NEXT: andl $63, %eax 2126; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%rax) 2127; AVX2-NEXT: movl 184(%rbp), %ecx 2128; AVX2-NEXT: andl $1, %ecx 2129; AVX2-NEXT: addq %rdx, %rcx 2130; AVX2-NEXT: movl %edx, %eax 2131; AVX2-NEXT: andl $63, %eax 2132; AVX2-NEXT: vpextrb $11, %xmm0, (%rsp,%rax) 2133; AVX2-NEXT: movl 192(%rbp), %edx 2134; AVX2-NEXT: andl $1, %edx 2135; AVX2-NEXT: addq %rcx, %rdx 2136; AVX2-NEXT: movl %ecx, %eax 2137; AVX2-NEXT: andl $63, %eax 2138; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%rax) 2139; AVX2-NEXT: movl 200(%rbp), %ecx 2140; AVX2-NEXT: andl $1, %ecx 2141; AVX2-NEXT: addq %rdx, %rcx 2142; AVX2-NEXT: movl %edx, %eax 2143; AVX2-NEXT: andl $63, %eax 2144; AVX2-NEXT: vpextrb $13, %xmm0, (%rsp,%rax) 2145; AVX2-NEXT: movl 208(%rbp), %edx 2146; AVX2-NEXT: andl $1, %edx 2147; AVX2-NEXT: addq %rcx, %rdx 2148; AVX2-NEXT: movl %ecx, %eax 2149; AVX2-NEXT: andl $63, %eax 2150; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%rax) 2151; AVX2-NEXT: movl 216(%rbp), %ecx 2152; AVX2-NEXT: andl $1, %ecx 2153; AVX2-NEXT: addq %rdx, %rcx 2154; AVX2-NEXT: movl %edx, %eax 2155; AVX2-NEXT: andl $63, %eax 2156; AVX2-NEXT: vpextrb $15, %xmm0, (%rsp,%rax) 2157; AVX2-NEXT: movl 224(%rbp), %edx 2158; AVX2-NEXT: andl $1, %edx 2159; AVX2-NEXT: addq %rcx, %rdx 2160; AVX2-NEXT: movl %ecx, %eax 2161; AVX2-NEXT: andl $63, %eax 2162; AVX2-NEXT: vpextrb $0, %xmm1, (%rsp,%rax) 2163; AVX2-NEXT: movl 232(%rbp), %ecx 2164; AVX2-NEXT: andl $1, %ecx 2165; AVX2-NEXT: addq %rdx, %rcx 2166; AVX2-NEXT: movl %edx, %eax 2167; AVX2-NEXT: andl $63, %eax 2168; AVX2-NEXT: vpextrb $1, %xmm1, (%rsp,%rax) 2169; AVX2-NEXT: movl 240(%rbp), %edx 2170; AVX2-NEXT: andl $1, %edx 2171; AVX2-NEXT: addq %rcx, %rdx 2172; AVX2-NEXT: movl %ecx, %eax 2173; AVX2-NEXT: andl $63, %eax 2174; AVX2-NEXT: vpextrb $2, %xmm1, (%rsp,%rax) 2175; AVX2-NEXT: movl 248(%rbp), %ecx 2176; AVX2-NEXT: andl $1, %ecx 2177; AVX2-NEXT: addq %rdx, %rcx 2178; AVX2-NEXT: movl %edx, %eax 2179; AVX2-NEXT: andl $63, %eax 2180; AVX2-NEXT: vpextrb $3, %xmm1, (%rsp,%rax) 2181; AVX2-NEXT: movl 256(%rbp), %edx 2182; AVX2-NEXT: andl $1, %edx 2183; AVX2-NEXT: addq %rcx, %rdx 2184; AVX2-NEXT: movl %ecx, %eax 2185; AVX2-NEXT: andl $63, %eax 2186; AVX2-NEXT: vpextrb $4, %xmm1, (%rsp,%rax) 2187; AVX2-NEXT: movl 264(%rbp), %ecx 2188; AVX2-NEXT: andl $1, %ecx 2189; AVX2-NEXT: addq %rdx, %rcx 2190; AVX2-NEXT: movl %edx, %eax 2191; AVX2-NEXT: andl $63, %eax 2192; AVX2-NEXT: vpextrb $5, %xmm1, (%rsp,%rax) 2193; AVX2-NEXT: movl 272(%rbp), %edx 2194; AVX2-NEXT: andl $1, %edx 2195; AVX2-NEXT: addq %rcx, %rdx 2196; AVX2-NEXT: movl %ecx, %eax 2197; AVX2-NEXT: andl $63, %eax 2198; AVX2-NEXT: vpextrb $6, %xmm1, (%rsp,%rax) 2199; AVX2-NEXT: movl 280(%rbp), %ecx 2200; AVX2-NEXT: andl $1, %ecx 2201; AVX2-NEXT: addq %rdx, %rcx 2202; AVX2-NEXT: movl %edx, %eax 2203; AVX2-NEXT: andl $63, %eax 2204; AVX2-NEXT: vpextrb $7, %xmm1, (%rsp,%rax) 2205; AVX2-NEXT: movl 288(%rbp), %edx 2206; AVX2-NEXT: andl $1, %edx 2207; AVX2-NEXT: addq %rcx, %rdx 2208; AVX2-NEXT: movl %ecx, %eax 2209; AVX2-NEXT: andl $63, %eax 2210; AVX2-NEXT: vpextrb $8, %xmm1, (%rsp,%rax) 2211; AVX2-NEXT: movl 296(%rbp), %ecx 2212; AVX2-NEXT: andl $1, %ecx 2213; AVX2-NEXT: addq %rdx, %rcx 2214; AVX2-NEXT: movl %edx, %eax 2215; AVX2-NEXT: andl $63, %eax 2216; AVX2-NEXT: vpextrb $9, %xmm1, (%rsp,%rax) 2217; AVX2-NEXT: movl 304(%rbp), %edx 2218; AVX2-NEXT: andl $1, %edx 2219; AVX2-NEXT: addq %rcx, %rdx 2220; AVX2-NEXT: movl %ecx, %eax 2221; AVX2-NEXT: andl $63, %eax 2222; AVX2-NEXT: vpextrb $10, %xmm1, (%rsp,%rax) 2223; AVX2-NEXT: movl 312(%rbp), %ecx 2224; AVX2-NEXT: andl $1, %ecx 2225; AVX2-NEXT: addq %rdx, %rcx 2226; AVX2-NEXT: movl %edx, %eax 2227; AVX2-NEXT: andl $63, %eax 2228; AVX2-NEXT: vpextrb $11, %xmm1, (%rsp,%rax) 2229; AVX2-NEXT: movl 320(%rbp), %edx 2230; AVX2-NEXT: andl $1, %edx 2231; AVX2-NEXT: addq %rcx, %rdx 2232; AVX2-NEXT: movl %ecx, %eax 2233; AVX2-NEXT: andl $63, %eax 2234; AVX2-NEXT: vpextrb $12, %xmm1, (%rsp,%rax) 2235; AVX2-NEXT: movl 328(%rbp), %ecx 2236; AVX2-NEXT: andl $1, %ecx 2237; AVX2-NEXT: addq %rdx, %rcx 2238; AVX2-NEXT: movl %edx, %eax 2239; AVX2-NEXT: andl $63, %eax 2240; AVX2-NEXT: vpextrb $13, %xmm1, (%rsp,%rax) 2241; AVX2-NEXT: movl 336(%rbp), %edx 2242; AVX2-NEXT: andl $1, %edx 2243; AVX2-NEXT: addq %rcx, %rdx 2244; AVX2-NEXT: movl %ecx, %eax 2245; AVX2-NEXT: andl $63, %eax 2246; AVX2-NEXT: vpextrb $14, %xmm1, (%rsp,%rax) 2247; AVX2-NEXT: movl 344(%rbp), %ecx 2248; AVX2-NEXT: andl $1, %ecx 2249; AVX2-NEXT: addq %rdx, %rcx 2250; AVX2-NEXT: movl %edx, %eax 2251; AVX2-NEXT: andl $63, %eax 2252; AVX2-NEXT: vpextrb $15, %xmm1, (%rsp,%rax) 2253; AVX2-NEXT: movl 352(%rbp), %edx 2254; AVX2-NEXT: andl $1, %edx 2255; AVX2-NEXT: addq %rcx, %rdx 2256; AVX2-NEXT: movl %ecx, %eax 2257; AVX2-NEXT: andl $63, %eax 2258; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 2259; AVX2-NEXT: vpextrb $0, %xmm0, (%rsp,%rax) 2260; AVX2-NEXT: movl 360(%rbp), %ecx 2261; AVX2-NEXT: andl $1, %ecx 2262; AVX2-NEXT: addq %rdx, %rcx 2263; AVX2-NEXT: movl %edx, %eax 2264; AVX2-NEXT: andl $63, %eax 2265; AVX2-NEXT: vpextrb $1, %xmm0, (%rsp,%rax) 2266; AVX2-NEXT: movl 368(%rbp), %edx 2267; AVX2-NEXT: andl $1, %edx 2268; AVX2-NEXT: addq %rcx, %rdx 2269; AVX2-NEXT: movl %ecx, %eax 2270; AVX2-NEXT: andl $63, %eax 2271; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rax) 2272; AVX2-NEXT: movl 376(%rbp), %ecx 2273; AVX2-NEXT: andl $1, %ecx 2274; AVX2-NEXT: addq %rdx, %rcx 2275; AVX2-NEXT: movl %edx, %eax 2276; AVX2-NEXT: andl $63, %eax 2277; AVX2-NEXT: vpextrb $3, %xmm0, (%rsp,%rax) 2278; AVX2-NEXT: movl 384(%rbp), %edx 2279; AVX2-NEXT: andl $1, %edx 2280; AVX2-NEXT: addq %rcx, %rdx 2281; AVX2-NEXT: movl %ecx, %eax 2282; AVX2-NEXT: andl $63, %eax 2283; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%rax) 2284; AVX2-NEXT: movl 392(%rbp), %ecx 2285; AVX2-NEXT: andl $1, %ecx 2286; AVX2-NEXT: addq %rdx, %rcx 2287; AVX2-NEXT: movl %edx, %eax 2288; AVX2-NEXT: andl $63, %eax 2289; AVX2-NEXT: vpextrb $5, %xmm0, (%rsp,%rax) 2290; AVX2-NEXT: movl 400(%rbp), %edx 2291; AVX2-NEXT: andl $1, %edx 2292; AVX2-NEXT: addq %rcx, %rdx 2293; AVX2-NEXT: movl %ecx, %eax 2294; AVX2-NEXT: andl $63, %eax 2295; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%rax) 2296; AVX2-NEXT: movl 408(%rbp), %ecx 2297; AVX2-NEXT: andl $1, %ecx 2298; AVX2-NEXT: addq %rdx, %rcx 2299; AVX2-NEXT: movl %edx, %eax 2300; AVX2-NEXT: andl $63, %eax 2301; AVX2-NEXT: vpextrb $7, %xmm0, (%rsp,%rax) 2302; AVX2-NEXT: movl 416(%rbp), %edx 2303; AVX2-NEXT: andl $1, %edx 2304; AVX2-NEXT: addq %rcx, %rdx 2305; AVX2-NEXT: movl %ecx, %eax 2306; AVX2-NEXT: andl $63, %eax 2307; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rax) 2308; AVX2-NEXT: movl 424(%rbp), %ecx 2309; AVX2-NEXT: andl $1, %ecx 2310; AVX2-NEXT: addq %rdx, %rcx 2311; AVX2-NEXT: movl %edx, %eax 2312; AVX2-NEXT: andl $63, %eax 2313; AVX2-NEXT: vpextrb $9, %xmm0, (%rsp,%rax) 2314; AVX2-NEXT: movl 432(%rbp), %edx 2315; AVX2-NEXT: andl $1, %edx 2316; AVX2-NEXT: addq %rcx, %rdx 2317; AVX2-NEXT: movl %ecx, %eax 2318; AVX2-NEXT: andl $63, %eax 2319; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%rax) 2320; AVX2-NEXT: movl 440(%rbp), %ecx 2321; AVX2-NEXT: andl $1, %ecx 2322; AVX2-NEXT: addq %rdx, %rcx 2323; AVX2-NEXT: movl %edx, %eax 2324; AVX2-NEXT: andl $63, %eax 2325; AVX2-NEXT: vpextrb $11, %xmm0, (%rsp,%rax) 2326; AVX2-NEXT: movl 448(%rbp), %edx 2327; AVX2-NEXT: andl $1, %edx 2328; AVX2-NEXT: addq %rcx, %rdx 2329; AVX2-NEXT: movl %ecx, %eax 2330; AVX2-NEXT: andl $63, %eax 2331; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%rax) 2332; AVX2-NEXT: movl 456(%rbp), %ecx 2333; AVX2-NEXT: andl $1, %ecx 2334; AVX2-NEXT: addq %rdx, %rcx 2335; AVX2-NEXT: movl %edx, %eax 2336; AVX2-NEXT: andl $63, %eax 2337; AVX2-NEXT: vpextrb $13, %xmm0, (%rsp,%rax) 2338; AVX2-NEXT: movl 464(%rbp), %edx 2339; AVX2-NEXT: andl $1, %edx 2340; AVX2-NEXT: addq %rcx, %rdx 2341; AVX2-NEXT: movl %ecx, %eax 2342; AVX2-NEXT: andl $63, %eax 2343; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%rax) 2344; AVX2-NEXT: movl 472(%rbp), %ecx 2345; AVX2-NEXT: andl $1, %ecx 2346; AVX2-NEXT: addq %rdx, %rcx 2347; AVX2-NEXT: movl %edx, %eax 2348; AVX2-NEXT: andl $63, %eax 2349; AVX2-NEXT: vpextrb $15, %xmm0, (%rsp,%rax) 2350; AVX2-NEXT: vpextrb $15, %xmm0, %eax 2351; AVX2-NEXT: cmpq $64, %rcx 2352; AVX2-NEXT: cmovbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload 2353; AVX2-NEXT: cmpq $63, %rcx 2354; AVX2-NEXT: movq %rcx, %rdx 2355; AVX2-NEXT: movl $63, %ecx 2356; AVX2-NEXT: cmovbq %rdx, %rcx 2357; AVX2-NEXT: movb %al, (%rsp,%rcx) 2358; AVX2-NEXT: vmovaps (%rsp), %ymm0 2359; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 2360; AVX2-NEXT: leaq -40(%rbp), %rsp 2361; AVX2-NEXT: popq %rbx 2362; AVX2-NEXT: popq %r12 2363; AVX2-NEXT: popq %r13 2364; AVX2-NEXT: popq %r14 2365; AVX2-NEXT: popq %r15 2366; AVX2-NEXT: popq %rbp 2367; AVX2-NEXT: retq 2368; 2369; AVX512F-LABEL: test_compress_v64i8: 2370; AVX512F: # %bb.0: 2371; AVX512F-NEXT: pushq %rbp 2372; AVX512F-NEXT: movq %rsp, %rbp 2373; AVX512F-NEXT: andq $-64, %rsp 2374; AVX512F-NEXT: subq $256, %rsp # imm = 0x100 2375; AVX512F-NEXT: movzbl 352(%rbp), %eax 2376; AVX512F-NEXT: andl $1, %eax 2377; AVX512F-NEXT: kmovw %eax, %k0 2378; AVX512F-NEXT: movzbl 360(%rbp), %eax 2379; AVX512F-NEXT: kmovw %eax, %k1 2380; AVX512F-NEXT: kshiftlw $15, %k1, %k1 2381; AVX512F-NEXT: kshiftrw $14, %k1, %k1 2382; AVX512F-NEXT: korw %k1, %k0, %k0 2383; AVX512F-NEXT: movw $-5, %ax 2384; AVX512F-NEXT: kmovw %eax, %k1 2385; AVX512F-NEXT: kandw %k1, %k0, %k0 2386; AVX512F-NEXT: kmovw %k1, %k3 2387; AVX512F-NEXT: movzbl 368(%rbp), %eax 2388; AVX512F-NEXT: kmovw %eax, %k1 2389; AVX512F-NEXT: kshiftlw $15, %k1, %k1 2390; AVX512F-NEXT: kshiftrw $13, %k1, %k1 2391; AVX512F-NEXT: korw %k1, %k0, %k0 2392; AVX512F-NEXT: movw $-9, %ax 2393; AVX512F-NEXT: kmovw %eax, %k7 2394; AVX512F-NEXT: kandw %k7, %k0, %k0 2395; AVX512F-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 2396; AVX512F-NEXT: movzbl 376(%rbp), %eax 2397; AVX512F-NEXT: kmovw %eax, %k1 2398; AVX512F-NEXT: kshiftlw $15, %k1, %k1 2399; AVX512F-NEXT: kshiftrw $12, %k1, %k1 2400; AVX512F-NEXT: korw %k1, %k0, %k0 2401; AVX512F-NEXT: movw $-17, %ax 2402; AVX512F-NEXT: kmovw %eax, %k5 2403; AVX512F-NEXT: kandw %k5, %k0, %k0 2404; AVX512F-NEXT: movzbl 384(%rbp), %eax 2405; AVX512F-NEXT: kmovw %eax, %k1 2406; AVX512F-NEXT: kshiftlw $15, %k1, %k1 2407; AVX512F-NEXT: kshiftrw $11, %k1, %k1 2408; AVX512F-NEXT: korw %k1, %k0, %k0 2409; AVX512F-NEXT: movw $-33, %ax 2410; AVX512F-NEXT: kmovw %eax, %k6 2411; AVX512F-NEXT: kandw %k6, %k0, %k0 2412; AVX512F-NEXT: movzbl 392(%rbp), %eax 2413; AVX512F-NEXT: kmovw %eax, %k1 2414; AVX512F-NEXT: kshiftlw $15, %k1, %k1 2415; AVX512F-NEXT: kshiftrw $10, %k1, %k1 2416; AVX512F-NEXT: korw %k1, %k0, %k0 2417; AVX512F-NEXT: movw $-65, %ax 2418; AVX512F-NEXT: kmovw %eax, %k1 2419; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 2420; AVX512F-NEXT: kandw %k1, %k0, %k0 2421; AVX512F-NEXT: movzbl 400(%rbp), %eax 2422; AVX512F-NEXT: kmovw %eax, %k1 2423; AVX512F-NEXT: kshiftlw $15, %k1, %k1 2424; AVX512F-NEXT: kshiftrw $9, %k1, %k1 2425; AVX512F-NEXT: korw %k1, %k0, %k0 2426; AVX512F-NEXT: movw $-129, %ax 2427; AVX512F-NEXT: kmovw %eax, %k1 2428; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 2429; AVX512F-NEXT: kandw %k1, %k0, %k0 2430; AVX512F-NEXT: movzbl 408(%rbp), %eax 2431; AVX512F-NEXT: kmovw %eax, %k1 2432; AVX512F-NEXT: kshiftlw $15, %k1, %k1 2433; AVX512F-NEXT: kshiftrw $8, %k1, %k1 2434; AVX512F-NEXT: korw %k1, %k0, %k0 2435; AVX512F-NEXT: movw $-257, %ax # imm = 0xFEFF 2436; AVX512F-NEXT: kmovw %eax, %k1 2437; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 2438; AVX512F-NEXT: kandw %k1, %k0, %k0 2439; AVX512F-NEXT: movzbl 416(%rbp), %eax 2440; AVX512F-NEXT: kmovw %eax, %k1 2441; AVX512F-NEXT: kshiftlw $15, %k1, %k1 2442; AVX512F-NEXT: kshiftrw $7, %k1, %k1 2443; AVX512F-NEXT: korw %k1, %k0, %k0 2444; AVX512F-NEXT: movw $-513, %ax # imm = 0xFDFF 2445; AVX512F-NEXT: kmovw %eax, %k1 2446; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 2447; AVX512F-NEXT: kandw %k1, %k0, %k0 2448; AVX512F-NEXT: movzbl 424(%rbp), %eax 2449; AVX512F-NEXT: kmovw %eax, %k1 2450; AVX512F-NEXT: kshiftlw $15, %k1, %k1 2451; AVX512F-NEXT: kshiftrw $6, %k1, %k1 2452; AVX512F-NEXT: korw %k1, %k0, %k0 2453; AVX512F-NEXT: movw $-1025, %ax # imm = 0xFBFF 2454; AVX512F-NEXT: kmovw %eax, %k1 2455; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 2456; AVX512F-NEXT: kandw %k1, %k0, %k0 2457; AVX512F-NEXT: movzbl 432(%rbp), %eax 2458; AVX512F-NEXT: kmovw %eax, %k1 2459; AVX512F-NEXT: kshiftlw $15, %k1, %k1 2460; AVX512F-NEXT: kshiftrw $5, %k1, %k1 2461; AVX512F-NEXT: korw %k1, %k0, %k0 2462; AVX512F-NEXT: movw $-2049, %ax # imm = 0xF7FF 2463; AVX512F-NEXT: kmovw %eax, %k1 2464; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 2465; AVX512F-NEXT: kandw %k1, %k0, %k0 2466; AVX512F-NEXT: movzbl 440(%rbp), %eax 2467; AVX512F-NEXT: kmovw %eax, %k1 2468; AVX512F-NEXT: kshiftlw $15, %k1, %k1 2469; AVX512F-NEXT: kshiftrw $4, %k1, %k1 2470; AVX512F-NEXT: korw %k1, %k0, %k0 2471; AVX512F-NEXT: movw $-4097, %ax # imm = 0xEFFF 2472; AVX512F-NEXT: kmovw %eax, %k1 2473; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 2474; AVX512F-NEXT: kandw %k1, %k0, %k0 2475; AVX512F-NEXT: movzbl 448(%rbp), %eax 2476; AVX512F-NEXT: kmovw %eax, %k1 2477; AVX512F-NEXT: kshiftlw $15, %k1, %k1 2478; AVX512F-NEXT: kshiftrw $3, %k1, %k1 2479; AVX512F-NEXT: korw %k1, %k0, %k0 2480; AVX512F-NEXT: movw $-8193, %ax # imm = 0xDFFF 2481; AVX512F-NEXT: kmovw %eax, %k1 2482; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 2483; AVX512F-NEXT: kandw %k1, %k0, %k0 2484; AVX512F-NEXT: movzbl 456(%rbp), %eax 2485; AVX512F-NEXT: kmovw %eax, %k1 2486; AVX512F-NEXT: kshiftlw $15, %k1, %k1 2487; AVX512F-NEXT: kshiftrw $2, %k1, %k1 2488; AVX512F-NEXT: korw %k1, %k0, %k1 2489; AVX512F-NEXT: movw $-16385, %ax # imm = 0xBFFF 2490; AVX512F-NEXT: kmovw %eax, %k4 2491; AVX512F-NEXT: kandw %k4, %k1, %k1 2492; AVX512F-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 2493; AVX512F-NEXT: movzbl 464(%rbp), %eax 2494; AVX512F-NEXT: kmovw %eax, %k2 2495; AVX512F-NEXT: kshiftlw $14, %k2, %k2 2496; AVX512F-NEXT: korw %k2, %k1, %k1 2497; AVX512F-NEXT: kshiftlw $1, %k1, %k1 2498; AVX512F-NEXT: kshiftrw $1, %k1, %k1 2499; AVX512F-NEXT: movzbl 472(%rbp), %eax 2500; AVX512F-NEXT: kmovw %eax, %k2 2501; AVX512F-NEXT: kshiftlw $15, %k2, %k2 2502; AVX512F-NEXT: korw %k2, %k1, %k1 2503; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 2504; AVX512F-NEXT: movzbl 224(%rbp), %eax 2505; AVX512F-NEXT: andl $1, %eax 2506; AVX512F-NEXT: movzbl 232(%rbp), %r10d 2507; AVX512F-NEXT: kmovw %r10d, %k1 2508; AVX512F-NEXT: kshiftlw $15, %k1, %k1 2509; AVX512F-NEXT: kshiftrw $14, %k1, %k1 2510; AVX512F-NEXT: kmovw %eax, %k2 2511; AVX512F-NEXT: korw %k1, %k2, %k1 2512; AVX512F-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 2513; AVX512F-NEXT: kandw %k3, %k1, %k1 2514; AVX512F-NEXT: movzbl 240(%rbp), %eax 2515; AVX512F-NEXT: kmovw %eax, %k2 2516; AVX512F-NEXT: kshiftlw $15, %k2, %k2 2517; AVX512F-NEXT: kshiftrw $13, %k2, %k2 2518; AVX512F-NEXT: korw %k2, %k1, %k1 2519; AVX512F-NEXT: kandw %k7, %k1, %k1 2520; AVX512F-NEXT: movzbl 248(%rbp), %eax 2521; AVX512F-NEXT: kmovw %eax, %k2 2522; AVX512F-NEXT: kshiftlw $15, %k2, %k2 2523; AVX512F-NEXT: kshiftrw $12, %k2, %k2 2524; AVX512F-NEXT: korw %k2, %k1, %k1 2525; AVX512F-NEXT: kandw %k5, %k1, %k1 2526; AVX512F-NEXT: movzbl 256(%rbp), %eax 2527; AVX512F-NEXT: kmovw %eax, %k2 2528; AVX512F-NEXT: kshiftlw $15, %k2, %k2 2529; AVX512F-NEXT: kshiftrw $11, %k2, %k2 2530; AVX512F-NEXT: korw %k2, %k1, %k1 2531; AVX512F-NEXT: kandw %k6, %k1, %k1 2532; AVX512F-NEXT: movzbl 264(%rbp), %eax 2533; AVX512F-NEXT: kmovw %eax, %k2 2534; AVX512F-NEXT: kshiftlw $15, %k2, %k2 2535; AVX512F-NEXT: kshiftrw $10, %k2, %k2 2536; AVX512F-NEXT: korw %k2, %k1, %k1 2537; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 2538; AVX512F-NEXT: kandw %k7, %k1, %k1 2539; AVX512F-NEXT: movzbl 272(%rbp), %eax 2540; AVX512F-NEXT: kmovw %eax, %k2 2541; AVX512F-NEXT: kshiftlw $15, %k2, %k2 2542; AVX512F-NEXT: kshiftrw $9, %k2, %k2 2543; AVX512F-NEXT: korw %k2, %k1, %k0 2544; AVX512F-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 2545; AVX512F-NEXT: movzbl 280(%rbp), %eax 2546; AVX512F-NEXT: kmovw %eax, %k1 2547; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 2548; AVX512F-NEXT: kshiftlw $15, %k1, %k1 2549; AVX512F-NEXT: kshiftrw $8, %k1, %k1 2550; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 2551; AVX512F-NEXT: kandw %k2, %k0, %k2 2552; AVX512F-NEXT: korw %k1, %k2, %k1 2553; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 2554; AVX512F-NEXT: kandw %k0, %k1, %k1 2555; AVX512F-NEXT: movzbl 288(%rbp), %eax 2556; AVX512F-NEXT: kmovw %eax, %k0 2557; AVX512F-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 2558; AVX512F-NEXT: kshiftlw $15, %k0, %k2 2559; AVX512F-NEXT: kshiftrw $7, %k2, %k2 2560; AVX512F-NEXT: korw %k2, %k1, %k1 2561; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 2562; AVX512F-NEXT: kandw %k0, %k1, %k1 2563; AVX512F-NEXT: movzbl 296(%rbp), %eax 2564; AVX512F-NEXT: kmovw %eax, %k2 2565; AVX512F-NEXT: kshiftlw $15, %k2, %k0 2566; AVX512F-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 2567; AVX512F-NEXT: kshiftrw $6, %k0, %k2 2568; AVX512F-NEXT: korw %k2, %k1, %k1 2569; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 2570; AVX512F-NEXT: kandw %k0, %k1, %k1 2571; AVX512F-NEXT: movzbl 304(%rbp), %eax 2572; AVX512F-NEXT: kmovw %eax, %k2 2573; AVX512F-NEXT: kshiftlw $15, %k2, %k0 2574; AVX512F-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 2575; AVX512F-NEXT: kshiftrw $5, %k0, %k2 2576; AVX512F-NEXT: korw %k2, %k1, %k1 2577; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 2578; AVX512F-NEXT: kandw %k0, %k1, %k1 2579; AVX512F-NEXT: movzbl 312(%rbp), %eax 2580; AVX512F-NEXT: kmovw %eax, %k2 2581; AVX512F-NEXT: kshiftlw $15, %k2, %k0 2582; AVX512F-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 2583; AVX512F-NEXT: kshiftrw $4, %k0, %k2 2584; AVX512F-NEXT: korw %k2, %k1, %k1 2585; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 2586; AVX512F-NEXT: kandw %k0, %k1, %k1 2587; AVX512F-NEXT: movzbl 320(%rbp), %eax 2588; AVX512F-NEXT: kmovw %eax, %k2 2589; AVX512F-NEXT: kshiftlw $15, %k2, %k0 2590; AVX512F-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 2591; AVX512F-NEXT: kshiftrw $3, %k0, %k2 2592; AVX512F-NEXT: korw %k2, %k1, %k1 2593; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 2594; AVX512F-NEXT: kandw %k0, %k1, %k1 2595; AVX512F-NEXT: movzbl 328(%rbp), %eax 2596; AVX512F-NEXT: kmovw %eax, %k2 2597; AVX512F-NEXT: kshiftlw $15, %k2, %k2 2598; AVX512F-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 2599; AVX512F-NEXT: kshiftrw $2, %k2, %k2 2600; AVX512F-NEXT: korw %k2, %k1, %k1 2601; AVX512F-NEXT: kandw %k4, %k1, %k1 2602; AVX512F-NEXT: movzbl 336(%rbp), %eax 2603; AVX512F-NEXT: kmovw %eax, %k2 2604; AVX512F-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 2605; AVX512F-NEXT: kshiftlw $14, %k2, %k2 2606; AVX512F-NEXT: korw %k2, %k1, %k1 2607; AVX512F-NEXT: kshiftlw $1, %k1, %k1 2608; AVX512F-NEXT: kshiftrw $1, %k1, %k1 2609; AVX512F-NEXT: movzbl 344(%rbp), %eax 2610; AVX512F-NEXT: kmovw %eax, %k2 2611; AVX512F-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 2612; AVX512F-NEXT: kshiftlw $15, %k2, %k2 2613; AVX512F-NEXT: korw %k2, %k1, %k1 2614; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 2615; AVX512F-NEXT: movzbl 96(%rbp), %eax 2616; AVX512F-NEXT: andl $1, %eax 2617; AVX512F-NEXT: movzbl 104(%rbp), %r10d 2618; AVX512F-NEXT: kmovw %r10d, %k1 2619; AVX512F-NEXT: kshiftlw $15, %k1, %k1 2620; AVX512F-NEXT: kshiftrw $14, %k1, %k1 2621; AVX512F-NEXT: kmovw %eax, %k2 2622; AVX512F-NEXT: korw %k1, %k2, %k1 2623; AVX512F-NEXT: kandw %k3, %k1, %k1 2624; AVX512F-NEXT: movzbl 112(%rbp), %eax 2625; AVX512F-NEXT: kmovw %eax, %k2 2626; AVX512F-NEXT: kshiftlw $15, %k2, %k2 2627; AVX512F-NEXT: kshiftrw $13, %k2, %k2 2628; AVX512F-NEXT: korw %k2, %k1, %k1 2629; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 2630; AVX512F-NEXT: kandw %k4, %k1, %k1 2631; AVX512F-NEXT: movzbl 120(%rbp), %eax 2632; AVX512F-NEXT: kmovw %eax, %k2 2633; AVX512F-NEXT: kshiftlw $15, %k2, %k2 2634; AVX512F-NEXT: kshiftrw $12, %k2, %k2 2635; AVX512F-NEXT: korw %k2, %k1, %k1 2636; AVX512F-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 2637; AVX512F-NEXT: kandw %k5, %k1, %k1 2638; AVX512F-NEXT: movzbl 128(%rbp), %eax 2639; AVX512F-NEXT: kmovw %eax, %k2 2640; AVX512F-NEXT: kshiftlw $15, %k2, %k2 2641; AVX512F-NEXT: kshiftrw $11, %k2, %k2 2642; AVX512F-NEXT: korw %k2, %k1, %k1 2643; AVX512F-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 2644; AVX512F-NEXT: kandw %k6, %k1, %k1 2645; AVX512F-NEXT: movzbl 136(%rbp), %eax 2646; AVX512F-NEXT: kmovw %eax, %k2 2647; AVX512F-NEXT: kshiftlw $15, %k2, %k2 2648; AVX512F-NEXT: kshiftrw $10, %k2, %k2 2649; AVX512F-NEXT: korw %k2, %k1, %k1 2650; AVX512F-NEXT: kandw %k7, %k1, %k1 2651; AVX512F-NEXT: movzbl 144(%rbp), %eax 2652; AVX512F-NEXT: kmovw %eax, %k2 2653; AVX512F-NEXT: kshiftlw $15, %k2, %k2 2654; AVX512F-NEXT: kshiftrw $9, %k2, %k2 2655; AVX512F-NEXT: korw %k2, %k1, %k1 2656; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 2657; AVX512F-NEXT: kandw %k2, %k1, %k1 2658; AVX512F-NEXT: movzbl 152(%rbp), %eax 2659; AVX512F-NEXT: kmovw %eax, %k2 2660; AVX512F-NEXT: kshiftlw $15, %k2, %k2 2661; AVX512F-NEXT: kshiftrw $8, %k2, %k2 2662; AVX512F-NEXT: korw %k2, %k1, %k1 2663; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 2664; AVX512F-NEXT: kandw %k3, %k1, %k1 2665; AVX512F-NEXT: movzbl 160(%rbp), %eax 2666; AVX512F-NEXT: kmovw %eax, %k2 2667; AVX512F-NEXT: kshiftlw $15, %k2, %k2 2668; AVX512F-NEXT: kshiftrw $7, %k2, %k2 2669; AVX512F-NEXT: korw %k2, %k1, %k1 2670; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 2671; AVX512F-NEXT: kandw %k7, %k1, %k1 2672; AVX512F-NEXT: movzbl 168(%rbp), %eax 2673; AVX512F-NEXT: kmovw %eax, %k2 2674; AVX512F-NEXT: kshiftlw $15, %k2, %k2 2675; AVX512F-NEXT: kshiftrw $6, %k2, %k2 2676; AVX512F-NEXT: korw %k2, %k1, %k1 2677; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 2678; AVX512F-NEXT: kandw %k2, %k1, %k1 2679; AVX512F-NEXT: movzbl 176(%rbp), %eax 2680; AVX512F-NEXT: kmovw %eax, %k2 2681; AVX512F-NEXT: kshiftlw $15, %k2, %k2 2682; AVX512F-NEXT: kshiftrw $5, %k2, %k2 2683; AVX512F-NEXT: korw %k2, %k1, %k1 2684; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 2685; AVX512F-NEXT: kandw %k2, %k1, %k1 2686; AVX512F-NEXT: movzbl 184(%rbp), %eax 2687; AVX512F-NEXT: kmovw %eax, %k2 2688; AVX512F-NEXT: kshiftlw $15, %k2, %k2 2689; AVX512F-NEXT: kshiftrw $4, %k2, %k2 2690; AVX512F-NEXT: korw %k2, %k1, %k1 2691; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 2692; AVX512F-NEXT: kandw %k2, %k1, %k1 2693; AVX512F-NEXT: movzbl 192(%rbp), %eax 2694; AVX512F-NEXT: kmovw %eax, %k2 2695; AVX512F-NEXT: kshiftlw $15, %k2, %k2 2696; AVX512F-NEXT: kshiftrw $3, %k2, %k2 2697; AVX512F-NEXT: korw %k2, %k1, %k1 2698; AVX512F-NEXT: kandw %k0, %k1, %k1 2699; AVX512F-NEXT: movzbl 200(%rbp), %eax 2700; AVX512F-NEXT: kmovw %eax, %k2 2701; AVX512F-NEXT: kshiftlw $15, %k2, %k2 2702; AVX512F-NEXT: kshiftrw $2, %k2, %k2 2703; AVX512F-NEXT: korw %k2, %k1, %k1 2704; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 2705; AVX512F-NEXT: kandw %k2, %k1, %k1 2706; AVX512F-NEXT: movzbl 208(%rbp), %eax 2707; AVX512F-NEXT: kmovw %eax, %k2 2708; AVX512F-NEXT: kshiftlw $14, %k2, %k2 2709; AVX512F-NEXT: korw %k2, %k1, %k1 2710; AVX512F-NEXT: kshiftlw $1, %k1, %k1 2711; AVX512F-NEXT: kshiftrw $1, %k1, %k1 2712; AVX512F-NEXT: movzbl 216(%rbp), %eax 2713; AVX512F-NEXT: kmovw %eax, %k2 2714; AVX512F-NEXT: kshiftlw $15, %k2, %k2 2715; AVX512F-NEXT: korw %k2, %k1, %k1 2716; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 2717; AVX512F-NEXT: andl $1, %edi 2718; AVX512F-NEXT: kmovw %esi, %k1 2719; AVX512F-NEXT: kshiftlw $15, %k1, %k1 2720; AVX512F-NEXT: kshiftrw $14, %k1, %k1 2721; AVX512F-NEXT: kmovw %edi, %k2 2722; AVX512F-NEXT: korw %k1, %k2, %k1 2723; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 2724; AVX512F-NEXT: kandw %k2, %k1, %k1 2725; AVX512F-NEXT: kmovw %edx, %k2 2726; AVX512F-NEXT: kshiftlw $15, %k2, %k2 2727; AVX512F-NEXT: kshiftrw $13, %k2, %k2 2728; AVX512F-NEXT: korw %k2, %k1, %k1 2729; AVX512F-NEXT: kandw %k4, %k1, %k1 2730; AVX512F-NEXT: kmovw %ecx, %k2 2731; AVX512F-NEXT: kshiftlw $15, %k2, %k2 2732; AVX512F-NEXT: kshiftrw $12, %k2, %k2 2733; AVX512F-NEXT: korw %k2, %k1, %k1 2734; AVX512F-NEXT: kandw %k5, %k1, %k1 2735; AVX512F-NEXT: kmovw %r8d, %k2 2736; AVX512F-NEXT: kshiftlw $15, %k2, %k2 2737; AVX512F-NEXT: kshiftrw $11, %k2, %k2 2738; AVX512F-NEXT: korw %k2, %k1, %k1 2739; AVX512F-NEXT: kandw %k6, %k1, %k1 2740; AVX512F-NEXT: kmovw %r9d, %k2 2741; AVX512F-NEXT: kshiftlw $15, %k2, %k2 2742; AVX512F-NEXT: kshiftrw $10, %k2, %k2 2743; AVX512F-NEXT: korw %k2, %k1, %k1 2744; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 2745; AVX512F-NEXT: kandw %k2, %k1, %k1 2746; AVX512F-NEXT: movzbl 16(%rbp), %eax 2747; AVX512F-NEXT: kmovw %eax, %k2 2748; AVX512F-NEXT: kshiftlw $15, %k2, %k2 2749; AVX512F-NEXT: kshiftrw $9, %k2, %k2 2750; AVX512F-NEXT: korw %k2, %k1, %k2 2751; AVX512F-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 2752; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 2753; AVX512F-NEXT: kandw %k1, %k2, %k1 2754; AVX512F-NEXT: movzbl 24(%rbp), %eax 2755; AVX512F-NEXT: kmovw %eax, %k2 2756; AVX512F-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 2757; AVX512F-NEXT: kshiftlw $15, %k2, %k2 2758; AVX512F-NEXT: kshiftrw $8, %k2, %k2 2759; AVX512F-NEXT: korw %k2, %k1, %k1 2760; AVX512F-NEXT: kandw %k3, %k1, %k1 2761; AVX512F-NEXT: movzbl 32(%rbp), %eax 2762; AVX512F-NEXT: kmovw %eax, %k2 2763; AVX512F-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 2764; AVX512F-NEXT: kshiftlw $15, %k2, %k2 2765; AVX512F-NEXT: kshiftrw $7, %k2, %k2 2766; AVX512F-NEXT: korw %k2, %k1, %k1 2767; AVX512F-NEXT: kandw %k7, %k1, %k1 2768; AVX512F-NEXT: movzbl 40(%rbp), %eax 2769; AVX512F-NEXT: kmovw %eax, %k2 2770; AVX512F-NEXT: kshiftlw $15, %k2, %k2 2771; AVX512F-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 2772; AVX512F-NEXT: kshiftrw $6, %k2, %k2 2773; AVX512F-NEXT: korw %k2, %k1, %k1 2774; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 2775; AVX512F-NEXT: kandw %k2, %k1, %k1 2776; AVX512F-NEXT: movzbl 48(%rbp), %eax 2777; AVX512F-NEXT: kmovw %eax, %k2 2778; AVX512F-NEXT: kshiftlw $15, %k2, %k5 2779; AVX512F-NEXT: kshiftrw $5, %k5, %k2 2780; AVX512F-NEXT: korw %k2, %k1, %k1 2781; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 2782; AVX512F-NEXT: kandw %k2, %k1, %k1 2783; AVX512F-NEXT: movzbl 56(%rbp), %eax 2784; AVX512F-NEXT: kmovw %eax, %k2 2785; AVX512F-NEXT: kshiftlw $15, %k2, %k4 2786; AVX512F-NEXT: kshiftrw $4, %k4, %k2 2787; AVX512F-NEXT: korw %k2, %k1, %k1 2788; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 2789; AVX512F-NEXT: kandw %k2, %k1, %k1 2790; AVX512F-NEXT: movzbl 64(%rbp), %eax 2791; AVX512F-NEXT: kmovw %eax, %k2 2792; AVX512F-NEXT: kshiftlw $15, %k2, %k3 2793; AVX512F-NEXT: kshiftrw $3, %k3, %k2 2794; AVX512F-NEXT: korw %k2, %k1, %k1 2795; AVX512F-NEXT: kandw %k0, %k1, %k1 2796; AVX512F-NEXT: movzbl 72(%rbp), %eax 2797; AVX512F-NEXT: kmovw %eax, %k2 2798; AVX512F-NEXT: kshiftlw $15, %k2, %k2 2799; AVX512F-NEXT: kshiftrw $2, %k2, %k0 2800; AVX512F-NEXT: korw %k0, %k1, %k0 2801; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 2802; AVX512F-NEXT: kandw %k1, %k0, %k0 2803; AVX512F-NEXT: movzbl 80(%rbp), %eax 2804; AVX512F-NEXT: kmovw %eax, %k1 2805; AVX512F-NEXT: kshiftlw $14, %k1, %k7 2806; AVX512F-NEXT: korw %k7, %k0, %k0 2807; AVX512F-NEXT: kshiftlw $1, %k0, %k0 2808; AVX512F-NEXT: kshiftrw $1, %k0, %k7 2809; AVX512F-NEXT: movzbl 88(%rbp), %eax 2810; AVX512F-NEXT: kmovw %eax, %k0 2811; AVX512F-NEXT: kshiftlw $15, %k0, %k6 2812; AVX512F-NEXT: korw %k6, %k7, %k6 2813; AVX512F-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 2814; AVX512F-NEXT: movw $-3, %ax 2815; AVX512F-NEXT: kmovw %eax, %k6 2816; AVX512F-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 2817; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 2818; AVX512F-NEXT: kandw %k6, %k7, %k6 2819; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 2820; AVX512F-NEXT: kshiftrw $14, %k7, %k7 2821; AVX512F-NEXT: korw %k7, %k6, %k6 2822; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 2823; AVX512F-NEXT: kandw %k7, %k6, %k6 2824; AVX512F-NEXT: kshiftrw $13, %k5, %k5 2825; AVX512F-NEXT: korw %k5, %k6, %k5 2826; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 2827; AVX512F-NEXT: kandw %k6, %k5, %k5 2828; AVX512F-NEXT: kshiftrw $12, %k4, %k4 2829; AVX512F-NEXT: korw %k4, %k5, %k4 2830; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 2831; AVX512F-NEXT: kandw %k5, %k4, %k4 2832; AVX512F-NEXT: kshiftrw $11, %k3, %k3 2833; AVX512F-NEXT: korw %k3, %k4, %k3 2834; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 2835; AVX512F-NEXT: kandw %k4, %k3, %k3 2836; AVX512F-NEXT: kshiftrw $10, %k2, %k2 2837; AVX512F-NEXT: korw %k2, %k3, %k2 2838; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 2839; AVX512F-NEXT: kandw %k3, %k2, %k2 2840; AVX512F-NEXT: kshiftlw $6, %k1, %k1 2841; AVX512F-NEXT: korw %k1, %k2, %k1 2842; AVX512F-NEXT: kshiftlw $9, %k1, %k1 2843; AVX512F-NEXT: kshiftrw $9, %k1, %k1 2844; AVX512F-NEXT: kshiftlw $7, %k0, %k0 2845; AVX512F-NEXT: korw %k0, %k1, %k0 2846; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 2847; AVX512F-NEXT: kshiftlw $9, %k1, %k1 2848; AVX512F-NEXT: kshiftrw $9, %k1, %k1 2849; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 2850; AVX512F-NEXT: kshiftlw $7, %k2, %k2 2851; AVX512F-NEXT: korw %k2, %k1, %k1 2852; AVX512F-NEXT: kxorw %k0, %k1, %k0 2853; AVX512F-NEXT: kshiftrw $4, %k0, %k1 2854; AVX512F-NEXT: kxorw %k1, %k0, %k0 2855; AVX512F-NEXT: kshiftrw $2, %k0, %k1 2856; AVX512F-NEXT: kxorw %k1, %k0, %k0 2857; AVX512F-NEXT: kshiftrw $1, %k0, %k1 2858; AVX512F-NEXT: kxorw %k1, %k0, %k0 2859; AVX512F-NEXT: kmovw %k0, %eax 2860; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 2861; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 2862; AVX512F-NEXT: kandw %k1, %k0, %k0 2863; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 2864; AVX512F-NEXT: kshiftrw $14, %k1, %k1 2865; AVX512F-NEXT: korw %k1, %k0, %k0 2866; AVX512F-NEXT: kandw %k7, %k0, %k0 2867; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 2868; AVX512F-NEXT: kshiftrw $13, %k1, %k1 2869; AVX512F-NEXT: korw %k1, %k0, %k0 2870; AVX512F-NEXT: kandw %k6, %k0, %k0 2871; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 2872; AVX512F-NEXT: kshiftrw $12, %k1, %k1 2873; AVX512F-NEXT: korw %k1, %k0, %k0 2874; AVX512F-NEXT: kandw %k5, %k0, %k0 2875; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 2876; AVX512F-NEXT: kshiftrw $11, %k1, %k1 2877; AVX512F-NEXT: korw %k1, %k0, %k0 2878; AVX512F-NEXT: kandw %k4, %k0, %k0 2879; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 2880; AVX512F-NEXT: kshiftrw $10, %k1, %k1 2881; AVX512F-NEXT: korw %k1, %k0, %k0 2882; AVX512F-NEXT: kandw %k3, %k0, %k0 2883; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 2884; AVX512F-NEXT: kshiftlw $6, %k1, %k1 2885; AVX512F-NEXT: korw %k1, %k0, %k0 2886; AVX512F-NEXT: kshiftlw $9, %k0, %k0 2887; AVX512F-NEXT: kshiftrw $9, %k0, %k0 2888; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 2889; AVX512F-NEXT: kshiftlw $7, %k1, %k1 2890; AVX512F-NEXT: korw %k1, %k0, %k0 2891; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 2892; AVX512F-NEXT: kshiftlw $9, %k1, %k1 2893; AVX512F-NEXT: kshiftrw $9, %k1, %k1 2894; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 2895; AVX512F-NEXT: kshiftlw $7, %k2, %k2 2896; AVX512F-NEXT: korw %k2, %k1, %k1 2897; AVX512F-NEXT: kxorw %k0, %k1, %k0 2898; AVX512F-NEXT: kshiftrw $4, %k0, %k1 2899; AVX512F-NEXT: kxorw %k1, %k0, %k0 2900; AVX512F-NEXT: kshiftrw $2, %k0, %k1 2901; AVX512F-NEXT: kxorw %k1, %k0, %k0 2902; AVX512F-NEXT: kshiftrw $1, %k0, %k1 2903; AVX512F-NEXT: kxorw %k1, %k0, %k0 2904; AVX512F-NEXT: kmovw %k0, %ecx 2905; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 2906; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 2907; AVX512F-NEXT: kxorw %k2, %k3, %k0 2908; AVX512F-NEXT: kshiftrw $8, %k0, %k1 2909; AVX512F-NEXT: kxorw %k1, %k0, %k0 2910; AVX512F-NEXT: kshiftrw $4, %k0, %k1 2911; AVX512F-NEXT: kxorw %k1, %k0, %k0 2912; AVX512F-NEXT: kshiftrw $2, %k0, %k1 2913; AVX512F-NEXT: kxorw %k1, %k0, %k0 2914; AVX512F-NEXT: kshiftrw $1, %k0, %k1 2915; AVX512F-NEXT: kxorw %k1, %k0, %k0 2916; AVX512F-NEXT: kmovw %k0, %edx 2917; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 2918; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero 2919; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 2920; AVX512F-NEXT: vpcompressd %zmm3, %zmm3 {%k1} {z} 2921; AVX512F-NEXT: vpternlogd {{.*#+}} zmm4 {%k1} {z} = -1 2922; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm2 2923; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero 2924; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 2925; AVX512F-NEXT: vpcompressd %zmm2, %zmm2 {%k1} {z} 2926; AVX512F-NEXT: vpternlogd {{.*#+}} zmm5 {%k1} {z} = -1 2927; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 2928; AVX512F-NEXT: vpcompressd %zmm6, %zmm6 {%k3} {z} 2929; AVX512F-NEXT: vpternlogd {{.*#+}} zmm7 {%k3} {z} = -1 2930; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 2931; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 2932; AVX512F-NEXT: vpcompressd %zmm0, %zmm0 {%k2} {z} 2933; AVX512F-NEXT: vpternlogd {{.*#+}} zmm8 {%k2} {z} = -1 2934; AVX512F-NEXT: vpmovdb %zmm6, {{[0-9]+}}(%rsp) 2935; AVX512F-NEXT: andl $31, %eax 2936; AVX512F-NEXT: vpmovdb %zmm0, 64(%rsp,%rax) 2937; AVX512F-NEXT: vpmovdb %zmm3, {{[0-9]+}}(%rsp) 2938; AVX512F-NEXT: andl $31, %ecx 2939; AVX512F-NEXT: vpmovdb %zmm2, 96(%rsp,%rcx) 2940; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 2941; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) 2942; AVX512F-NEXT: andl $63, %edx 2943; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 2944; AVX512F-NEXT: vmovaps %ymm0, 128(%rsp,%rdx) 2945; AVX512F-NEXT: vpmovdb %zmm4, %xmm0 2946; AVX512F-NEXT: vpmovdb %zmm5, %xmm2 2947; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 2948; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 2949; AVX512F-NEXT: vpblendvb %ymm0, {{[0-9]+}}(%rsp), %ymm2, %ymm0 2950; AVX512F-NEXT: vpmovdb %zmm7, %xmm2 2951; AVX512F-NEXT: vpmovdb %zmm8, %xmm3 2952; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 2953; AVX512F-NEXT: vpblendvb %ymm2, {{[0-9]+}}(%rsp), %ymm1, %ymm1 2954; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 2955; AVX512F-NEXT: movq %rbp, %rsp 2956; AVX512F-NEXT: popq %rbp 2957; AVX512F-NEXT: retq 2958; 2959; AVX512VL-LABEL: test_compress_v64i8: 2960; AVX512VL: # %bb.0: 2961; AVX512VL-NEXT: vpsllw $7, %zmm1, %zmm1 2962; AVX512VL-NEXT: vpmovb2m %zmm1, %k1 2963; AVX512VL-NEXT: vpcompressb %zmm0, %zmm2 {%k1} 2964; AVX512VL-NEXT: vmovdqa64 %zmm2, %zmm0 2965; AVX512VL-NEXT: retq 2966 %out = call <64 x i8> @llvm.experimental.vector.compress(<64 x i8> %vec, <64 x i1> %mask, <64 x i8> %passthru) 2967 ret <64 x i8> %out 2968} 2969 2970define <32 x i16> @test_compress_v32i16(<32 x i16> %vec, <32 x i1> %mask, <32 x i16> %passthru) nounwind { 2971; AVX2-LABEL: test_compress_v32i16: 2972; AVX2: # %bb.0: 2973; AVX2-NEXT: pushq %rbp 2974; AVX2-NEXT: movq %rsp, %rbp 2975; AVX2-NEXT: pushq %r15 2976; AVX2-NEXT: pushq %r14 2977; AVX2-NEXT: pushq %r13 2978; AVX2-NEXT: pushq %r12 2979; AVX2-NEXT: pushq %rbx 2980; AVX2-NEXT: andq $-32, %rsp 2981; AVX2-NEXT: subq $256, %rsp # imm = 0x100 2982; AVX2-NEXT: vmovaps %ymm4, {{[0-9]+}}(%rsp) 2983; AVX2-NEXT: vmovaps %ymm3, (%rsp) 2984; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 2985; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero 2986; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 2987; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 2988; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero 2989; AVX2-NEXT: vpand %ymm5, %ymm6, %ymm5 2990; AVX2-NEXT: vpaddw %ymm4, %ymm5, %ymm4 2991; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 2992; AVX2-NEXT: vpaddw %xmm5, %xmm4, %xmm4 2993; AVX2-NEXT: vpextrw $1, %xmm4, %eax 2994; AVX2-NEXT: vmovd %xmm4, %ecx 2995; AVX2-NEXT: addl %eax, %ecx 2996; AVX2-NEXT: vpextrw $2, %xmm4, %eax 2997; AVX2-NEXT: vpextrw $3, %xmm4, %edx 2998; AVX2-NEXT: addl %eax, %edx 2999; AVX2-NEXT: addl %ecx, %edx 3000; AVX2-NEXT: vpextrw $4, %xmm4, %eax 3001; AVX2-NEXT: vpextrw $5, %xmm4, %ecx 3002; AVX2-NEXT: addl %eax, %ecx 3003; AVX2-NEXT: vpextrw $6, %xmm4, %eax 3004; AVX2-NEXT: addl %ecx, %eax 3005; AVX2-NEXT: addl %edx, %eax 3006; AVX2-NEXT: vpextrw $7, %xmm4, %ecx 3007; AVX2-NEXT: addl %eax, %ecx 3008; AVX2-NEXT: andl $31, %ecx 3009; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 3010; AVX2-NEXT: vpextrb $1, %xmm2, %eax 3011; AVX2-NEXT: andl $1, %eax 3012; AVX2-NEXT: vmovd %xmm2, %ecx 3013; AVX2-NEXT: andl $1, %ecx 3014; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 3015; AVX2-NEXT: addq %rcx, %rax 3016; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 3017; AVX2-NEXT: vpextrb $2, %xmm2, %ecx 3018; AVX2-NEXT: andl $1, %ecx 3019; AVX2-NEXT: addq %rax, %rcx 3020; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 3021; AVX2-NEXT: vpextrb $3, %xmm2, %eax 3022; AVX2-NEXT: andl $1, %eax 3023; AVX2-NEXT: addq %rcx, %rax 3024; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 3025; AVX2-NEXT: vpextrb $4, %xmm2, %ecx 3026; AVX2-NEXT: andl $1, %ecx 3027; AVX2-NEXT: addq %rax, %rcx 3028; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 3029; AVX2-NEXT: vpextrb $5, %xmm2, %eax 3030; AVX2-NEXT: andl $1, %eax 3031; AVX2-NEXT: addq %rcx, %rax 3032; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 3033; AVX2-NEXT: vpextrb $6, %xmm2, %ecx 3034; AVX2-NEXT: andl $1, %ecx 3035; AVX2-NEXT: addq %rax, %rcx 3036; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 3037; AVX2-NEXT: vpextrb $7, %xmm2, %eax 3038; AVX2-NEXT: andl $1, %eax 3039; AVX2-NEXT: addq %rcx, %rax 3040; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 3041; AVX2-NEXT: vpextrb $8, %xmm2, %ecx 3042; AVX2-NEXT: andl $1, %ecx 3043; AVX2-NEXT: addq %rax, %rcx 3044; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 3045; AVX2-NEXT: vpextrb $9, %xmm2, %eax 3046; AVX2-NEXT: andl $1, %eax 3047; AVX2-NEXT: addq %rcx, %rax 3048; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 3049; AVX2-NEXT: vpextrb $10, %xmm2, %ecx 3050; AVX2-NEXT: andl $1, %ecx 3051; AVX2-NEXT: addq %rax, %rcx 3052; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 3053; AVX2-NEXT: vpextrb $11, %xmm2, %eax 3054; AVX2-NEXT: andl $1, %eax 3055; AVX2-NEXT: addq %rcx, %rax 3056; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 3057; AVX2-NEXT: vpextrb $12, %xmm2, %ecx 3058; AVX2-NEXT: andl $1, %ecx 3059; AVX2-NEXT: addq %rax, %rcx 3060; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 3061; AVX2-NEXT: vpextrb $13, %xmm2, %eax 3062; AVX2-NEXT: andl $1, %eax 3063; AVX2-NEXT: addq %rcx, %rax 3064; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 3065; AVX2-NEXT: vpextrb $14, %xmm2, %ecx 3066; AVX2-NEXT: andl $1, %ecx 3067; AVX2-NEXT: addq %rax, %rcx 3068; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 3069; AVX2-NEXT: vpextrb $15, %xmm2, %eax 3070; AVX2-NEXT: andl $1, %eax 3071; AVX2-NEXT: addq %rcx, %rax 3072; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 3073; AVX2-NEXT: vmovd %xmm3, %ecx 3074; AVX2-NEXT: andl $1, %ecx 3075; AVX2-NEXT: addq %rax, %rcx 3076; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 3077; AVX2-NEXT: vpextrb $1, %xmm3, %eax 3078; AVX2-NEXT: andl $1, %eax 3079; AVX2-NEXT: addq %rcx, %rax 3080; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 3081; AVX2-NEXT: vpextrb $2, %xmm3, %ecx 3082; AVX2-NEXT: andl $1, %ecx 3083; AVX2-NEXT: addq %rax, %rcx 3084; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 3085; AVX2-NEXT: vpextrb $3, %xmm3, %r12d 3086; AVX2-NEXT: andl $1, %r12d 3087; AVX2-NEXT: addq %rcx, %r12 3088; AVX2-NEXT: vpextrb $4, %xmm3, %r15d 3089; AVX2-NEXT: andl $1, %r15d 3090; AVX2-NEXT: addq %r12, %r15 3091; AVX2-NEXT: vpextrb $5, %xmm3, %r14d 3092; AVX2-NEXT: andl $1, %r14d 3093; AVX2-NEXT: addq %r15, %r14 3094; AVX2-NEXT: vpextrb $6, %xmm3, %ebx 3095; AVX2-NEXT: andl $1, %ebx 3096; AVX2-NEXT: addq %r14, %rbx 3097; AVX2-NEXT: vpextrb $7, %xmm3, %r11d 3098; AVX2-NEXT: andl $1, %r11d 3099; AVX2-NEXT: addq %rbx, %r11 3100; AVX2-NEXT: vpextrb $8, %xmm3, %r10d 3101; AVX2-NEXT: andl $1, %r10d 3102; AVX2-NEXT: addq %r11, %r10 3103; AVX2-NEXT: vpextrb $9, %xmm3, %r9d 3104; AVX2-NEXT: andl $1, %r9d 3105; AVX2-NEXT: addq %r10, %r9 3106; AVX2-NEXT: vpextrb $10, %xmm3, %r8d 3107; AVX2-NEXT: andl $1, %r8d 3108; AVX2-NEXT: addq %r9, %r8 3109; AVX2-NEXT: vpextrb $11, %xmm3, %edi 3110; AVX2-NEXT: andl $1, %edi 3111; AVX2-NEXT: addq %r8, %rdi 3112; AVX2-NEXT: vpextrb $12, %xmm3, %esi 3113; AVX2-NEXT: andl $1, %esi 3114; AVX2-NEXT: addq %rdi, %rsi 3115; AVX2-NEXT: vpextrb $13, %xmm3, %edx 3116; AVX2-NEXT: andl $1, %edx 3117; AVX2-NEXT: addq %rsi, %rdx 3118; AVX2-NEXT: vpextrb $14, %xmm3, %ecx 3119; AVX2-NEXT: andl $1, %ecx 3120; AVX2-NEXT: addq %rdx, %rcx 3121; AVX2-NEXT: vpextrb $15, %xmm3, %eax 3122; AVX2-NEXT: andl $1, %eax 3123; AVX2-NEXT: addq %rcx, %rax 3124; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 3125; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 3126; AVX2-NEXT: cmpq $32, %rax 3127; AVX2-NEXT: vpextrw $7, %xmm2, %eax 3128; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload 3129; AVX2-NEXT: cmovbw (%rsp,%r13,2), %ax 3130; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 3131; AVX2-NEXT: vpextrw $0, %xmm0, (%rsp) 3132; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload 3133; AVX2-NEXT: vpextrw $1, %xmm0, (%rsp,%r13,2) 3134; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload 3135; AVX2-NEXT: vpextrw $2, %xmm0, (%rsp,%r13,2) 3136; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload 3137; AVX2-NEXT: vpextrw $3, %xmm0, (%rsp,%r13,2) 3138; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload 3139; AVX2-NEXT: vpextrw $4, %xmm0, (%rsp,%r13,2) 3140; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload 3141; AVX2-NEXT: vpextrw $5, %xmm0, (%rsp,%r13,2) 3142; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload 3143; AVX2-NEXT: andl $31, %r13d 3144; AVX2-NEXT: vpextrw $6, %xmm0, (%rsp,%r13,2) 3145; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload 3146; AVX2-NEXT: andl $31, %r13d 3147; AVX2-NEXT: vpextrw $7, %xmm0, (%rsp,%r13,2) 3148; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload 3149; AVX2-NEXT: andl $31, %r13d 3150; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 3151; AVX2-NEXT: vpextrw $0, %xmm0, (%rsp,%r13,2) 3152; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload 3153; AVX2-NEXT: andl $31, %r13d 3154; AVX2-NEXT: vpextrw $1, %xmm0, (%rsp,%r13,2) 3155; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload 3156; AVX2-NEXT: andl $31, %r13d 3157; AVX2-NEXT: vpextrw $2, %xmm0, (%rsp,%r13,2) 3158; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload 3159; AVX2-NEXT: andl $31, %r13d 3160; AVX2-NEXT: vpextrw $3, %xmm0, (%rsp,%r13,2) 3161; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload 3162; AVX2-NEXT: andl $31, %r13d 3163; AVX2-NEXT: vpextrw $4, %xmm0, (%rsp,%r13,2) 3164; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload 3165; AVX2-NEXT: andl $31, %r13d 3166; AVX2-NEXT: vpextrw $5, %xmm0, (%rsp,%r13,2) 3167; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload 3168; AVX2-NEXT: andl $31, %r13d 3169; AVX2-NEXT: vpextrw $6, %xmm0, (%rsp,%r13,2) 3170; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload 3171; AVX2-NEXT: andl $31, %r13d 3172; AVX2-NEXT: vpextrw $7, %xmm0, (%rsp,%r13,2) 3173; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload 3174; AVX2-NEXT: andl $31, %r13d 3175; AVX2-NEXT: vpextrw $0, %xmm1, (%rsp,%r13,2) 3176; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload 3177; AVX2-NEXT: andl $31, %r13d 3178; AVX2-NEXT: vpextrw $1, %xmm1, (%rsp,%r13,2) 3179; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload 3180; AVX2-NEXT: andl $31, %r13d 3181; AVX2-NEXT: vpextrw $2, %xmm1, (%rsp,%r13,2) 3182; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 3183; AVX2-NEXT: andl $31, %eax 3184; AVX2-NEXT: vpextrw $3, %xmm1, (%rsp,%rax,2) 3185; AVX2-NEXT: andl $31, %r12d 3186; AVX2-NEXT: vpextrw $4, %xmm1, (%rsp,%r12,2) 3187; AVX2-NEXT: andl $31, %r15d 3188; AVX2-NEXT: vpextrw $5, %xmm1, (%rsp,%r15,2) 3189; AVX2-NEXT: andl $31, %r14d 3190; AVX2-NEXT: vpextrw $6, %xmm1, (%rsp,%r14,2) 3191; AVX2-NEXT: andl $31, %ebx 3192; AVX2-NEXT: vpextrw $7, %xmm1, (%rsp,%rbx,2) 3193; AVX2-NEXT: andl $31, %r11d 3194; AVX2-NEXT: vpextrw $0, %xmm2, (%rsp,%r11,2) 3195; AVX2-NEXT: andl $31, %r10d 3196; AVX2-NEXT: vpextrw $1, %xmm2, (%rsp,%r10,2) 3197; AVX2-NEXT: andl $31, %r9d 3198; AVX2-NEXT: vpextrw $2, %xmm2, (%rsp,%r9,2) 3199; AVX2-NEXT: andl $31, %r8d 3200; AVX2-NEXT: vpextrw $3, %xmm2, (%rsp,%r8,2) 3201; AVX2-NEXT: andl $31, %edi 3202; AVX2-NEXT: vpextrw $4, %xmm2, (%rsp,%rdi,2) 3203; AVX2-NEXT: andl $31, %esi 3204; AVX2-NEXT: vpextrw $5, %xmm2, (%rsp,%rsi,2) 3205; AVX2-NEXT: andl $31, %edx 3206; AVX2-NEXT: vpextrw $6, %xmm2, (%rsp,%rdx,2) 3207; AVX2-NEXT: andl $31, %ecx 3208; AVX2-NEXT: vpextrw $7, %xmm2, (%rsp,%rcx,2) 3209; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload 3210; AVX2-NEXT: cmpq $31, %rcx 3211; AVX2-NEXT: movl $31, %eax 3212; AVX2-NEXT: cmovbq %rcx, %rax 3213; AVX2-NEXT: movl %eax, %eax 3214; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload 3215; AVX2-NEXT: movw %cx, (%rsp,%rax,2) 3216; AVX2-NEXT: vmovaps (%rsp), %ymm0 3217; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 3218; AVX2-NEXT: leaq -40(%rbp), %rsp 3219; AVX2-NEXT: popq %rbx 3220; AVX2-NEXT: popq %r12 3221; AVX2-NEXT: popq %r13 3222; AVX2-NEXT: popq %r14 3223; AVX2-NEXT: popq %r15 3224; AVX2-NEXT: popq %rbp 3225; AVX2-NEXT: retq 3226; 3227; AVX512F-LABEL: test_compress_v32i16: 3228; AVX512F: # %bb.0: 3229; AVX512F-NEXT: pushq %rbp 3230; AVX512F-NEXT: movq %rsp, %rbp 3231; AVX512F-NEXT: andq $-64, %rsp 3232; AVX512F-NEXT: subq $128, %rsp 3233; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 3234; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm5 3235; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero 3236; AVX512F-NEXT: vpmovsxbd %xmm5, %zmm5 3237; AVX512F-NEXT: vpslld $31, %zmm5, %zmm5 3238; AVX512F-NEXT: vptestmd %zmm5, %zmm5, %k1 3239; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 3240; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1 3241; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k2 3242; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 3243; AVX512F-NEXT: vpcompressd %zmm1, %zmm1 {%k2} {z} 3244; AVX512F-NEXT: vpmovdw %zmm1, (%rsp) 3245; AVX512F-NEXT: kshiftrw $8, %k2, %k0 3246; AVX512F-NEXT: kxorw %k0, %k2, %k0 3247; AVX512F-NEXT: kshiftrw $4, %k0, %k2 3248; AVX512F-NEXT: kxorw %k2, %k0, %k0 3249; AVX512F-NEXT: kshiftrw $2, %k0, %k2 3250; AVX512F-NEXT: kxorw %k2, %k0, %k0 3251; AVX512F-NEXT: kshiftrw $1, %k0, %k2 3252; AVX512F-NEXT: kxorw %k2, %k0, %k0 3253; AVX512F-NEXT: kmovw %k0, %eax 3254; AVX512F-NEXT: andl $31, %eax 3255; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 3256; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 3257; AVX512F-NEXT: vpcompressd %zmm0, %zmm0 {%k1} {z} 3258; AVX512F-NEXT: vpmovdw %zmm0, (%rsp,%rax,2) 3259; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 3260; AVX512F-NEXT: vpsllw $15, %ymm4, %ymm1 3261; AVX512F-NEXT: vpsraw $15, %ymm1, %ymm1 3262; AVX512F-NEXT: vpblendvb %ymm1, {{[0-9]+}}(%rsp), %ymm0, %ymm0 3263; AVX512F-NEXT: vpsllw $15, %ymm3, %ymm1 3264; AVX512F-NEXT: vpsraw $15, %ymm1, %ymm1 3265; AVX512F-NEXT: vpblendvb %ymm1, (%rsp), %ymm2, %ymm1 3266; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 3267; AVX512F-NEXT: movq %rbp, %rsp 3268; AVX512F-NEXT: popq %rbp 3269; AVX512F-NEXT: retq 3270; 3271; AVX512VL-LABEL: test_compress_v32i16: 3272; AVX512VL: # %bb.0: 3273; AVX512VL-NEXT: vpsllw $7, %ymm1, %ymm1 3274; AVX512VL-NEXT: vpmovb2m %ymm1, %k1 3275; AVX512VL-NEXT: vpcompressw %zmm0, %zmm2 {%k1} 3276; AVX512VL-NEXT: vmovdqa64 %zmm2, %zmm0 3277; AVX512VL-NEXT: retq 3278 %out = call <32 x i16> @llvm.experimental.vector.compress(<32 x i16> %vec, <32 x i1> %mask, <32 x i16> %passthru) 3279 ret <32 x i16> %out 3280} 3281 3282define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i32> %passthru) nounwind { 3283; AVX2-LABEL: test_compress_large: 3284; AVX2: # %bb.0: 3285; AVX2-NEXT: pushq %rbp 3286; AVX2-NEXT: movq %rsp, %rbp 3287; AVX2-NEXT: andq $-32, %rsp 3288; AVX2-NEXT: subq $288, %rsp # imm = 0x120 3289; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx 3290; AVX2-NEXT: # kill: def $edx killed $edx def $rdx 3291; AVX2-NEXT: # kill: def $r9d killed $r9d def $r9 3292; AVX2-NEXT: # kill: def $r8d killed $r8d def $r8 3293; AVX2-NEXT: # kill: def $esi killed $esi def $rsi 3294; AVX2-NEXT: movq %rdi, %rax 3295; AVX2-NEXT: vmovss %xmm0, (%rsp) 3296; AVX2-NEXT: andl $1, %esi 3297; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rsi,4) 3298; AVX2-NEXT: andl $1, %edx 3299; AVX2-NEXT: addl %esi, %edx 3300; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rdx,4) 3301; AVX2-NEXT: andl $1, %ecx 3302; AVX2-NEXT: addl %edx, %ecx 3303; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rcx,4) 3304; AVX2-NEXT: andl $1, %r8d 3305; AVX2-NEXT: addl %ecx, %r8d 3306; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 3307; AVX2-NEXT: vmovss %xmm0, (%rsp,%r8,4) 3308; AVX2-NEXT: andl $1, %r9d 3309; AVX2-NEXT: addl %r8d, %r9d 3310; AVX2-NEXT: movzbl 16(%rbp), %ecx 3311; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%r9,4) 3312; AVX2-NEXT: movzbl %cl, %ecx 3313; AVX2-NEXT: andl $1, %ecx 3314; AVX2-NEXT: addl %r9d, %ecx 3315; AVX2-NEXT: movzbl 24(%rbp), %edx 3316; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rcx,4) 3317; AVX2-NEXT: movzbl %dl, %edx 3318; AVX2-NEXT: andl $1, %edx 3319; AVX2-NEXT: addl %ecx, %edx 3320; AVX2-NEXT: movzbl 32(%rbp), %ecx 3321; AVX2-NEXT: movzbl %cl, %ecx 3322; AVX2-NEXT: andl $1, %ecx 3323; AVX2-NEXT: addl %edx, %ecx 3324; AVX2-NEXT: # kill: def $edx killed $edx def $rdx 3325; AVX2-NEXT: andl $63, %edx 3326; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rdx,4) 3327; AVX2-NEXT: movzbl 40(%rbp), %edx 3328; AVX2-NEXT: movzbl %dl, %edx 3329; AVX2-NEXT: andl $1, %edx 3330; AVX2-NEXT: addl %ecx, %edx 3331; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx 3332; AVX2-NEXT: andl $63, %ecx 3333; AVX2-NEXT: vmovss %xmm1, (%rsp,%rcx,4) 3334; AVX2-NEXT: movzbl 48(%rbp), %ecx 3335; AVX2-NEXT: movzbl %cl, %ecx 3336; AVX2-NEXT: andl $1, %ecx 3337; AVX2-NEXT: addl %edx, %ecx 3338; AVX2-NEXT: # kill: def $edx killed $edx def $rdx 3339; AVX2-NEXT: andl $63, %edx 3340; AVX2-NEXT: vextractps $1, %xmm1, (%rsp,%rdx,4) 3341; AVX2-NEXT: movzbl 56(%rbp), %edx 3342; AVX2-NEXT: movzbl %dl, %edx 3343; AVX2-NEXT: andl $1, %edx 3344; AVX2-NEXT: addl %ecx, %edx 3345; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx 3346; AVX2-NEXT: andl $63, %ecx 3347; AVX2-NEXT: vextractps $2, %xmm1, (%rsp,%rcx,4) 3348; AVX2-NEXT: movzbl 64(%rbp), %ecx 3349; AVX2-NEXT: movzbl %cl, %ecx 3350; AVX2-NEXT: andl $1, %ecx 3351; AVX2-NEXT: addl %edx, %ecx 3352; AVX2-NEXT: # kill: def $edx killed $edx def $rdx 3353; AVX2-NEXT: andl $63, %edx 3354; AVX2-NEXT: vextractps $3, %xmm1, (%rsp,%rdx,4) 3355; AVX2-NEXT: movzbl 72(%rbp), %edx 3356; AVX2-NEXT: movzbl %dl, %edx 3357; AVX2-NEXT: andl $1, %edx 3358; AVX2-NEXT: addl %ecx, %edx 3359; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx 3360; AVX2-NEXT: andl $63, %ecx 3361; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm0 3362; AVX2-NEXT: vmovss %xmm0, (%rsp,%rcx,4) 3363; AVX2-NEXT: movzbl 80(%rbp), %ecx 3364; AVX2-NEXT: movzbl %cl, %ecx 3365; AVX2-NEXT: andl $1, %ecx 3366; AVX2-NEXT: addl %edx, %ecx 3367; AVX2-NEXT: # kill: def $edx killed $edx def $rdx 3368; AVX2-NEXT: andl $63, %edx 3369; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rdx,4) 3370; AVX2-NEXT: movzbl 88(%rbp), %edx 3371; AVX2-NEXT: movzbl %dl, %edx 3372; AVX2-NEXT: andl $1, %edx 3373; AVX2-NEXT: addl %ecx, %edx 3374; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx 3375; AVX2-NEXT: andl $63, %ecx 3376; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rcx,4) 3377; AVX2-NEXT: movzbl 96(%rbp), %ecx 3378; AVX2-NEXT: movzbl %cl, %ecx 3379; AVX2-NEXT: andl $1, %ecx 3380; AVX2-NEXT: addl %edx, %ecx 3381; AVX2-NEXT: # kill: def $edx killed $edx def $rdx 3382; AVX2-NEXT: andl $63, %edx 3383; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rdx,4) 3384; AVX2-NEXT: movzbl 104(%rbp), %edx 3385; AVX2-NEXT: movzbl %dl, %edx 3386; AVX2-NEXT: andl $1, %edx 3387; AVX2-NEXT: addl %ecx, %edx 3388; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx 3389; AVX2-NEXT: andl $63, %ecx 3390; AVX2-NEXT: vmovss %xmm2, (%rsp,%rcx,4) 3391; AVX2-NEXT: movzbl 112(%rbp), %ecx 3392; AVX2-NEXT: movzbl %cl, %ecx 3393; AVX2-NEXT: andl $1, %ecx 3394; AVX2-NEXT: addl %edx, %ecx 3395; AVX2-NEXT: # kill: def $edx killed $edx def $rdx 3396; AVX2-NEXT: andl $63, %edx 3397; AVX2-NEXT: vextractps $1, %xmm2, (%rsp,%rdx,4) 3398; AVX2-NEXT: movzbl 120(%rbp), %edx 3399; AVX2-NEXT: movzbl %dl, %edx 3400; AVX2-NEXT: andl $1, %edx 3401; AVX2-NEXT: addl %ecx, %edx 3402; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx 3403; AVX2-NEXT: andl $63, %ecx 3404; AVX2-NEXT: vextractps $2, %xmm2, (%rsp,%rcx,4) 3405; AVX2-NEXT: movzbl 128(%rbp), %ecx 3406; AVX2-NEXT: movzbl %cl, %ecx 3407; AVX2-NEXT: andl $1, %ecx 3408; AVX2-NEXT: addl %edx, %ecx 3409; AVX2-NEXT: # kill: def $edx killed $edx def $rdx 3410; AVX2-NEXT: andl $63, %edx 3411; AVX2-NEXT: vextractps $3, %xmm2, (%rsp,%rdx,4) 3412; AVX2-NEXT: movzbl 136(%rbp), %edx 3413; AVX2-NEXT: movzbl %dl, %edx 3414; AVX2-NEXT: andl $1, %edx 3415; AVX2-NEXT: addl %ecx, %edx 3416; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx 3417; AVX2-NEXT: andl $63, %ecx 3418; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm0 3419; AVX2-NEXT: vmovss %xmm0, (%rsp,%rcx,4) 3420; AVX2-NEXT: movzbl 144(%rbp), %ecx 3421; AVX2-NEXT: movzbl %cl, %ecx 3422; AVX2-NEXT: andl $1, %ecx 3423; AVX2-NEXT: addl %edx, %ecx 3424; AVX2-NEXT: # kill: def $edx killed $edx def $rdx 3425; AVX2-NEXT: andl $63, %edx 3426; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rdx,4) 3427; AVX2-NEXT: movzbl 152(%rbp), %edx 3428; AVX2-NEXT: movzbl %dl, %edx 3429; AVX2-NEXT: andl $1, %edx 3430; AVX2-NEXT: addl %ecx, %edx 3431; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx 3432; AVX2-NEXT: andl $63, %ecx 3433; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rcx,4) 3434; AVX2-NEXT: movzbl 160(%rbp), %ecx 3435; AVX2-NEXT: movzbl %cl, %ecx 3436; AVX2-NEXT: andl $1, %ecx 3437; AVX2-NEXT: addl %edx, %ecx 3438; AVX2-NEXT: # kill: def $edx killed $edx def $rdx 3439; AVX2-NEXT: andl $63, %edx 3440; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rdx,4) 3441; AVX2-NEXT: movzbl 168(%rbp), %edx 3442; AVX2-NEXT: movzbl %dl, %edx 3443; AVX2-NEXT: andl $1, %edx 3444; AVX2-NEXT: addl %ecx, %edx 3445; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx 3446; AVX2-NEXT: andl $63, %ecx 3447; AVX2-NEXT: vmovss %xmm3, (%rsp,%rcx,4) 3448; AVX2-NEXT: movzbl 176(%rbp), %ecx 3449; AVX2-NEXT: movzbl %cl, %ecx 3450; AVX2-NEXT: andl $1, %ecx 3451; AVX2-NEXT: addl %edx, %ecx 3452; AVX2-NEXT: # kill: def $edx killed $edx def $rdx 3453; AVX2-NEXT: andl $63, %edx 3454; AVX2-NEXT: vextractps $1, %xmm3, (%rsp,%rdx,4) 3455; AVX2-NEXT: movzbl 184(%rbp), %edx 3456; AVX2-NEXT: movzbl %dl, %edx 3457; AVX2-NEXT: andl $1, %edx 3458; AVX2-NEXT: addl %ecx, %edx 3459; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx 3460; AVX2-NEXT: andl $63, %ecx 3461; AVX2-NEXT: vextractps $2, %xmm3, (%rsp,%rcx,4) 3462; AVX2-NEXT: movzbl 192(%rbp), %ecx 3463; AVX2-NEXT: movzbl %cl, %ecx 3464; AVX2-NEXT: andl $1, %ecx 3465; AVX2-NEXT: addl %edx, %ecx 3466; AVX2-NEXT: # kill: def $edx killed $edx def $rdx 3467; AVX2-NEXT: andl $63, %edx 3468; AVX2-NEXT: vextractps $3, %xmm3, (%rsp,%rdx,4) 3469; AVX2-NEXT: movzbl 200(%rbp), %edx 3470; AVX2-NEXT: movzbl %dl, %edx 3471; AVX2-NEXT: andl $1, %edx 3472; AVX2-NEXT: addl %ecx, %edx 3473; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx 3474; AVX2-NEXT: andl $63, %ecx 3475; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm0 3476; AVX2-NEXT: vmovss %xmm0, (%rsp,%rcx,4) 3477; AVX2-NEXT: movzbl 208(%rbp), %ecx 3478; AVX2-NEXT: movzbl %cl, %ecx 3479; AVX2-NEXT: andl $1, %ecx 3480; AVX2-NEXT: addl %edx, %ecx 3481; AVX2-NEXT: # kill: def $edx killed $edx def $rdx 3482; AVX2-NEXT: andl $63, %edx 3483; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rdx,4) 3484; AVX2-NEXT: movzbl 216(%rbp), %edx 3485; AVX2-NEXT: movzbl %dl, %edx 3486; AVX2-NEXT: andl $1, %edx 3487; AVX2-NEXT: addl %ecx, %edx 3488; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx 3489; AVX2-NEXT: andl $63, %ecx 3490; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rcx,4) 3491; AVX2-NEXT: movzbl 224(%rbp), %ecx 3492; AVX2-NEXT: movzbl %cl, %ecx 3493; AVX2-NEXT: andl $1, %ecx 3494; AVX2-NEXT: addl %edx, %ecx 3495; AVX2-NEXT: # kill: def $edx killed $edx def $rdx 3496; AVX2-NEXT: andl $63, %edx 3497; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rdx,4) 3498; AVX2-NEXT: movzbl 232(%rbp), %edx 3499; AVX2-NEXT: movzbl %dl, %edx 3500; AVX2-NEXT: andl $1, %edx 3501; AVX2-NEXT: addl %ecx, %edx 3502; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx 3503; AVX2-NEXT: andl $63, %ecx 3504; AVX2-NEXT: vmovss %xmm4, (%rsp,%rcx,4) 3505; AVX2-NEXT: movzbl 240(%rbp), %ecx 3506; AVX2-NEXT: movzbl %cl, %ecx 3507; AVX2-NEXT: andl $1, %ecx 3508; AVX2-NEXT: addl %edx, %ecx 3509; AVX2-NEXT: # kill: def $edx killed $edx def $rdx 3510; AVX2-NEXT: andl $63, %edx 3511; AVX2-NEXT: vextractps $1, %xmm4, (%rsp,%rdx,4) 3512; AVX2-NEXT: movzbl 248(%rbp), %edx 3513; AVX2-NEXT: movzbl %dl, %edx 3514; AVX2-NEXT: andl $1, %edx 3515; AVX2-NEXT: addl %ecx, %edx 3516; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx 3517; AVX2-NEXT: andl $63, %ecx 3518; AVX2-NEXT: vextractps $2, %xmm4, (%rsp,%rcx,4) 3519; AVX2-NEXT: movzbl 256(%rbp), %ecx 3520; AVX2-NEXT: movzbl %cl, %ecx 3521; AVX2-NEXT: andl $1, %ecx 3522; AVX2-NEXT: addl %edx, %ecx 3523; AVX2-NEXT: # kill: def $edx killed $edx def $rdx 3524; AVX2-NEXT: andl $63, %edx 3525; AVX2-NEXT: vextractps $3, %xmm4, (%rsp,%rdx,4) 3526; AVX2-NEXT: movzbl 264(%rbp), %edx 3527; AVX2-NEXT: movzbl %dl, %edx 3528; AVX2-NEXT: andl $1, %edx 3529; AVX2-NEXT: addl %ecx, %edx 3530; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx 3531; AVX2-NEXT: andl $63, %ecx 3532; AVX2-NEXT: vextractf128 $1, %ymm4, %xmm0 3533; AVX2-NEXT: vmovss %xmm0, (%rsp,%rcx,4) 3534; AVX2-NEXT: movzbl 272(%rbp), %ecx 3535; AVX2-NEXT: movzbl %cl, %ecx 3536; AVX2-NEXT: andl $1, %ecx 3537; AVX2-NEXT: addl %edx, %ecx 3538; AVX2-NEXT: # kill: def $edx killed $edx def $rdx 3539; AVX2-NEXT: andl $63, %edx 3540; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rdx,4) 3541; AVX2-NEXT: movzbl 280(%rbp), %edx 3542; AVX2-NEXT: movzbl %dl, %edx 3543; AVX2-NEXT: andl $1, %edx 3544; AVX2-NEXT: addl %ecx, %edx 3545; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx 3546; AVX2-NEXT: andl $63, %ecx 3547; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rcx,4) 3548; AVX2-NEXT: movzbl 288(%rbp), %ecx 3549; AVX2-NEXT: movzbl %cl, %ecx 3550; AVX2-NEXT: andl $1, %ecx 3551; AVX2-NEXT: addl %edx, %ecx 3552; AVX2-NEXT: # kill: def $edx killed $edx def $rdx 3553; AVX2-NEXT: andl $63, %edx 3554; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rdx,4) 3555; AVX2-NEXT: movzbl 296(%rbp), %edx 3556; AVX2-NEXT: movzbl %dl, %edx 3557; AVX2-NEXT: andl $1, %edx 3558; AVX2-NEXT: addl %ecx, %edx 3559; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx 3560; AVX2-NEXT: andl $63, %ecx 3561; AVX2-NEXT: vmovss %xmm5, (%rsp,%rcx,4) 3562; AVX2-NEXT: movzbl 304(%rbp), %ecx 3563; AVX2-NEXT: movzbl %cl, %ecx 3564; AVX2-NEXT: andl $1, %ecx 3565; AVX2-NEXT: addl %edx, %ecx 3566; AVX2-NEXT: # kill: def $edx killed $edx def $rdx 3567; AVX2-NEXT: andl $63, %edx 3568; AVX2-NEXT: vextractps $1, %xmm5, (%rsp,%rdx,4) 3569; AVX2-NEXT: movzbl 312(%rbp), %edx 3570; AVX2-NEXT: movzbl %dl, %edx 3571; AVX2-NEXT: andl $1, %edx 3572; AVX2-NEXT: addl %ecx, %edx 3573; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx 3574; AVX2-NEXT: andl $63, %ecx 3575; AVX2-NEXT: vextractps $2, %xmm5, (%rsp,%rcx,4) 3576; AVX2-NEXT: movzbl 320(%rbp), %ecx 3577; AVX2-NEXT: movzbl %cl, %ecx 3578; AVX2-NEXT: andl $1, %ecx 3579; AVX2-NEXT: addl %edx, %ecx 3580; AVX2-NEXT: # kill: def $edx killed $edx def $rdx 3581; AVX2-NEXT: andl $63, %edx 3582; AVX2-NEXT: vextractps $3, %xmm5, (%rsp,%rdx,4) 3583; AVX2-NEXT: movzbl 328(%rbp), %edx 3584; AVX2-NEXT: movzbl %dl, %edx 3585; AVX2-NEXT: andl $1, %edx 3586; AVX2-NEXT: addl %ecx, %edx 3587; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx 3588; AVX2-NEXT: andl $63, %ecx 3589; AVX2-NEXT: vextractf128 $1, %ymm5, %xmm0 3590; AVX2-NEXT: vmovss %xmm0, (%rsp,%rcx,4) 3591; AVX2-NEXT: movzbl 336(%rbp), %ecx 3592; AVX2-NEXT: movzbl %cl, %ecx 3593; AVX2-NEXT: andl $1, %ecx 3594; AVX2-NEXT: addl %edx, %ecx 3595; AVX2-NEXT: # kill: def $edx killed $edx def $rdx 3596; AVX2-NEXT: andl $63, %edx 3597; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rdx,4) 3598; AVX2-NEXT: movzbl 344(%rbp), %edx 3599; AVX2-NEXT: movzbl %dl, %edx 3600; AVX2-NEXT: andl $1, %edx 3601; AVX2-NEXT: addl %ecx, %edx 3602; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx 3603; AVX2-NEXT: andl $63, %ecx 3604; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rcx,4) 3605; AVX2-NEXT: movzbl 352(%rbp), %ecx 3606; AVX2-NEXT: movzbl %cl, %ecx 3607; AVX2-NEXT: andl $1, %ecx 3608; AVX2-NEXT: addl %edx, %ecx 3609; AVX2-NEXT: # kill: def $edx killed $edx def $rdx 3610; AVX2-NEXT: andl $63, %edx 3611; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rdx,4) 3612; AVX2-NEXT: movzbl 360(%rbp), %edx 3613; AVX2-NEXT: movzbl %dl, %edx 3614; AVX2-NEXT: andl $1, %edx 3615; AVX2-NEXT: addl %ecx, %edx 3616; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx 3617; AVX2-NEXT: andl $63, %ecx 3618; AVX2-NEXT: vmovss %xmm6, (%rsp,%rcx,4) 3619; AVX2-NEXT: movzbl 368(%rbp), %ecx 3620; AVX2-NEXT: movzbl %cl, %ecx 3621; AVX2-NEXT: andl $1, %ecx 3622; AVX2-NEXT: addl %edx, %ecx 3623; AVX2-NEXT: # kill: def $edx killed $edx def $rdx 3624; AVX2-NEXT: andl $63, %edx 3625; AVX2-NEXT: vextractps $1, %xmm6, (%rsp,%rdx,4) 3626; AVX2-NEXT: movzbl 376(%rbp), %edx 3627; AVX2-NEXT: movzbl %dl, %edx 3628; AVX2-NEXT: andl $1, %edx 3629; AVX2-NEXT: addl %ecx, %edx 3630; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx 3631; AVX2-NEXT: andl $63, %ecx 3632; AVX2-NEXT: vextractps $2, %xmm6, (%rsp,%rcx,4) 3633; AVX2-NEXT: movzbl 384(%rbp), %ecx 3634; AVX2-NEXT: movzbl %cl, %ecx 3635; AVX2-NEXT: andl $1, %ecx 3636; AVX2-NEXT: addl %edx, %ecx 3637; AVX2-NEXT: # kill: def $edx killed $edx def $rdx 3638; AVX2-NEXT: andl $63, %edx 3639; AVX2-NEXT: vextractps $3, %xmm6, (%rsp,%rdx,4) 3640; AVX2-NEXT: movzbl 392(%rbp), %edx 3641; AVX2-NEXT: movzbl %dl, %edx 3642; AVX2-NEXT: andl $1, %edx 3643; AVX2-NEXT: addl %ecx, %edx 3644; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx 3645; AVX2-NEXT: andl $63, %ecx 3646; AVX2-NEXT: vextractf128 $1, %ymm6, %xmm0 3647; AVX2-NEXT: vmovss %xmm0, (%rsp,%rcx,4) 3648; AVX2-NEXT: movzbl 400(%rbp), %ecx 3649; AVX2-NEXT: movzbl %cl, %ecx 3650; AVX2-NEXT: andl $1, %ecx 3651; AVX2-NEXT: addl %edx, %ecx 3652; AVX2-NEXT: # kill: def $edx killed $edx def $rdx 3653; AVX2-NEXT: andl $63, %edx 3654; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rdx,4) 3655; AVX2-NEXT: movzbl 408(%rbp), %edx 3656; AVX2-NEXT: movzbl %dl, %edx 3657; AVX2-NEXT: andl $1, %edx 3658; AVX2-NEXT: addl %ecx, %edx 3659; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx 3660; AVX2-NEXT: andl $63, %ecx 3661; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rcx,4) 3662; AVX2-NEXT: movzbl 416(%rbp), %ecx 3663; AVX2-NEXT: movzbl %cl, %ecx 3664; AVX2-NEXT: andl $1, %ecx 3665; AVX2-NEXT: addl %edx, %ecx 3666; AVX2-NEXT: # kill: def $edx killed $edx def $rdx 3667; AVX2-NEXT: andl $63, %edx 3668; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rdx,4) 3669; AVX2-NEXT: movzbl 424(%rbp), %edx 3670; AVX2-NEXT: movzbl %dl, %edx 3671; AVX2-NEXT: andl $1, %edx 3672; AVX2-NEXT: addl %ecx, %edx 3673; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx 3674; AVX2-NEXT: andl $63, %ecx 3675; AVX2-NEXT: vmovss %xmm7, (%rsp,%rcx,4) 3676; AVX2-NEXT: movzbl 432(%rbp), %ecx 3677; AVX2-NEXT: movzbl %cl, %ecx 3678; AVX2-NEXT: andl $1, %ecx 3679; AVX2-NEXT: addl %edx, %ecx 3680; AVX2-NEXT: # kill: def $edx killed $edx def $rdx 3681; AVX2-NEXT: andl $63, %edx 3682; AVX2-NEXT: vextractps $1, %xmm7, (%rsp,%rdx,4) 3683; AVX2-NEXT: movzbl 440(%rbp), %edx 3684; AVX2-NEXT: movzbl %dl, %edx 3685; AVX2-NEXT: andl $1, %edx 3686; AVX2-NEXT: addl %ecx, %edx 3687; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx 3688; AVX2-NEXT: andl $63, %ecx 3689; AVX2-NEXT: vextractps $2, %xmm7, (%rsp,%rcx,4) 3690; AVX2-NEXT: movzbl 448(%rbp), %ecx 3691; AVX2-NEXT: movzbl %cl, %ecx 3692; AVX2-NEXT: andl $1, %ecx 3693; AVX2-NEXT: addl %edx, %ecx 3694; AVX2-NEXT: # kill: def $edx killed $edx def $rdx 3695; AVX2-NEXT: andl $63, %edx 3696; AVX2-NEXT: vextractps $3, %xmm7, (%rsp,%rdx,4) 3697; AVX2-NEXT: movzbl 456(%rbp), %edx 3698; AVX2-NEXT: movzbl %dl, %edx 3699; AVX2-NEXT: andl $1, %edx 3700; AVX2-NEXT: addl %ecx, %edx 3701; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx 3702; AVX2-NEXT: andl $63, %ecx 3703; AVX2-NEXT: vextractf128 $1, %ymm7, %xmm0 3704; AVX2-NEXT: vmovss %xmm0, (%rsp,%rcx,4) 3705; AVX2-NEXT: movzbl 464(%rbp), %ecx 3706; AVX2-NEXT: movzbl %cl, %ecx 3707; AVX2-NEXT: andl $1, %ecx 3708; AVX2-NEXT: addl %edx, %ecx 3709; AVX2-NEXT: # kill: def $edx killed $edx def $rdx 3710; AVX2-NEXT: andl $63, %edx 3711; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rdx,4) 3712; AVX2-NEXT: movzbl 472(%rbp), %edx 3713; AVX2-NEXT: movzbl %dl, %edx 3714; AVX2-NEXT: andl $1, %edx 3715; AVX2-NEXT: addl %ecx, %edx 3716; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx 3717; AVX2-NEXT: andl $63, %ecx 3718; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rcx,4) 3719; AVX2-NEXT: andl $63, %edx 3720; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rdx,4) 3721; AVX2-NEXT: vmovaps (%rsp), %ymm0 3722; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 3723; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm2 3724; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm3 3725; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm4 3726; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm5 3727; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm6 3728; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm7 3729; AVX2-NEXT: vmovaps %ymm7, 224(%rdi) 3730; AVX2-NEXT: vmovaps %ymm6, 192(%rdi) 3731; AVX2-NEXT: vmovaps %ymm5, 160(%rdi) 3732; AVX2-NEXT: vmovaps %ymm4, 128(%rdi) 3733; AVX2-NEXT: vmovaps %ymm3, 96(%rdi) 3734; AVX2-NEXT: vmovaps %ymm2, 64(%rdi) 3735; AVX2-NEXT: vmovaps %ymm1, 32(%rdi) 3736; AVX2-NEXT: vmovaps %ymm0, (%rdi) 3737; AVX2-NEXT: movq %rbp, %rsp 3738; AVX2-NEXT: popq %rbp 3739; AVX2-NEXT: vzeroupper 3740; AVX2-NEXT: retq 3741; 3742; AVX512F-LABEL: test_compress_large: 3743; AVX512F: # %bb.0: 3744; AVX512F-NEXT: pushq %rbp 3745; AVX512F-NEXT: movq %rsp, %rbp 3746; AVX512F-NEXT: andq $-64, %rsp 3747; AVX512F-NEXT: subq $640, %rsp # imm = 0x280 3748; AVX512F-NEXT: movzbl 352(%rbp), %eax 3749; AVX512F-NEXT: andl $1, %eax 3750; AVX512F-NEXT: kmovw %eax, %k0 3751; AVX512F-NEXT: movzbl 360(%rbp), %eax 3752; AVX512F-NEXT: kmovw %eax, %k1 3753; AVX512F-NEXT: kshiftlw $15, %k1, %k1 3754; AVX512F-NEXT: kshiftrw $14, %k1, %k1 3755; AVX512F-NEXT: korw %k1, %k0, %k0 3756; AVX512F-NEXT: movw $-5, %ax 3757; AVX512F-NEXT: kmovw %eax, %k1 3758; AVX512F-NEXT: kandw %k1, %k0, %k0 3759; AVX512F-NEXT: kmovw %k1, %k3 3760; AVX512F-NEXT: movzbl 368(%rbp), %eax 3761; AVX512F-NEXT: kmovw %eax, %k1 3762; AVX512F-NEXT: kshiftlw $15, %k1, %k1 3763; AVX512F-NEXT: kshiftrw $13, %k1, %k1 3764; AVX512F-NEXT: korw %k1, %k0, %k0 3765; AVX512F-NEXT: movw $-9, %ax 3766; AVX512F-NEXT: kmovw %eax, %k7 3767; AVX512F-NEXT: kandw %k7, %k0, %k0 3768; AVX512F-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 3769; AVX512F-NEXT: movzbl 376(%rbp), %eax 3770; AVX512F-NEXT: kmovw %eax, %k1 3771; AVX512F-NEXT: kshiftlw $15, %k1, %k1 3772; AVX512F-NEXT: kshiftrw $12, %k1, %k1 3773; AVX512F-NEXT: korw %k1, %k0, %k0 3774; AVX512F-NEXT: movw $-17, %ax 3775; AVX512F-NEXT: kmovw %eax, %k5 3776; AVX512F-NEXT: kandw %k5, %k0, %k0 3777; AVX512F-NEXT: movzbl 384(%rbp), %eax 3778; AVX512F-NEXT: kmovw %eax, %k1 3779; AVX512F-NEXT: kshiftlw $15, %k1, %k1 3780; AVX512F-NEXT: kshiftrw $11, %k1, %k1 3781; AVX512F-NEXT: korw %k1, %k0, %k0 3782; AVX512F-NEXT: movw $-33, %ax 3783; AVX512F-NEXT: kmovw %eax, %k6 3784; AVX512F-NEXT: kandw %k6, %k0, %k0 3785; AVX512F-NEXT: movzbl 392(%rbp), %eax 3786; AVX512F-NEXT: kmovw %eax, %k1 3787; AVX512F-NEXT: kshiftlw $15, %k1, %k1 3788; AVX512F-NEXT: kshiftrw $10, %k1, %k1 3789; AVX512F-NEXT: korw %k1, %k0, %k0 3790; AVX512F-NEXT: movw $-65, %ax 3791; AVX512F-NEXT: kmovw %eax, %k1 3792; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 3793; AVX512F-NEXT: kandw %k1, %k0, %k0 3794; AVX512F-NEXT: movzbl 400(%rbp), %eax 3795; AVX512F-NEXT: kmovw %eax, %k1 3796; AVX512F-NEXT: kshiftlw $15, %k1, %k1 3797; AVX512F-NEXT: kshiftrw $9, %k1, %k1 3798; AVX512F-NEXT: korw %k1, %k0, %k0 3799; AVX512F-NEXT: movw $-129, %ax 3800; AVX512F-NEXT: kmovw %eax, %k1 3801; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 3802; AVX512F-NEXT: kandw %k1, %k0, %k0 3803; AVX512F-NEXT: movzbl 408(%rbp), %eax 3804; AVX512F-NEXT: kmovw %eax, %k1 3805; AVX512F-NEXT: kshiftlw $15, %k1, %k1 3806; AVX512F-NEXT: kshiftrw $8, %k1, %k1 3807; AVX512F-NEXT: korw %k1, %k0, %k0 3808; AVX512F-NEXT: movw $-257, %ax # imm = 0xFEFF 3809; AVX512F-NEXT: kmovw %eax, %k1 3810; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 3811; AVX512F-NEXT: kandw %k1, %k0, %k0 3812; AVX512F-NEXT: movzbl 416(%rbp), %eax 3813; AVX512F-NEXT: kmovw %eax, %k1 3814; AVX512F-NEXT: kshiftlw $15, %k1, %k1 3815; AVX512F-NEXT: kshiftrw $7, %k1, %k1 3816; AVX512F-NEXT: korw %k1, %k0, %k0 3817; AVX512F-NEXT: movw $-513, %ax # imm = 0xFDFF 3818; AVX512F-NEXT: kmovw %eax, %k1 3819; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 3820; AVX512F-NEXT: kandw %k1, %k0, %k0 3821; AVX512F-NEXT: movzbl 424(%rbp), %eax 3822; AVX512F-NEXT: kmovw %eax, %k1 3823; AVX512F-NEXT: kshiftlw $15, %k1, %k1 3824; AVX512F-NEXT: kshiftrw $6, %k1, %k1 3825; AVX512F-NEXT: korw %k1, %k0, %k0 3826; AVX512F-NEXT: movw $-1025, %ax # imm = 0xFBFF 3827; AVX512F-NEXT: kmovw %eax, %k1 3828; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 3829; AVX512F-NEXT: kandw %k1, %k0, %k0 3830; AVX512F-NEXT: movzbl 432(%rbp), %eax 3831; AVX512F-NEXT: kmovw %eax, %k1 3832; AVX512F-NEXT: kshiftlw $15, %k1, %k1 3833; AVX512F-NEXT: kshiftrw $5, %k1, %k1 3834; AVX512F-NEXT: korw %k1, %k0, %k0 3835; AVX512F-NEXT: movw $-2049, %ax # imm = 0xF7FF 3836; AVX512F-NEXT: kmovw %eax, %k1 3837; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 3838; AVX512F-NEXT: kandw %k1, %k0, %k0 3839; AVX512F-NEXT: movzbl 440(%rbp), %eax 3840; AVX512F-NEXT: kmovw %eax, %k1 3841; AVX512F-NEXT: kshiftlw $15, %k1, %k1 3842; AVX512F-NEXT: kshiftrw $4, %k1, %k1 3843; AVX512F-NEXT: korw %k1, %k0, %k0 3844; AVX512F-NEXT: movw $-4097, %ax # imm = 0xEFFF 3845; AVX512F-NEXT: kmovw %eax, %k1 3846; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 3847; AVX512F-NEXT: kandw %k1, %k0, %k0 3848; AVX512F-NEXT: movzbl 448(%rbp), %eax 3849; AVX512F-NEXT: kmovw %eax, %k1 3850; AVX512F-NEXT: kshiftlw $15, %k1, %k1 3851; AVX512F-NEXT: kshiftrw $3, %k1, %k1 3852; AVX512F-NEXT: korw %k1, %k0, %k0 3853; AVX512F-NEXT: movw $-8193, %ax # imm = 0xDFFF 3854; AVX512F-NEXT: kmovw %eax, %k1 3855; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 3856; AVX512F-NEXT: kandw %k1, %k0, %k0 3857; AVX512F-NEXT: movzbl 456(%rbp), %eax 3858; AVX512F-NEXT: kmovw %eax, %k1 3859; AVX512F-NEXT: kshiftlw $15, %k1, %k1 3860; AVX512F-NEXT: kshiftrw $2, %k1, %k1 3861; AVX512F-NEXT: korw %k1, %k0, %k1 3862; AVX512F-NEXT: movw $-16385, %ax # imm = 0xBFFF 3863; AVX512F-NEXT: kmovw %eax, %k4 3864; AVX512F-NEXT: kandw %k4, %k1, %k1 3865; AVX512F-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 3866; AVX512F-NEXT: movzbl 464(%rbp), %eax 3867; AVX512F-NEXT: kmovw %eax, %k2 3868; AVX512F-NEXT: kshiftlw $14, %k2, %k2 3869; AVX512F-NEXT: korw %k2, %k1, %k1 3870; AVX512F-NEXT: kshiftlw $1, %k1, %k1 3871; AVX512F-NEXT: kshiftrw $1, %k1, %k1 3872; AVX512F-NEXT: movzbl 472(%rbp), %eax 3873; AVX512F-NEXT: kmovw %eax, %k2 3874; AVX512F-NEXT: kshiftlw $15, %k2, %k2 3875; AVX512F-NEXT: korw %k2, %k1, %k1 3876; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 3877; AVX512F-NEXT: movzbl 224(%rbp), %eax 3878; AVX512F-NEXT: andl $1, %eax 3879; AVX512F-NEXT: movzbl 232(%rbp), %r10d 3880; AVX512F-NEXT: kmovw %r10d, %k1 3881; AVX512F-NEXT: kshiftlw $15, %k1, %k1 3882; AVX512F-NEXT: kshiftrw $14, %k1, %k1 3883; AVX512F-NEXT: kmovw %eax, %k2 3884; AVX512F-NEXT: korw %k1, %k2, %k1 3885; AVX512F-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 3886; AVX512F-NEXT: kandw %k3, %k1, %k1 3887; AVX512F-NEXT: movzbl 240(%rbp), %eax 3888; AVX512F-NEXT: kmovw %eax, %k2 3889; AVX512F-NEXT: kshiftlw $15, %k2, %k2 3890; AVX512F-NEXT: kshiftrw $13, %k2, %k2 3891; AVX512F-NEXT: korw %k2, %k1, %k1 3892; AVX512F-NEXT: kandw %k7, %k1, %k1 3893; AVX512F-NEXT: movzbl 248(%rbp), %eax 3894; AVX512F-NEXT: kmovw %eax, %k2 3895; AVX512F-NEXT: kshiftlw $15, %k2, %k2 3896; AVX512F-NEXT: kshiftrw $12, %k2, %k2 3897; AVX512F-NEXT: korw %k2, %k1, %k1 3898; AVX512F-NEXT: kandw %k5, %k1, %k1 3899; AVX512F-NEXT: movzbl 256(%rbp), %eax 3900; AVX512F-NEXT: kmovw %eax, %k2 3901; AVX512F-NEXT: kshiftlw $15, %k2, %k2 3902; AVX512F-NEXT: kshiftrw $11, %k2, %k2 3903; AVX512F-NEXT: korw %k2, %k1, %k1 3904; AVX512F-NEXT: kandw %k6, %k1, %k1 3905; AVX512F-NEXT: movzbl 264(%rbp), %eax 3906; AVX512F-NEXT: kmovw %eax, %k2 3907; AVX512F-NEXT: kshiftlw $15, %k2, %k2 3908; AVX512F-NEXT: kshiftrw $10, %k2, %k2 3909; AVX512F-NEXT: korw %k2, %k1, %k1 3910; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 3911; AVX512F-NEXT: kandw %k7, %k1, %k1 3912; AVX512F-NEXT: movzbl 272(%rbp), %eax 3913; AVX512F-NEXT: kmovw %eax, %k2 3914; AVX512F-NEXT: kshiftlw $15, %k2, %k2 3915; AVX512F-NEXT: kshiftrw $9, %k2, %k2 3916; AVX512F-NEXT: korw %k2, %k1, %k0 3917; AVX512F-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 3918; AVX512F-NEXT: movzbl 280(%rbp), %eax 3919; AVX512F-NEXT: kmovw %eax, %k1 3920; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 3921; AVX512F-NEXT: kshiftlw $15, %k1, %k1 3922; AVX512F-NEXT: kshiftrw $8, %k1, %k1 3923; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 3924; AVX512F-NEXT: kandw %k2, %k0, %k2 3925; AVX512F-NEXT: korw %k1, %k2, %k1 3926; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 3927; AVX512F-NEXT: kandw %k0, %k1, %k1 3928; AVX512F-NEXT: movzbl 288(%rbp), %eax 3929; AVX512F-NEXT: kmovw %eax, %k0 3930; AVX512F-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 3931; AVX512F-NEXT: kshiftlw $15, %k0, %k2 3932; AVX512F-NEXT: kshiftrw $7, %k2, %k2 3933; AVX512F-NEXT: korw %k2, %k1, %k1 3934; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 3935; AVX512F-NEXT: kandw %k0, %k1, %k1 3936; AVX512F-NEXT: movzbl 296(%rbp), %eax 3937; AVX512F-NEXT: kmovw %eax, %k2 3938; AVX512F-NEXT: kshiftlw $15, %k2, %k0 3939; AVX512F-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 3940; AVX512F-NEXT: kshiftrw $6, %k0, %k2 3941; AVX512F-NEXT: korw %k2, %k1, %k1 3942; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 3943; AVX512F-NEXT: kandw %k0, %k1, %k1 3944; AVX512F-NEXT: movzbl 304(%rbp), %eax 3945; AVX512F-NEXT: kmovw %eax, %k2 3946; AVX512F-NEXT: kshiftlw $15, %k2, %k0 3947; AVX512F-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 3948; AVX512F-NEXT: kshiftrw $5, %k0, %k2 3949; AVX512F-NEXT: korw %k2, %k1, %k1 3950; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 3951; AVX512F-NEXT: kandw %k0, %k1, %k1 3952; AVX512F-NEXT: movzbl 312(%rbp), %eax 3953; AVX512F-NEXT: kmovw %eax, %k2 3954; AVX512F-NEXT: kshiftlw $15, %k2, %k0 3955; AVX512F-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 3956; AVX512F-NEXT: kshiftrw $4, %k0, %k2 3957; AVX512F-NEXT: korw %k2, %k1, %k1 3958; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 3959; AVX512F-NEXT: kandw %k0, %k1, %k1 3960; AVX512F-NEXT: movzbl 320(%rbp), %eax 3961; AVX512F-NEXT: kmovw %eax, %k2 3962; AVX512F-NEXT: kshiftlw $15, %k2, %k0 3963; AVX512F-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 3964; AVX512F-NEXT: kshiftrw $3, %k0, %k2 3965; AVX512F-NEXT: korw %k2, %k1, %k1 3966; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 3967; AVX512F-NEXT: kandw %k0, %k1, %k1 3968; AVX512F-NEXT: movzbl 328(%rbp), %eax 3969; AVX512F-NEXT: kmovw %eax, %k2 3970; AVX512F-NEXT: kshiftlw $15, %k2, %k2 3971; AVX512F-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 3972; AVX512F-NEXT: kshiftrw $2, %k2, %k2 3973; AVX512F-NEXT: korw %k2, %k1, %k1 3974; AVX512F-NEXT: kandw %k4, %k1, %k1 3975; AVX512F-NEXT: movzbl 336(%rbp), %eax 3976; AVX512F-NEXT: kmovw %eax, %k2 3977; AVX512F-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 3978; AVX512F-NEXT: kshiftlw $14, %k2, %k2 3979; AVX512F-NEXT: korw %k2, %k1, %k1 3980; AVX512F-NEXT: kshiftlw $1, %k1, %k1 3981; AVX512F-NEXT: kshiftrw $1, %k1, %k1 3982; AVX512F-NEXT: movzbl 344(%rbp), %eax 3983; AVX512F-NEXT: kmovw %eax, %k2 3984; AVX512F-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 3985; AVX512F-NEXT: kshiftlw $15, %k2, %k2 3986; AVX512F-NEXT: korw %k2, %k1, %k1 3987; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 3988; AVX512F-NEXT: movzbl 96(%rbp), %eax 3989; AVX512F-NEXT: andl $1, %eax 3990; AVX512F-NEXT: movzbl 104(%rbp), %r10d 3991; AVX512F-NEXT: kmovw %r10d, %k1 3992; AVX512F-NEXT: kshiftlw $15, %k1, %k1 3993; AVX512F-NEXT: kshiftrw $14, %k1, %k1 3994; AVX512F-NEXT: kmovw %eax, %k2 3995; AVX512F-NEXT: korw %k1, %k2, %k1 3996; AVX512F-NEXT: kandw %k3, %k1, %k1 3997; AVX512F-NEXT: movzbl 112(%rbp), %eax 3998; AVX512F-NEXT: kmovw %eax, %k2 3999; AVX512F-NEXT: kshiftlw $15, %k2, %k2 4000; AVX512F-NEXT: kshiftrw $13, %k2, %k2 4001; AVX512F-NEXT: korw %k2, %k1, %k1 4002; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 4003; AVX512F-NEXT: kandw %k4, %k1, %k1 4004; AVX512F-NEXT: movzbl 120(%rbp), %eax 4005; AVX512F-NEXT: kmovw %eax, %k2 4006; AVX512F-NEXT: kshiftlw $15, %k2, %k2 4007; AVX512F-NEXT: kshiftrw $12, %k2, %k2 4008; AVX512F-NEXT: korw %k2, %k1, %k1 4009; AVX512F-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 4010; AVX512F-NEXT: kandw %k5, %k1, %k1 4011; AVX512F-NEXT: movzbl 128(%rbp), %eax 4012; AVX512F-NEXT: kmovw %eax, %k2 4013; AVX512F-NEXT: kshiftlw $15, %k2, %k2 4014; AVX512F-NEXT: kshiftrw $11, %k2, %k2 4015; AVX512F-NEXT: korw %k2, %k1, %k1 4016; AVX512F-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 4017; AVX512F-NEXT: kandw %k6, %k1, %k1 4018; AVX512F-NEXT: movzbl 136(%rbp), %eax 4019; AVX512F-NEXT: kmovw %eax, %k2 4020; AVX512F-NEXT: kshiftlw $15, %k2, %k2 4021; AVX512F-NEXT: kshiftrw $10, %k2, %k2 4022; AVX512F-NEXT: korw %k2, %k1, %k1 4023; AVX512F-NEXT: kandw %k7, %k1, %k1 4024; AVX512F-NEXT: movzbl 144(%rbp), %eax 4025; AVX512F-NEXT: kmovw %eax, %k2 4026; AVX512F-NEXT: kshiftlw $15, %k2, %k2 4027; AVX512F-NEXT: kshiftrw $9, %k2, %k2 4028; AVX512F-NEXT: korw %k2, %k1, %k1 4029; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 4030; AVX512F-NEXT: kandw %k2, %k1, %k1 4031; AVX512F-NEXT: movzbl 152(%rbp), %eax 4032; AVX512F-NEXT: kmovw %eax, %k2 4033; AVX512F-NEXT: kshiftlw $15, %k2, %k2 4034; AVX512F-NEXT: kshiftrw $8, %k2, %k2 4035; AVX512F-NEXT: korw %k2, %k1, %k1 4036; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 4037; AVX512F-NEXT: kandw %k3, %k1, %k1 4038; AVX512F-NEXT: movzbl 160(%rbp), %eax 4039; AVX512F-NEXT: kmovw %eax, %k2 4040; AVX512F-NEXT: kshiftlw $15, %k2, %k2 4041; AVX512F-NEXT: kshiftrw $7, %k2, %k2 4042; AVX512F-NEXT: korw %k2, %k1, %k1 4043; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 4044; AVX512F-NEXT: kandw %k7, %k1, %k1 4045; AVX512F-NEXT: movzbl 168(%rbp), %eax 4046; AVX512F-NEXT: kmovw %eax, %k2 4047; AVX512F-NEXT: kshiftlw $15, %k2, %k2 4048; AVX512F-NEXT: kshiftrw $6, %k2, %k2 4049; AVX512F-NEXT: korw %k2, %k1, %k1 4050; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 4051; AVX512F-NEXT: kandw %k2, %k1, %k1 4052; AVX512F-NEXT: movzbl 176(%rbp), %eax 4053; AVX512F-NEXT: kmovw %eax, %k2 4054; AVX512F-NEXT: kshiftlw $15, %k2, %k2 4055; AVX512F-NEXT: kshiftrw $5, %k2, %k2 4056; AVX512F-NEXT: korw %k2, %k1, %k1 4057; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 4058; AVX512F-NEXT: kandw %k2, %k1, %k1 4059; AVX512F-NEXT: movzbl 184(%rbp), %eax 4060; AVX512F-NEXT: kmovw %eax, %k2 4061; AVX512F-NEXT: kshiftlw $15, %k2, %k2 4062; AVX512F-NEXT: kshiftrw $4, %k2, %k2 4063; AVX512F-NEXT: korw %k2, %k1, %k1 4064; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 4065; AVX512F-NEXT: kandw %k2, %k1, %k1 4066; AVX512F-NEXT: movzbl 192(%rbp), %eax 4067; AVX512F-NEXT: kmovw %eax, %k2 4068; AVX512F-NEXT: kshiftlw $15, %k2, %k2 4069; AVX512F-NEXT: kshiftrw $3, %k2, %k2 4070; AVX512F-NEXT: korw %k2, %k1, %k1 4071; AVX512F-NEXT: kandw %k0, %k1, %k1 4072; AVX512F-NEXT: movzbl 200(%rbp), %eax 4073; AVX512F-NEXT: kmovw %eax, %k2 4074; AVX512F-NEXT: kshiftlw $15, %k2, %k2 4075; AVX512F-NEXT: kshiftrw $2, %k2, %k2 4076; AVX512F-NEXT: korw %k2, %k1, %k1 4077; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 4078; AVX512F-NEXT: kandw %k2, %k1, %k1 4079; AVX512F-NEXT: movzbl 208(%rbp), %eax 4080; AVX512F-NEXT: kmovw %eax, %k2 4081; AVX512F-NEXT: kshiftlw $14, %k2, %k2 4082; AVX512F-NEXT: korw %k2, %k1, %k1 4083; AVX512F-NEXT: kshiftlw $1, %k1, %k1 4084; AVX512F-NEXT: kshiftrw $1, %k1, %k1 4085; AVX512F-NEXT: movzbl 216(%rbp), %eax 4086; AVX512F-NEXT: kmovw %eax, %k2 4087; AVX512F-NEXT: kshiftlw $15, %k2, %k2 4088; AVX512F-NEXT: korw %k2, %k1, %k1 4089; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 4090; AVX512F-NEXT: andl $1, %edi 4091; AVX512F-NEXT: kmovw %esi, %k1 4092; AVX512F-NEXT: kshiftlw $15, %k1, %k1 4093; AVX512F-NEXT: kshiftrw $14, %k1, %k1 4094; AVX512F-NEXT: kmovw %edi, %k2 4095; AVX512F-NEXT: korw %k1, %k2, %k1 4096; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 4097; AVX512F-NEXT: kandw %k2, %k1, %k1 4098; AVX512F-NEXT: kmovw %edx, %k2 4099; AVX512F-NEXT: kshiftlw $15, %k2, %k2 4100; AVX512F-NEXT: kshiftrw $13, %k2, %k2 4101; AVX512F-NEXT: korw %k2, %k1, %k1 4102; AVX512F-NEXT: kandw %k4, %k1, %k1 4103; AVX512F-NEXT: kmovw %ecx, %k2 4104; AVX512F-NEXT: kshiftlw $15, %k2, %k2 4105; AVX512F-NEXT: kshiftrw $12, %k2, %k2 4106; AVX512F-NEXT: korw %k2, %k1, %k1 4107; AVX512F-NEXT: kandw %k5, %k1, %k1 4108; AVX512F-NEXT: kmovw %r8d, %k2 4109; AVX512F-NEXT: kshiftlw $15, %k2, %k2 4110; AVX512F-NEXT: kshiftrw $11, %k2, %k2 4111; AVX512F-NEXT: korw %k2, %k1, %k1 4112; AVX512F-NEXT: kandw %k6, %k1, %k1 4113; AVX512F-NEXT: kmovw %r9d, %k2 4114; AVX512F-NEXT: kshiftlw $15, %k2, %k2 4115; AVX512F-NEXT: kshiftrw $10, %k2, %k2 4116; AVX512F-NEXT: korw %k2, %k1, %k1 4117; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 4118; AVX512F-NEXT: kandw %k2, %k1, %k1 4119; AVX512F-NEXT: movzbl 16(%rbp), %eax 4120; AVX512F-NEXT: kmovw %eax, %k2 4121; AVX512F-NEXT: kshiftlw $15, %k2, %k2 4122; AVX512F-NEXT: kshiftrw $9, %k2, %k2 4123; AVX512F-NEXT: korw %k2, %k1, %k2 4124; AVX512F-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 4125; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 4126; AVX512F-NEXT: kandw %k1, %k2, %k1 4127; AVX512F-NEXT: movzbl 24(%rbp), %eax 4128; AVX512F-NEXT: kmovw %eax, %k2 4129; AVX512F-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 4130; AVX512F-NEXT: kshiftlw $15, %k2, %k2 4131; AVX512F-NEXT: kshiftrw $8, %k2, %k2 4132; AVX512F-NEXT: korw %k2, %k1, %k1 4133; AVX512F-NEXT: kandw %k3, %k1, %k1 4134; AVX512F-NEXT: movzbl 32(%rbp), %eax 4135; AVX512F-NEXT: kmovw %eax, %k2 4136; AVX512F-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 4137; AVX512F-NEXT: kshiftlw $15, %k2, %k2 4138; AVX512F-NEXT: kshiftrw $7, %k2, %k2 4139; AVX512F-NEXT: korw %k2, %k1, %k1 4140; AVX512F-NEXT: kandw %k7, %k1, %k1 4141; AVX512F-NEXT: movzbl 40(%rbp), %eax 4142; AVX512F-NEXT: kmovw %eax, %k2 4143; AVX512F-NEXT: kshiftlw $15, %k2, %k2 4144; AVX512F-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 4145; AVX512F-NEXT: kshiftrw $6, %k2, %k2 4146; AVX512F-NEXT: korw %k2, %k1, %k1 4147; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 4148; AVX512F-NEXT: kandw %k2, %k1, %k1 4149; AVX512F-NEXT: movzbl 48(%rbp), %eax 4150; AVX512F-NEXT: kmovw %eax, %k2 4151; AVX512F-NEXT: kshiftlw $15, %k2, %k5 4152; AVX512F-NEXT: kshiftrw $5, %k5, %k2 4153; AVX512F-NEXT: korw %k2, %k1, %k1 4154; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 4155; AVX512F-NEXT: kandw %k2, %k1, %k1 4156; AVX512F-NEXT: movzbl 56(%rbp), %eax 4157; AVX512F-NEXT: kmovw %eax, %k2 4158; AVX512F-NEXT: kshiftlw $15, %k2, %k4 4159; AVX512F-NEXT: kshiftrw $4, %k4, %k2 4160; AVX512F-NEXT: korw %k2, %k1, %k1 4161; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 4162; AVX512F-NEXT: kandw %k2, %k1, %k1 4163; AVX512F-NEXT: movzbl 64(%rbp), %eax 4164; AVX512F-NEXT: kmovw %eax, %k2 4165; AVX512F-NEXT: kshiftlw $15, %k2, %k3 4166; AVX512F-NEXT: kshiftrw $3, %k3, %k2 4167; AVX512F-NEXT: korw %k2, %k1, %k1 4168; AVX512F-NEXT: kandw %k0, %k1, %k1 4169; AVX512F-NEXT: movzbl 72(%rbp), %eax 4170; AVX512F-NEXT: kmovw %eax, %k2 4171; AVX512F-NEXT: kshiftlw $15, %k2, %k2 4172; AVX512F-NEXT: kshiftrw $2, %k2, %k0 4173; AVX512F-NEXT: korw %k0, %k1, %k0 4174; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 4175; AVX512F-NEXT: kandw %k1, %k0, %k0 4176; AVX512F-NEXT: movzbl 80(%rbp), %eax 4177; AVX512F-NEXT: kmovw %eax, %k1 4178; AVX512F-NEXT: kshiftlw $14, %k1, %k7 4179; AVX512F-NEXT: korw %k7, %k0, %k0 4180; AVX512F-NEXT: kshiftlw $1, %k0, %k0 4181; AVX512F-NEXT: kshiftrw $1, %k0, %k7 4182; AVX512F-NEXT: movzbl 88(%rbp), %eax 4183; AVX512F-NEXT: kmovw %eax, %k0 4184; AVX512F-NEXT: kshiftlw $15, %k0, %k6 4185; AVX512F-NEXT: korw %k6, %k7, %k6 4186; AVX512F-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 4187; AVX512F-NEXT: movw $-3, %ax 4188; AVX512F-NEXT: kmovw %eax, %k6 4189; AVX512F-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 4190; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 4191; AVX512F-NEXT: kandw %k6, %k7, %k6 4192; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 4193; AVX512F-NEXT: kshiftrw $14, %k7, %k7 4194; AVX512F-NEXT: korw %k7, %k6, %k6 4195; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 4196; AVX512F-NEXT: kandw %k7, %k6, %k6 4197; AVX512F-NEXT: kshiftrw $13, %k5, %k5 4198; AVX512F-NEXT: korw %k5, %k6, %k5 4199; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 4200; AVX512F-NEXT: kandw %k6, %k5, %k5 4201; AVX512F-NEXT: kshiftrw $12, %k4, %k4 4202; AVX512F-NEXT: korw %k4, %k5, %k4 4203; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 4204; AVX512F-NEXT: kandw %k5, %k4, %k4 4205; AVX512F-NEXT: kshiftrw $11, %k3, %k3 4206; AVX512F-NEXT: korw %k3, %k4, %k3 4207; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 4208; AVX512F-NEXT: kandw %k4, %k3, %k3 4209; AVX512F-NEXT: kshiftrw $10, %k2, %k2 4210; AVX512F-NEXT: korw %k2, %k3, %k2 4211; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 4212; AVX512F-NEXT: kandw %k3, %k2, %k2 4213; AVX512F-NEXT: kshiftlw $6, %k1, %k1 4214; AVX512F-NEXT: korw %k1, %k2, %k1 4215; AVX512F-NEXT: kshiftlw $9, %k1, %k1 4216; AVX512F-NEXT: kshiftrw $9, %k1, %k1 4217; AVX512F-NEXT: kshiftlw $7, %k0, %k0 4218; AVX512F-NEXT: korw %k0, %k1, %k0 4219; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 4220; AVX512F-NEXT: kshiftlw $9, %k1, %k1 4221; AVX512F-NEXT: kshiftrw $9, %k1, %k1 4222; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 4223; AVX512F-NEXT: kshiftlw $7, %k2, %k2 4224; AVX512F-NEXT: korw %k2, %k1, %k1 4225; AVX512F-NEXT: kxorw %k0, %k1, %k0 4226; AVX512F-NEXT: kshiftrw $4, %k0, %k1 4227; AVX512F-NEXT: kxorw %k1, %k0, %k0 4228; AVX512F-NEXT: kshiftrw $2, %k0, %k1 4229; AVX512F-NEXT: kxorw %k1, %k0, %k0 4230; AVX512F-NEXT: kshiftrw $1, %k0, %k1 4231; AVX512F-NEXT: kxorw %k1, %k0, %k0 4232; AVX512F-NEXT: kmovw %k0, %eax 4233; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 4234; AVX512F-NEXT: vpcompressd %zmm2, %zmm2 {%k1} {z} 4235; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 4236; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 4237; AVX512F-NEXT: kandw %k1, %k0, %k0 4238; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 4239; AVX512F-NEXT: kshiftrw $14, %k1, %k1 4240; AVX512F-NEXT: korw %k1, %k0, %k0 4241; AVX512F-NEXT: kandw %k7, %k0, %k0 4242; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 4243; AVX512F-NEXT: kshiftrw $13, %k1, %k1 4244; AVX512F-NEXT: korw %k1, %k0, %k0 4245; AVX512F-NEXT: kandw %k6, %k0, %k0 4246; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 4247; AVX512F-NEXT: kshiftrw $12, %k1, %k1 4248; AVX512F-NEXT: korw %k1, %k0, %k0 4249; AVX512F-NEXT: kandw %k5, %k0, %k0 4250; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 4251; AVX512F-NEXT: kshiftrw $11, %k1, %k1 4252; AVX512F-NEXT: korw %k1, %k0, %k0 4253; AVX512F-NEXT: kandw %k4, %k0, %k0 4254; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 4255; AVX512F-NEXT: kshiftrw $10, %k1, %k1 4256; AVX512F-NEXT: korw %k1, %k0, %k0 4257; AVX512F-NEXT: kandw %k3, %k0, %k0 4258; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 4259; AVX512F-NEXT: kshiftlw $6, %k1, %k1 4260; AVX512F-NEXT: korw %k1, %k0, %k0 4261; AVX512F-NEXT: kshiftlw $9, %k0, %k0 4262; AVX512F-NEXT: kshiftrw $9, %k0, %k0 4263; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 4264; AVX512F-NEXT: kshiftlw $7, %k1, %k1 4265; AVX512F-NEXT: korw %k1, %k0, %k0 4266; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 4267; AVX512F-NEXT: kshiftlw $9, %k1, %k1 4268; AVX512F-NEXT: kshiftrw $9, %k1, %k1 4269; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 4270; AVX512F-NEXT: kshiftlw $7, %k2, %k2 4271; AVX512F-NEXT: korw %k2, %k1, %k1 4272; AVX512F-NEXT: kxorw %k0, %k1, %k0 4273; AVX512F-NEXT: kshiftrw $4, %k0, %k1 4274; AVX512F-NEXT: kxorw %k1, %k0, %k0 4275; AVX512F-NEXT: kshiftrw $2, %k0, %k1 4276; AVX512F-NEXT: kxorw %k1, %k0, %k0 4277; AVX512F-NEXT: kshiftrw $1, %k0, %k1 4278; AVX512F-NEXT: kxorw %k1, %k0, %k0 4279; AVX512F-NEXT: kmovw %k0, %ecx 4280; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 4281; AVX512F-NEXT: vpcompressd %zmm3, %zmm3 {%k1} {z} 4282; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 4283; AVX512F-NEXT: vpcompressd %zmm0, %zmm0 {%k2} {z} 4284; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 4285; AVX512F-NEXT: vpcompressd %zmm1, %zmm1 {%k1} {z} 4286; AVX512F-NEXT: kxorw %k1, %k2, %k0 4287; AVX512F-NEXT: kshiftrw $8, %k0, %k1 4288; AVX512F-NEXT: kxorw %k1, %k0, %k0 4289; AVX512F-NEXT: kshiftrw $4, %k0, %k1 4290; AVX512F-NEXT: kxorw %k1, %k0, %k0 4291; AVX512F-NEXT: kshiftrw $2, %k0, %k1 4292; AVX512F-NEXT: kxorw %k1, %k0, %k0 4293; AVX512F-NEXT: kshiftrw $1, %k0, %k1 4294; AVX512F-NEXT: kxorw %k1, %k0, %k0 4295; AVX512F-NEXT: kmovw %k0, %edx 4296; AVX512F-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) 4297; AVX512F-NEXT: andl $31, %eax 4298; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%rsp,%rax,4) 4299; AVX512F-NEXT: vmovdqa64 %zmm2, {{[0-9]+}}(%rsp) 4300; AVX512F-NEXT: andl $31, %ecx 4301; AVX512F-NEXT: vmovdqa64 %zmm3, 192(%rsp,%rcx,4) 4302; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm0 4303; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm1 4304; AVX512F-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) 4305; AVX512F-NEXT: andl $63, %edx 4306; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm0 4307; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm2 4308; AVX512F-NEXT: vmovaps %zmm0, 320(%rsp,%rdx,4) 4309; AVX512F-NEXT: vmovaps %zmm1, {{[0-9]+}}(%rsp) 4310; AVX512F-NEXT: vmovaps %zmm2, 384(%rsp,%rdx,4) 4311; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm0 4312; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm1 4313; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm2 4314; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm3 4315; AVX512F-NEXT: movq %rbp, %rsp 4316; AVX512F-NEXT: popq %rbp 4317; AVX512F-NEXT: retq 4318; 4319; AVX512VL-LABEL: test_compress_large: 4320; AVX512VL: # %bb.0: 4321; AVX512VL-NEXT: pushq %rbp 4322; AVX512VL-NEXT: movq %rsp, %rbp 4323; AVX512VL-NEXT: andq $-64, %rsp 4324; AVX512VL-NEXT: subq $576, %rsp # imm = 0x240 4325; AVX512VL-NEXT: vpsllw $7, %zmm0, %zmm0 4326; AVX512VL-NEXT: vpmovb2m %zmm0, %k1 4327; AVX512VL-NEXT: kshiftrq $48, %k1, %k3 4328; AVX512VL-NEXT: kshiftrq $32, %k1, %k4 4329; AVX512VL-NEXT: kshiftrq $16, %k1, %k2 4330; AVX512VL-NEXT: vpcompressd %zmm1, %zmm0 {%k1} {z} 4331; AVX512VL-NEXT: vmovdqa64 %zmm0, (%rsp) 4332; AVX512VL-NEXT: kshiftrq $8, %k1, %k0 4333; AVX512VL-NEXT: kxorw %k0, %k1, %k0 4334; AVX512VL-NEXT: kshiftrw $4, %k0, %k5 4335; AVX512VL-NEXT: kxorw %k5, %k0, %k0 4336; AVX512VL-NEXT: kshiftrw $2, %k0, %k5 4337; AVX512VL-NEXT: kxorw %k5, %k0, %k0 4338; AVX512VL-NEXT: kshiftrw $1, %k0, %k5 4339; AVX512VL-NEXT: kxorw %k5, %k0, %k0 4340; AVX512VL-NEXT: kmovd %k0, %eax 4341; AVX512VL-NEXT: andl $31, %eax 4342; AVX512VL-NEXT: vpcompressd %zmm2, %zmm0 {%k2} {z} 4343; AVX512VL-NEXT: vmovdqa64 %zmm0, (%rsp,%rax,4) 4344; AVX512VL-NEXT: vpcompressd %zmm3, %zmm0 {%k4} {z} 4345; AVX512VL-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) 4346; AVX512VL-NEXT: kshiftrq $40, %k1, %k0 4347; AVX512VL-NEXT: kxorw %k0, %k4, %k0 4348; AVX512VL-NEXT: kshiftrw $4, %k0, %k4 4349; AVX512VL-NEXT: kxorw %k4, %k0, %k0 4350; AVX512VL-NEXT: kshiftrw $2, %k0, %k4 4351; AVX512VL-NEXT: kxorw %k4, %k0, %k0 4352; AVX512VL-NEXT: kshiftrw $1, %k0, %k4 4353; AVX512VL-NEXT: kxorw %k4, %k0, %k0 4354; AVX512VL-NEXT: kmovd %k0, %eax 4355; AVX512VL-NEXT: andl $31, %eax 4356; AVX512VL-NEXT: vpcompressd %zmm4, %zmm0 {%k3} {z} 4357; AVX512VL-NEXT: vmovdqa64 %zmm0, 128(%rsp,%rax,4) 4358; AVX512VL-NEXT: vmovaps (%rsp), %zmm0 4359; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm1 4360; AVX512VL-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) 4361; AVX512VL-NEXT: kxorw %k2, %k1, %k0 4362; AVX512VL-NEXT: kshiftrw $8, %k0, %k1 4363; AVX512VL-NEXT: kxorw %k1, %k0, %k0 4364; AVX512VL-NEXT: kshiftrw $4, %k0, %k1 4365; AVX512VL-NEXT: kxorw %k1, %k0, %k0 4366; AVX512VL-NEXT: kshiftrw $2, %k0, %k1 4367; AVX512VL-NEXT: kxorw %k1, %k0, %k0 4368; AVX512VL-NEXT: kshiftrw $1, %k0, %k1 4369; AVX512VL-NEXT: kxorw %k1, %k0, %k0 4370; AVX512VL-NEXT: kmovd %k0, %eax 4371; AVX512VL-NEXT: andl $63, %eax 4372; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm0 4373; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm2 4374; AVX512VL-NEXT: vmovaps %zmm0, 256(%rsp,%rax,4) 4375; AVX512VL-NEXT: vmovaps %zmm1, {{[0-9]+}}(%rsp) 4376; AVX512VL-NEXT: vmovaps %zmm2, 320(%rsp,%rax,4) 4377; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm0 4378; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm1 4379; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm2 4380; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm3 4381; AVX512VL-NEXT: movq %rbp, %rsp 4382; AVX512VL-NEXT: popq %rbp 4383; AVX512VL-NEXT: retq 4384 %out = call <64 x i32> @llvm.experimental.vector.compress(<64 x i32> %vec, <64 x i1> %mask, <64 x i32> undef) 4385 ret <64 x i32> %out 4386} 4387 4388define <4 x i32> @test_compress_all_const() nounwind { 4389; AVX2-LABEL: test_compress_all_const: 4390; AVX2: # %bb.0: 4391; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = [5,9,0,0] 4392; AVX2-NEXT: retq 4393; 4394; AVX512F-LABEL: test_compress_all_const: 4395; AVX512F: # %bb.0: 4396; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm0 = [5,9,0,0] 4397; AVX512F-NEXT: retq 4398; 4399; AVX512VL-LABEL: test_compress_all_const: 4400; AVX512VL: # %bb.0: 4401; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm0 = [5,9,0,0] 4402; AVX512VL-NEXT: retq 4403 %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> <i32 3, i32 5, i32 7, i32 9>, 4404 <4 x i1> <i1 0, i1 1, i1 0, i1 1>, 4405 <4 x i32> undef) 4406 ret <4 x i32> %out 4407} 4408 4409define <4 x i32> @test_compress_const_mask(<4 x i32> %vec) nounwind { 4410; CHECK-LABEL: test_compress_const_mask: 4411; CHECK: # %bb.0: 4412; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] 4413; CHECK-NEXT: retq 4414 %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> <i1 1, i1 undef, i1 0, i1 1>, <4 x i32> undef) 4415 ret <4 x i32> %out 4416} 4417 4418define <4 x i32> @test_compress_const_mask_passthrough(<4 x i32> %vec, <4 x i32> %passthru) nounwind { 4419; CHECK-LABEL: test_compress_const_mask_passthrough: 4420; CHECK: # %bb.0: 4421; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[2,3] 4422; CHECK-NEXT: retq 4423 %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> <i1 1, i1 undef, i1 0, i1 1>, <4 x i32> %passthru) 4424 ret <4 x i32> %out 4425} 4426 4427define <4 x i32> @test_compress_const_mask_const_passthrough(<4 x i32> %vec) nounwind { 4428; CHECK-LABEL: test_compress_const_mask_const_passthrough: 4429; CHECK: # %bb.0: 4430; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] 4431; CHECK-NEXT: movl $7, %eax 4432; CHECK-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 4433; CHECK-NEXT: movl $8, %eax 4434; CHECK-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 4435; CHECK-NEXT: retq 4436 %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x i32> <i32 5, i32 6, i32 7, i32 8>) 4437 ret <4 x i32> %out 4438} 4439 4440; We pass a placeholder value for the const_mask* tests to check that they are converted to a no-op by simply copying 4441; the second vector input register to the return register or doing nothing. 4442define <4 x i32> @test_compress_const_splat1_mask(<4 x i32> %ignore, <4 x i32> %vec) nounwind { 4443; CHECK-LABEL: test_compress_const_splat1_mask: 4444; CHECK: # %bb.0: 4445; CHECK-NEXT: vmovaps %xmm1, %xmm0 4446; CHECK-NEXT: retq 4447 %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> splat (i1 -1), <4 x i32> undef) 4448 ret <4 x i32> %out 4449} 4450define <4 x i32> @test_compress_const_splat0_mask(<4 x i32> %ignore, <4 x i32> %vec) nounwind { 4451; CHECK-LABEL: test_compress_const_splat0_mask: 4452; CHECK: # %bb.0: 4453; CHECK-NEXT: retq 4454 %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> splat (i1 0), <4 x i32> undef) 4455 ret <4 x i32> %out 4456} 4457define <4 x i32> @test_compress_undef_mask(<4 x i32> %ignore, <4 x i32> %vec) nounwind { 4458; CHECK-LABEL: test_compress_undef_mask: 4459; CHECK: # %bb.0: 4460; CHECK-NEXT: retq 4461 %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> undef, <4 x i32> undef) 4462 ret <4 x i32> %out 4463} 4464define <4 x i32> @test_compress_const_splat0_mask_with_passthru(<4 x i32> %ignore, <4 x i32> %vec, <4 x i32> %passthru) nounwind { 4465; CHECK-LABEL: test_compress_const_splat0_mask_with_passthru: 4466; CHECK: # %bb.0: 4467; CHECK-NEXT: vmovaps %xmm2, %xmm0 4468; CHECK-NEXT: retq 4469 %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> splat (i1 0), <4 x i32> %passthru) 4470 ret <4 x i32> %out 4471} 4472define <4 x i32> @test_compress_const_splat0_mask_without_passthru(<4 x i32> %ignore, <4 x i32> %vec) nounwind { 4473; CHECK-LABEL: test_compress_const_splat0_mask_without_passthru: 4474; CHECK: # %bb.0: 4475; CHECK-NEXT: retq 4476 %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> splat (i1 0), <4 x i32> undef) 4477 ret <4 x i32> %out 4478} 4479 4480define <4 x i8> @test_compress_small(<4 x i8> %vec, <4 x i1> %mask) nounwind { 4481; AVX2-LABEL: test_compress_small: 4482; AVX2: # %bb.0: 4483; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 4484; AVX2-NEXT: vpsllw $7, %xmm1, %xmm1 4485; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 4486; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 4487; AVX2-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm1 4488; AVX2-NEXT: vpextrb $0, %xmm0, -{{[0-9]+}}(%rsp) 4489; AVX2-NEXT: vmovd %xmm1, %eax 4490; AVX2-NEXT: andl $1, %eax 4491; AVX2-NEXT: vpextrb $1, %xmm0, -24(%rsp,%rax) 4492; AVX2-NEXT: vpextrb $1, %xmm1, %ecx 4493; AVX2-NEXT: andl $1, %ecx 4494; AVX2-NEXT: addq %rax, %rcx 4495; AVX2-NEXT: vpextrb $2, %xmm0, -24(%rsp,%rcx) 4496; AVX2-NEXT: vpextrb $2, %xmm1, %eax 4497; AVX2-NEXT: andl $1, %eax 4498; AVX2-NEXT: addq %rcx, %rax 4499; AVX2-NEXT: vpextrb $3, %xmm0, -24(%rsp,%rax) 4500; AVX2-NEXT: vpextrb $3, %xmm1, %ecx 4501; AVX2-NEXT: andl $1, %ecx 4502; AVX2-NEXT: addq %rax, %rcx 4503; AVX2-NEXT: vpextrb $4, %xmm0, -24(%rsp,%rcx) 4504; AVX2-NEXT: vpextrb $4, %xmm1, %eax 4505; AVX2-NEXT: andl $1, %eax 4506; AVX2-NEXT: addq %rcx, %rax 4507; AVX2-NEXT: vpextrb $5, %xmm1, %ecx 4508; AVX2-NEXT: andl $1, %ecx 4509; AVX2-NEXT: addq %rax, %rcx 4510; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax 4511; AVX2-NEXT: andl $15, %eax 4512; AVX2-NEXT: vpextrb $5, %xmm0, -24(%rsp,%rax) 4513; AVX2-NEXT: vpextrb $6, %xmm1, %eax 4514; AVX2-NEXT: andl $1, %eax 4515; AVX2-NEXT: addq %rcx, %rax 4516; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx 4517; AVX2-NEXT: andl $15, %ecx 4518; AVX2-NEXT: vpextrb $6, %xmm0, -24(%rsp,%rcx) 4519; AVX2-NEXT: vpextrb $7, %xmm1, %ecx 4520; AVX2-NEXT: andl $1, %ecx 4521; AVX2-NEXT: addq %rax, %rcx 4522; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax 4523; AVX2-NEXT: andl $15, %eax 4524; AVX2-NEXT: vpextrb $7, %xmm0, -24(%rsp,%rax) 4525; AVX2-NEXT: vpextrb $8, %xmm1, %eax 4526; AVX2-NEXT: andl $1, %eax 4527; AVX2-NEXT: addq %rcx, %rax 4528; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx 4529; AVX2-NEXT: andl $15, %ecx 4530; AVX2-NEXT: vpextrb $8, %xmm0, -24(%rsp,%rcx) 4531; AVX2-NEXT: vpextrb $9, %xmm1, %ecx 4532; AVX2-NEXT: andl $1, %ecx 4533; AVX2-NEXT: addq %rax, %rcx 4534; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax 4535; AVX2-NEXT: andl $15, %eax 4536; AVX2-NEXT: vpextrb $9, %xmm0, -24(%rsp,%rax) 4537; AVX2-NEXT: vpextrb $10, %xmm1, %eax 4538; AVX2-NEXT: andl $1, %eax 4539; AVX2-NEXT: addq %rcx, %rax 4540; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx 4541; AVX2-NEXT: andl $15, %ecx 4542; AVX2-NEXT: vpextrb $10, %xmm0, -24(%rsp,%rcx) 4543; AVX2-NEXT: vpextrb $11, %xmm1, %ecx 4544; AVX2-NEXT: andl $1, %ecx 4545; AVX2-NEXT: addq %rax, %rcx 4546; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax 4547; AVX2-NEXT: andl $15, %eax 4548; AVX2-NEXT: vpextrb $11, %xmm0, -24(%rsp,%rax) 4549; AVX2-NEXT: vpextrb $12, %xmm1, %eax 4550; AVX2-NEXT: andl $1, %eax 4551; AVX2-NEXT: addq %rcx, %rax 4552; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx 4553; AVX2-NEXT: andl $15, %ecx 4554; AVX2-NEXT: vpextrb $12, %xmm0, -24(%rsp,%rcx) 4555; AVX2-NEXT: vpextrb $13, %xmm1, %ecx 4556; AVX2-NEXT: andl $1, %ecx 4557; AVX2-NEXT: addq %rax, %rcx 4558; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax 4559; AVX2-NEXT: andl $15, %eax 4560; AVX2-NEXT: vpextrb $13, %xmm0, -24(%rsp,%rax) 4561; AVX2-NEXT: vpextrb $14, %xmm1, %eax 4562; AVX2-NEXT: andl $1, %eax 4563; AVX2-NEXT: addl %ecx, %eax 4564; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx 4565; AVX2-NEXT: andl $15, %ecx 4566; AVX2-NEXT: vpextrb $14, %xmm0, -24(%rsp,%rcx) 4567; AVX2-NEXT: andl $15, %eax 4568; AVX2-NEXT: vpextrb $15, %xmm0, -24(%rsp,%rax) 4569; AVX2-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 4570; AVX2-NEXT: retq 4571; 4572; AVX512F-LABEL: test_compress_small: 4573; AVX512F: # %bb.0: 4574; AVX512F-NEXT: vpslld $31, %xmm1, %xmm1 4575; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 4576; AVX512F-NEXT: kshiftlw $12, %k0, %k0 4577; AVX512F-NEXT: kshiftrw $12, %k0, %k1 4578; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 4579; AVX512F-NEXT: vpcompressd %zmm0, %zmm0 {%k1} {z} 4580; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 4581; AVX512F-NEXT: vzeroupper 4582; AVX512F-NEXT: retq 4583; 4584; AVX512VL-LABEL: test_compress_small: 4585; AVX512VL: # %bb.0: 4586; AVX512VL-NEXT: vpslld $31, %xmm1, %xmm1 4587; AVX512VL-NEXT: vptestmd %xmm1, %xmm1, %k1 4588; AVX512VL-NEXT: vpcompressb %xmm0, %xmm0 {%k1} {z} 4589; AVX512VL-NEXT: retq 4590 %out = call <4 x i8> @llvm.experimental.vector.compress(<4 x i8> %vec, <4 x i1> %mask, <4 x i8> undef) 4591 ret <4 x i8> %out 4592} 4593 4594define <4 x i4> @test_compress_illegal_element_type(<4 x i4> %vec, <4 x i1> %mask) nounwind { 4595; AVX2-LABEL: test_compress_illegal_element_type: 4596; AVX2: # %bb.0: 4597; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 4598; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1 4599; AVX2-NEXT: vmovss %xmm0, -{{[0-9]+}}(%rsp) 4600; AVX2-NEXT: vmovd %xmm1, %eax 4601; AVX2-NEXT: andl $1, %eax 4602; AVX2-NEXT: vextractps $1, %xmm0, -24(%rsp,%rax,4) 4603; AVX2-NEXT: vpextrd $1, %xmm1, %ecx 4604; AVX2-NEXT: subl %ecx, %eax 4605; AVX2-NEXT: leal (,%rax,4), %ecx 4606; AVX2-NEXT: vextractps $2, %xmm0, -24(%rsp,%rcx) 4607; AVX2-NEXT: vpextrd $2, %xmm1, %ecx 4608; AVX2-NEXT: subl %ecx, %eax 4609; AVX2-NEXT: andl $3, %eax 4610; AVX2-NEXT: vextractps $3, %xmm0, -24(%rsp,%rax,4) 4611; AVX2-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 4612; AVX2-NEXT: retq 4613; 4614; AVX512F-LABEL: test_compress_illegal_element_type: 4615; AVX512F: # %bb.0: 4616; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 4617; AVX512F-NEXT: vpslld $31, %xmm1, %xmm1 4618; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 4619; AVX512F-NEXT: kshiftlw $12, %k0, %k0 4620; AVX512F-NEXT: kshiftrw $12, %k0, %k1 4621; AVX512F-NEXT: vpcompressd %zmm0, %zmm0 {%k1} {z} 4622; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 4623; AVX512F-NEXT: vzeroupper 4624; AVX512F-NEXT: retq 4625; 4626; AVX512VL-LABEL: test_compress_illegal_element_type: 4627; AVX512VL: # %bb.0: 4628; AVX512VL-NEXT: vpslld $31, %xmm1, %xmm1 4629; AVX512VL-NEXT: vptestmd %xmm1, %xmm1, %k1 4630; AVX512VL-NEXT: vpcompressd %xmm0, %xmm0 {%k1} {z} 4631; AVX512VL-NEXT: retq 4632 %out = call <4 x i4> @llvm.experimental.vector.compress(<4 x i4> %vec, <4 x i1> %mask, <4 x i4> undef) 4633 ret <4 x i4> %out 4634} 4635 4636define <3 x i32> @test_compress_narrow(<3 x i32> %vec, <3 x i1> %mask) nounwind { 4637; AVX2-LABEL: test_compress_narrow: 4638; AVX2: # %bb.0: 4639; AVX2-NEXT: vmovd %edi, %xmm1 4640; AVX2-NEXT: vpinsrd $1, %esi, %xmm1, %xmm1 4641; AVX2-NEXT: vpinsrd $2, %edx, %xmm1, %xmm1 4642; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 4643; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1 4644; AVX2-NEXT: vmovss %xmm0, -{{[0-9]+}}(%rsp) 4645; AVX2-NEXT: vmovd %xmm1, %eax 4646; AVX2-NEXT: andl $1, %eax 4647; AVX2-NEXT: vextractps $1, %xmm0, -24(%rsp,%rax,4) 4648; AVX2-NEXT: vpextrd $1, %xmm1, %ecx 4649; AVX2-NEXT: subl %ecx, %eax 4650; AVX2-NEXT: leal (,%rax,4), %ecx 4651; AVX2-NEXT: vextractps $2, %xmm0, -24(%rsp,%rcx) 4652; AVX2-NEXT: vpextrd $2, %xmm1, %ecx 4653; AVX2-NEXT: subl %ecx, %eax 4654; AVX2-NEXT: andl $3, %eax 4655; AVX2-NEXT: vextractps $3, %xmm0, -24(%rsp,%rax,4) 4656; AVX2-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 4657; AVX2-NEXT: retq 4658; 4659; AVX512F-LABEL: test_compress_narrow: 4660; AVX512F: # %bb.0: 4661; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 4662; AVX512F-NEXT: andl $1, %edi 4663; AVX512F-NEXT: kmovw %edi, %k0 4664; AVX512F-NEXT: kmovw %esi, %k1 4665; AVX512F-NEXT: kshiftlw $15, %k1, %k1 4666; AVX512F-NEXT: kshiftrw $14, %k1, %k1 4667; AVX512F-NEXT: korw %k1, %k0, %k0 4668; AVX512F-NEXT: movw $-5, %ax 4669; AVX512F-NEXT: kmovw %eax, %k1 4670; AVX512F-NEXT: kandw %k1, %k0, %k0 4671; AVX512F-NEXT: kmovw %edx, %k1 4672; AVX512F-NEXT: kshiftlw $15, %k1, %k1 4673; AVX512F-NEXT: kshiftrw $13, %k1, %k1 4674; AVX512F-NEXT: korw %k1, %k0, %k0 4675; AVX512F-NEXT: movb $7, %al 4676; AVX512F-NEXT: kmovw %eax, %k1 4677; AVX512F-NEXT: kandw %k1, %k0, %k0 4678; AVX512F-NEXT: kshiftlw $12, %k0, %k0 4679; AVX512F-NEXT: kshiftrw $12, %k0, %k1 4680; AVX512F-NEXT: vpcompressd %zmm0, %zmm0 {%k1} {z} 4681; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 4682; AVX512F-NEXT: vzeroupper 4683; AVX512F-NEXT: retq 4684; 4685; AVX512VL-LABEL: test_compress_narrow: 4686; AVX512VL: # %bb.0: 4687; AVX512VL-NEXT: andl $1, %edi 4688; AVX512VL-NEXT: kmovw %edi, %k0 4689; AVX512VL-NEXT: kmovd %esi, %k1 4690; AVX512VL-NEXT: kshiftlw $15, %k1, %k1 4691; AVX512VL-NEXT: kshiftrw $14, %k1, %k1 4692; AVX512VL-NEXT: korw %k1, %k0, %k0 4693; AVX512VL-NEXT: movw $-5, %ax 4694; AVX512VL-NEXT: kmovd %eax, %k1 4695; AVX512VL-NEXT: kandw %k1, %k0, %k0 4696; AVX512VL-NEXT: kmovd %edx, %k1 4697; AVX512VL-NEXT: kshiftlw $15, %k1, %k1 4698; AVX512VL-NEXT: kshiftrw $13, %k1, %k1 4699; AVX512VL-NEXT: korw %k1, %k0, %k0 4700; AVX512VL-NEXT: movb $7, %al 4701; AVX512VL-NEXT: kmovd %eax, %k1 4702; AVX512VL-NEXT: kandw %k1, %k0, %k1 4703; AVX512VL-NEXT: vpcompressd %xmm0, %xmm0 {%k1} {z} 4704; AVX512VL-NEXT: retq 4705 %out = call <3 x i32> @llvm.experimental.vector.compress(<3 x i32> %vec, <3 x i1> %mask, <3 x i32> undef) 4706 ret <3 x i32> %out 4707} 4708 4709define <3 x i3> @test_compress_narrow_illegal_element_type(<3 x i3> %vec, <3 x i1> %mask) nounwind { 4710; AVX2-LABEL: test_compress_narrow_illegal_element_type: 4711; AVX2: # %bb.0: 4712; AVX2-NEXT: vmovd %ecx, %xmm0 4713; AVX2-NEXT: vpinsrd $1, %r8d, %xmm0, %xmm0 4714; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 4715; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 4716; AVX2-NEXT: movl %edi, -{{[0-9]+}}(%rsp) 4717; AVX2-NEXT: vmovd %xmm0, %eax 4718; AVX2-NEXT: andl $1, %eax 4719; AVX2-NEXT: movl %esi, -24(%rsp,%rax,4) 4720; AVX2-NEXT: vpextrd $1, %xmm0, %ecx 4721; AVX2-NEXT: subl %ecx, %eax 4722; AVX2-NEXT: shll $2, %eax 4723; AVX2-NEXT: movl %edx, -24(%rsp,%rax) 4724; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 4725; AVX2-NEXT: vmovd %xmm0, %eax 4726; AVX2-NEXT: vpextrb $4, %xmm0, %edx 4727; AVX2-NEXT: vpextrb $8, %xmm0, %ecx 4728; AVX2-NEXT: # kill: def $al killed $al killed $eax 4729; AVX2-NEXT: # kill: def $dl killed $dl killed $edx 4730; AVX2-NEXT: # kill: def $cl killed $cl killed $ecx 4731; AVX2-NEXT: retq 4732; 4733; AVX512F-LABEL: test_compress_narrow_illegal_element_type: 4734; AVX512F: # %bb.0: 4735; AVX512F-NEXT: andl $1, %ecx 4736; AVX512F-NEXT: kmovw %ecx, %k0 4737; AVX512F-NEXT: kmovw %r8d, %k1 4738; AVX512F-NEXT: kshiftlw $15, %k1, %k1 4739; AVX512F-NEXT: kshiftrw $14, %k1, %k1 4740; AVX512F-NEXT: korw %k1, %k0, %k0 4741; AVX512F-NEXT: movw $-5, %ax 4742; AVX512F-NEXT: kmovw %eax, %k1 4743; AVX512F-NEXT: kandw %k1, %k0, %k0 4744; AVX512F-NEXT: kmovw %r9d, %k1 4745; AVX512F-NEXT: kshiftlw $15, %k1, %k1 4746; AVX512F-NEXT: kshiftrw $13, %k1, %k1 4747; AVX512F-NEXT: korw %k1, %k0, %k0 4748; AVX512F-NEXT: movb $7, %al 4749; AVX512F-NEXT: kmovw %eax, %k1 4750; AVX512F-NEXT: kandw %k1, %k0, %k0 4751; AVX512F-NEXT: vmovd %edi, %xmm0 4752; AVX512F-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 4753; AVX512F-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0 4754; AVX512F-NEXT: kshiftlw $12, %k0, %k0 4755; AVX512F-NEXT: kshiftrw $12, %k0, %k1 4756; AVX512F-NEXT: vpcompressd %zmm0, %zmm0 {%k1} {z} 4757; AVX512F-NEXT: vmovd %xmm0, %eax 4758; AVX512F-NEXT: vpextrb $4, %xmm0, %edx 4759; AVX512F-NEXT: vpextrb $8, %xmm0, %ecx 4760; AVX512F-NEXT: # kill: def $al killed $al killed $eax 4761; AVX512F-NEXT: # kill: def $dl killed $dl killed $edx 4762; AVX512F-NEXT: # kill: def $cl killed $cl killed $ecx 4763; AVX512F-NEXT: vzeroupper 4764; AVX512F-NEXT: retq 4765; 4766; AVX512VL-LABEL: test_compress_narrow_illegal_element_type: 4767; AVX512VL: # %bb.0: 4768; AVX512VL-NEXT: andl $1, %ecx 4769; AVX512VL-NEXT: kmovw %ecx, %k0 4770; AVX512VL-NEXT: kmovd %r8d, %k1 4771; AVX512VL-NEXT: kshiftlw $15, %k1, %k1 4772; AVX512VL-NEXT: kshiftrw $14, %k1, %k1 4773; AVX512VL-NEXT: korw %k1, %k0, %k0 4774; AVX512VL-NEXT: movw $-5, %ax 4775; AVX512VL-NEXT: kmovd %eax, %k1 4776; AVX512VL-NEXT: kandw %k1, %k0, %k0 4777; AVX512VL-NEXT: kmovd %r9d, %k1 4778; AVX512VL-NEXT: kshiftlw $15, %k1, %k1 4779; AVX512VL-NEXT: kshiftrw $13, %k1, %k1 4780; AVX512VL-NEXT: korw %k1, %k0, %k0 4781; AVX512VL-NEXT: movb $7, %al 4782; AVX512VL-NEXT: kmovd %eax, %k1 4783; AVX512VL-NEXT: kandw %k1, %k0, %k1 4784; AVX512VL-NEXT: vmovd %edi, %xmm0 4785; AVX512VL-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 4786; AVX512VL-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0 4787; AVX512VL-NEXT: vpcompressd %xmm0, %xmm0 {%k1} {z} 4788; AVX512VL-NEXT: vmovd %xmm0, %eax 4789; AVX512VL-NEXT: vpextrb $4, %xmm0, %edx 4790; AVX512VL-NEXT: vpextrb $8, %xmm0, %ecx 4791; AVX512VL-NEXT: # kill: def $al killed $al killed $eax 4792; AVX512VL-NEXT: # kill: def $dl killed $dl killed $edx 4793; AVX512VL-NEXT: # kill: def $cl killed $cl killed $ecx 4794; AVX512VL-NEXT: retq 4795 %out = call <3 x i3> @llvm.experimental.vector.compress(<3 x i3> %vec, <3 x i1> %mask, <3 x i3> undef) 4796 ret <3 x i3> %out 4797} 4798 4799define <4 x i32> @test_compress_v4i32_zero_passthru(<4 x i32> %vec, <4 x i1> %mask) nounwind { 4800; AVX2-LABEL: test_compress_v4i32_zero_passthru: 4801; AVX2: # %bb.0: 4802; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 4803; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1 4804; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 4805; AVX2-NEXT: vmovaps %xmm2, -{{[0-9]+}}(%rsp) 4806; AVX2-NEXT: vmovss %xmm0, -{{[0-9]+}}(%rsp) 4807; AVX2-NEXT: vmovd %xmm1, %eax 4808; AVX2-NEXT: andl $1, %eax 4809; AVX2-NEXT: vextractps $1, %xmm0, -24(%rsp,%rax,4) 4810; AVX2-NEXT: vpextrd $1, %xmm1, %ecx 4811; AVX2-NEXT: andl $1, %ecx 4812; AVX2-NEXT: addq %rax, %rcx 4813; AVX2-NEXT: vextractps $2, %xmm0, -24(%rsp,%rcx,4) 4814; AVX2-NEXT: vpextrd $2, %xmm1, %eax 4815; AVX2-NEXT: andl $1, %eax 4816; AVX2-NEXT: addq %rcx, %rax 4817; AVX2-NEXT: vpextrd $3, %xmm1, %ecx 4818; AVX2-NEXT: andl $1, %ecx 4819; AVX2-NEXT: addq %rax, %rcx 4820; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax 4821; AVX2-NEXT: andl $3, %eax 4822; AVX2-NEXT: vextractps $3, %xmm0, -24(%rsp,%rax,4) 4823; AVX2-NEXT: xorl %eax, %eax 4824; AVX2-NEXT: cmpq $3, %rcx 4825; AVX2-NEXT: movl $3, %edx 4826; AVX2-NEXT: cmovbq %rcx, %rdx 4827; AVX2-NEXT: vextractps $3, %xmm0, %ecx 4828; AVX2-NEXT: cmovbel %eax, %ecx 4829; AVX2-NEXT: movl %ecx, -24(%rsp,%rdx,4) 4830; AVX2-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 4831; AVX2-NEXT: retq 4832; 4833; AVX512F-LABEL: test_compress_v4i32_zero_passthru: 4834; AVX512F: # %bb.0: 4835; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 4836; AVX512F-NEXT: vpslld $31, %xmm1, %xmm1 4837; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 4838; AVX512F-NEXT: kshiftlw $12, %k0, %k0 4839; AVX512F-NEXT: kshiftrw $12, %k0, %k1 4840; AVX512F-NEXT: vpcompressd %zmm0, %zmm0 {%k1} {z} 4841; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 4842; AVX512F-NEXT: vzeroupper 4843; AVX512F-NEXT: retq 4844; 4845; AVX512VL-LABEL: test_compress_v4i32_zero_passthru: 4846; AVX512VL: # %bb.0: 4847; AVX512VL-NEXT: vpslld $31, %xmm1, %xmm1 4848; AVX512VL-NEXT: vptestmd %xmm1, %xmm1, %k1 4849; AVX512VL-NEXT: vpcompressd %xmm0, %xmm0 {%k1} {z} 4850; AVX512VL-NEXT: retq 4851 %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> %mask, <4 x i32> zeroinitializer) 4852 ret <4 x i32> %out 4853} 4854