1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -fast-isel -mtriple=i686-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,X86 3; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,X64 4 5; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512bw-builtins.c 6 7define i64 @test_mm512_kunpackd(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) { 8; X86-LABEL: test_mm512_kunpackd: 9; X86: # %bb.0: # %entry 10; X86-NEXT: pushl %ebp 11; X86-NEXT: .cfi_def_cfa_offset 8 12; X86-NEXT: .cfi_offset %ebp, -8 13; X86-NEXT: movl %esp, %ebp 14; X86-NEXT: .cfi_def_cfa_register %ebp 15; X86-NEXT: andl $-64, %esp 16; X86-NEXT: subl $64, %esp 17; X86-NEXT: vmovdqa64 136(%ebp), %zmm3 18; X86-NEXT: vpcmpneqb %zmm0, %zmm1, %k0 19; X86-NEXT: vpcmpneqb 8(%ebp), %zmm2, %k1 20; X86-NEXT: vpcmpneqb 72(%ebp), %zmm3, %k2 21; X86-NEXT: kandd %k0, %k2, %k0 22; X86-NEXT: kmovd %k0, %eax 23; X86-NEXT: kshiftrq $32, %k2, %k0 24; X86-NEXT: kandd %k1, %k0, %k0 25; X86-NEXT: kmovd %k0, %edx 26; X86-NEXT: movl %ebp, %esp 27; X86-NEXT: popl %ebp 28; X86-NEXT: .cfi_def_cfa %esp, 4 29; X86-NEXT: vzeroupper 30; X86-NEXT: retl 31; 32; X64-LABEL: test_mm512_kunpackd: 33; X64: # %bb.0: # %entry 34; X64-NEXT: vpcmpneqb %zmm0, %zmm1, %k0 35; X64-NEXT: vpcmpneqb %zmm3, %zmm2, %k1 36; X64-NEXT: kunpckdq %k0, %k1, %k1 37; X64-NEXT: vpcmpneqb %zmm5, %zmm4, %k0 {%k1} 38; X64-NEXT: kmovq %k0, %rax 39; X64-NEXT: vzeroupper 40; X64-NEXT: retq 41entry: 42 %0 = bitcast <8 x i64> %__E to <64 x i8> 43 %1 = bitcast <8 x i64> %__F to <64 x i8> 44 %2 = bitcast <8 x i64> %__B to <64 x i8> 45 %3 = bitcast <8 x i64> %__A to <64 x i8> 46 %4 = icmp ne <64 x i8> %2, %3 47 %5 = bitcast <8 x i64> %__C to <64 x i8> 48 %6 = bitcast <8 x i64> %__D to <64 x i8> 49 %7 = icmp ne <64 x i8> %5, %6 50 %8 = shufflevector <64 x i1> %4, <64 x i1> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 51 %9 = shufflevector <64 x i1> %7, <64 x i1> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 52 %10 = shufflevector <32 x i1> %8, <32 x i1> %9, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 53 %11 = icmp ne <64 x i8> %0, %1 54 %12 = and <64 x i1> %11, %10 55 %13 = bitcast <64 x i1> %12 to i64 56 ret i64 %13 57} 58 59define i32 @test_mm512_kunpackw(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) { 60; X86-LABEL: test_mm512_kunpackw: 61; X86: # %bb.0: # %entry 62; X86-NEXT: pushl %ebp 63; X86-NEXT: .cfi_def_cfa_offset 8 64; X86-NEXT: .cfi_offset %ebp, -8 65; X86-NEXT: movl %esp, %ebp 66; X86-NEXT: .cfi_def_cfa_register %ebp 67; X86-NEXT: andl $-64, %esp 68; X86-NEXT: subl $64, %esp 69; X86-NEXT: vmovdqa64 136(%ebp), %zmm3 70; X86-NEXT: vpcmpneqw %zmm0, %zmm1, %k0 71; X86-NEXT: vpcmpneqw 8(%ebp), %zmm2, %k1 72; X86-NEXT: kunpckwd %k0, %k1, %k1 73; X86-NEXT: vpcmpneqw 72(%ebp), %zmm3, %k0 {%k1} 74; X86-NEXT: kmovd %k0, %eax 75; X86-NEXT: movl %ebp, %esp 76; X86-NEXT: popl %ebp 77; X86-NEXT: .cfi_def_cfa %esp, 4 78; X86-NEXT: vzeroupper 79; X86-NEXT: retl 80; 81; X64-LABEL: test_mm512_kunpackw: 82; X64: # %bb.0: # %entry 83; X64-NEXT: vpcmpneqw %zmm0, %zmm1, %k0 84; X64-NEXT: vpcmpneqw %zmm3, %zmm2, %k1 85; X64-NEXT: kunpckwd %k0, %k1, %k1 86; X64-NEXT: vpcmpneqw %zmm5, %zmm4, %k0 {%k1} 87; X64-NEXT: kmovd %k0, %eax 88; X64-NEXT: vzeroupper 89; X64-NEXT: retq 90entry: 91 %0 = bitcast <8 x i64> %__E to <32 x i16> 92 %1 = bitcast <8 x i64> %__F to <32 x i16> 93 %2 = bitcast <8 x i64> %__B to <32 x i16> 94 %3 = bitcast <8 x i64> %__A to <32 x i16> 95 %4 = icmp ne <32 x i16> %2, %3 96 %5 = bitcast <8 x i64> %__C to <32 x i16> 97 %6 = bitcast <8 x i64> %__D to <32 x i16> 98 %7 = icmp ne <32 x i16> %5, %6 99 %8 = shufflevector <32 x i1> %4, <32 x i1> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 100 %9 = shufflevector <32 x i1> %7, <32 x i1> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 101 %10 = shufflevector <16 x i1> %8, <16 x i1> %9, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 102 %11 = icmp ne <32 x i16> %0, %1 103 %12 = and <32 x i1> %11, %10 104 %13 = bitcast <32 x i1> %12 to i32 105 ret i32 %13 106} 107 108 109define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext %__A) { 110; X86-LABEL: test_mm512_mask_set1_epi8: 111; X86: # %bb.0: # %entry 112; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k0 113; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 114; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 115; X86-NEXT: kunpckdq %k1, %k0, %k1 116; X86-NEXT: vpbroadcastb %eax, %zmm0 {%k1} 117; X86-NEXT: retl 118; 119; X64-LABEL: test_mm512_mask_set1_epi8: 120; X64: # %bb.0: # %entry 121; X64-NEXT: kmovq %rdi, %k1 122; X64-NEXT: vpbroadcastb %esi, %zmm0 {%k1} 123; X64-NEXT: retq 124 entry: 125 %vecinit.i.i = insertelement <64 x i8> undef, i8 %__A, i32 0 126 %vecinit63.i.i = shufflevector <64 x i8> %vecinit.i.i, <64 x i8> undef, <64 x i32> zeroinitializer 127 %0 = bitcast <8 x i64> %__O to <64 x i8> 128 %1 = bitcast i64 %__M to <64 x i1> 129 %2 = select <64 x i1> %1, <64 x i8> %vecinit63.i.i, <64 x i8> %0 130 %3 = bitcast <64 x i8> %2 to <8 x i64> 131 ret <8 x i64> %3 132} 133 134define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) { 135; X86-LABEL: test_mm512_maskz_set1_epi8: 136; X86: # %bb.0: # %entry 137; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k0 138; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 139; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 140; X86-NEXT: kunpckdq %k1, %k0, %k1 141; X86-NEXT: vpbroadcastb %eax, %zmm0 {%k1} {z} 142; X86-NEXT: retl 143; 144; X64-LABEL: test_mm512_maskz_set1_epi8: 145; X64: # %bb.0: # %entry 146; X64-NEXT: kmovq %rdi, %k1 147; X64-NEXT: vpbroadcastb %esi, %zmm0 {%k1} {z} 148; X64-NEXT: retq 149 entry: 150 %vecinit.i.i = insertelement <64 x i8> undef, i8 %__A, i32 0 151 %vecinit63.i.i = shufflevector <64 x i8> %vecinit.i.i, <64 x i8> undef, <64 x i32> zeroinitializer 152 %0 = bitcast i64 %__M to <64 x i1> 153 %1 = select <64 x i1> %0, <64 x i8> %vecinit63.i.i, <64 x i8> zeroinitializer 154 %2 = bitcast <64 x i8> %1 to <8 x i64> 155 ret <8 x i64> %2 156} 157 158define <8 x i64> @test_mm512_mask_set1_epi16(<8 x i64> %__O, i32 %__M, i16 signext %__A) { 159; X86-LABEL: test_mm512_mask_set1_epi16: 160; X86: # %bb.0: # %entry 161; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 162; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 163; X86-NEXT: vpbroadcastw %eax, %zmm0 {%k1} 164; X86-NEXT: retl 165; 166; X64-LABEL: test_mm512_mask_set1_epi16: 167; X64: # %bb.0: # %entry 168; X64-NEXT: kmovd %edi, %k1 169; X64-NEXT: vpbroadcastw %esi, %zmm0 {%k1} 170; X64-NEXT: retq 171 entry: 172 %vecinit.i.i = insertelement <32 x i16> undef, i16 %__A, i32 0 173 %vecinit31.i.i = shufflevector <32 x i16> %vecinit.i.i, <32 x i16> undef, <32 x i32> zeroinitializer 174 %0 = bitcast <8 x i64> %__O to <32 x i16> 175 %1 = bitcast i32 %__M to <32 x i1> 176 %2 = select <32 x i1> %1, <32 x i16> %vecinit31.i.i, <32 x i16> %0 177 %3 = bitcast <32 x i16> %2 to <8 x i64> 178 ret <8 x i64> %3 179} 180 181define <8 x i64> @test_mm512_maskz_set1_epi16(i32 %__M, i16 signext %__A) { 182; X86-LABEL: test_mm512_maskz_set1_epi16: 183; X86: # %bb.0: # %entry 184; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 185; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 186; X86-NEXT: vpbroadcastw %eax, %zmm0 {%k1} {z} 187; X86-NEXT: retl 188; 189; X64-LABEL: test_mm512_maskz_set1_epi16: 190; X64: # %bb.0: # %entry 191; X64-NEXT: kmovd %edi, %k1 192; X64-NEXT: vpbroadcastw %esi, %zmm0 {%k1} {z} 193; X64-NEXT: retq 194 entry: 195 %vecinit.i.i = insertelement <32 x i16> undef, i16 %__A, i32 0 196 %vecinit31.i.i = shufflevector <32 x i16> %vecinit.i.i, <32 x i16> undef, <32 x i32> zeroinitializer 197 %0 = bitcast i32 %__M to <32 x i1> 198 %1 = select <32 x i1> %0, <32 x i16> %vecinit31.i.i, <32 x i16> zeroinitializer 199 %2 = bitcast <32 x i16> %1 to <8 x i64> 200 ret <8 x i64> %2 201} 202 203define <8 x i64> @test_mm512_broadcastb_epi8(<2 x i64> %a0) { 204; CHECK-LABEL: test_mm512_broadcastb_epi8: 205; CHECK: # %bb.0: 206; CHECK-NEXT: vpbroadcastb %xmm0, %zmm0 207; CHECK-NEXT: ret{{[l|q]}} 208 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 209 %res0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <64 x i32> zeroinitializer 210 %res1 = bitcast <64 x i8> %res0 to <8 x i64> 211 ret <8 x i64> %res1 212} 213 214define <8 x i64> @test_mm512_mask_broadcastb_epi8(<8 x i64> %a0, ptr %a1, <2 x i64> %a2) { 215; X86-LABEL: test_mm512_mask_broadcastb_epi8: 216; X86: # %bb.0: 217; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 218; X86-NEXT: kmovq (%eax), %k1 219; X86-NEXT: vpbroadcastb %xmm1, %zmm0 {%k1} 220; X86-NEXT: retl 221; 222; X64-LABEL: test_mm512_mask_broadcastb_epi8: 223; X64: # %bb.0: 224; X64-NEXT: kmovq (%rdi), %k1 225; X64-NEXT: vpbroadcastb %xmm1, %zmm0 {%k1} 226; X64-NEXT: retq 227 %arg0 = bitcast <8 x i64> %a0 to <64 x i8> 228 %arg1 = load <64 x i1>, ptr %a1 229 %arg2 = bitcast <2 x i64> %a2 to <16 x i8> 230 %res0 = shufflevector <16 x i8> %arg2, <16 x i8> undef, <64 x i32> zeroinitializer 231 %res1 = select <64 x i1> %arg1, <64 x i8> %res0, <64 x i8> %arg0 232 %res2 = bitcast <64 x i8> %res1 to <8 x i64> 233 ret <8 x i64> %res2 234} 235 236define <8 x i64> @test_mm512_maskz_broadcastb_epi8(ptr %a0, <2 x i64> %a1) { 237; X86-LABEL: test_mm512_maskz_broadcastb_epi8: 238; X86: # %bb.0: 239; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 240; X86-NEXT: kmovq (%eax), %k1 241; X86-NEXT: vpbroadcastb %xmm0, %zmm0 {%k1} {z} 242; X86-NEXT: retl 243; 244; X64-LABEL: test_mm512_maskz_broadcastb_epi8: 245; X64: # %bb.0: 246; X64-NEXT: kmovq (%rdi), %k1 247; X64-NEXT: vpbroadcastb %xmm0, %zmm0 {%k1} {z} 248; X64-NEXT: retq 249 %arg0 = load <64 x i1>, ptr %a0 250 %arg1 = bitcast <2 x i64> %a1 to <16 x i8> 251 %res0 = shufflevector <16 x i8> %arg1, <16 x i8> undef, <64 x i32> zeroinitializer 252 %res1 = select <64 x i1> %arg0, <64 x i8> %res0, <64 x i8> zeroinitializer 253 %res2 = bitcast <64 x i8> %res1 to <8 x i64> 254 ret <8 x i64> %res2 255} 256 257define <8 x i64> @test_mm512_broadcastw_epi16(<2 x i64> %a0) { 258; CHECK-LABEL: test_mm512_broadcastw_epi16: 259; CHECK: # %bb.0: 260; CHECK-NEXT: vpbroadcastw %xmm0, %zmm0 261; CHECK-NEXT: ret{{[l|q]}} 262 %arg0 = bitcast <2 x i64> %a0 to <8 x i16> 263 %res0 = shufflevector <8 x i16> %arg0, <8 x i16> undef, <32 x i32> zeroinitializer 264 %res1 = bitcast <32 x i16> %res0 to <8 x i64> 265 ret <8 x i64> %res1 266} 267 268define <8 x i64> @test_mm512_mask_broadcastw_epi16(<8 x i64> %a0, i32 %a1, <2 x i64> %a2) { 269; X86-LABEL: test_mm512_mask_broadcastw_epi16: 270; X86: # %bb.0: 271; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 272; X86-NEXT: vpbroadcastw %xmm1, %zmm0 {%k1} 273; X86-NEXT: retl 274; 275; X64-LABEL: test_mm512_mask_broadcastw_epi16: 276; X64: # %bb.0: 277; X64-NEXT: kmovd %edi, %k1 278; X64-NEXT: vpbroadcastw %xmm1, %zmm0 {%k1} 279; X64-NEXT: retq 280 %arg0 = bitcast <8 x i64> %a0 to <32 x i16> 281 %arg1 = bitcast i32 %a1 to <32 x i1> 282 %arg2 = bitcast <2 x i64> %a2 to <8 x i16> 283 %res0 = shufflevector <8 x i16> %arg2, <8 x i16> undef, <32 x i32> zeroinitializer 284 %res1 = select <32 x i1> %arg1, <32 x i16> %res0, <32 x i16> %arg0 285 %res2 = bitcast <32 x i16> %res1 to <8 x i64> 286 ret <8 x i64> %res2 287} 288 289define <8 x i64> @test_mm512_maskz_broadcastw_epi16(i32 %a0, <2 x i64> %a1) { 290; X86-LABEL: test_mm512_maskz_broadcastw_epi16: 291; X86: # %bb.0: 292; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 293; X86-NEXT: vpbroadcastw %xmm0, %zmm0 {%k1} {z} 294; X86-NEXT: retl 295; 296; X64-LABEL: test_mm512_maskz_broadcastw_epi16: 297; X64: # %bb.0: 298; X64-NEXT: kmovd %edi, %k1 299; X64-NEXT: vpbroadcastw %xmm0, %zmm0 {%k1} {z} 300; X64-NEXT: retq 301 %arg0 = bitcast i32 %a0 to <32 x i1> 302 %arg1 = bitcast <2 x i64> %a1 to <8 x i16> 303 %res0 = shufflevector <8 x i16> %arg1, <8 x i16> undef, <32 x i32> zeroinitializer 304 %res1 = select <32 x i1> %arg0, <32 x i16> %res0, <32 x i16> zeroinitializer 305 %res2 = bitcast <32 x i16> %res1 to <8 x i64> 306 ret <8 x i64> %res2 307} 308 309define <8 x i64> @test_mm512_bslli_epi128(<8 x i64> %a0) { 310; CHECK-LABEL: test_mm512_bslli_epi128: 311; CHECK: # %bb.0: 312; CHECK-NEXT: vpsrldq {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[43,44,45,46,47],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[59,60,61,62,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 313; CHECK-NEXT: ret{{[l|q]}} 314 %arg0 = bitcast <8 x i64> %a0 to <64 x i8> 315 %res0 = shufflevector <64 x i8> %arg0, <64 x i8> zeroinitializer, <64 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122> 316 %res1 = bitcast <64 x i8> %res0 to <8 x i64> 317 ret <8 x i64> %res1 318} 319 320define <8 x i64> @test_mm512_bsrli_epi128(<8 x i64> %a0) { 321; CHECK-LABEL: test_mm512_bsrli_epi128: 322; CHECK: # %bb.0: 323; CHECK-NEXT: vpsrldq {{.*#+}} zmm0 = zmm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zmm0[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zmm0[37,38,39,40,41,42,43,44,45,46,47],zero,zero,zero,zero,zero,zmm0[53,54,55,56,57,58,59,60,61,62,63],zero,zero,zero,zero,zero 324; CHECK-NEXT: ret{{[l|q]}} 325 %arg0 = bitcast <8 x i64> %a0 to <64 x i8> 326 %res0 = shufflevector <64 x i8> %arg0, <64 x i8> zeroinitializer, <64 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 82, i32 83, i32 84, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 98, i32 99, i32 100, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113, i32 114, i32 115, i32 116> 327 %res1 = bitcast <64 x i8> %res0 to <8 x i64> 328 ret <8 x i64> %res1 329} 330 331define <8 x i64> @test_mm512_unpackhi_epi8(<8 x i64> %a0, <8 x i64> %a1) { 332; CHECK-LABEL: test_mm512_unpackhi_epi8: 333; CHECK: # %bb.0: 334; CHECK-NEXT: vpunpckhbw {{.*#+}} zmm0 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] 335; CHECK-NEXT: ret{{[l|q]}} 336 %arg0 = bitcast <8 x i64> %a0 to <64 x i8> 337 %arg1 = bitcast <8 x i64> %a1 to <64 x i8> 338 %res0 = shufflevector <64 x i8> %arg0, <64 x i8> %arg1, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127> 339 %res1 = bitcast <64 x i8> %res0 to <8 x i64> 340 ret <8 x i64> %res1 341} 342 343; TODO - improve support for i64 -> mmask64 on 32-bit targets 344define <8 x i64> @test_mm512_mask_unpackhi_epi8(<8 x i64> %a0, ptr %a1, <8 x i64> %a2, <8 x i64> %a3) { 345; X86-LABEL: test_mm512_mask_unpackhi_epi8: 346; X86: # %bb.0: 347; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 348; X86-NEXT: kmovq (%eax), %k1 349; X86-NEXT: vpunpckhbw {{.*#+}} zmm0 {%k1} = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] 350; X86-NEXT: retl 351; 352; X64-LABEL: test_mm512_mask_unpackhi_epi8: 353; X64: # %bb.0: 354; X64-NEXT: kmovq (%rdi), %k1 355; X64-NEXT: vpunpckhbw {{.*#+}} zmm0 {%k1} = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] 356; X64-NEXT: retq 357 %arg0 = bitcast <8 x i64> %a0 to <64 x i8> 358 %sel1 = load <64 x i1>, ptr %a1 359 %arg2 = bitcast <8 x i64> %a2 to <64 x i8> 360 %arg3 = bitcast <8 x i64> %a3 to <64 x i8> 361 %res0 = shufflevector <64 x i8> %arg2, <64 x i8> %arg3, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127> 362 %res1 = select <64 x i1> %sel1, <64 x i8> %res0, <64 x i8> %arg0 363 %res2 = bitcast <64 x i8> %res1 to <8 x i64> 364 ret <8 x i64> %res2 365} 366 367define <8 x i64> @test_mm512_maskz_unpackhi_epi8(ptr %a0, <8 x i64> %a1, <8 x i64> %a2) { 368; X86-LABEL: test_mm512_maskz_unpackhi_epi8: 369; X86: # %bb.0: 370; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 371; X86-NEXT: kmovq (%eax), %k1 372; X86-NEXT: vpunpckhbw {{.*#+}} zmm0 {%k1} {z} = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] 373; X86-NEXT: retl 374; 375; X64-LABEL: test_mm512_maskz_unpackhi_epi8: 376; X64: # %bb.0: 377; X64-NEXT: kmovq (%rdi), %k1 378; X64-NEXT: vpunpckhbw {{.*#+}} zmm0 {%k1} {z} = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] 379; X64-NEXT: retq 380 %sel0 = load <64 x i1>, ptr %a0 381 %arg1 = bitcast <8 x i64> %a1 to <64 x i8> 382 %arg2 = bitcast <8 x i64> %a2 to <64 x i8> 383 %res0 = shufflevector <64 x i8> %arg1, <64 x i8> %arg2, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127> 384 %res1 = select <64 x i1> %sel0, <64 x i8> %res0, <64 x i8> zeroinitializer 385 %res2 = bitcast <64 x i8> %res1 to <8 x i64> 386 ret <8 x i64> %res2 387} 388 389define <8 x i64> @test_mm512_unpackhi_epi16(<8 x i64> %a0, <8 x i64> %a1) { 390; CHECK-LABEL: test_mm512_unpackhi_epi16: 391; CHECK: # %bb.0: 392; CHECK-NEXT: vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31] 393; CHECK-NEXT: ret{{[l|q]}} 394 %arg0 = bitcast <8 x i64> %a0 to <32 x i16> 395 %arg1 = bitcast <8 x i64> %a1 to <32 x i16> 396 %res0 = shufflevector <32 x i16> %arg0, <32 x i16> %arg1, <32 x i32> <i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63> 397 %res1 = bitcast <32 x i16> %res0 to <8 x i64> 398 ret <8 x i64> %res1 399} 400 401define <8 x i64> @test_mm512_mask_unpackhi_epi16(<8 x i64> %a0, i32 %a1, <8 x i64> %a2, <8 x i64> %a3) { 402; X86-LABEL: test_mm512_mask_unpackhi_epi16: 403; X86: # %bb.0: 404; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 405; X86-NEXT: vpunpckhwd {{.*#+}} zmm0 {%k1} = zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31] 406; X86-NEXT: retl 407; 408; X64-LABEL: test_mm512_mask_unpackhi_epi16: 409; X64: # %bb.0: 410; X64-NEXT: kmovd %edi, %k1 411; X64-NEXT: vpunpckhwd {{.*#+}} zmm0 {%k1} = zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31] 412; X64-NEXT: retq 413 %arg0 = bitcast <8 x i64> %a0 to <32 x i16> 414 %arg1 = bitcast i32 %a1 to <32 x i1> 415 %arg2 = bitcast <8 x i64> %a2 to <32 x i16> 416 %arg3 = bitcast <8 x i64> %a3 to <32 x i16> 417 %res0 = shufflevector <32 x i16> %arg2, <32 x i16> %arg3, <32 x i32> <i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63> 418 %res1 = select <32 x i1> %arg1, <32 x i16> %res0, <32 x i16> %arg0 419 %res2 = bitcast <32 x i16> %res1 to <8 x i64> 420 ret <8 x i64> %res2 421} 422 423define <8 x i64> @test_mm512_maskz_unpackhi_epi16(i32 %a0, <8 x i64> %a1, <8 x i64> %a2) { 424; X86-LABEL: test_mm512_maskz_unpackhi_epi16: 425; X86: # %bb.0: 426; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 427; X86-NEXT: vpunpckhwd {{.*#+}} zmm0 {%k1} {z} = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31] 428; X86-NEXT: retl 429; 430; X64-LABEL: test_mm512_maskz_unpackhi_epi16: 431; X64: # %bb.0: 432; X64-NEXT: kmovd %edi, %k1 433; X64-NEXT: vpunpckhwd {{.*#+}} zmm0 {%k1} {z} = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31] 434; X64-NEXT: retq 435 %arg0 = bitcast i32 %a0 to <32 x i1> 436 %arg1 = bitcast <8 x i64> %a1 to <32 x i16> 437 %arg2 = bitcast <8 x i64> %a2 to <32 x i16> 438 %res0 = shufflevector <32 x i16> %arg1, <32 x i16> %arg2, <32 x i32> <i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63> 439 %res1 = select <32 x i1> %arg0, <32 x i16> %res0, <32 x i16> zeroinitializer 440 %res2 = bitcast <32 x i16> %res1 to <8 x i64> 441 ret <8 x i64> %res2 442} 443 444define <8 x i64> @test_mm512_unpacklo_epi8(<8 x i64> %a0, <8 x i64> %a1) { 445; CHECK-LABEL: test_mm512_unpacklo_epi8: 446; CHECK: # %bb.0: 447; CHECK-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] 448; CHECK-NEXT: ret{{[l|q]}} 449 %arg0 = bitcast <8 x i64> %a0 to <64 x i8> 450 %arg1 = bitcast <8 x i64> %a1 to <64 x i8> 451 %res0 = shufflevector <64 x i8> %arg0, <64 x i8> %arg1, <64 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119> 452 %res1 = bitcast <64 x i8> %res0 to <8 x i64> 453 ret <8 x i64> %res1 454} 455 456define <8 x i64> @test_mm512_mask_unpacklo_epi8(<8 x i64> %a0, ptr %a1, <8 x i64> %a2, <8 x i64> %a3) { 457; X86-LABEL: test_mm512_mask_unpacklo_epi8: 458; X86: # %bb.0: 459; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 460; X86-NEXT: kmovq (%eax), %k1 461; X86-NEXT: vpunpcklbw {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55] 462; X86-NEXT: retl 463; 464; X64-LABEL: test_mm512_mask_unpacklo_epi8: 465; X64: # %bb.0: 466; X64-NEXT: kmovq (%rdi), %k1 467; X64-NEXT: vpunpcklbw {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55] 468; X64-NEXT: retq 469 %arg0 = bitcast <8 x i64> %a0 to <64 x i8> 470 %sel1 = load <64 x i1>, ptr %a1 471 %arg2 = bitcast <8 x i64> %a2 to <64 x i8> 472 %arg3 = bitcast <8 x i64> %a3 to <64 x i8> 473 %res0 = shufflevector <64 x i8> %arg2, <64 x i8> %arg3, <64 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119> 474 %res1 = select <64 x i1> %sel1, <64 x i8> %res0, <64 x i8> %arg0 475 %res2 = bitcast <64 x i8> %res1 to <8 x i64> 476 ret <8 x i64> %res2 477} 478 479define <8 x i64> @test_mm512_maskz_unpacklo_epi8(ptr %a0, <8 x i64> %a1, <8 x i64> %a2) { 480; X86-LABEL: test_mm512_maskz_unpacklo_epi8: 481; X86: # %bb.0: 482; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 483; X86-NEXT: kmovq (%eax), %k1 484; X86-NEXT: vpunpcklbw {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] 485; X86-NEXT: retl 486; 487; X64-LABEL: test_mm512_maskz_unpacklo_epi8: 488; X64: # %bb.0: 489; X64-NEXT: kmovq (%rdi), %k1 490; X64-NEXT: vpunpcklbw {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] 491; X64-NEXT: retq 492 %sel0 = load <64 x i1>, ptr %a0 493 %arg1 = bitcast <8 x i64> %a1 to <64 x i8> 494 %arg2 = bitcast <8 x i64> %a2 to <64 x i8> 495 %res0 = shufflevector <64 x i8> %arg1, <64 x i8> %arg2, <64 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119> 496 %res1 = select <64 x i1> %sel0, <64 x i8> %res0, <64 x i8> zeroinitializer 497 %res2 = bitcast <64 x i8> %res1 to <8 x i64> 498 ret <8 x i64> %res2 499} 500 501define <8 x i64> @test_mm512_unpacklo_epi16(<8 x i64> %a0, <8 x i64> %a1) { 502; CHECK-LABEL: test_mm512_unpacklo_epi16: 503; CHECK: # %bb.0: 504; CHECK-NEXT: vpunpcklwd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27] 505; CHECK-NEXT: ret{{[l|q]}} 506 %arg0 = bitcast <8 x i64> %a0 to <32 x i16> 507 %arg1 = bitcast <8 x i64> %a1 to <32 x i16> 508 %res0 = shufflevector <32 x i16> %arg0, <32 x i16> %arg1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59> 509 %res1 = bitcast <32 x i16> %res0 to <8 x i64> 510 ret <8 x i64> %res1 511} 512 513define <8 x i64> @test_mm512_mask_unpacklo_epi16(<8 x i64> %a0, i32 %a1, <8 x i64> %a2, <8 x i64> %a3) { 514; X86-LABEL: test_mm512_mask_unpacklo_epi16: 515; X86: # %bb.0: 516; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 517; X86-NEXT: vpunpcklwd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27] 518; X86-NEXT: retl 519; 520; X64-LABEL: test_mm512_mask_unpacklo_epi16: 521; X64: # %bb.0: 522; X64-NEXT: kmovd %edi, %k1 523; X64-NEXT: vpunpcklwd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27] 524; X64-NEXT: retq 525 %arg0 = bitcast <8 x i64> %a0 to <32 x i16> 526 %arg1 = bitcast i32 %a1 to <32 x i1> 527 %arg2 = bitcast <8 x i64> %a2 to <32 x i16> 528 %arg3 = bitcast <8 x i64> %a3 to <32 x i16> 529 %res0 = shufflevector <32 x i16> %arg2, <32 x i16> %arg3, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59> 530 %res1 = select <32 x i1> %arg1, <32 x i16> %res0, <32 x i16> %arg0 531 %res2 = bitcast <32 x i16> %res1 to <8 x i64> 532 ret <8 x i64> %res2 533} 534 535define <8 x i64> @test_mm512_maskz_unpacklo_epi16(i32 %a0, <8 x i64> %a1, <8 x i64> %a2) { 536; X86-LABEL: test_mm512_maskz_unpacklo_epi16: 537; X86: # %bb.0: 538; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 539; X86-NEXT: vpunpcklwd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27] 540; X86-NEXT: retl 541; 542; X64-LABEL: test_mm512_maskz_unpacklo_epi16: 543; X64: # %bb.0: 544; X64-NEXT: kmovd %edi, %k1 545; X64-NEXT: vpunpcklwd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27] 546; X64-NEXT: retq 547 %arg0 = bitcast i32 %a0 to <32 x i1> 548 %arg1 = bitcast <8 x i64> %a1 to <32 x i16> 549 %arg2 = bitcast <8 x i64> %a2 to <32 x i16> 550 %res0 = shufflevector <32 x i16> %arg1, <32 x i16> %arg2, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59> 551 %res1 = select <32 x i1> %arg0, <32 x i16> %res0, <32 x i16> zeroinitializer 552 %res2 = bitcast <32 x i16> %res1 to <8 x i64> 553 ret <8 x i64> %res2 554} 555 556define i64 @test_mm512_test_epi8_mask(<8 x i64> %__A, <8 x i64> %__B) { 557; X86-LABEL: test_mm512_test_epi8_mask: 558; X86: # %bb.0: # %entry 559; X86-NEXT: vptestmb %zmm0, %zmm1, %k0 560; X86-NEXT: kshiftrq $32, %k0, %k1 561; X86-NEXT: kmovd %k0, %eax 562; X86-NEXT: kmovd %k1, %edx 563; X86-NEXT: vzeroupper 564; X86-NEXT: retl 565; 566; X64-LABEL: test_mm512_test_epi8_mask: 567; X64: # %bb.0: # %entry 568; X64-NEXT: vptestmb %zmm0, %zmm1, %k0 569; X64-NEXT: kmovq %k0, %rax 570; X64-NEXT: vzeroupper 571; X64-NEXT: retq 572entry: 573 %and1.i.i = and <8 x i64> %__B, %__A 574 %0 = bitcast <8 x i64> %and1.i.i to <64 x i8> 575 %1 = icmp ne <64 x i8> %0, zeroinitializer 576 %2 = bitcast <64 x i1> %1 to i64 577 ret i64 %2 578} 579 580define i64 @test_mm512_mask_test_epi8_mask(i64 %__U, <8 x i64> %__A, <8 x i64> %__B) { 581; X86-LABEL: test_mm512_mask_test_epi8_mask: 582; X86: # %bb.0: # %entry 583; X86-NEXT: vptestmb %zmm0, %zmm1, %k0 584; X86-NEXT: kshiftrq $32, %k0, %k1 585; X86-NEXT: kmovd %k1, %edx 586; X86-NEXT: kmovd %k0, %eax 587; X86-NEXT: andl {{[0-9]+}}(%esp), %eax 588; X86-NEXT: andl {{[0-9]+}}(%esp), %edx 589; X86-NEXT: vzeroupper 590; X86-NEXT: retl 591; 592; X64-LABEL: test_mm512_mask_test_epi8_mask: 593; X64: # %bb.0: # %entry 594; X64-NEXT: kmovq %rdi, %k1 595; X64-NEXT: vptestmb %zmm0, %zmm1, %k0 {%k1} 596; X64-NEXT: kmovq %k0, %rax 597; X64-NEXT: vzeroupper 598; X64-NEXT: retq 599entry: 600 %and1.i.i = and <8 x i64> %__B, %__A 601 %0 = bitcast <8 x i64> %and1.i.i to <64 x i8> 602 %1 = icmp ne <64 x i8> %0, zeroinitializer 603 %2 = bitcast i64 %__U to <64 x i1> 604 %3 = and <64 x i1> %1, %2 605 %4 = bitcast <64 x i1> %3 to i64 606 ret i64 %4 607} 608 609define i32 @test_mm512_test_epi16_mask(<8 x i64> %__A, <8 x i64> %__B) { 610; CHECK-LABEL: test_mm512_test_epi16_mask: 611; CHECK: # %bb.0: # %entry 612; CHECK-NEXT: vptestmw %zmm0, %zmm1, %k0 613; CHECK-NEXT: kmovd %k0, %eax 614; CHECK-NEXT: vzeroupper 615; CHECK-NEXT: ret{{[l|q]}} 616entry: 617 %and1.i.i = and <8 x i64> %__B, %__A 618 %0 = bitcast <8 x i64> %and1.i.i to <32 x i16> 619 %1 = icmp ne <32 x i16> %0, zeroinitializer 620 %2 = bitcast <32 x i1> %1 to i32 621 ret i32 %2 622} 623 624define i32 @test_mm512_mask_test_epi16_mask(i32 %__U, <8 x i64> %__A, <8 x i64> %__B) { 625; X86-LABEL: test_mm512_mask_test_epi16_mask: 626; X86: # %bb.0: # %entry 627; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 628; X86-NEXT: vptestmw %zmm0, %zmm1, %k0 {%k1} 629; X86-NEXT: kmovd %k0, %eax 630; X86-NEXT: vzeroupper 631; X86-NEXT: retl 632; 633; X64-LABEL: test_mm512_mask_test_epi16_mask: 634; X64: # %bb.0: # %entry 635; X64-NEXT: kmovd %edi, %k1 636; X64-NEXT: vptestmw %zmm0, %zmm1, %k0 {%k1} 637; X64-NEXT: kmovd %k0, %eax 638; X64-NEXT: vzeroupper 639; X64-NEXT: retq 640entry: 641 %and1.i.i = and <8 x i64> %__B, %__A 642 %0 = bitcast <8 x i64> %and1.i.i to <32 x i16> 643 %1 = icmp ne <32 x i16> %0, zeroinitializer 644 %2 = bitcast i32 %__U to <32 x i1> 645 %3 = and <32 x i1> %1, %2 646 %4 = bitcast <32 x i1> %3 to i32 647 ret i32 %4 648} 649 650define i64 @test_mm512_testn_epi8_mask(<8 x i64> %__A, <8 x i64> %__B) { 651; X86-LABEL: test_mm512_testn_epi8_mask: 652; X86: # %bb.0: # %entry 653; X86-NEXT: vptestnmb %zmm0, %zmm1, %k0 654; X86-NEXT: kshiftrq $32, %k0, %k1 655; X86-NEXT: kmovd %k0, %eax 656; X86-NEXT: kmovd %k1, %edx 657; X86-NEXT: vzeroupper 658; X86-NEXT: retl 659; 660; X64-LABEL: test_mm512_testn_epi8_mask: 661; X64: # %bb.0: # %entry 662; X64-NEXT: vptestnmb %zmm0, %zmm1, %k0 663; X64-NEXT: kmovq %k0, %rax 664; X64-NEXT: vzeroupper 665; X64-NEXT: retq 666entry: 667 %and1.i.i = and <8 x i64> %__B, %__A 668 %0 = bitcast <8 x i64> %and1.i.i to <64 x i8> 669 %1 = icmp eq <64 x i8> %0, zeroinitializer 670 %2 = bitcast <64 x i1> %1 to i64 671 ret i64 %2 672} 673 674define i64 @test_mm512_mask_testn_epi8_mask(i64 %__U, <8 x i64> %__A, <8 x i64> %__B) { 675; X86-LABEL: test_mm512_mask_testn_epi8_mask: 676; X86: # %bb.0: # %entry 677; X86-NEXT: vptestnmb %zmm0, %zmm1, %k0 678; X86-NEXT: kshiftrq $32, %k0, %k1 679; X86-NEXT: kmovd %k1, %edx 680; X86-NEXT: kmovd %k0, %eax 681; X86-NEXT: andl {{[0-9]+}}(%esp), %eax 682; X86-NEXT: andl {{[0-9]+}}(%esp), %edx 683; X86-NEXT: vzeroupper 684; X86-NEXT: retl 685; 686; X64-LABEL: test_mm512_mask_testn_epi8_mask: 687; X64: # %bb.0: # %entry 688; X64-NEXT: kmovq %rdi, %k1 689; X64-NEXT: vptestnmb %zmm0, %zmm1, %k0 {%k1} 690; X64-NEXT: kmovq %k0, %rax 691; X64-NEXT: vzeroupper 692; X64-NEXT: retq 693entry: 694 %and1.i.i = and <8 x i64> %__B, %__A 695 %0 = bitcast <8 x i64> %and1.i.i to <64 x i8> 696 %1 = icmp eq <64 x i8> %0, zeroinitializer 697 %2 = bitcast i64 %__U to <64 x i1> 698 %3 = and <64 x i1> %1, %2 699 %4 = bitcast <64 x i1> %3 to i64 700 ret i64 %4 701} 702 703define i32 @test_mm512_testn_epi16_mask(<8 x i64> %__A, <8 x i64> %__B) { 704; CHECK-LABEL: test_mm512_testn_epi16_mask: 705; CHECK: # %bb.0: # %entry 706; CHECK-NEXT: vptestnmw %zmm0, %zmm1, %k0 707; CHECK-NEXT: kmovd %k0, %eax 708; CHECK-NEXT: vzeroupper 709; CHECK-NEXT: ret{{[l|q]}} 710entry: 711 %and1.i.i = and <8 x i64> %__B, %__A 712 %0 = bitcast <8 x i64> %and1.i.i to <32 x i16> 713 %1 = icmp eq <32 x i16> %0, zeroinitializer 714 %2 = bitcast <32 x i1> %1 to i32 715 ret i32 %2 716} 717 718define i32 @test_mm512_mask_testn_epi16_mask(i32 %__U, <8 x i64> %__A, <8 x i64> %__B) { 719; X86-LABEL: test_mm512_mask_testn_epi16_mask: 720; X86: # %bb.0: # %entry 721; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 722; X86-NEXT: vptestnmw %zmm0, %zmm1, %k0 {%k1} 723; X86-NEXT: kmovd %k0, %eax 724; X86-NEXT: vzeroupper 725; X86-NEXT: retl 726; 727; X64-LABEL: test_mm512_mask_testn_epi16_mask: 728; X64: # %bb.0: # %entry 729; X64-NEXT: kmovd %edi, %k1 730; X64-NEXT: vptestnmw %zmm0, %zmm1, %k0 {%k1} 731; X64-NEXT: kmovd %k0, %eax 732; X64-NEXT: vzeroupper 733; X64-NEXT: retq 734entry: 735 %and1.i.i = and <8 x i64> %__B, %__A 736 %0 = bitcast <8 x i64> %and1.i.i to <32 x i16> 737 %1 = icmp eq <32 x i16> %0, zeroinitializer 738 %2 = bitcast i32 %__U to <32 x i1> 739 %3 = and <32 x i1> %1, %2 740 %4 = bitcast <32 x i1> %3 to i32 741 ret i32 %4 742} 743 744define <4 x i64> @test_mm512_cvtepi16_epi8(<8 x i64> %__A) { 745; CHECK-LABEL: test_mm512_cvtepi16_epi8: 746; CHECK: # %bb.0: # %entry 747; CHECK-NEXT: vpmovwb %zmm0, %ymm0 748; CHECK-NEXT: ret{{[l|q]}} 749entry: 750 %0 = bitcast <8 x i64> %__A to <32 x i16> 751 %conv.i = trunc <32 x i16> %0 to <32 x i8> 752 %1 = bitcast <32 x i8> %conv.i to <4 x i64> 753 ret <4 x i64> %1 754} 755 756define <4 x i64> @test_mm512_mask_cvtepi16_epi8(<4 x i64> %__O, i32 %__M, <8 x i64> %__A) { 757; X86-LABEL: test_mm512_mask_cvtepi16_epi8: 758; X86: # %bb.0: # %entry 759; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 760; X86-NEXT: vpmovwb %zmm1, %ymm0 {%k1} 761; X86-NEXT: retl 762; 763; X64-LABEL: test_mm512_mask_cvtepi16_epi8: 764; X64: # %bb.0: # %entry 765; X64-NEXT: kmovd %edi, %k1 766; X64-NEXT: vpmovwb %zmm1, %ymm0 {%k1} 767; X64-NEXT: retq 768entry: 769 %0 = bitcast <8 x i64> %__A to <32 x i16> 770 %conv.i.i = trunc <32 x i16> %0 to <32 x i8> 771 %1 = bitcast <4 x i64> %__O to <32 x i8> 772 %2 = bitcast i32 %__M to <32 x i1> 773 %3 = select <32 x i1> %2, <32 x i8> %conv.i.i, <32 x i8> %1 774 %4 = bitcast <32 x i8> %3 to <4 x i64> 775 ret <4 x i64> %4 776} 777 778define <4 x i64> @test_mm512_maskz_cvtepi16_epi8(i32 %__M, <8 x i64> %__A) { 779; X86-LABEL: test_mm512_maskz_cvtepi16_epi8: 780; X86: # %bb.0: # %entry 781; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 782; X86-NEXT: vpmovwb %zmm0, %ymm0 {%k1} {z} 783; X86-NEXT: retl 784; 785; X64-LABEL: test_mm512_maskz_cvtepi16_epi8: 786; X64: # %bb.0: # %entry 787; X64-NEXT: kmovd %edi, %k1 788; X64-NEXT: vpmovwb %zmm0, %ymm0 {%k1} {z} 789; X64-NEXT: retq 790entry: 791 %0 = bitcast <8 x i64> %__A to <32 x i16> 792 %conv.i.i = trunc <32 x i16> %0 to <32 x i8> 793 %1 = bitcast i32 %__M to <32 x i1> 794 %2 = select <32 x i1> %1, <32 x i8> %conv.i.i, <32 x i8> zeroinitializer 795 %3 = bitcast <32 x i8> %2 to <4 x i64> 796 ret <4 x i64> %3 797} 798 799define <8 x i64> @test_mm512_mask2_permutex2var_epi16(<8 x i64> %__A, <8 x i64> %__I, i32 %__U, <8 x i64> %__B) { 800; X86-LABEL: test_mm512_mask2_permutex2var_epi16: 801; X86: # %bb.0: # %entry 802; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 803; X86-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 {%k1} 804; X86-NEXT: vmovdqa64 %zmm1, %zmm0 805; X86-NEXT: retl 806; 807; X64-LABEL: test_mm512_mask2_permutex2var_epi16: 808; X64: # %bb.0: # %entry 809; X64-NEXT: kmovd %edi, %k1 810; X64-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 {%k1} 811; X64-NEXT: vmovdqa64 %zmm1, %zmm0 812; X64-NEXT: retq 813entry: 814 %0 = bitcast <8 x i64> %__A to <32 x i16> 815 %1 = bitcast <8 x i64> %__I to <32 x i16> 816 %2 = bitcast <8 x i64> %__B to <32 x i16> 817 %3 = tail call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %0, <32 x i16> %1, <32 x i16> %2) 818 %4 = bitcast i32 %__U to <32 x i1> 819 %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %1 820 %6 = bitcast <32 x i16> %5 to <8 x i64> 821 ret <8 x i64> %6 822} 823 824define <8 x i64> @test_mm512_permutex2var_epi16(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) { 825; CHECK-LABEL: test_mm512_permutex2var_epi16: 826; CHECK: # %bb.0: # %entry 827; CHECK-NEXT: vpermt2w %zmm2, %zmm1, %zmm0 828; CHECK-NEXT: ret{{[l|q]}} 829entry: 830 %0 = bitcast <8 x i64> %__A to <32 x i16> 831 %1 = bitcast <8 x i64> %__I to <32 x i16> 832 %2 = bitcast <8 x i64> %__B to <32 x i16> 833 %3 = tail call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %0, <32 x i16> %1, <32 x i16> %2) 834 %4 = bitcast <32 x i16> %3 to <8 x i64> 835 ret <8 x i64> %4 836} 837 838define <8 x i64> @test_mm512_mask_permutex2var_epi16(<8 x i64> %__A, i32 %__U, <8 x i64> %__I, <8 x i64> %__B) { 839; X86-LABEL: test_mm512_mask_permutex2var_epi16: 840; X86: # %bb.0: # %entry 841; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 842; X86-NEXT: vpermt2w %zmm2, %zmm1, %zmm0 {%k1} 843; X86-NEXT: retl 844; 845; X64-LABEL: test_mm512_mask_permutex2var_epi16: 846; X64: # %bb.0: # %entry 847; X64-NEXT: kmovd %edi, %k1 848; X64-NEXT: vpermt2w %zmm2, %zmm1, %zmm0 {%k1} 849; X64-NEXT: retq 850entry: 851 %0 = bitcast <8 x i64> %__A to <32 x i16> 852 %1 = bitcast <8 x i64> %__I to <32 x i16> 853 %2 = bitcast <8 x i64> %__B to <32 x i16> 854 %3 = tail call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %0, <32 x i16> %1, <32 x i16> %2) 855 %4 = bitcast i32 %__U to <32 x i1> 856 %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %0 857 %6 = bitcast <32 x i16> %5 to <8 x i64> 858 ret <8 x i64> %6 859} 860 861define <8 x i64> @test_mm512_maskz_permutex2var_epi16(i32 %__U, <8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) { 862; X86-LABEL: test_mm512_maskz_permutex2var_epi16: 863; X86: # %bb.0: # %entry 864; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 865; X86-NEXT: vpermt2w %zmm2, %zmm1, %zmm0 {%k1} {z} 866; X86-NEXT: retl 867; 868; X64-LABEL: test_mm512_maskz_permutex2var_epi16: 869; X64: # %bb.0: # %entry 870; X64-NEXT: kmovd %edi, %k1 871; X64-NEXT: vpermt2w %zmm2, %zmm1, %zmm0 {%k1} {z} 872; X64-NEXT: retq 873entry: 874 %0 = bitcast <8 x i64> %__A to <32 x i16> 875 %1 = bitcast <8 x i64> %__I to <32 x i16> 876 %2 = bitcast <8 x i64> %__B to <32 x i16> 877 %3 = tail call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %0, <32 x i16> %1, <32 x i16> %2) 878 %4 = bitcast i32 %__U to <32 x i1> 879 %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer 880 %6 = bitcast <32 x i16> %5 to <8 x i64> 881 ret <8 x i64> %6 882} 883 884declare <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>) 885 886!0 = !{i32 1} 887 888