1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vp2intersect,+avx512vl --show-mc-encoding | FileCheck %s --check-prefix=X86 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vp2intersect,+avx512vl --show-mc-encoding | FileCheck %s --check-prefix=X64 4 5define void @test_mm256_2intersect_epi32(<4 x i64> %a, <4 x i64> %b, ptr nocapture %m0, ptr nocapture %m1) { 6; X86-LABEL: test_mm256_2intersect_epi32: 7; X86: # %bb.0: # %entry 8; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] 9; X86-NEXT: vp2intersectd %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x28,0x68,0xc1] 10; X86-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] 11; X86-NEXT: kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0] 12; X86-NEXT: movb %dl, (%eax) # encoding: [0x88,0x10] 13; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] 14; X86-NEXT: movb %cl, (%eax) # encoding: [0x88,0x08] 15; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 16; X86-NEXT: retl # encoding: [0xc3] 17; 18; X64-LABEL: test_mm256_2intersect_epi32: 19; X64: # %bb.0: # %entry 20; X64-NEXT: vp2intersectd %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x28,0x68,0xc1] 21; X64-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] 22; X64-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] 23; X64-NEXT: movb %cl, (%rdi) # encoding: [0x88,0x0f] 24; X64-NEXT: movb %al, (%rsi) # encoding: [0x88,0x06] 25; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 26; X64-NEXT: retq # encoding: [0xc3] 27entry: 28 %0 = bitcast <4 x i64> %a to <8 x i32> 29 %1 = bitcast <4 x i64> %b to <8 x i32> 30 %2 = tail call { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.d.256(<8 x i32> %0, <8 x i32> %1) 31 %3 = extractvalue { <8 x i1>, <8 x i1> } %2, 0 32 store <8 x i1> %3, ptr %m0, align 8 33 %4 = extractvalue { <8 x i1>, <8 x i1> } %2, 1 34 store <8 x i1> %4, ptr %m1, align 8 35 ret void 36} 37 38define void @test_mm256_2intersect_epi64(<4 x i64> %a, <4 x i64> %b, ptr nocapture %m0, ptr nocapture %m1) { 39; X86-LABEL: test_mm256_2intersect_epi64: 40; X86: # %bb.0: # %entry 41; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] 42; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04] 43; X86-NEXT: vp2intersectq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x28,0x68,0xc1] 44; X86-NEXT: kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c] 45; X86-NEXT: kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c] 46; X86-NEXT: kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2] 47; X86-NEXT: movb %dl, (%ecx) # encoding: [0x88,0x11] 48; X86-NEXT: kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c] 49; X86-NEXT: kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] 50; X86-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] 51; X86-NEXT: movb %cl, (%eax) # encoding: [0x88,0x08] 52; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 53; X86-NEXT: retl # encoding: [0xc3] 54; 55; X64-LABEL: test_mm256_2intersect_epi64: 56; X64: # %bb.0: # %entry 57; X64-NEXT: vp2intersectq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x28,0x68,0xc1] 58; X64-NEXT: kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c] 59; X64-NEXT: kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c] 60; X64-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] 61; X64-NEXT: movb %al, (%rdi) # encoding: [0x88,0x07] 62; X64-NEXT: kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c] 63; X64-NEXT: kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] 64; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] 65; X64-NEXT: movb %al, (%rsi) # encoding: [0x88,0x06] 66; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 67; X64-NEXT: retq # encoding: [0xc3] 68entry: 69 %0 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.q.256(<4 x i64> %a, <4 x i64> %b) 70 %1 = extractvalue { <4 x i1>, <4 x i1> } %0, 0 71 %2 = shufflevector <4 x i1> %1, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 72 %3 = bitcast <8 x i1> %2 to i8 73 store i8 %3, ptr %m0, align 1 74 %4 = extractvalue { <4 x i1>, <4 x i1> } %0, 1 75 %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 76 %6 = bitcast <8 x i1> %5 to i8 77 store i8 %6, ptr %m1, align 1 78 ret void 79} 80 81define void @test_mm256_2intersect_epi32_p(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %m0, ptr nocapture %m1) { 82; X86-LABEL: test_mm256_2intersect_epi32_p: 83; X86: # %bb.0: # %entry 84; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x0c] 85; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08] 86; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04] 87; X86-NEXT: vmovdqa (%edx), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x02] 88; X86-NEXT: vp2intersectd (%ecx), %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x28,0x68,0x01] 89; X86-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] 90; X86-NEXT: kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0] 91; X86-NEXT: movb %dl, (%eax) # encoding: [0x88,0x10] 92; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x10] 93; X86-NEXT: movb %cl, (%eax) # encoding: [0x88,0x08] 94; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 95; X86-NEXT: retl # encoding: [0xc3] 96; 97; X64-LABEL: test_mm256_2intersect_epi32_p: 98; X64: # %bb.0: # %entry 99; X64-NEXT: vmovdqa (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x07] 100; X64-NEXT: vp2intersectd (%rsi), %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x28,0x68,0x06] 101; X64-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] 102; X64-NEXT: kmovw %k0, %esi # encoding: [0xc5,0xf8,0x93,0xf0] 103; X64-NEXT: movb %sil, (%rdx) # encoding: [0x40,0x88,0x32] 104; X64-NEXT: movb %al, (%rcx) # encoding: [0x88,0x01] 105; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 106; X64-NEXT: retq # encoding: [0xc3] 107entry: 108 %0 = load <8 x i32>, ptr %a, align 32 109 %1 = load <8 x i32>, ptr %b, align 32 110 %2 = tail call { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.d.256(<8 x i32> %0, <8 x i32> %1) 111 %3 = extractvalue { <8 x i1>, <8 x i1> } %2, 0 112 store <8 x i1> %3, ptr %m0, align 8 113 %4 = extractvalue { <8 x i1>, <8 x i1> } %2, 1 114 store <8 x i1> %4, ptr %m1, align 8 115 ret void 116} 117 118define void @test_mm256_2intersect_epi64_p(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %m0, ptr nocapture %m1) { 119; X86-LABEL: test_mm256_2intersect_epi64_p: 120; X86: # %bb.0: # %entry 121; X86-NEXT: pushl %esi # encoding: [0x56] 122; X86-NEXT: .cfi_def_cfa_offset 8 123; X86-NEXT: .cfi_offset %esi, -8 124; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14] 125; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10] 126; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c] 127; X86-NEXT: movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08] 128; X86-NEXT: vmovdqa (%esi), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x06] 129; X86-NEXT: vp2intersectq (%edx), %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x28,0x68,0x02] 130; X86-NEXT: kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c] 131; X86-NEXT: kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c] 132; X86-NEXT: kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2] 133; X86-NEXT: movb %dl, (%ecx) # encoding: [0x88,0x11] 134; X86-NEXT: kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c] 135; X86-NEXT: kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] 136; X86-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] 137; X86-NEXT: movb %cl, (%eax) # encoding: [0x88,0x08] 138; X86-NEXT: popl %esi # encoding: [0x5e] 139; X86-NEXT: .cfi_def_cfa_offset 4 140; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 141; X86-NEXT: retl # encoding: [0xc3] 142; 143; X64-LABEL: test_mm256_2intersect_epi64_p: 144; X64: # %bb.0: # %entry 145; X64-NEXT: vmovdqa (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x07] 146; X64-NEXT: vp2intersectq (%rsi), %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x28,0x68,0x06] 147; X64-NEXT: kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c] 148; X64-NEXT: kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c] 149; X64-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] 150; X64-NEXT: movb %al, (%rdx) # encoding: [0x88,0x02] 151; X64-NEXT: kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c] 152; X64-NEXT: kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] 153; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] 154; X64-NEXT: movb %al, (%rcx) # encoding: [0x88,0x01] 155; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 156; X64-NEXT: retq # encoding: [0xc3] 157entry: 158 %0 = load <4 x i64>, ptr %a, align 32 159 %1 = load <4 x i64>, ptr %b, align 32 160 %2 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.q.256(<4 x i64> %0, <4 x i64> %1) 161 %3 = extractvalue { <4 x i1>, <4 x i1> } %2, 0 162 %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 163 %5 = bitcast <8 x i1> %4 to i8 164 store i8 %5, ptr %m0, align 1 165 %6 = extractvalue { <4 x i1>, <4 x i1> } %2, 1 166 %7 = shufflevector <4 x i1> %6, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 167 %8 = bitcast <8 x i1> %7 to i8 168 store i8 %8, ptr %m1, align 1 169 ret void 170} 171 172define void @test_mm256_2intersect_epi32_b(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %m0, ptr nocapture %m1) { 173; X86-LABEL: test_mm256_2intersect_epi32_b: 174; X86: # %bb.0: # %entry 175; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x0c] 176; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08] 177; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04] 178; X86-NEXT: vpbroadcastd (%edx), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x58,0x02] 179; X86-NEXT: vp2intersectd (%ecx){1to8}, %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x38,0x68,0x01] 180; X86-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] 181; X86-NEXT: kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0] 182; X86-NEXT: movb %dl, (%eax) # encoding: [0x88,0x10] 183; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x10] 184; X86-NEXT: movb %cl, (%eax) # encoding: [0x88,0x08] 185; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 186; X86-NEXT: retl # encoding: [0xc3] 187; 188; X64-LABEL: test_mm256_2intersect_epi32_b: 189; X64: # %bb.0: # %entry 190; X64-NEXT: vpbroadcastd (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x58,0x07] 191; X64-NEXT: vp2intersectd (%rsi){1to8}, %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x38,0x68,0x06] 192; X64-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] 193; X64-NEXT: kmovw %k0, %esi # encoding: [0xc5,0xf8,0x93,0xf0] 194; X64-NEXT: movb %sil, (%rdx) # encoding: [0x40,0x88,0x32] 195; X64-NEXT: movb %al, (%rcx) # encoding: [0x88,0x01] 196; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 197; X64-NEXT: retq # encoding: [0xc3] 198entry: 199 %0 = load i32, ptr %a, align 4 200 %vecinit.i.i = insertelement <8 x i32> undef, i32 %0, i32 0 201 %vecinit7.i.i = shufflevector <8 x i32> %vecinit.i.i, <8 x i32> undef, <8 x i32> zeroinitializer 202 %1 = load i32, ptr %b, align 4 203 %vecinit.i.i2 = insertelement <8 x i32> undef, i32 %1, i32 0 204 %vecinit7.i.i3 = shufflevector <8 x i32> %vecinit.i.i2, <8 x i32> undef, <8 x i32> zeroinitializer 205 %2 = tail call { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.d.256(<8 x i32> %vecinit7.i.i, <8 x i32> %vecinit7.i.i3) 206 %3 = extractvalue { <8 x i1>, <8 x i1> } %2, 0 207 store <8 x i1> %3, ptr %m0, align 8 208 %4 = extractvalue { <8 x i1>, <8 x i1> } %2, 1 209 store <8 x i1> %4, ptr %m1, align 8 210 ret void 211} 212 213define void @test_mm256_2intersect_epi64_b(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %m0, ptr nocapture %m1) { 214; X86-LABEL: test_mm256_2intersect_epi64_b: 215; X86: # %bb.0: # %entry 216; X86-NEXT: pushl %esi # encoding: [0x56] 217; X86-NEXT: .cfi_def_cfa_offset 8 218; X86-NEXT: .cfi_offset %esi, -8 219; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14] 220; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10] 221; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c] 222; X86-NEXT: movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08] 223; X86-NEXT: vpbroadcastq (%esi), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0x06] 224; X86-NEXT: vp2intersectq (%edx){1to4}, %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x38,0x68,0x02] 225; X86-NEXT: kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c] 226; X86-NEXT: kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c] 227; X86-NEXT: kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2] 228; X86-NEXT: movb %dl, (%ecx) # encoding: [0x88,0x11] 229; X86-NEXT: kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c] 230; X86-NEXT: kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] 231; X86-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] 232; X86-NEXT: movb %cl, (%eax) # encoding: [0x88,0x08] 233; X86-NEXT: popl %esi # encoding: [0x5e] 234; X86-NEXT: .cfi_def_cfa_offset 4 235; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 236; X86-NEXT: retl # encoding: [0xc3] 237; 238; X64-LABEL: test_mm256_2intersect_epi64_b: 239; X64: # %bb.0: # %entry 240; X64-NEXT: vpbroadcastq (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0x07] 241; X64-NEXT: vp2intersectq (%rsi){1to4}, %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x38,0x68,0x06] 242; X64-NEXT: kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c] 243; X64-NEXT: kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c] 244; X64-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] 245; X64-NEXT: movb %al, (%rdx) # encoding: [0x88,0x02] 246; X64-NEXT: kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c] 247; X64-NEXT: kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] 248; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] 249; X64-NEXT: movb %al, (%rcx) # encoding: [0x88,0x01] 250; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 251; X64-NEXT: retq # encoding: [0xc3] 252entry: 253 %0 = load i64, ptr %a, align 8 254 %vecinit.i.i = insertelement <4 x i64> undef, i64 %0, i32 0 255 %vecinit3.i.i = shufflevector <4 x i64> %vecinit.i.i, <4 x i64> undef, <4 x i32> zeroinitializer 256 %1 = load i64, ptr %b, align 8 257 %vecinit.i.i2 = insertelement <4 x i64> undef, i64 %1, i32 0 258 %vecinit3.i.i3 = shufflevector <4 x i64> %vecinit.i.i2, <4 x i64> undef, <4 x i32> zeroinitializer 259 %2 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.q.256(<4 x i64> %vecinit3.i.i, <4 x i64> %vecinit3.i.i3) 260 %3 = extractvalue { <4 x i1>, <4 x i1> } %2, 0 261 %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 262 %5 = bitcast <8 x i1> %4 to i8 263 store i8 %5, ptr %m0, align 1 264 %6 = extractvalue { <4 x i1>, <4 x i1> } %2, 1 265 %7 = shufflevector <4 x i1> %6, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 266 %8 = bitcast <8 x i1> %7 to i8 267 store i8 %8, ptr %m1, align 1 268 ret void 269} 270 271define void @test_mm_2intersect_epi32(<2 x i64> %a, <2 x i64> %b, ptr nocapture %m0, ptr nocapture %m1) { 272; X86-LABEL: test_mm_2intersect_epi32: 273; X86: # %bb.0: # %entry 274; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] 275; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04] 276; X86-NEXT: vp2intersectd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x08,0x68,0xc1] 277; X86-NEXT: kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c] 278; X86-NEXT: kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c] 279; X86-NEXT: kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2] 280; X86-NEXT: movb %dl, (%ecx) # encoding: [0x88,0x11] 281; X86-NEXT: kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c] 282; X86-NEXT: kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] 283; X86-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] 284; X86-NEXT: movb %cl, (%eax) # encoding: [0x88,0x08] 285; X86-NEXT: retl # encoding: [0xc3] 286; 287; X64-LABEL: test_mm_2intersect_epi32: 288; X64: # %bb.0: # %entry 289; X64-NEXT: vp2intersectd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x08,0x68,0xc1] 290; X64-NEXT: kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c] 291; X64-NEXT: kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c] 292; X64-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] 293; X64-NEXT: movb %al, (%rdi) # encoding: [0x88,0x07] 294; X64-NEXT: kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c] 295; X64-NEXT: kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] 296; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] 297; X64-NEXT: movb %al, (%rsi) # encoding: [0x88,0x06] 298; X64-NEXT: retq # encoding: [0xc3] 299entry: 300 %0 = bitcast <2 x i64> %a to <4 x i32> 301 %1 = bitcast <2 x i64> %b to <4 x i32> 302 %2 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.d.128(<4 x i32> %0, <4 x i32> %1) 303 %3 = extractvalue { <4 x i1>, <4 x i1> } %2, 0 304 %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 305 %5 = bitcast <8 x i1> %4 to i8 306 store i8 %5, ptr %m0, align 1 307 %6 = extractvalue { <4 x i1>, <4 x i1> } %2, 1 308 %7 = shufflevector <4 x i1> %6, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 309 %8 = bitcast <8 x i1> %7 to i8 310 store i8 %8, ptr %m1, align 1 311 ret void 312} 313 314define void @test_mm_2intersect_epi64(<2 x i64> %a, <2 x i64> %b, ptr nocapture %m0, ptr nocapture %m1) { 315; X86-LABEL: test_mm_2intersect_epi64: 316; X86: # %bb.0: # %entry 317; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] 318; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04] 319; X86-NEXT: vp2intersectq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x08,0x68,0xc1] 320; X86-NEXT: kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e] 321; X86-NEXT: kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e] 322; X86-NEXT: kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2] 323; X86-NEXT: movb %dl, (%ecx) # encoding: [0x88,0x11] 324; X86-NEXT: kshiftlw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0e] 325; X86-NEXT: kshiftrw $14, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e] 326; X86-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] 327; X86-NEXT: movb %cl, (%eax) # encoding: [0x88,0x08] 328; X86-NEXT: retl # encoding: [0xc3] 329; 330; X64-LABEL: test_mm_2intersect_epi64: 331; X64: # %bb.0: # %entry 332; X64-NEXT: vp2intersectq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x08,0x68,0xc1] 333; X64-NEXT: kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e] 334; X64-NEXT: kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e] 335; X64-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] 336; X64-NEXT: movb %al, (%rdi) # encoding: [0x88,0x07] 337; X64-NEXT: kshiftlw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0e] 338; X64-NEXT: kshiftrw $14, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e] 339; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] 340; X64-NEXT: movb %al, (%rsi) # encoding: [0x88,0x06] 341; X64-NEXT: retq # encoding: [0xc3] 342entry: 343 %0 = tail call { <2 x i1>, <2 x i1> } @llvm.x86.avx512.vp2intersect.q.128(<2 x i64> %a, <2 x i64> %b) 344 %1 = extractvalue { <2 x i1>, <2 x i1> } %0, 0 345 %2 = shufflevector <2 x i1> %1, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3> 346 %3 = bitcast <8 x i1> %2 to i8 347 store i8 %3, ptr %m0, align 1 348 %4 = extractvalue { <2 x i1>, <2 x i1> } %0, 1 349 %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3> 350 %6 = bitcast <8 x i1> %5 to i8 351 store i8 %6, ptr %m1, align 1 352 ret void 353} 354 355define void @test_mm_2intersect_epi32_p(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %m0, ptr nocapture %m1) { 356; X86-LABEL: test_mm_2intersect_epi32_p: 357; X86: # %bb.0: # %entry 358; X86-NEXT: pushl %esi # encoding: [0x56] 359; X86-NEXT: .cfi_def_cfa_offset 8 360; X86-NEXT: .cfi_offset %esi, -8 361; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14] 362; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10] 363; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c] 364; X86-NEXT: movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08] 365; X86-NEXT: vmovdqa (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x06] 366; X86-NEXT: vp2intersectd (%edx), %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x08,0x68,0x02] 367; X86-NEXT: kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c] 368; X86-NEXT: kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c] 369; X86-NEXT: kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2] 370; X86-NEXT: movb %dl, (%ecx) # encoding: [0x88,0x11] 371; X86-NEXT: kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c] 372; X86-NEXT: kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] 373; X86-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] 374; X86-NEXT: movb %cl, (%eax) # encoding: [0x88,0x08] 375; X86-NEXT: popl %esi # encoding: [0x5e] 376; X86-NEXT: .cfi_def_cfa_offset 4 377; X86-NEXT: retl # encoding: [0xc3] 378; 379; X64-LABEL: test_mm_2intersect_epi32_p: 380; X64: # %bb.0: # %entry 381; X64-NEXT: vmovdqa (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x07] 382; X64-NEXT: vp2intersectd (%rsi), %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x08,0x68,0x06] 383; X64-NEXT: kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c] 384; X64-NEXT: kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c] 385; X64-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] 386; X64-NEXT: movb %al, (%rdx) # encoding: [0x88,0x02] 387; X64-NEXT: kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c] 388; X64-NEXT: kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] 389; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] 390; X64-NEXT: movb %al, (%rcx) # encoding: [0x88,0x01] 391; X64-NEXT: retq # encoding: [0xc3] 392entry: 393 %0 = load <4 x i32>, ptr %a, align 16 394 %1 = load <4 x i32>, ptr %b, align 16 395 %2 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.d.128(<4 x i32> %0, <4 x i32> %1) 396 %3 = extractvalue { <4 x i1>, <4 x i1> } %2, 0 397 %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 398 %5 = bitcast <8 x i1> %4 to i8 399 store i8 %5, ptr %m0, align 1 400 %6 = extractvalue { <4 x i1>, <4 x i1> } %2, 1 401 %7 = shufflevector <4 x i1> %6, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 402 %8 = bitcast <8 x i1> %7 to i8 403 store i8 %8, ptr %m1, align 1 404 ret void 405} 406 407define void @test_mm_2intersect_epi64_p(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %m0, ptr nocapture %m1) { 408; X86-LABEL: test_mm_2intersect_epi64_p: 409; X86: # %bb.0: # %entry 410; X86-NEXT: pushl %esi # encoding: [0x56] 411; X86-NEXT: .cfi_def_cfa_offset 8 412; X86-NEXT: .cfi_offset %esi, -8 413; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14] 414; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10] 415; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c] 416; X86-NEXT: movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08] 417; X86-NEXT: vmovdqa (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x06] 418; X86-NEXT: vp2intersectq (%edx), %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x08,0x68,0x02] 419; X86-NEXT: kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e] 420; X86-NEXT: kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e] 421; X86-NEXT: kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2] 422; X86-NEXT: movb %dl, (%ecx) # encoding: [0x88,0x11] 423; X86-NEXT: kshiftlw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0e] 424; X86-NEXT: kshiftrw $14, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e] 425; X86-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] 426; X86-NEXT: movb %cl, (%eax) # encoding: [0x88,0x08] 427; X86-NEXT: popl %esi # encoding: [0x5e] 428; X86-NEXT: .cfi_def_cfa_offset 4 429; X86-NEXT: retl # encoding: [0xc3] 430; 431; X64-LABEL: test_mm_2intersect_epi64_p: 432; X64: # %bb.0: # %entry 433; X64-NEXT: vmovdqa (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x07] 434; X64-NEXT: vp2intersectq (%rsi), %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x08,0x68,0x06] 435; X64-NEXT: kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e] 436; X64-NEXT: kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e] 437; X64-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] 438; X64-NEXT: movb %al, (%rdx) # encoding: [0x88,0x02] 439; X64-NEXT: kshiftlw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0e] 440; X64-NEXT: kshiftrw $14, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e] 441; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] 442; X64-NEXT: movb %al, (%rcx) # encoding: [0x88,0x01] 443; X64-NEXT: retq # encoding: [0xc3] 444entry: 445 %0 = load <2 x i64>, ptr %a, align 16 446 %1 = load <2 x i64>, ptr %b, align 16 447 %2 = tail call { <2 x i1>, <2 x i1> } @llvm.x86.avx512.vp2intersect.q.128(<2 x i64> %0, <2 x i64> %1) 448 %3 = extractvalue { <2 x i1>, <2 x i1> } %2, 0 449 %4 = shufflevector <2 x i1> %3, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3> 450 %5 = bitcast <8 x i1> %4 to i8 451 store i8 %5, ptr %m0, align 1 452 %6 = extractvalue { <2 x i1>, <2 x i1> } %2, 1 453 %7 = shufflevector <2 x i1> %6, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3> 454 %8 = bitcast <8 x i1> %7 to i8 455 store i8 %8, ptr %m1, align 1 456 ret void 457} 458 459define void @test_mm_2intersect_epi32_b(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %m0, ptr nocapture %m1) { 460; X86-LABEL: test_mm_2intersect_epi32_b: 461; X86: # %bb.0: # %entry 462; X86-NEXT: pushl %esi # encoding: [0x56] 463; X86-NEXT: .cfi_def_cfa_offset 8 464; X86-NEXT: .cfi_offset %esi, -8 465; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14] 466; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10] 467; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c] 468; X86-NEXT: movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08] 469; X86-NEXT: vpbroadcastd (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0x06] 470; X86-NEXT: vp2intersectd (%edx){1to4}, %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x18,0x68,0x02] 471; X86-NEXT: kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c] 472; X86-NEXT: kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c] 473; X86-NEXT: kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2] 474; X86-NEXT: movb %dl, (%ecx) # encoding: [0x88,0x11] 475; X86-NEXT: kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c] 476; X86-NEXT: kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] 477; X86-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] 478; X86-NEXT: movb %cl, (%eax) # encoding: [0x88,0x08] 479; X86-NEXT: popl %esi # encoding: [0x5e] 480; X86-NEXT: .cfi_def_cfa_offset 4 481; X86-NEXT: retl # encoding: [0xc3] 482; 483; X64-LABEL: test_mm_2intersect_epi32_b: 484; X64: # %bb.0: # %entry 485; X64-NEXT: vpbroadcastd (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0x07] 486; X64-NEXT: vp2intersectd (%rsi){1to4}, %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x18,0x68,0x06] 487; X64-NEXT: kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c] 488; X64-NEXT: kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c] 489; X64-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] 490; X64-NEXT: movb %al, (%rdx) # encoding: [0x88,0x02] 491; X64-NEXT: kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c] 492; X64-NEXT: kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] 493; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] 494; X64-NEXT: movb %al, (%rcx) # encoding: [0x88,0x01] 495; X64-NEXT: retq # encoding: [0xc3] 496entry: 497 %0 = load i32, ptr %a, align 4 498 %vecinit.i.i = insertelement <4 x i32> undef, i32 %0, i32 0 499 %vecinit3.i.i = shufflevector <4 x i32> %vecinit.i.i, <4 x i32> undef, <4 x i32> zeroinitializer 500 %1 = load i32, ptr %b, align 4 501 %vecinit.i.i2 = insertelement <4 x i32> undef, i32 %1, i32 0 502 %vecinit3.i.i3 = shufflevector <4 x i32> %vecinit.i.i2, <4 x i32> undef, <4 x i32> zeroinitializer 503 %2 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.d.128(<4 x i32> %vecinit3.i.i, <4 x i32> %vecinit3.i.i3) 504 %3 = extractvalue { <4 x i1>, <4 x i1> } %2, 0 505 %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 506 %5 = bitcast <8 x i1> %4 to i8 507 store i8 %5, ptr %m0, align 1 508 %6 = extractvalue { <4 x i1>, <4 x i1> } %2, 1 509 %7 = shufflevector <4 x i1> %6, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 510 %8 = bitcast <8 x i1> %7 to i8 511 store i8 %8, ptr %m1, align 1 512 ret void 513} 514 515define void @test_mm_2intersect_epi64_b(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %m0, ptr nocapture %m1) { 516; X86-LABEL: test_mm_2intersect_epi64_b: 517; X86: # %bb.0: # %entry 518; X86-NEXT: pushl %esi # encoding: [0x56] 519; X86-NEXT: .cfi_def_cfa_offset 8 520; X86-NEXT: .cfi_offset %esi, -8 521; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14] 522; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10] 523; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c] 524; X86-NEXT: movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08] 525; X86-NEXT: vpbroadcastq (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0x06] 526; X86-NEXT: vp2intersectq (%edx){1to2}, %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x18,0x68,0x02] 527; X86-NEXT: kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e] 528; X86-NEXT: kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e] 529; X86-NEXT: kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2] 530; X86-NEXT: movb %dl, (%ecx) # encoding: [0x88,0x11] 531; X86-NEXT: kshiftlw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0e] 532; X86-NEXT: kshiftrw $14, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e] 533; X86-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] 534; X86-NEXT: movb %cl, (%eax) # encoding: [0x88,0x08] 535; X86-NEXT: popl %esi # encoding: [0x5e] 536; X86-NEXT: .cfi_def_cfa_offset 4 537; X86-NEXT: retl # encoding: [0xc3] 538; 539; X64-LABEL: test_mm_2intersect_epi64_b: 540; X64: # %bb.0: # %entry 541; X64-NEXT: vpbroadcastq (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0x07] 542; X64-NEXT: vp2intersectq (%rsi){1to2}, %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x18,0x68,0x06] 543; X64-NEXT: kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e] 544; X64-NEXT: kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e] 545; X64-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] 546; X64-NEXT: movb %al, (%rdx) # encoding: [0x88,0x02] 547; X64-NEXT: kshiftlw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0e] 548; X64-NEXT: kshiftrw $14, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e] 549; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] 550; X64-NEXT: movb %al, (%rcx) # encoding: [0x88,0x01] 551; X64-NEXT: retq # encoding: [0xc3] 552entry: 553 %0 = load i64, ptr %a, align 8 554 %vecinit.i.i = insertelement <2 x i64> undef, i64 %0, i32 0 555 %vecinit1.i.i = shufflevector <2 x i64> %vecinit.i.i, <2 x i64> undef, <2 x i32> zeroinitializer 556 %1 = load i64, ptr %b, align 8 557 %vecinit.i.i2 = insertelement <2 x i64> undef, i64 %1, i32 0 558 %vecinit1.i.i3 = shufflevector <2 x i64> %vecinit.i.i2, <2 x i64> undef, <2 x i32> zeroinitializer 559 %2 = tail call { <2 x i1>, <2 x i1> } @llvm.x86.avx512.vp2intersect.q.128(<2 x i64> %vecinit1.i.i, <2 x i64> %vecinit1.i.i3) 560 %3 = extractvalue { <2 x i1>, <2 x i1> } %2, 0 561 %4 = shufflevector <2 x i1> %3, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3> 562 %5 = bitcast <8 x i1> %4 to i8 563 store i8 %5, ptr %m0, align 1 564 %6 = extractvalue { <2 x i1>, <2 x i1> } %2, 1 565 %7 = shufflevector <2 x i1> %6, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3> 566 %8 = bitcast <8 x i1> %7 to i8 567 store i8 %8, ptr %m1, align 1 568 ret void 569} 570 571declare { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.d.256(<8 x i32>, <8 x i32>) 572declare { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.q.256(<4 x i64>, <4 x i64>) 573declare { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.d.128(<4 x i32>, <4 x i32>) 574declare { <2 x i1>, <2 x i1> } @llvm.x86.avx512.vp2intersect.q.128(<2 x i64>, <2 x i64>) 575