1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vp2intersect --show-mc-encoding | FileCheck %s --check-prefix=X86 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vp2intersect --show-mc-encoding | FileCheck %s --check-prefix=X64 4 5define void @test_mm512_2intersect_epi32(<8 x i64> %a, <8 x i64> %b, ptr nocapture %m0, ptr nocapture %m1) { 6; X86-LABEL: test_mm512_2intersect_epi32: 7; X86: # %bb.0: # %entry 8; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] 9; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04] 10; X86-NEXT: vp2intersectd %zmm1, %zmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x48,0x68,0xc1] 11; X86-NEXT: kmovw %k0, (%ecx) # encoding: [0xc5,0xf8,0x91,0x01] 12; X86-NEXT: kmovw %k1, (%eax) # encoding: [0xc5,0xf8,0x91,0x08] 13; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 14; X86-NEXT: retl # encoding: [0xc3] 15; 16; X64-LABEL: test_mm512_2intersect_epi32: 17; X64: # %bb.0: # %entry 18; X64-NEXT: vp2intersectd %zmm1, %zmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x48,0x68,0xc1] 19; X64-NEXT: kmovw %k0, (%rdi) # encoding: [0xc5,0xf8,0x91,0x07] 20; X64-NEXT: kmovw %k1, (%rsi) # encoding: [0xc5,0xf8,0x91,0x0e] 21; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 22; X64-NEXT: retq # encoding: [0xc3] 23entry: 24 %0 = bitcast <8 x i64> %a to <16 x i32> 25 %1 = bitcast <8 x i64> %b to <16 x i32> 26 %2 = tail call { <16 x i1>, <16 x i1> } @llvm.x86.avx512.vp2intersect.d.512(<16 x i32> %0, <16 x i32> %1) 27 %3 = extractvalue { <16 x i1>, <16 x i1> } %2, 0 28 store <16 x i1> %3, ptr %m0, align 16 29 %4 = extractvalue { <16 x i1>, <16 x i1> } %2, 1 30 store <16 x i1> %4, ptr %m1, align 16 31 ret void 32} 33 34define void @test_mm512_2intersect_epi64(<8 x i64> %a, <8 x i64> %b, ptr nocapture %m0, ptr nocapture %m1) { 35; X86-LABEL: test_mm512_2intersect_epi64: 36; X86: # %bb.0: # %entry 37; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] 38; X86-NEXT: vp2intersectq %zmm1, %zmm0, %k0 # encoding: [0x62,0xf2,0xff,0x48,0x68,0xc1] 39; X86-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] 40; X86-NEXT: kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0] 41; X86-NEXT: movb %dl, (%eax) # encoding: [0x88,0x10] 42; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] 43; X86-NEXT: movb %cl, (%eax) # encoding: [0x88,0x08] 44; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 45; X86-NEXT: retl # encoding: [0xc3] 46; 47; X64-LABEL: test_mm512_2intersect_epi64: 48; X64: # %bb.0: # %entry 49; X64-NEXT: vp2intersectq %zmm1, %zmm0, %k0 # encoding: [0x62,0xf2,0xff,0x48,0x68,0xc1] 50; X64-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] 51; X64-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] 52; X64-NEXT: movb %cl, (%rdi) # encoding: [0x88,0x0f] 53; X64-NEXT: movb %al, (%rsi) # encoding: [0x88,0x06] 54; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 55; X64-NEXT: retq # encoding: [0xc3] 56entry: 57 %0 = tail call { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.q.512(<8 x i64> %a, <8 x i64> %b) 58 %1 = extractvalue { <8 x i1>, <8 x i1> } %0, 0 59 store <8 x i1> %1, ptr %m0, align 8 60 %2 = extractvalue { <8 x i1>, <8 x i1> } %0, 1 61 store <8 x i1> %2, ptr %m1, align 8 62 ret void 63} 64 65define void @test_mm512_2intersect_epi32_p(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %m0, ptr nocapture %m1) { 66; X86-LABEL: test_mm512_2intersect_epi32_p: 67; X86: # %bb.0: # %entry 68; X86-NEXT: pushl %esi # encoding: [0x56] 69; X86-NEXT: .cfi_def_cfa_offset 8 70; X86-NEXT: .cfi_offset %esi, -8 71; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14] 72; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10] 73; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c] 74; X86-NEXT: movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08] 75; X86-NEXT: vmovdqa64 (%esi), %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0x06] 76; X86-NEXT: vp2intersectd (%edx), %zmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x48,0x68,0x02] 77; X86-NEXT: kmovw %k0, (%ecx) # encoding: [0xc5,0xf8,0x91,0x01] 78; X86-NEXT: kmovw %k1, (%eax) # encoding: [0xc5,0xf8,0x91,0x08] 79; X86-NEXT: popl %esi # encoding: [0x5e] 80; X86-NEXT: .cfi_def_cfa_offset 4 81; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 82; X86-NEXT: retl # encoding: [0xc3] 83; 84; X64-LABEL: test_mm512_2intersect_epi32_p: 85; X64: # %bb.0: # %entry 86; X64-NEXT: vmovdqa64 (%rdi), %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0x07] 87; X64-NEXT: vp2intersectd (%rsi), %zmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x48,0x68,0x06] 88; X64-NEXT: kmovw %k0, (%rdx) # encoding: [0xc5,0xf8,0x91,0x02] 89; X64-NEXT: kmovw %k1, (%rcx) # encoding: [0xc5,0xf8,0x91,0x09] 90; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 91; X64-NEXT: retq # encoding: [0xc3] 92entry: 93 %0 = load <16 x i32>, ptr %a, align 64 94 %1 = load <16 x i32>, ptr %b, align 64 95 %2 = tail call { <16 x i1>, <16 x i1> } @llvm.x86.avx512.vp2intersect.d.512(<16 x i32> %0, <16 x i32> %1) 96 %3 = extractvalue { <16 x i1>, <16 x i1> } %2, 0 97 store <16 x i1> %3, ptr %m0, align 16 98 %4 = extractvalue { <16 x i1>, <16 x i1> } %2, 1 99 store <16 x i1> %4, ptr %m1, align 16 100 ret void 101} 102 103define void @test_mm512_2intersect_epi64_p(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %m0, ptr nocapture %m1) { 104; X86-LABEL: test_mm512_2intersect_epi64_p: 105; X86: # %bb.0: # %entry 106; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x0c] 107; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08] 108; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04] 109; X86-NEXT: vmovdqa64 (%edx), %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0x02] 110; X86-NEXT: vp2intersectq (%ecx), %zmm0, %k0 # encoding: [0x62,0xf2,0xff,0x48,0x68,0x01] 111; X86-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] 112; X86-NEXT: kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0] 113; X86-NEXT: movb %dl, (%eax) # encoding: [0x88,0x10] 114; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x10] 115; X86-NEXT: movb %cl, (%eax) # encoding: [0x88,0x08] 116; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 117; X86-NEXT: retl # encoding: [0xc3] 118; 119; X64-LABEL: test_mm512_2intersect_epi64_p: 120; X64: # %bb.0: # %entry 121; X64-NEXT: vmovdqa64 (%rdi), %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0x07] 122; X64-NEXT: vp2intersectq (%rsi), %zmm0, %k0 # encoding: [0x62,0xf2,0xff,0x48,0x68,0x06] 123; X64-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] 124; X64-NEXT: kmovw %k0, %esi # encoding: [0xc5,0xf8,0x93,0xf0] 125; X64-NEXT: movb %sil, (%rdx) # encoding: [0x40,0x88,0x32] 126; X64-NEXT: movb %al, (%rcx) # encoding: [0x88,0x01] 127; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 128; X64-NEXT: retq # encoding: [0xc3] 129 130entry: 131 %0 = load <8 x i64>, ptr %a, align 64 132 %1 = load <8 x i64>, ptr %b, align 64 133 %2 = tail call { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.q.512(<8 x i64> %0, <8 x i64> %1) 134 %3 = extractvalue { <8 x i1>, <8 x i1> } %2, 0 135 store <8 x i1> %3, ptr %m0, align 8 136 %4 = extractvalue { <8 x i1>, <8 x i1> } %2, 1 137 store <8 x i1> %4, ptr %m1, align 8 138 ret void 139} 140 141define void @test_mm512_2intersect_epi32_b(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %m0, ptr nocapture %m1) { 142; X86-LABEL: test_mm512_2intersect_epi32_b: 143; X86: # %bb.0: # %entry 144; X86-NEXT: pushl %esi # encoding: [0x56] 145; X86-NEXT: .cfi_def_cfa_offset 8 146; X86-NEXT: .cfi_offset %esi, -8 147; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14] 148; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10] 149; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c] 150; X86-NEXT: movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08] 151; X86-NEXT: vpbroadcastd (%esi), %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x58,0x06] 152; X86-NEXT: vp2intersectd (%edx){1to16}, %zmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x58,0x68,0x02] 153; X86-NEXT: kmovw %k0, (%ecx) # encoding: [0xc5,0xf8,0x91,0x01] 154; X86-NEXT: kmovw %k1, (%eax) # encoding: [0xc5,0xf8,0x91,0x08] 155; X86-NEXT: popl %esi # encoding: [0x5e] 156; X86-NEXT: .cfi_def_cfa_offset 4 157; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 158; X86-NEXT: retl # encoding: [0xc3] 159; 160; X64-LABEL: test_mm512_2intersect_epi32_b: 161; X64: # %bb.0: # %entry 162; X64-NEXT: vpbroadcastd (%rdi), %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x58,0x07] 163; X64-NEXT: vp2intersectd (%rsi){1to16}, %zmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x58,0x68,0x06] 164; X64-NEXT: kmovw %k0, (%rdx) # encoding: [0xc5,0xf8,0x91,0x02] 165; X64-NEXT: kmovw %k1, (%rcx) # encoding: [0xc5,0xf8,0x91,0x09] 166; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 167; X64-NEXT: retq # encoding: [0xc3] 168entry: 169 %0 = load i32, ptr %a, align 4 170 %vecinit.i = insertelement <16 x i32> undef, i32 %0, i32 0 171 %vecinit15.i = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer 172 %1 = load i32, ptr %b, align 4 173 %vecinit.i2 = insertelement <16 x i32> undef, i32 %1, i32 0 174 %vecinit15.i3 = shufflevector <16 x i32> %vecinit.i2, <16 x i32> undef, <16 x i32> zeroinitializer 175 %2 = tail call { <16 x i1>, <16 x i1> } @llvm.x86.avx512.vp2intersect.d.512(<16 x i32> %vecinit15.i, <16 x i32> %vecinit15.i3) 176 %3 = extractvalue { <16 x i1>, <16 x i1> } %2, 0 177 store <16 x i1> %3, ptr %m0, align 16 178 %4 = extractvalue { <16 x i1>, <16 x i1> } %2, 1 179 store <16 x i1> %4, ptr %m1, align 16 180 ret void 181} 182 183define void @test_mm512_2intersect_epi64_b(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %m0, ptr nocapture %m1) { 184; X86-LABEL: test_mm512_2intersect_epi64_b: 185; X86: # %bb.0: # %entry 186; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x0c] 187; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08] 188; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04] 189; X86-NEXT: vpbroadcastq (%edx), %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x02] 190; X86-NEXT: vp2intersectq (%ecx){1to8}, %zmm0, %k0 # encoding: [0x62,0xf2,0xff,0x58,0x68,0x01] 191; X86-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] 192; X86-NEXT: kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0] 193; X86-NEXT: movb %dl, (%eax) # encoding: [0x88,0x10] 194; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x10] 195; X86-NEXT: movb %cl, (%eax) # encoding: [0x88,0x08] 196; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 197; X86-NEXT: retl # encoding: [0xc3] 198; 199; X64-LABEL: test_mm512_2intersect_epi64_b: 200; X64: # %bb.0: # %entry 201; X64-NEXT: vpbroadcastq (%rdi), %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x07] 202; X64-NEXT: vp2intersectq (%rsi){1to8}, %zmm0, %k0 # encoding: [0x62,0xf2,0xff,0x58,0x68,0x06] 203; X64-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] 204; X64-NEXT: kmovw %k0, %esi # encoding: [0xc5,0xf8,0x93,0xf0] 205; X64-NEXT: movb %sil, (%rdx) # encoding: [0x40,0x88,0x32] 206; X64-NEXT: movb %al, (%rcx) # encoding: [0x88,0x01] 207; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 208; X64-NEXT: retq # encoding: [0xc3] 209entry: 210 %0 = load i64, ptr %a, align 8 211 %vecinit.i = insertelement <8 x i64> undef, i64 %0, i32 0 212 %vecinit7.i = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer 213 %1 = load i64, ptr %b, align 8 214 %vecinit.i2 = insertelement <8 x i64> undef, i64 %1, i32 0 215 %vecinit7.i3 = shufflevector <8 x i64> %vecinit.i2, <8 x i64> undef, <8 x i32> zeroinitializer 216 %2 = tail call { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.q.512(<8 x i64> %vecinit7.i, <8 x i64> %vecinit7.i3) 217 %3 = extractvalue { <8 x i1>, <8 x i1> } %2, 0 218 store <8 x i1> %3, ptr %m0, align 8 219 %4 = extractvalue { <8 x i1>, <8 x i1> } %2, 1 220 store <8 x i1> %4, ptr %m1, align 8 221 ret void 222} 223 224declare { <16 x i1>, <16 x i1> } @llvm.x86.avx512.vp2intersect.d.512(<16 x i32>, <16 x i32>) 225declare { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.q.512(<8 x i64>, <8 x i64>) 226