xref: /llvm-project/llvm/test/CodeGen/X86/avx512vlvp2intersect-intrinsics.ll (revision eb64697a7b75d2b22041cc992fad0c8dfa7989cb)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vp2intersect,+avx512vl --show-mc-encoding | FileCheck %s --check-prefix=X86
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vp2intersect,+avx512vl --show-mc-encoding | FileCheck %s --check-prefix=X64
4
5define void @test_mm256_2intersect_epi32(<4 x i64> %a, <4 x i64> %b, ptr nocapture %m0, ptr nocapture %m1) {
6; X86-LABEL: test_mm256_2intersect_epi32:
7; X86:       # %bb.0: # %entry
8; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
9; X86-NEXT:    vp2intersectd %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x28,0x68,0xc1]
10; X86-NEXT:    kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9]
11; X86-NEXT:    kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0]
12; X86-NEXT:    movb %dl, (%eax) # encoding: [0x88,0x10]
13; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
14; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
15; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
16; X86-NEXT:    retl # encoding: [0xc3]
17;
18; X64-LABEL: test_mm256_2intersect_epi32:
19; X64:       # %bb.0: # %entry
20; X64-NEXT:    vp2intersectd %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x28,0x68,0xc1]
21; X64-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
22; X64-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
23; X64-NEXT:    movb %cl, (%rdi) # encoding: [0x88,0x0f]
24; X64-NEXT:    movb %al, (%rsi) # encoding: [0x88,0x06]
25; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
26; X64-NEXT:    retq # encoding: [0xc3]
27entry:
28  %0 = bitcast <4 x i64> %a to <8 x i32>
29  %1 = bitcast <4 x i64> %b to <8 x i32>
30  %2 = tail call { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.d.256(<8 x i32> %0, <8 x i32> %1)
31  %3 = extractvalue { <8 x i1>, <8 x i1> } %2, 0
32  store <8 x i1> %3, ptr %m0, align 8
33  %4 = extractvalue { <8 x i1>, <8 x i1> } %2, 1
34  store <8 x i1> %4, ptr %m1, align 8
35  ret void
36}
37
38define void @test_mm256_2intersect_epi64(<4 x i64> %a, <4 x i64> %b, ptr nocapture %m0, ptr nocapture %m1) {
39; X86-LABEL: test_mm256_2intersect_epi64:
40; X86:       # %bb.0: # %entry
41; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
42; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
43; X86-NEXT:    vp2intersectq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x28,0x68,0xc1]
44; X86-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
45; X86-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
46; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
47; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
48; X86-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
49; X86-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
50; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
51; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
52; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
53; X86-NEXT:    retl # encoding: [0xc3]
54;
55; X64-LABEL: test_mm256_2intersect_epi64:
56; X64:       # %bb.0: # %entry
57; X64-NEXT:    vp2intersectq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x28,0x68,0xc1]
58; X64-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
59; X64-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
60; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
61; X64-NEXT:    movb %al, (%rdi) # encoding: [0x88,0x07]
62; X64-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
63; X64-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
64; X64-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
65; X64-NEXT:    movb %al, (%rsi) # encoding: [0x88,0x06]
66; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
67; X64-NEXT:    retq # encoding: [0xc3]
68entry:
69  %0 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.q.256(<4 x i64> %a, <4 x i64> %b)
70  %1 = extractvalue { <4 x i1>, <4 x i1> } %0, 0
71  %2 = shufflevector <4 x i1> %1, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
72  %3 = bitcast <8 x i1> %2 to i8
73  store i8 %3, ptr %m0, align 1
74  %4 = extractvalue { <4 x i1>, <4 x i1> } %0, 1
75  %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
76  %6 = bitcast <8 x i1> %5 to i8
77  store i8 %6, ptr %m1, align 1
78  ret void
79}
80
81define void @test_mm256_2intersect_epi32_p(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %m0, ptr nocapture %m1) {
82; X86-LABEL: test_mm256_2intersect_epi32_p:
83; X86:       # %bb.0: # %entry
84; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x0c]
85; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
86; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04]
87; X86-NEXT:    vmovdqa (%edx), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x02]
88; X86-NEXT:    vp2intersectd (%ecx), %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x28,0x68,0x01]
89; X86-NEXT:    kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9]
90; X86-NEXT:    kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0]
91; X86-NEXT:    movb %dl, (%eax) # encoding: [0x88,0x10]
92; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x10]
93; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
94; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
95; X86-NEXT:    retl # encoding: [0xc3]
96;
97; X64-LABEL: test_mm256_2intersect_epi32_p:
98; X64:       # %bb.0: # %entry
99; X64-NEXT:    vmovdqa (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x07]
100; X64-NEXT:    vp2intersectd (%rsi), %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x28,0x68,0x06]
101; X64-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
102; X64-NEXT:    kmovw %k0, %esi # encoding: [0xc5,0xf8,0x93,0xf0]
103; X64-NEXT:    movb %sil, (%rdx) # encoding: [0x40,0x88,0x32]
104; X64-NEXT:    movb %al, (%rcx) # encoding: [0x88,0x01]
105; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
106; X64-NEXT:    retq # encoding: [0xc3]
107entry:
108  %0 = load <8 x i32>, ptr %a, align 32
109  %1 = load <8 x i32>, ptr %b, align 32
110  %2 = tail call { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.d.256(<8 x i32> %0, <8 x i32> %1)
111  %3 = extractvalue { <8 x i1>, <8 x i1> } %2, 0
112  store <8 x i1> %3, ptr %m0, align 8
113  %4 = extractvalue { <8 x i1>, <8 x i1> } %2, 1
114  store <8 x i1> %4, ptr %m1, align 8
115  ret void
116}
117
118define void @test_mm256_2intersect_epi64_p(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %m0, ptr nocapture %m1) {
119; X86-LABEL: test_mm256_2intersect_epi64_p:
120; X86:       # %bb.0: # %entry
121; X86-NEXT:    pushl %esi # encoding: [0x56]
122; X86-NEXT:    .cfi_def_cfa_offset 8
123; X86-NEXT:    .cfi_offset %esi, -8
124; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
125; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
126; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
127; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
128; X86-NEXT:    vmovdqa (%esi), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x06]
129; X86-NEXT:    vp2intersectq (%edx), %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x28,0x68,0x02]
130; X86-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
131; X86-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
132; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
133; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
134; X86-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
135; X86-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
136; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
137; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
138; X86-NEXT:    popl %esi # encoding: [0x5e]
139; X86-NEXT:    .cfi_def_cfa_offset 4
140; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
141; X86-NEXT:    retl # encoding: [0xc3]
142;
143; X64-LABEL: test_mm256_2intersect_epi64_p:
144; X64:       # %bb.0: # %entry
145; X64-NEXT:    vmovdqa (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x07]
146; X64-NEXT:    vp2intersectq (%rsi), %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x28,0x68,0x06]
147; X64-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
148; X64-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
149; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
150; X64-NEXT:    movb %al, (%rdx) # encoding: [0x88,0x02]
151; X64-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
152; X64-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
153; X64-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
154; X64-NEXT:    movb %al, (%rcx) # encoding: [0x88,0x01]
155; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
156; X64-NEXT:    retq # encoding: [0xc3]
157entry:
158  %0 = load <4 x i64>, ptr %a, align 32
159  %1 = load <4 x i64>, ptr %b, align 32
160  %2 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.q.256(<4 x i64> %0, <4 x i64> %1)
161  %3 = extractvalue { <4 x i1>, <4 x i1> } %2, 0
162  %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
163  %5 = bitcast <8 x i1> %4 to i8
164  store i8 %5, ptr %m0, align 1
165  %6 = extractvalue { <4 x i1>, <4 x i1> } %2, 1
166  %7 = shufflevector <4 x i1> %6, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
167  %8 = bitcast <8 x i1> %7 to i8
168  store i8 %8, ptr %m1, align 1
169  ret void
170}
171
172define void @test_mm256_2intersect_epi32_b(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %m0, ptr nocapture %m1) {
173; X86-LABEL: test_mm256_2intersect_epi32_b:
174; X86:       # %bb.0: # %entry
175; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x0c]
176; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
177; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04]
178; X86-NEXT:    vpbroadcastd (%edx), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x58,0x02]
179; X86-NEXT:    vp2intersectd (%ecx){1to8}, %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x38,0x68,0x01]
180; X86-NEXT:    kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9]
181; X86-NEXT:    kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0]
182; X86-NEXT:    movb %dl, (%eax) # encoding: [0x88,0x10]
183; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x10]
184; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
185; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
186; X86-NEXT:    retl # encoding: [0xc3]
187;
188; X64-LABEL: test_mm256_2intersect_epi32_b:
189; X64:       # %bb.0: # %entry
190; X64-NEXT:    vpbroadcastd (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x58,0x07]
191; X64-NEXT:    vp2intersectd (%rsi){1to8}, %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x38,0x68,0x06]
192; X64-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
193; X64-NEXT:    kmovw %k0, %esi # encoding: [0xc5,0xf8,0x93,0xf0]
194; X64-NEXT:    movb %sil, (%rdx) # encoding: [0x40,0x88,0x32]
195; X64-NEXT:    movb %al, (%rcx) # encoding: [0x88,0x01]
196; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
197; X64-NEXT:    retq # encoding: [0xc3]
198entry:
199  %0 = load i32, ptr %a, align 4
200  %vecinit.i.i = insertelement <8 x i32> undef, i32 %0, i32 0
201  %vecinit7.i.i = shufflevector <8 x i32> %vecinit.i.i, <8 x i32> undef, <8 x i32> zeroinitializer
202  %1 = load i32, ptr %b, align 4
203  %vecinit.i.i2 = insertelement <8 x i32> undef, i32 %1, i32 0
204  %vecinit7.i.i3 = shufflevector <8 x i32> %vecinit.i.i2, <8 x i32> undef, <8 x i32> zeroinitializer
205  %2 = tail call { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.d.256(<8 x i32> %vecinit7.i.i, <8 x i32> %vecinit7.i.i3)
206  %3 = extractvalue { <8 x i1>, <8 x i1> } %2, 0
207  store <8 x i1> %3, ptr %m0, align 8
208  %4 = extractvalue { <8 x i1>, <8 x i1> } %2, 1
209  store <8 x i1> %4, ptr %m1, align 8
210  ret void
211}
212
213define void @test_mm256_2intersect_epi64_b(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %m0, ptr nocapture %m1) {
214; X86-LABEL: test_mm256_2intersect_epi64_b:
215; X86:       # %bb.0: # %entry
216; X86-NEXT:    pushl %esi # encoding: [0x56]
217; X86-NEXT:    .cfi_def_cfa_offset 8
218; X86-NEXT:    .cfi_offset %esi, -8
219; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
220; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
221; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
222; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
223; X86-NEXT:    vpbroadcastq (%esi), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0x06]
224; X86-NEXT:    vp2intersectq (%edx){1to4}, %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x38,0x68,0x02]
225; X86-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
226; X86-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
227; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
228; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
229; X86-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
230; X86-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
231; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
232; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
233; X86-NEXT:    popl %esi # encoding: [0x5e]
234; X86-NEXT:    .cfi_def_cfa_offset 4
235; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
236; X86-NEXT:    retl # encoding: [0xc3]
237;
238; X64-LABEL: test_mm256_2intersect_epi64_b:
239; X64:       # %bb.0: # %entry
240; X64-NEXT:    vpbroadcastq (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0x07]
241; X64-NEXT:    vp2intersectq (%rsi){1to4}, %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x38,0x68,0x06]
242; X64-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
243; X64-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
244; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
245; X64-NEXT:    movb %al, (%rdx) # encoding: [0x88,0x02]
246; X64-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
247; X64-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
248; X64-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
249; X64-NEXT:    movb %al, (%rcx) # encoding: [0x88,0x01]
250; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
251; X64-NEXT:    retq # encoding: [0xc3]
252entry:
253  %0 = load i64, ptr %a, align 8
254  %vecinit.i.i = insertelement <4 x i64> undef, i64 %0, i32 0
255  %vecinit3.i.i = shufflevector <4 x i64> %vecinit.i.i, <4 x i64> undef, <4 x i32> zeroinitializer
256  %1 = load i64, ptr %b, align 8
257  %vecinit.i.i2 = insertelement <4 x i64> undef, i64 %1, i32 0
258  %vecinit3.i.i3 = shufflevector <4 x i64> %vecinit.i.i2, <4 x i64> undef, <4 x i32> zeroinitializer
259  %2 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.q.256(<4 x i64> %vecinit3.i.i, <4 x i64> %vecinit3.i.i3)
260  %3 = extractvalue { <4 x i1>, <4 x i1> } %2, 0
261  %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
262  %5 = bitcast <8 x i1> %4 to i8
263  store i8 %5, ptr %m0, align 1
264  %6 = extractvalue { <4 x i1>, <4 x i1> } %2, 1
265  %7 = shufflevector <4 x i1> %6, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
266  %8 = bitcast <8 x i1> %7 to i8
267  store i8 %8, ptr %m1, align 1
268  ret void
269}
270
271define void @test_mm_2intersect_epi32(<2 x i64> %a, <2 x i64> %b, ptr nocapture %m0, ptr nocapture %m1) {
272; X86-LABEL: test_mm_2intersect_epi32:
273; X86:       # %bb.0: # %entry
274; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
275; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
276; X86-NEXT:    vp2intersectd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x08,0x68,0xc1]
277; X86-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
278; X86-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
279; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
280; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
281; X86-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
282; X86-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
283; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
284; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
285; X86-NEXT:    retl # encoding: [0xc3]
286;
287; X64-LABEL: test_mm_2intersect_epi32:
288; X64:       # %bb.0: # %entry
289; X64-NEXT:    vp2intersectd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x08,0x68,0xc1]
290; X64-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
291; X64-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
292; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
293; X64-NEXT:    movb %al, (%rdi) # encoding: [0x88,0x07]
294; X64-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
295; X64-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
296; X64-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
297; X64-NEXT:    movb %al, (%rsi) # encoding: [0x88,0x06]
298; X64-NEXT:    retq # encoding: [0xc3]
299entry:
300  %0 = bitcast <2 x i64> %a to <4 x i32>
301  %1 = bitcast <2 x i64> %b to <4 x i32>
302  %2 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.d.128(<4 x i32> %0, <4 x i32> %1)
303  %3 = extractvalue { <4 x i1>, <4 x i1> } %2, 0
304  %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
305  %5 = bitcast <8 x i1> %4 to i8
306  store i8 %5, ptr %m0, align 1
307  %6 = extractvalue { <4 x i1>, <4 x i1> } %2, 1
308  %7 = shufflevector <4 x i1> %6, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
309  %8 = bitcast <8 x i1> %7 to i8
310  store i8 %8, ptr %m1, align 1
311  ret void
312}
313
314define void @test_mm_2intersect_epi64(<2 x i64> %a, <2 x i64> %b, ptr nocapture %m0, ptr nocapture %m1) {
315; X86-LABEL: test_mm_2intersect_epi64:
316; X86:       # %bb.0: # %entry
317; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
318; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
319; X86-NEXT:    vp2intersectq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x08,0x68,0xc1]
320; X86-NEXT:    kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e]
321; X86-NEXT:    kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e]
322; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
323; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
324; X86-NEXT:    kshiftlw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0e]
325; X86-NEXT:    kshiftrw $14, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e]
326; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
327; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
328; X86-NEXT:    retl # encoding: [0xc3]
329;
330; X64-LABEL: test_mm_2intersect_epi64:
331; X64:       # %bb.0: # %entry
332; X64-NEXT:    vp2intersectq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x08,0x68,0xc1]
333; X64-NEXT:    kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e]
334; X64-NEXT:    kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e]
335; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
336; X64-NEXT:    movb %al, (%rdi) # encoding: [0x88,0x07]
337; X64-NEXT:    kshiftlw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0e]
338; X64-NEXT:    kshiftrw $14, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e]
339; X64-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
340; X64-NEXT:    movb %al, (%rsi) # encoding: [0x88,0x06]
341; X64-NEXT:    retq # encoding: [0xc3]
342entry:
343  %0 = tail call { <2 x i1>, <2 x i1> } @llvm.x86.avx512.vp2intersect.q.128(<2 x i64> %a, <2 x i64> %b)
344  %1 = extractvalue { <2 x i1>, <2 x i1> } %0, 0
345  %2 = shufflevector <2 x i1> %1, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
346  %3 = bitcast <8 x i1> %2 to i8
347  store i8 %3, ptr %m0, align 1
348  %4 = extractvalue { <2 x i1>, <2 x i1> } %0, 1
349  %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
350  %6 = bitcast <8 x i1> %5 to i8
351  store i8 %6, ptr %m1, align 1
352  ret void
353}
354
355define void @test_mm_2intersect_epi32_p(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %m0, ptr nocapture %m1) {
356; X86-LABEL: test_mm_2intersect_epi32_p:
357; X86:       # %bb.0: # %entry
358; X86-NEXT:    pushl %esi # encoding: [0x56]
359; X86-NEXT:    .cfi_def_cfa_offset 8
360; X86-NEXT:    .cfi_offset %esi, -8
361; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
362; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
363; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
364; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
365; X86-NEXT:    vmovdqa (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x06]
366; X86-NEXT:    vp2intersectd (%edx), %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x08,0x68,0x02]
367; X86-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
368; X86-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
369; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
370; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
371; X86-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
372; X86-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
373; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
374; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
375; X86-NEXT:    popl %esi # encoding: [0x5e]
376; X86-NEXT:    .cfi_def_cfa_offset 4
377; X86-NEXT:    retl # encoding: [0xc3]
378;
379; X64-LABEL: test_mm_2intersect_epi32_p:
380; X64:       # %bb.0: # %entry
381; X64-NEXT:    vmovdqa (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x07]
382; X64-NEXT:    vp2intersectd (%rsi), %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x08,0x68,0x06]
383; X64-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
384; X64-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
385; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
386; X64-NEXT:    movb %al, (%rdx) # encoding: [0x88,0x02]
387; X64-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
388; X64-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
389; X64-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
390; X64-NEXT:    movb %al, (%rcx) # encoding: [0x88,0x01]
391; X64-NEXT:    retq # encoding: [0xc3]
392entry:
393  %0 = load <4 x i32>, ptr %a, align 16
394  %1 = load <4 x i32>, ptr %b, align 16
395  %2 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.d.128(<4 x i32> %0, <4 x i32> %1)
396  %3 = extractvalue { <4 x i1>, <4 x i1> } %2, 0
397  %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
398  %5 = bitcast <8 x i1> %4 to i8
399  store i8 %5, ptr %m0, align 1
400  %6 = extractvalue { <4 x i1>, <4 x i1> } %2, 1
401  %7 = shufflevector <4 x i1> %6, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
402  %8 = bitcast <8 x i1> %7 to i8
403  store i8 %8, ptr %m1, align 1
404  ret void
405}
406
407define void @test_mm_2intersect_epi64_p(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %m0, ptr nocapture %m1) {
408; X86-LABEL: test_mm_2intersect_epi64_p:
409; X86:       # %bb.0: # %entry
410; X86-NEXT:    pushl %esi # encoding: [0x56]
411; X86-NEXT:    .cfi_def_cfa_offset 8
412; X86-NEXT:    .cfi_offset %esi, -8
413; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
414; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
415; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
416; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
417; X86-NEXT:    vmovdqa (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x06]
418; X86-NEXT:    vp2intersectq (%edx), %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x08,0x68,0x02]
419; X86-NEXT:    kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e]
420; X86-NEXT:    kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e]
421; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
422; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
423; X86-NEXT:    kshiftlw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0e]
424; X86-NEXT:    kshiftrw $14, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e]
425; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
426; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
427; X86-NEXT:    popl %esi # encoding: [0x5e]
428; X86-NEXT:    .cfi_def_cfa_offset 4
429; X86-NEXT:    retl # encoding: [0xc3]
430;
431; X64-LABEL: test_mm_2intersect_epi64_p:
432; X64:       # %bb.0: # %entry
433; X64-NEXT:    vmovdqa (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x07]
434; X64-NEXT:    vp2intersectq (%rsi), %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x08,0x68,0x06]
435; X64-NEXT:    kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e]
436; X64-NEXT:    kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e]
437; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
438; X64-NEXT:    movb %al, (%rdx) # encoding: [0x88,0x02]
439; X64-NEXT:    kshiftlw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0e]
440; X64-NEXT:    kshiftrw $14, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e]
441; X64-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
442; X64-NEXT:    movb %al, (%rcx) # encoding: [0x88,0x01]
443; X64-NEXT:    retq # encoding: [0xc3]
444entry:
445  %0 = load <2 x i64>, ptr %a, align 16
446  %1 = load <2 x i64>, ptr %b, align 16
447  %2 = tail call { <2 x i1>, <2 x i1> } @llvm.x86.avx512.vp2intersect.q.128(<2 x i64> %0, <2 x i64> %1)
448  %3 = extractvalue { <2 x i1>, <2 x i1> } %2, 0
449  %4 = shufflevector <2 x i1> %3, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
450  %5 = bitcast <8 x i1> %4 to i8
451  store i8 %5, ptr %m0, align 1
452  %6 = extractvalue { <2 x i1>, <2 x i1> } %2, 1
453  %7 = shufflevector <2 x i1> %6, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
454  %8 = bitcast <8 x i1> %7 to i8
455  store i8 %8, ptr %m1, align 1
456  ret void
457}
458
459define void @test_mm_2intersect_epi32_b(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %m0, ptr nocapture %m1) {
460; X86-LABEL: test_mm_2intersect_epi32_b:
461; X86:       # %bb.0: # %entry
462; X86-NEXT:    pushl %esi # encoding: [0x56]
463; X86-NEXT:    .cfi_def_cfa_offset 8
464; X86-NEXT:    .cfi_offset %esi, -8
465; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
466; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
467; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
468; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
469; X86-NEXT:    vpbroadcastd (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0x06]
470; X86-NEXT:    vp2intersectd (%edx){1to4}, %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x18,0x68,0x02]
471; X86-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
472; X86-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
473; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
474; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
475; X86-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
476; X86-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
477; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
478; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
479; X86-NEXT:    popl %esi # encoding: [0x5e]
480; X86-NEXT:    .cfi_def_cfa_offset 4
481; X86-NEXT:    retl # encoding: [0xc3]
482;
483; X64-LABEL: test_mm_2intersect_epi32_b:
484; X64:       # %bb.0: # %entry
485; X64-NEXT:    vpbroadcastd (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0x07]
486; X64-NEXT:    vp2intersectd (%rsi){1to4}, %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x18,0x68,0x06]
487; X64-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
488; X64-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
489; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
490; X64-NEXT:    movb %al, (%rdx) # encoding: [0x88,0x02]
491; X64-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
492; X64-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
493; X64-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
494; X64-NEXT:    movb %al, (%rcx) # encoding: [0x88,0x01]
495; X64-NEXT:    retq # encoding: [0xc3]
496entry:
497  %0 = load i32, ptr %a, align 4
498  %vecinit.i.i = insertelement <4 x i32> undef, i32 %0, i32 0
499  %vecinit3.i.i = shufflevector <4 x i32> %vecinit.i.i, <4 x i32> undef, <4 x i32> zeroinitializer
500  %1 = load i32, ptr %b, align 4
501  %vecinit.i.i2 = insertelement <4 x i32> undef, i32 %1, i32 0
502  %vecinit3.i.i3 = shufflevector <4 x i32> %vecinit.i.i2, <4 x i32> undef, <4 x i32> zeroinitializer
503  %2 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.d.128(<4 x i32> %vecinit3.i.i, <4 x i32> %vecinit3.i.i3)
504  %3 = extractvalue { <4 x i1>, <4 x i1> } %2, 0
505  %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
506  %5 = bitcast <8 x i1> %4 to i8
507  store i8 %5, ptr %m0, align 1
508  %6 = extractvalue { <4 x i1>, <4 x i1> } %2, 1
509  %7 = shufflevector <4 x i1> %6, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
510  %8 = bitcast <8 x i1> %7 to i8
511  store i8 %8, ptr %m1, align 1
512  ret void
513}
514
515define void @test_mm_2intersect_epi64_b(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %m0, ptr nocapture %m1) {
516; X86-LABEL: test_mm_2intersect_epi64_b:
517; X86:       # %bb.0: # %entry
518; X86-NEXT:    pushl %esi # encoding: [0x56]
519; X86-NEXT:    .cfi_def_cfa_offset 8
520; X86-NEXT:    .cfi_offset %esi, -8
521; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
522; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
523; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
524; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
525; X86-NEXT:    vpbroadcastq (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0x06]
526; X86-NEXT:    vp2intersectq (%edx){1to2}, %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x18,0x68,0x02]
527; X86-NEXT:    kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e]
528; X86-NEXT:    kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e]
529; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
530; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
531; X86-NEXT:    kshiftlw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0e]
532; X86-NEXT:    kshiftrw $14, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e]
533; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
534; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
535; X86-NEXT:    popl %esi # encoding: [0x5e]
536; X86-NEXT:    .cfi_def_cfa_offset 4
537; X86-NEXT:    retl # encoding: [0xc3]
538;
539; X64-LABEL: test_mm_2intersect_epi64_b:
540; X64:       # %bb.0: # %entry
541; X64-NEXT:    vpbroadcastq (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0x07]
542; X64-NEXT:    vp2intersectq (%rsi){1to2}, %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x18,0x68,0x06]
543; X64-NEXT:    kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e]
544; X64-NEXT:    kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e]
545; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
546; X64-NEXT:    movb %al, (%rdx) # encoding: [0x88,0x02]
547; X64-NEXT:    kshiftlw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0e]
548; X64-NEXT:    kshiftrw $14, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e]
549; X64-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
550; X64-NEXT:    movb %al, (%rcx) # encoding: [0x88,0x01]
551; X64-NEXT:    retq # encoding: [0xc3]
552entry:
553  %0 = load i64, ptr %a, align 8
554  %vecinit.i.i = insertelement <2 x i64> undef, i64 %0, i32 0
555  %vecinit1.i.i = shufflevector <2 x i64> %vecinit.i.i, <2 x i64> undef, <2 x i32> zeroinitializer
556  %1 = load i64, ptr %b, align 8
557  %vecinit.i.i2 = insertelement <2 x i64> undef, i64 %1, i32 0
558  %vecinit1.i.i3 = shufflevector <2 x i64> %vecinit.i.i2, <2 x i64> undef, <2 x i32> zeroinitializer
559  %2 = tail call { <2 x i1>, <2 x i1> } @llvm.x86.avx512.vp2intersect.q.128(<2 x i64> %vecinit1.i.i, <2 x i64> %vecinit1.i.i3)
560  %3 = extractvalue { <2 x i1>, <2 x i1> } %2, 0
561  %4 = shufflevector <2 x i1> %3, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
562  %5 = bitcast <8 x i1> %4 to i8
563  store i8 %5, ptr %m0, align 1
564  %6 = extractvalue { <2 x i1>, <2 x i1> } %2, 1
565  %7 = shufflevector <2 x i1> %6, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
566  %8 = bitcast <8 x i1> %7 to i8
567  store i8 %8, ptr %m1, align 1
568  ret void
569}
570
571declare { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.d.256(<8 x i32>, <8 x i32>)
572declare { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.q.256(<4 x i64>, <4 x i64>)
573declare { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.d.128(<4 x i32>, <4 x i32>)
574declare { <2 x i1>, <2 x i1> } @llvm.x86.avx512.vp2intersect.q.128(<2 x i64>, <2 x i64>)
575