xref: /llvm-project/llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll (revision eb64697a7b75d2b22041cc992fad0c8dfa7989cb)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vp2intersect -verify-machineinstrs | FileCheck %s --check-prefix=X86
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vp2intersect -verify-machineinstrs | FileCheck %s --check-prefix=X64
4
5; Test with more than four live mask pairs
6
7define void @test(<16 x i32> %a0, <16 x i32> %b0, <16 x i32> %a1, <16 x i32> %b1, <16 x i32> %a2, <16 x i32> %b2, <16 x i32> %a3, <16 x i32> %b3, <16 x i32> %a4, <16 x i32> %b4, ptr nocapture %m0, ptr nocapture %m1) nounwind {
8; X86-LABEL: test:
9; X86:       # %bb.0: # %entry
10; X86-NEXT:    pushl %ebp
11; X86-NEXT:    movl %esp, %ebp
12; X86-NEXT:    pushl %edi
13; X86-NEXT:    pushl %esi
14; X86-NEXT:    andl $-64, %esp
15; X86-NEXT:    subl $64, %esp
16; X86-NEXT:    movl 456(%ebp), %esi
17; X86-NEXT:    vmovdqa64 328(%ebp), %zmm3
18; X86-NEXT:    vmovdqa64 200(%ebp), %zmm4
19; X86-NEXT:    vmovdqa64 72(%ebp), %zmm5
20; X86-NEXT:    vp2intersectd %zmm1, %zmm0, %k0
21; X86-NEXT:    kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
22; X86-NEXT:    kmovw %k1, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
23; X86-NEXT:    vp2intersectd 8(%ebp), %zmm2, %k0
24; X86-NEXT:    kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
25; X86-NEXT:    kmovw %k1, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
26; X86-NEXT:    vp2intersectd 136(%ebp), %zmm5, %k0
27; X86-NEXT:    kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
28; X86-NEXT:    kmovw %k1, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
29; X86-NEXT:    vp2intersectd 264(%ebp), %zmm4, %k0
30; X86-NEXT:    kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
31; X86-NEXT:    kmovw %k1, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
32; X86-NEXT:    vp2intersectd 392(%ebp), %zmm3, %k0
33; X86-NEXT:    kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
34; X86-NEXT:    kmovw %k1, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
35; X86-NEXT:    vzeroupper
36; X86-NEXT:    calll dummy@PLT
37; X86-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k0 # 2-byte Reload
38; X86-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 # 2-byte Reload
39; X86-NEXT:    kmovw %k0, %eax
40; X86-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k0 # 2-byte Reload
41; X86-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 # 2-byte Reload
42; X86-NEXT:    kmovw %k0, %edx
43; X86-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k0 # 2-byte Reload
44; X86-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 # 2-byte Reload
45; X86-NEXT:    kmovw %k0, %ecx
46; X86-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k0 # 2-byte Reload
47; X86-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 # 2-byte Reload
48; X86-NEXT:    kmovw %k0, %edi
49; X86-NEXT:    addl %edi, %eax
50; X86-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k2 # 2-byte Reload
51; X86-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k3 # 2-byte Reload
52; X86-NEXT:    kmovw %k2, %edi
53; X86-NEXT:    addl %edx, %ecx
54; X86-NEXT:    kmovw %k1, %edx
55; X86-NEXT:    addl %edi, %edx
56; X86-NEXT:    addl %edx, %eax
57; X86-NEXT:    addl %ecx, %eax
58; X86-NEXT:    movw %ax, (%esi)
59; X86-NEXT:    leal -8(%ebp), %esp
60; X86-NEXT:    popl %esi
61; X86-NEXT:    popl %edi
62; X86-NEXT:    popl %ebp
63; X86-NEXT:    retl
64;
65; X64-LABEL: test:
66; X64:       # %bb.0: # %entry
67; X64-NEXT:    pushq %rbp
68; X64-NEXT:    movq %rsp, %rbp
69; X64-NEXT:    pushq %rbx
70; X64-NEXT:    andq $-64, %rsp
71; X64-NEXT:    subq $64, %rsp
72; X64-NEXT:    movq %rdi, %rbx
73; X64-NEXT:    vmovdqa64 16(%rbp), %zmm8
74; X64-NEXT:    vp2intersectd %zmm1, %zmm0, %k0
75; X64-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
76; X64-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
77; X64-NEXT:    vp2intersectd %zmm3, %zmm2, %k0
78; X64-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
79; X64-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
80; X64-NEXT:    vp2intersectd %zmm5, %zmm4, %k0
81; X64-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
82; X64-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
83; X64-NEXT:    vp2intersectd %zmm7, %zmm6, %k0
84; X64-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
85; X64-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
86; X64-NEXT:    vp2intersectd 80(%rbp), %zmm8, %k0
87; X64-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
88; X64-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
89; X64-NEXT:    vzeroupper
90; X64-NEXT:    callq dummy@PLT
91; X64-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
92; X64-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
93; X64-NEXT:    kmovw %k0, %eax
94; X64-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
95; X64-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
96; X64-NEXT:    kmovw %k0, %ecx
97; X64-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
98; X64-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
99; X64-NEXT:    kmovw %k0, %edx
100; X64-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
101; X64-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
102; X64-NEXT:    kmovw %k0, %esi
103; X64-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
104; X64-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
105; X64-NEXT:    kmovw %k0, %edi
106; X64-NEXT:    kmovw %k1, %r8d
107; X64-NEXT:    addl %edi, %eax
108; X64-NEXT:    addl %ecx, %edx
109; X64-NEXT:    addl %eax, %edx
110; X64-NEXT:    addl %r8d, %edx
111; X64-NEXT:    addl %esi, %edx
112; X64-NEXT:    movw %dx, (%rbx)
113; X64-NEXT:    leaq -8(%rbp), %rsp
114; X64-NEXT:    popq %rbx
115; X64-NEXT:    popq %rbp
116; X64-NEXT:    retq
117entry:
118  %0 = call { <16 x i1>, <16 x i1> } @llvm.x86.avx512.vp2intersect.d.512(<16 x i32> %a0, <16 x i32> %b0)
119  %1 = call { <16 x i1>, <16 x i1> } @llvm.x86.avx512.vp2intersect.d.512(<16 x i32> %a1, <16 x i32> %b1)
120  %2 = call { <16 x i1>, <16 x i1> } @llvm.x86.avx512.vp2intersect.d.512(<16 x i32> %a2, <16 x i32> %b2)
121  %3 = call { <16 x i1>, <16 x i1> } @llvm.x86.avx512.vp2intersect.d.512(<16 x i32> %a3, <16 x i32> %b3)
122  %4 = call { <16 x i1>, <16 x i1> } @llvm.x86.avx512.vp2intersect.d.512(<16 x i32> %a4, <16 x i32> %b4)
123
124  %5 = extractvalue { <16 x i1>, <16 x i1> } %0, 0
125  %6 = extractvalue { <16 x i1>, <16 x i1> } %1, 0
126  %7 = extractvalue { <16 x i1>, <16 x i1> } %2, 0
127  %8 = extractvalue { <16 x i1>, <16 x i1> } %3, 0
128  %9 = extractvalue { <16 x i1>, <16 x i1> } %4, 0
129  %10 = extractvalue { <16 x i1>, <16 x i1> } %0, 1
130  %11 = extractvalue { <16 x i1>, <16 x i1> } %1, 1
131
132  call void @dummy()
133
134  %12 = bitcast <16 x i1> %5 to i16
135  %13 = bitcast <16 x i1> %6 to i16
136  %14 = bitcast <16 x i1> %7 to i16
137  %15 = bitcast <16 x i1> %8 to i16
138  %16 = bitcast <16 x i1> %9 to i16
139  %17 = bitcast <16 x i1> %10 to i16
140  %18 = bitcast <16 x i1> %11 to i16
141
142  %19 = add i16 %12, %13
143  %20 = add i16 %14, %15
144  %21 = add i16 %16, %17
145  %22 = add i16 %19, %21
146  %23 = add i16 %22, %20
147
148  store i16 %23, ptr %m0, align 16
149  ret void
150}
151
152declare { <16 x i1>, <16 x i1> } @llvm.x86.avx512.vp2intersect.d.512(<16 x i32>, <16 x i32>)
153declare void @dummy()
154