xref: /llvm-project/llvm/test/CodeGen/X86/AMX/amx-configO2toO0.ll (revision 0eb17a9d8672c3503c76a808b0773235b042f5a9)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f | FileCheck %s --check-prefix=AVX512
3
4@buf = dso_local global [1024 x i8] zeroinitializer, align 16
5@buf2 = dso_local global [1024 x i8] zeroinitializer, align 16
6
7define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) nounwind {
8; AVX512-LABEL: test_api:
9; AVX512:       # %bb.0: # %entry
10; AVX512-NEXT:    pushq %rbp
11; AVX512-NEXT:    movq %rsp, %rbp
12; AVX512-NEXT:    andq $-1024, %rsp # imm = 0xFC00
13; AVX512-NEXT:    subq $8192, %rsp # imm = 0x2000
14; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
15; AVX512-NEXT:    vmovups %zmm0, {{[0-9]+}}(%rsp)
16; AVX512-NEXT:    movb $1, {{[0-9]+}}(%rsp)
17; AVX512-NEXT:    movw %dx, %ax
18; AVX512-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
19; AVX512-NEXT:    movw %si, %ax
20; AVX512-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
21; AVX512-NEXT:    cmpl $0, %edi
22; AVX512-NEXT:    je .LBB0_2
23; AVX512-NEXT:  # %bb.1: # %if.then
24; AVX512-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
25; AVX512-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
26; AVX512-NEXT:    movl $buf, %esi
27; AVX512-NEXT:    movl $32, %edi
28; AVX512-NEXT:    movw $8, %dx
29; AVX512-NEXT:    # implicit-def: $al
30; AVX512-NEXT:    movb %al, {{[0-9]+}}(%rsp)
31; AVX512-NEXT:    movw %dx, {{[0-9]+}}(%rsp)
32; AVX512-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
33; AVX512-NEXT:    tileloadd (%rsi,%rdi), %tmm0
34; AVX512-NEXT:    movl $64, %edi
35; AVX512-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
36; AVX512-NEXT:    movw $8, %dx
37; AVX512-NEXT:    tilestored %tmm0, (%rsi,%rdi)
38; AVX512-NEXT:    movl $buf, %esi
39; AVX512-NEXT:    movl $32, %edi
40; AVX512-NEXT:    movw $8, %dx
41; AVX512-NEXT:    # implicit-def: $al
42; AVX512-NEXT:    movb %al, {{[0-9]+}}(%rsp)
43; AVX512-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
44; AVX512-NEXT:    # implicit-def: $dl
45; AVX512-NEXT:    movb %dl, {{[0-9]+}}(%rsp)
46; AVX512-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
47; AVX512-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
48; AVX512-NEXT:    tileloadd (%rsi,%rdi), %tmm0
49; AVX512-NEXT:    movl $64, %edi
50; AVX512-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
51; AVX512-NEXT:    movw $8, %dx
52; AVX512-NEXT:    tilestored %tmm0, (%rsi,%rdi)
53; AVX512-NEXT:    movl $buf, %edx
54; AVX512-NEXT:    movl $32, %esi
55; AVX512-NEXT:    tileloadd (%rdx,%rsi), %tmm0
56; AVX512-NEXT:    movl $64, %esi
57; AVX512-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
58; AVX512-NEXT:    tilestored %tmm0, (%rdx,%rsi)
59; AVX512-NEXT:    jmp .LBB0_3
60; AVX512-NEXT:  .LBB0_2: # %if.else
61; AVX512-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
62; AVX512-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
63; AVX512-NEXT:    movl $buf2, %esi
64; AVX512-NEXT:    movl $32, %edi
65; AVX512-NEXT:    movw $8, %dx
66; AVX512-NEXT:    # implicit-def: $al
67; AVX512-NEXT:    movb %al, {{[0-9]+}}(%rsp)
68; AVX512-NEXT:    movw %dx, {{[0-9]+}}(%rsp)
69; AVX512-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
70; AVX512-NEXT:    tileloadd (%rsi,%rdi), %tmm0
71; AVX512-NEXT:    movl $64, %edi
72; AVX512-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
73; AVX512-NEXT:    movw $8, %dx
74; AVX512-NEXT:    tilestored %tmm0, (%rsi,%rdi)
75; AVX512-NEXT:    movl $buf2, %esi
76; AVX512-NEXT:    movl $32, %edi
77; AVX512-NEXT:    movw $8, %dx
78; AVX512-NEXT:    # implicit-def: $al
79; AVX512-NEXT:    movb %al, {{[0-9]+}}(%rsp)
80; AVX512-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
81; AVX512-NEXT:    # implicit-def: $dl
82; AVX512-NEXT:    movb %dl, {{[0-9]+}}(%rsp)
83; AVX512-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
84; AVX512-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
85; AVX512-NEXT:    tileloadd (%rsi,%rdi), %tmm0
86; AVX512-NEXT:    movl $64, %edi
87; AVX512-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
88; AVX512-NEXT:    movw $8, %dx
89; AVX512-NEXT:    tilestored %tmm0, (%rsi,%rdi)
90; AVX512-NEXT:    movl $buf2, %edx
91; AVX512-NEXT:    movl $32, %esi
92; AVX512-NEXT:    tileloadd (%rdx,%rsi), %tmm0
93; AVX512-NEXT:    movl $64, %esi
94; AVX512-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
95; AVX512-NEXT:    tilestored %tmm0, (%rdx,%rsi)
96; AVX512-NEXT:  .LBB0_3: # %if.end
97; AVX512-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
98; AVX512-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
99; AVX512-NEXT:    movl $64, %edi
100; AVX512-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
101; AVX512-NEXT:    movw $8, %si
102; AVX512-NEXT:    # implicit-def: $al
103; AVX512-NEXT:    movb %al, {{[0-9]+}}(%rsp)
104; AVX512-NEXT:    movw %si, {{[0-9]+}}(%rsp)
105; AVX512-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
106; AVX512-NEXT:    tileloadd (%rdx,%rdi), %tmm0
107; AVX512-NEXT:    movabsq $64, %rdx
108; AVX512-NEXT:    tilestored %tmm0, 1024(%rsp,%rdx) # 1024-byte Folded Spill
109; AVX512-NEXT:    movl $64, %r8d
110; AVX512-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
111; AVX512-NEXT:    movw $8, %dx
112; AVX512-NEXT:    # implicit-def: $al
113; AVX512-NEXT:    movb %al, {{[0-9]+}}(%rsp)
114; AVX512-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
115; AVX512-NEXT:    # implicit-def: $al
116; AVX512-NEXT:    movb %al, {{[0-9]+}}(%rsp)
117; AVX512-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
118; AVX512-NEXT:    # implicit-def: $al
119; AVX512-NEXT:    movb %al, {{[0-9]+}}(%rsp)
120; AVX512-NEXT:    movw %si, {{[0-9]+}}(%rsp)
121; AVX512-NEXT:    # implicit-def: $al
122; AVX512-NEXT:    movb %al, {{[0-9]+}}(%rsp)
123; AVX512-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
124; AVX512-NEXT:    # implicit-def: $dl
125; AVX512-NEXT:    movb %dl, {{[0-9]+}}(%rsp)
126; AVX512-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
127; AVX512-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
128; AVX512-NEXT:    tileloadd (%rdi,%r8), %tmm2
129; AVX512-NEXT:    movl $64, %edi
130; AVX512-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
131; AVX512-NEXT:    tileloadd (%rdx,%rdi), %tmm0
132; AVX512-NEXT:    movw $8, %dx
133; AVX512-NEXT:    movabsq $64, %rdi
134; AVX512-NEXT:    tileloadd 1024(%rsp,%rdi), %tmm1 # 1024-byte Folded Reload
135; AVX512-NEXT:    tdpbssd %tmm2, %tmm1, %tmm0
136; AVX512-NEXT:    movl $64, %esi
137; AVX512-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
138; AVX512-NEXT:    tilestored %tmm0, (%rdx,%rsi)
139; AVX512-NEXT:    movl $64, %esi
140; AVX512-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
141; AVX512-NEXT:    tileloadd (%rdx,%rsi), %tmm0
142; AVX512-NEXT:    movl $buf, %edx
143; AVX512-NEXT:    movl $32, %esi
144; AVX512-NEXT:    tilestored %tmm0, (%rdx,%rsi)
145; AVX512-NEXT:    movq %rbp, %rsp
146; AVX512-NEXT:    popq %rbp
147; AVX512-NEXT:    tilerelease
148; AVX512-NEXT:    vzeroupper
149; AVX512-NEXT:    retq
150entry:
151  %tobool.not = icmp eq i32 %cond, 0
152  br i1 %tobool.not, label %if.else, label %if.then
153
154if.then:                                          ; preds = %entry
155  %0 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, ptr @buf, i64 32)
156  %1 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, ptr @buf, i64 32)
157  %2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, ptr @buf, i64 32)
158  br label %if.end
159
160if.else:                                          ; preds = %entry
161  %3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, ptr @buf2, i64 32)
162  %4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, ptr @buf2, i64 32)
163  %5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, ptr @buf2, i64 32)
164  br label %if.end
165
166if.end:                                           ; preds = %if.else, %if.then
167  %a.sroa.1094.0.in = phi x86_amx [ %3, %if.else ], [ %0, %if.then ]
168  %b.sroa.1069.0.in = phi x86_amx [ %4, %if.else ], [ %1, %if.then ]
169  %c.sroa.1044.0.in = phi x86_amx [ %5, %if.else ], [ %2, %if.then ]
170  %6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 8, x86_amx %c.sroa.1044.0.in, x86_amx %a.sroa.1094.0.in, x86_amx %b.sroa.1069.0.in)
171  tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, ptr @buf, i64 32, x86_amx %6)
172  ret void
173}
174
175declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64)
176declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
177declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx)
178