xref: /llvm-project/llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll (revision 08ddbab866cb76619f0f4952dc11bab2a9ee1147)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-int8,+amx-bf16,+avx512f -verify-machineinstrs | FileCheck %s
3
4define void @test_amx(ptr %pointer, ptr %base, i64 %stride) {
5; CHECK-LABEL: test_amx:
6; CHECK:       # %bb.0:
7; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
8; CHECK-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
9; CHECK-NEXT:    movb $1, -{{[0-9]+}}(%rsp)
10; CHECK-NEXT:    movb $8, -{{[0-9]+}}(%rsp)
11; CHECK-NEXT:    movw $8, -{{[0-9]+}}(%rsp)
12; CHECK-NEXT:    movb $8, -{{[0-9]+}}(%rsp)
13; CHECK-NEXT:    movw $8, -{{[0-9]+}}(%rsp)
14; CHECK-NEXT:    movb $8, -{{[0-9]+}}(%rsp)
15; CHECK-NEXT:    movw $8, -{{[0-9]+}}(%rsp)
16; CHECK-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp)
17; CHECK-NEXT:    movw $8, %ax
18; CHECK-NEXT:    tilezero %tmm0
19; CHECK-NEXT:    tileloadd (%rsi,%rdx), %tmm1
20; CHECK-NEXT:    tileloadd (%rsi,%rdx), %tmm2
21; CHECK-NEXT:    tdpbssd %tmm2, %tmm1, %tmm0
22; CHECK-NEXT:    tdpbsud %tmm2, %tmm1, %tmm0
23; CHECK-NEXT:    tdpbusd %tmm2, %tmm1, %tmm0
24; CHECK-NEXT:    tdpbuud %tmm2, %tmm1, %tmm0
25; CHECK-NEXT:    tdpbf16ps %tmm2, %tmm1, %tmm0
26; CHECK-NEXT:    tileloaddt1 (%rsi,%rdx), %tmm1
27; CHECK-NEXT:    tilestored %tmm0, (%rdi,%rdx)
28; CHECK-NEXT:    tilerelease
29; CHECK-NEXT:    vzeroupper
30; CHECK-NEXT:    retq
31  %c = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8)
32  %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, ptr %base, i64 %stride)
33  %b = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, ptr %base, i64 %stride)
34  %d0 = call x86_amx @llvm.x86.tdpbssd.internal(i16 8, i16 8, i16 8, x86_amx %c, x86_amx %a, x86_amx %b)
35  %d1 = call x86_amx @llvm.x86.tdpbsud.internal(i16 8, i16 8, i16 8, x86_amx %d0, x86_amx %a, x86_amx %b)
36  %d2 = call x86_amx @llvm.x86.tdpbusd.internal(i16 8, i16 8, i16 8, x86_amx %d1, x86_amx %a, x86_amx %b)
37  %d3 = call x86_amx @llvm.x86.tdpbuud.internal(i16 8, i16 8, i16 8, x86_amx %d2, x86_amx %a, x86_amx %b)
38  %d4 = call x86_amx @llvm.x86.tdpbf16ps.internal(i16 8, i16 8, i16 8, x86_amx %d3, x86_amx %a, x86_amx %b)
39  %e = call x86_amx @llvm.x86.tileloaddt164.internal(i16 8, i16 8, ptr %base, i64 %stride)
40  call void @llvm.x86.tilestored64.internal(i16 8, i16 8, ptr %pointer, i64 %stride, x86_amx %d4)
41
42  ret void
43}
44
45declare x86_amx @llvm.x86.tilezero.internal(i16, i16)
46declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64)
47declare x86_amx @llvm.x86.tileloaddt164.internal(i16, i16, ptr, i64)
48declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
49declare x86_amx @llvm.x86.tdpbsud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
50declare x86_amx @llvm.x86.tdpbusd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
51declare x86_amx @llvm.x86.tdpbuud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
52declare x86_amx @llvm.x86.tdpbf16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
53declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx)
54
55define void @PR90954(ptr %0, ptr %1, i32 %2) nounwind {
56; CHECK-LABEL: PR90954:
57; CHECK:       # %bb.0:
58; CHECK-NEXT:    pushq %rbp
59; CHECK-NEXT:    movq %rsp, %rbp
60; CHECK-NEXT:    pushq %r15
61; CHECK-NEXT:    pushq %r14
62; CHECK-NEXT:    pushq %r13
63; CHECK-NEXT:    pushq %r12
64; CHECK-NEXT:    pushq %rbx
65; CHECK-NEXT:    andq $-1024, %rsp # imm = 0xFC00
66; CHECK-NEXT:    subq $5120, %rsp # imm = 0x1400
67; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
68; CHECK-NEXT:    vmovups %zmm0, {{[0-9]+}}(%rsp)
69; CHECK-NEXT:    movb $1, {{[0-9]+}}(%rsp)
70; CHECK-NEXT:    movb $16, {{[0-9]+}}(%rsp)
71; CHECK-NEXT:    movw $64, {{[0-9]+}}(%rsp)
72; CHECK-NEXT:    movb $16, {{[0-9]+}}(%rsp)
73; CHECK-NEXT:    movw $64, {{[0-9]+}}(%rsp)
74; CHECK-NEXT:    movb $16, {{[0-9]+}}(%rsp)
75; CHECK-NEXT:    movw $64, {{[0-9]+}}(%rsp)
76; CHECK-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
77; CHECK-NEXT:    shll $4, %edx
78; CHECK-NEXT:    xorl %eax, %eax
79; CHECK-NEXT:    movw $64, %cx
80; CHECK-NEXT:    movw $16, %di
81; CHECK-NEXT:    movb $1, %r8b
82; CHECK-NEXT:    movl $64, %r9d
83; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %r10
84; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %r11
85; CHECK-NEXT:    xorl %ebx, %ebx
86; CHECK-NEXT:    xorl %r14d, %r14d
87; CHECK-NEXT:    jmp .LBB1_1
88; CHECK-NEXT:    .p2align 4
89; CHECK-NEXT:  .LBB1_5: # in Loop: Header=BB1_1 Depth=1
90; CHECK-NEXT:    incq %r14
91; CHECK-NEXT:    addl %edx, %ebx
92; CHECK-NEXT:  .LBB1_1: # =>This Loop Header: Depth=1
93; CHECK-NEXT:    # Child Loop BB1_2 Depth 2
94; CHECK-NEXT:    movslq %ebx, %r15
95; CHECK-NEXT:    leaq (%rsi,%r15,4), %r15
96; CHECK-NEXT:    xorl %r12d, %r12d
97; CHECK-NEXT:    xorl %r13d, %r13d
98; CHECK-NEXT:    jmp .LBB1_2
99; CHECK-NEXT:    .p2align 4
100; CHECK-NEXT:  .LBB1_4: # in Loop: Header=BB1_2 Depth=2
101; CHECK-NEXT:    tilestored %tmm1, (%r15,%rax)
102; CHECK-NEXT:    incq %r13
103; CHECK-NEXT:    addq $64, %r15
104; CHECK-NEXT:    decq %r12
105; CHECK-NEXT:    je .LBB1_5
106; CHECK-NEXT:  .LBB1_2: # Parent Loop BB1_1 Depth=1
107; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
108; CHECK-NEXT:    tilezero %tmm0
109; CHECK-NEXT:    tilezero %tmm1
110; CHECK-NEXT:    testb %r8b, %r8b
111; CHECK-NEXT:    jne .LBB1_4
112; CHECK-NEXT:  # %bb.3: # in Loop: Header=BB1_2 Depth=2
113; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
114; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
115; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
116; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
117; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
118; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
119; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
120; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
121; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
122; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
123; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
124; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
125; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
126; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
127; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
128; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
129; CHECK-NEXT:    tileloadd (%r10,%r9), %tmm1
130; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
131; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
132; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
133; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
134; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
135; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
136; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
137; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
138; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
139; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
140; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
141; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
142; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
143; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
144; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
145; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
146; CHECK-NEXT:    tileloadd (%r11,%r9), %tmm2
147; CHECK-NEXT:    tdpbf16ps %tmm2, %tmm1, %tmm0
148; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
149; CHECK-NEXT:    movabsq $64, %rax
150; CHECK-NEXT:    tilestored %tmm0, 3072(%rsp,%rax) # 1024-byte Folded Spill
151; CHECK-NEXT:    tileloadd 3072(%rsp,%rax), %tmm1 # 1024-byte Folded Reload
152; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
153; CHECK-NEXT:    jmp .LBB1_4
154  %4 = shl i32 %2, 4
155  %5 = icmp eq i64 0, 0
156  br label %6
157
1586:                                                ; preds = %31, %3
159  %7 = phi i64 [ 0, %3 ], [ %32, %31 ]
160  %8 = trunc nuw nsw i64 %7 to i32
161  %9 = mul i32 %4, %8
162  %10 = mul i32 0, %8
163  %11 = sext i32 %9 to i64
164  %12 = getelementptr inbounds i32, ptr %1, i64 %11
165  br label %13
166
16713:                                               ; preds = %25, %6
168  %14 = phi i64 [ %29, %25 ], [ 0, %6 ]
169  %15 = tail call x86_amx @llvm.x86.tilezero.internal(i16 16, i16 64)
170  %16 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %15)
171  %17 = shl nsw i64 %14, 4
172  %18 = getelementptr i32, ptr %0, i64 %17
173  br i1 %5, label %25, label %19
174
17519:                                               ; preds = %13
176  %20 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %16)
177  %21 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> zeroinitializer)
178  %22 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> zeroinitializer)
179  %23 = tail call x86_amx @llvm.x86.tdpbf16ps.internal(i16 16, i16 64, i16 64, x86_amx %20, x86_amx %21, x86_amx %22)
180  %24 = tail call noundef <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %23)
181  br label %25
182
18325:                                               ; preds = %19, %13
184  %26 = phi <256 x i32> [ undef, %13 ], [ %24, %19 ]
185  %27 = getelementptr inbounds i32, ptr %12, i64 %17
186  %28 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %26)
187  tail call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr %27, i64 0, x86_amx %28)
188  %29 = add nuw nsw i64 %14, 1
189  %30 = icmp eq i64 %29, 0
190  br i1 %30, label %31, label %13
191
19231:                                               ; preds = %25
193  %32 = add nuw nsw i64 %7, 1
194  br label %6
195}
196
197define void @multi_use() nounwind {
198; CHECK-LABEL: multi_use:
199; CHECK:       # %bb.0:
200; CHECK-NEXT:    pushq %rbp
201; CHECK-NEXT:    subq $2928, %rsp # imm = 0xB70
202; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
203; CHECK-NEXT:    vmovups %zmm0, {{[0-9]+}}(%rsp)
204; CHECK-NEXT:    movb $1, {{[0-9]+}}(%rsp)
205; CHECK-NEXT:    movb $16, {{[0-9]+}}(%rsp)
206; CHECK-NEXT:    movw $64, {{[0-9]+}}(%rsp)
207; CHECK-NEXT:    movb $16, {{[0-9]+}}(%rsp)
208; CHECK-NEXT:    movw $64, {{[0-9]+}}(%rsp)
209; CHECK-NEXT:    movw $64, %ax
210; CHECK-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
211; CHECK-NEXT:    movw $16, %cx
212; CHECK-NEXT:    tilezero %tmm0
213; CHECK-NEXT:    movabsq $64, %rbp
214; CHECK-NEXT:    tilestored %tmm0, 896(%rsp,%rbp) # 1024-byte Folded Spill
215; CHECK-NEXT:    tileloadd 896(%rsp,%rbp), %tmm1 # 1024-byte Folded Reload
216; CHECK-NEXT:    tdpbf16ps %tmm0, %tmm0, %tmm1
217; CHECK-NEXT:    tdpbf16ps %tmm0, %tmm0, %tmm0
218; CHECK-NEXT:    addq $2928, %rsp # imm = 0xB70
219; CHECK-NEXT:    popq %rbp
220; CHECK-NEXT:    tilerelease
221; CHECK-NEXT:    vzeroupper
222; CHECK-NEXT:    retq
223  %1 = call x86_amx @llvm.x86.tilezero.internal(i16 16, i16 64)
224  %2 = call x86_amx @llvm.x86.tdpbf16ps.internal(i16 16, i16 64, i16 64, x86_amx %1, x86_amx %1, x86_amx %1)
225  %3 = call x86_amx @llvm.x86.tdpbf16ps.internal(i16 16, i16 64, i16 64, x86_amx %1, x86_amx %1, x86_amx %1)
226  ret void
227}
228
229declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>)
230declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx)
231