xref: /llvm-project/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll (revision e6bf48d11047e970cb24554a01b65b566d6b5d22)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck %s
3@buf = dso_local global [3072 x i8] zeroinitializer, align 16
4
5define dso_local void @test1(i16 signext %0, i16 signext %1) nounwind {
6; CHECK-LABEL: test1:
7; CHECK:       # %bb.0:
8; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
9; CHECK-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
10; CHECK-NEXT:    movb $1, -{{[0-9]+}}(%rsp)
11; CHECK-NEXT:    movw $8, -{{[0-9]+}}(%rsp)
12; CHECK-NEXT:    movb $8, -{{[0-9]+}}(%rsp)
13; CHECK-NEXT:    movw %si, -{{[0-9]+}}(%rsp)
14; CHECK-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
15; CHECK-NEXT:    movw %si, -{{[0-9]+}}(%rsp)
16; CHECK-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
17; CHECK-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp)
18; CHECK-NEXT:    movl $buf, %eax
19; CHECK-NEXT:    movl $32, %ecx
20; CHECK-NEXT:    movw $8, %dx
21; CHECK-NEXT:    tileloadd (%rax,%rcx), %tmm0
22; CHECK-NEXT:    movl $buf+1024, %eax
23; CHECK-NEXT:    tileloadd (%rax,%rcx), %tmm1
24; CHECK-NEXT:    movl $buf+2048, %eax
25; CHECK-NEXT:    tileloadd (%rax,%rcx), %tmm2
26; CHECK-NEXT:    tdpbssd %tmm1, %tmm0, %tmm2
27; CHECK-NEXT:    tilestored %tmm2, (%rax,%rcx)
28; CHECK-NEXT:    tilerelease
29; CHECK-NEXT:    vzeroupper
30; CHECK-NEXT:    jmp foo # TAILCALL
31  %3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, ptr @buf, i64 32)
32  %4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 1024), i64 32)
33  %5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32)
34  %6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %5, x86_amx %3, x86_amx %4)
35  call void @llvm.dbg.value(metadata x86_amx %6, metadata !DILocalVariable(name: "1", scope: !2), metadata !DIExpression()), !dbg !3
36  tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32, x86_amx %6)
37  tail call void @foo()
38  ret void
39}
40
41define dso_local void @test2(i16 signext %0, i16 signext %1) nounwind {
42; CHECK-LABEL: test2:
43; CHECK:       # %bb.0:
44; CHECK-NEXT:    pushq %rbp
45; CHECK-NEXT:    pushq %rbx
46; CHECK-NEXT:    subq $72, %rsp
47; CHECK-NEXT:    movl %esi, %ebx
48; CHECK-NEXT:    movl %edi, %ebp
49; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
50; CHECK-NEXT:    vmovups %zmm0, {{[0-9]+}}(%rsp)
51; CHECK-NEXT:    movb $1, {{[0-9]+}}(%rsp)
52; CHECK-NEXT:    movw $8, {{[0-9]+}}(%rsp)
53; CHECK-NEXT:    movb $8, {{[0-9]+}}(%rsp)
54; CHECK-NEXT:    movw $8, {{[0-9]+}}(%rsp)
55; CHECK-NEXT:    movb $8, {{[0-9]+}}(%rsp)
56; CHECK-NEXT:    movw %bx, {{[0-9]+}}(%rsp)
57; CHECK-NEXT:    movb %bpl, {{[0-9]+}}(%rsp)
58; CHECK-NEXT:    movw %bx, {{[0-9]+}}(%rsp)
59; CHECK-NEXT:    movb %bpl, {{[0-9]+}}(%rsp)
60; CHECK-NEXT:    movw %bx, {{[0-9]+}}(%rsp)
61; CHECK-NEXT:    movb %bpl, {{[0-9]+}}(%rsp)
62; CHECK-NEXT:    vzeroupper
63; CHECK-NEXT:    callq foo
64; CHECK-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
65; CHECK-NEXT:    xorl %eax, %eax
66; CHECK-NEXT:    testb %al, %al
67; CHECK-NEXT:    jne .LBB1_3
68; CHECK-NEXT:  # %bb.1: # %if.true
69; CHECK-NEXT:    movw $8, %ax
70; CHECK-NEXT:    tilezero %tmm0
71; CHECK-NEXT:    movl $32, %ecx
72; CHECK-NEXT:    movl $buf+1024, %edx
73; CHECK-NEXT:    tileloadd (%rdx,%rcx), %tmm1
74; CHECK-NEXT:    movl $buf+2048, %edx
75; CHECK-NEXT:    tileloadd (%rdx,%rcx), %tmm2
76; CHECK-NEXT:    tdpbssd %tmm2, %tmm1, %tmm0
77; CHECK-NEXT:    tilestored %tmm0, (%rdx,%rcx)
78; CHECK-NEXT:    jmp .LBB1_2
79; CHECK-NEXT:  .LBB1_3: # %if.false
80; CHECK-NEXT:    movl $buf, %eax
81; CHECK-NEXT:    movl $32, %ecx
82; CHECK-NEXT:    movw $8, %dx
83; CHECK-NEXT:    tileloadd (%rax,%rcx), %tmm3
84; CHECK-NEXT:    movl $buf+1024, %eax
85; CHECK-NEXT:    tileloadd (%rax,%rcx), %tmm4
86; CHECK-NEXT:    movl $buf+2048, %eax
87; CHECK-NEXT:    tileloadd (%rax,%rcx), %tmm2
88; CHECK-NEXT:    tdpbssd %tmm2, %tmm4, %tmm3
89; CHECK-NEXT:    tilestored %tmm3, (%rax,%rcx)
90; CHECK-NEXT:  .LBB1_2: # %if.true
91; CHECK-NEXT:    addq $72, %rsp
92; CHECK-NEXT:    popq %rbx
93; CHECK-NEXT:    popq %rbp
94; CHECK-NEXT:    tilerelease
95; CHECK-NEXT:    retq
96  call void @foo()
97  br i1 undef, label %if.true, label %if.false
98
99if.true:
100  %t1 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %0, i16 8)
101  %t2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 1024), i64 32)
102  %t3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32)
103  %t4 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %t1, x86_amx %t2, x86_amx %t3)
104  tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32, x86_amx %t4)
105  br label %exit
106
107if.false:
108  %t5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, ptr @buf, i64 32)
109  %t6 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 1024), i64 32)
110  %t7 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32)
111  %t8 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %t5, x86_amx %t6, x86_amx %t7)
112  tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32, x86_amx %t8)
113  br label %exit
114
115exit:
116  ret void
117}
118
119define dso_local void @test3(i16 signext %0, i16 signext %1) nounwind {
120; CHECK-LABEL: test3:
121; CHECK:       # %bb.0:
122; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
123; CHECK-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
124; CHECK-NEXT:    movb $1, -{{[0-9]+}}(%rsp)
125; CHECK-NEXT:    movw %si, -{{[0-9]+}}(%rsp)
126; CHECK-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
127; CHECK-NEXT:    xorl %eax, %eax
128; CHECK-NEXT:    testb %al, %al
129; CHECK-NEXT:    jne .LBB2_2
130; CHECK-NEXT:  # %bb.1: # %if.true
131; CHECK-NEXT:    incl %edi
132; CHECK-NEXT:    jmp .LBB2_3
133; CHECK-NEXT:  .LBB2_2: # %if.false
134; CHECK-NEXT:    decl %edi
135; CHECK-NEXT:  .LBB2_3: # %exit
136; CHECK-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
137; CHECK-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp)
138; CHECK-NEXT:    tilezero %tmm0
139; CHECK-NEXT:    movl $buf, %eax
140; CHECK-NEXT:    movl $32, %ecx
141; CHECK-NEXT:    tilestored %tmm0, (%rax,%rcx)
142; CHECK-NEXT:    tilerelease
143; CHECK-NEXT:    vzeroupper
144; CHECK-NEXT:    retq
145  br i1 undef, label %if.true, label %if.false
146
147if.true:
148  %3 = add i16 %0, 1
149  br label %exit
150
151if.false:
152  %4 = sub i16 %0, 1
153  br label %exit
154
155exit:
156  %5 = phi i16 [ %3, %if.true ], [ %4, %if.false ]
157  %6 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %5, i16 %1)
158  tail call void @llvm.x86.tilestored64.internal(i16 %5, i16 %1, ptr @buf, i64 32, x86_amx %6)
159  ret void
160}
161
162define dso_local void @test4(i16 signext %0, i16 signext %1) nounwind {
163; CHECK-LABEL: test4:
164; CHECK:       # %bb.0:
165; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
166; CHECK-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
167; CHECK-NEXT:    movb $1, -{{[0-9]+}}(%rsp)
168; CHECK-NEXT:    movw %si, -{{[0-9]+}}(%rsp)
169; CHECK-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
170; CHECK-NEXT:    xorl %eax, %eax
171; CHECK-NEXT:    testb %al, %al
172; CHECK-NEXT:    jne .LBB3_3
173; CHECK-NEXT:  # %bb.1: # %if.true
174; CHECK-NEXT:    incl %edi
175; CHECK-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
176; CHECK-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp)
177; CHECK-NEXT:    xorl %eax, %eax
178; CHECK-NEXT:    testb %al, %al
179; CHECK-NEXT:    jne .LBB3_4
180; CHECK-NEXT:  .LBB3_2: # %amx2
181; CHECK-NEXT:    movl $32, %eax
182; CHECK-NEXT:    movl $buf+1024, %ecx
183; CHECK-NEXT:    tileloadd (%rcx,%rax), %tmm0
184; CHECK-NEXT:    movl $buf, %ecx
185; CHECK-NEXT:    tilestored %tmm0, (%rcx,%rax)
186; CHECK-NEXT:    tilerelease
187; CHECK-NEXT:    vzeroupper
188; CHECK-NEXT:    retq
189; CHECK-NEXT:  .LBB3_3: # %if.false
190; CHECK-NEXT:    decl %edi
191; CHECK-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
192; CHECK-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp)
193; CHECK-NEXT:    xorl %eax, %eax
194; CHECK-NEXT:    testb %al, %al
195; CHECK-NEXT:    jne .LBB3_2
196; CHECK-NEXT:  .LBB3_4: # %amx1
197; CHECK-NEXT:    tilezero %tmm0
198; CHECK-NEXT:    movl $buf, %eax
199; CHECK-NEXT:    movl $32, %ecx
200; CHECK-NEXT:    tilestored %tmm0, (%rax,%rcx)
201; CHECK-NEXT:    tilerelease
202; CHECK-NEXT:    vzeroupper
203; CHECK-NEXT:    retq
204  br i1 undef, label %if.true, label %if.false
205
206if.true:
207  %3 = add i16 %0, 1
208  br i1 undef, label %amx1, label %amx2
209
210if.false:
211  %4 = sub i16 %0, 1
212  br i1 undef, label %amx2, label %amx1
213
214amx1:
215  %5 = phi i16 [ %3, %if.true ], [ %4, %if.false ]
216  %6 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %5, i16 %1)
217  tail call void @llvm.x86.tilestored64.internal(i16 %5, i16 %1, ptr @buf, i64 32, x86_amx %6)
218  br label %exit
219
220amx2:
221  %7 = phi i16 [ %3, %if.true ], [ %4, %if.false ]
222  %8 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %7, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 1024), i64 32)
223  tail call void @llvm.x86.tilestored64.internal(i16 %7, i16 %1, ptr @buf, i64 32, x86_amx %8)
224  br label %exit
225
226exit:
227  ret void
228}
229
230define dso_local void @test5(i16 signext %0, i16 signext %1) nounwind {
231; CHECK-LABEL: test5:
232; CHECK:       # %bb.0: # %entry
233; CHECK-NEXT:    # kill: def $esi killed $esi def $rsi
234; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
235; CHECK-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
236; CHECK-NEXT:    movb $1, -{{[0-9]+}}(%rsp)
237; CHECK-NEXT:    movw %si, -{{[0-9]+}}(%rsp)
238; CHECK-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
239; CHECK-NEXT:    xorl %eax, %eax
240; CHECK-NEXT:    movl $buf, %ecx
241; CHECK-NEXT:    movl $32, %edx
242; CHECK-NEXT:    leal -1(%rsi), %r8d
243; CHECK-NEXT:    jmp .LBB4_1
244; CHECK-NEXT:    .p2align 4
245; CHECK-NEXT:  .LBB4_3: # %if.false
246; CHECK-NEXT:    # in Loop: Header=BB4_1 Depth=1
247; CHECK-NEXT:    movl %r8d, %esi
248; CHECK-NEXT:    movw %r8w, -{{[0-9]+}}(%rsp)
249; CHECK-NEXT:    cmpw $7, %si
250; CHECK-NEXT:    jne .LBB4_5
251; CHECK-NEXT:  .LBB4_1: # %loop.bb1
252; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
253; CHECK-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp)
254; CHECK-NEXT:    testb %al, %al
255; CHECK-NEXT:    jne .LBB4_3
256; CHECK-NEXT:  # %bb.2: # %if.true
257; CHECK-NEXT:    # in Loop: Header=BB4_1 Depth=1
258; CHECK-NEXT:    tilezero %tmm0
259; CHECK-NEXT:    tilestored %tmm0, (%rcx,%rdx)
260; CHECK-NEXT:    cmpw $7, %si
261; CHECK-NEXT:    je .LBB4_1
262; CHECK-NEXT:  .LBB4_5: # %exit
263; CHECK-NEXT:    tilerelease
264; CHECK-NEXT:    vzeroupper
265; CHECK-NEXT:    retq
266entry:
267  br label %loop.bb1
268
269loop.bb1:
270  %2 = phi i16 [ %1, %entry ], [ %5, %loop.bb2 ]
271  br i1 undef, label %if.true, label %if.false
272
273if.true:
274  %3 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %0, i16 %2)
275  tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %2, ptr @buf, i64 32, x86_amx %3)
276  br label %loop.bb2
277
278if.false:
279  %4 = sub i16 %1, 1
280  br label %loop.bb2
281
282loop.bb2:
283  %5 = phi i16 [ %2, %if.true ], [ %4, %if.false ]
284  %6 = icmp eq i16 %5, 7
285  br i1 %6, label %loop.bb1, label %exit
286
287exit:
288  ret void
289}
290
291define dso_local void @test6(i16 signext %0) nounwind {
292; CHECK-LABEL: test6:
293; CHECK:       # %bb.0: # %entry
294; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
295; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
296; CHECK-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
297; CHECK-NEXT:    movb $1, -{{[0-9]+}}(%rsp)
298; CHECK-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
299; CHECK-NEXT:    xorl %eax, %eax
300; CHECK-NEXT:    movl $buf, %ecx
301; CHECK-NEXT:    movl $32, %edx
302; CHECK-NEXT:    xorl %esi, %esi
303; CHECK-NEXT:    jmp .LBB5_1
304; CHECK-NEXT:    .p2align 4
305; CHECK-NEXT:  .LBB5_3: # %if.false
306; CHECK-NEXT:    # in Loop: Header=BB5_1 Depth=1
307; CHECK-NEXT:    decl %esi
308; CHECK-NEXT:  .LBB5_4: # %loop.bb2
309; CHECK-NEXT:    # in Loop: Header=BB5_1 Depth=1
310; CHECK-NEXT:    leal (%rdi,%rsi), %r8d
311; CHECK-NEXT:    movw %r8w, -{{[0-9]+}}(%rsp)
312; CHECK-NEXT:    cmpw $7, %si
313; CHECK-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp)
314; CHECK-NEXT:    tilezero %tmm0
315; CHECK-NEXT:    tilestored %tmm0, (%rcx,%rdx)
316; CHECK-NEXT:    jne .LBB5_5
317; CHECK-NEXT:  .LBB5_1: # %loop.bb1
318; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
319; CHECK-NEXT:    testb %al, %al
320; CHECK-NEXT:    jne .LBB5_3
321; CHECK-NEXT:  # %bb.2: # %if.true
322; CHECK-NEXT:    # in Loop: Header=BB5_1 Depth=1
323; CHECK-NEXT:    incl %esi
324; CHECK-NEXT:    jmp .LBB5_4
325; CHECK-NEXT:  .LBB5_5: # %exit
326; CHECK-NEXT:    tilerelease
327; CHECK-NEXT:    vzeroupper
328; CHECK-NEXT:    retq
329entry:
330  br label %loop.bb1
331
332loop.bb1:
333  %1 = phi i16 [ 0, %entry ], [ %4, %loop.bb2 ]
334  br i1 undef, label %if.true, label %if.false
335
336if.true:
337  %2 = add i16 %1, 1
338  br label %loop.bb2
339
340if.false:
341  %3 = sub i16 %1, 1
342  br label %loop.bb2
343
344loop.bb2:
345  %4 = phi i16 [ %2, %if.true ], [ %3, %if.false ]
346  %5 = icmp eq i16 %4, 7
347  %6 = add i16 %0, %4
348  %7 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %0, i16 %6)
349  tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %6, ptr @buf, i64 32, x86_amx %7)
350  br i1 %5, label %loop.bb1, label %exit
351
352exit:
353  ret void
354}
355
356
357declare dso_local void @foo() nounwind
358declare void @llvm.dbg.value(metadata, metadata, metadata)
359declare x86_amx @llvm.x86.tilezero.internal(i16, i16)
360declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64)
361declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
362declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx)
363
364!llvm.dbg.cu = !{!0}
365!llvm.module.flags = !{!1}
366
367!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !DIFile(filename: "1", directory: "1"))
368!1 = !{i32 2, !"Debug Info Version", i32 3}
369!2 = distinct !DISubprogram(unit: !0)
370!3 = !DILocation(line: 1, column: 1, scope: !2)
371