xref: /llvm-project/llvm/test/CodeGen/X86/AMX/amx-config.ll (revision a21abc782a8e1cb718a10c471a3b634f3102fc1c)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8,+avx512f -verify-machineinstrs | FileCheck %s --check-prefix=AVX512
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8,+avx2 -verify-machineinstrs | FileCheck %s --check-prefix=AVX2
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8,+avx -verify-machineinstrs | FileCheck %s --check-prefix=AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -verify-machineinstrs | FileCheck %s --check-prefix=SSE2
6
7@buf = dso_local global [1024 x i8] zeroinitializer, align 64
8@buf2 = dso_local global [1024 x i8] zeroinitializer, align 64
9
10; Function Attrs: nounwind uwtable
11define <4 x i32> @test_api(i32 %0, i16 signext %1, i16 signext %2, <4 x i32> %xmm0) {
12; AVX512-LABEL: test_api:
13; AVX512:       # %bb.0:
14; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
15; AVX512-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
16; AVX512-NEXT:    movb $1, -{{[0-9]+}}(%rsp)
17; AVX512-NEXT:    movw %dx, -{{[0-9]+}}(%rsp)
18; AVX512-NEXT:    movw %dx, -{{[0-9]+}}(%rsp)
19; AVX512-NEXT:    movw %si, -{{[0-9]+}}(%rsp)
20; AVX512-NEXT:    testl %edi, %edi
21; AVX512-NEXT:    movsbl %sil, %eax
22; AVX512-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
23; AVX512-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
24; AVX512-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
25; AVX512-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp)
26; AVX512-NEXT:    je .LBB0_2
27; AVX512-NEXT:  # %bb.1:
28; AVX512-NEXT:    movl $buf, %ecx
29; AVX512-NEXT:    jmp .LBB0_3
30; AVX512-NEXT:  .LBB0_2:
31; AVX512-NEXT:    movl $buf2, %ecx
32; AVX512-NEXT:  .LBB0_3:
33; AVX512-NEXT:    movl $32, %edi
34; AVX512-NEXT:    tileloadd (%rcx,%rdi), %tmm0
35; AVX512-NEXT:    tileloadd (%rcx,%rdi), %tmm2
36; AVX512-NEXT:    tileloadd (%rcx,%rdi), %tmm1
37; AVX512-NEXT:    tdpbssd %tmm2, %tmm0, %tmm1
38; AVX512-NEXT:    movl $buf, %ecx
39; AVX512-NEXT:    movl $32, %esi
40; AVX512-NEXT:    tilestored %tmm1, (%rcx,%rsi)
41; AVX512-NEXT:    tilerelease
42; AVX512-NEXT:    vzeroupper
43; AVX512-NEXT:    retq
44;
45; AVX2-LABEL: test_api:
46; AVX2:       # %bb.0:
47; AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
48; AVX2-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
49; AVX2-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
50; AVX2-NEXT:    movb $1, -{{[0-9]+}}(%rsp)
51; AVX2-NEXT:    movw %dx, -{{[0-9]+}}(%rsp)
52; AVX2-NEXT:    movw %dx, -{{[0-9]+}}(%rsp)
53; AVX2-NEXT:    movw %si, -{{[0-9]+}}(%rsp)
54; AVX2-NEXT:    testl %edi, %edi
55; AVX2-NEXT:    movsbl %sil, %eax
56; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
57; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
58; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
59; AVX2-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp)
60; AVX2-NEXT:    je .LBB0_2
61; AVX2-NEXT:  # %bb.1:
62; AVX2-NEXT:    movl $buf, %ecx
63; AVX2-NEXT:    jmp .LBB0_3
64; AVX2-NEXT:  .LBB0_2:
65; AVX2-NEXT:    movl $buf2, %ecx
66; AVX2-NEXT:  .LBB0_3:
67; AVX2-NEXT:    movl $32, %edi
68; AVX2-NEXT:    tileloadd (%rcx,%rdi), %tmm0
69; AVX2-NEXT:    tileloadd (%rcx,%rdi), %tmm2
70; AVX2-NEXT:    tileloadd (%rcx,%rdi), %tmm1
71; AVX2-NEXT:    tdpbssd %tmm2, %tmm0, %tmm1
72; AVX2-NEXT:    movl $buf, %ecx
73; AVX2-NEXT:    movl $32, %esi
74; AVX2-NEXT:    tilestored %tmm1, (%rcx,%rsi)
75; AVX2-NEXT:    tilerelease
76; AVX2-NEXT:    vzeroupper
77; AVX2-NEXT:    retq
78;
79; AVX1-LABEL: test_api:
80; AVX1:       # %bb.0:
81; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
82; AVX1-NEXT:    vmovups %xmm1, -{{[0-9]+}}(%rsp)
83; AVX1-NEXT:    vmovups %xmm1, -{{[0-9]+}}(%rsp)
84; AVX1-NEXT:    vmovups %xmm1, -{{[0-9]+}}(%rsp)
85; AVX1-NEXT:    vmovups %xmm1, -{{[0-9]+}}(%rsp)
86; AVX1-NEXT:    movb $1, -{{[0-9]+}}(%rsp)
87; AVX1-NEXT:    movw %dx, -{{[0-9]+}}(%rsp)
88; AVX1-NEXT:    movw %dx, -{{[0-9]+}}(%rsp)
89; AVX1-NEXT:    movw %si, -{{[0-9]+}}(%rsp)
90; AVX1-NEXT:    testl %edi, %edi
91; AVX1-NEXT:    movsbl %sil, %eax
92; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
93; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
94; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
95; AVX1-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp)
96; AVX1-NEXT:    je .LBB0_2
97; AVX1-NEXT:  # %bb.1:
98; AVX1-NEXT:    movl $buf, %ecx
99; AVX1-NEXT:    jmp .LBB0_3
100; AVX1-NEXT:  .LBB0_2:
101; AVX1-NEXT:    movl $buf2, %ecx
102; AVX1-NEXT:  .LBB0_3:
103; AVX1-NEXT:    movl $32, %edi
104; AVX1-NEXT:    tileloadd (%rcx,%rdi), %tmm0
105; AVX1-NEXT:    tileloadd (%rcx,%rdi), %tmm2
106; AVX1-NEXT:    tileloadd (%rcx,%rdi), %tmm1
107; AVX1-NEXT:    tdpbssd %tmm2, %tmm0, %tmm1
108; AVX1-NEXT:    movl $buf, %ecx
109; AVX1-NEXT:    movl $32, %esi
110; AVX1-NEXT:    tilestored %tmm1, (%rcx,%rsi)
111; AVX1-NEXT:    tilerelease
112; AVX1-NEXT:    retq
113;
114; SSE2-LABEL: test_api:
115; SSE2:       # %bb.0:
116; SSE2-NEXT:    xorps %xmm1, %xmm1
117; SSE2-NEXT:    movups %xmm1, -{{[0-9]+}}(%rsp)
118; SSE2-NEXT:    movups %xmm1, -{{[0-9]+}}(%rsp)
119; SSE2-NEXT:    movups %xmm1, -{{[0-9]+}}(%rsp)
120; SSE2-NEXT:    movups %xmm1, -{{[0-9]+}}(%rsp)
121; SSE2-NEXT:    movb $1, -{{[0-9]+}}(%rsp)
122; SSE2-NEXT:    movw %dx, -{{[0-9]+}}(%rsp)
123; SSE2-NEXT:    movw %dx, -{{[0-9]+}}(%rsp)
124; SSE2-NEXT:    movw %si, -{{[0-9]+}}(%rsp)
125; SSE2-NEXT:    testl %edi, %edi
126; SSE2-NEXT:    movsbl %sil, %eax
127; SSE2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
128; SSE2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
129; SSE2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
130; SSE2-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp)
131; SSE2-NEXT:    je .LBB0_2
132; SSE2-NEXT:  # %bb.1:
133; SSE2-NEXT:    movl $buf, %ecx
134; SSE2-NEXT:    jmp .LBB0_3
135; SSE2-NEXT:  .LBB0_2:
136; SSE2-NEXT:    movl $buf2, %ecx
137; SSE2-NEXT:  .LBB0_3:
138; SSE2-NEXT:    movl $32, %edi
139; SSE2-NEXT:    tileloadd (%rcx,%rdi), %tmm0
140; SSE2-NEXT:    tileloadd (%rcx,%rdi), %tmm2
141; SSE2-NEXT:    tileloadd (%rcx,%rdi), %tmm1
142; SSE2-NEXT:    tdpbssd %tmm2, %tmm0, %tmm1
143; SSE2-NEXT:    movl $buf, %ecx
144; SSE2-NEXT:    movl $32, %esi
145; SSE2-NEXT:    tilestored %tmm1, (%rcx,%rsi)
146; SSE2-NEXT:    tilerelease
147; SSE2-NEXT:    retq
148  %4 = icmp eq i32 %0, 0
149  %5 = shl i16 %1, 8
150  %6 = ashr exact i16 %5, 8
151  br i1 %4, label %11, label %7
152
1537:                                                ; preds = %3
154  %8 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %6, i16 %1, ptr @buf, i64 32)
155  %9 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %6, i16 %2, ptr @buf, i64 32)
156  %10 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %6, i16 %2, ptr @buf, i64 32)
157  br label %15
158
15911:                                               ; preds = %3
160  %12 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %6, i16 %1, ptr @buf2, i64 32)
161  %13 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %6, i16 %2, ptr @buf2, i64 32)
162  %14 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %6, i16 %2, ptr @buf2, i64 32)
163  br label %15
164
16515:                                               ; preds = %11, %7
166  %16 = phi x86_amx [ %12, %11 ], [ %8, %7 ]
167  %17 = phi x86_amx [ %13, %11 ], [ %9, %7 ]
168  %18 = phi x86_amx [ %14, %11 ], [ %10, %7 ]
169  %19 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %6, i16 %2, i16 %1, x86_amx %18, x86_amx %16, x86_amx %17)
170  tail call void @llvm.x86.tilestored64.internal(i16 %6, i16 %2, ptr @buf, i64 32, x86_amx %19)
171  ret <4 x i32> %xmm0
172}
173
174declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64)
175declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
176declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx)
177