xref: /llvm-project/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll (revision 9e9907f1cfa424366fba58d9520f9305b537cec9)
1; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX908 %s
2; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX908 %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A %s
4
5declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16>, <2 x i16>, <32 x float>, i32, i32, i32)
6declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16>, <2 x i16>, <16 x float>, i32, i32, i32)
7declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16>, <2 x i16>, <4 x float>, i32, i32, i32)
8declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16>, <2 x i16>, <16 x float>, i32, i32, i32)
9declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16>, <2 x i16>, <4 x float>, i32, i32, i32)
10declare i32 @llvm.amdgcn.workitem.id.x()
11
12; GCN-LABEL: {{^}}test_mfma_f32_32x32x2bf16:
13; GCN-DAG:         v_mov_b32_e32 [[TWO:v[0-9]+]], 2
14; GCN-DAG:         v_mov_b32_e32 [[ONE:v[0-9]+]], 1
15; GCN-DAG:         s_load_dwordx16
16; GCN-DAG:         s_load_dwordx16
17; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
18; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
19; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
20; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
21; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
22; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
23; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
24; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
25; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
26; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
27; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
28; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
29; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
30; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
31; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
32; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
33; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
34; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
35; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
36; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
37; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
38; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
39; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
40; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
41; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
42; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
43; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
44; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
45; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
46; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
47; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
48; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
49; GFX90A-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
50; GCN:             v_mfma_f32_32x32x2bf16 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
51; GFX908-COUNT-32: v_accvgpr_read_b32
52; GFX908:          global_store_dwordx4
53; GFX90A-NOT:      v_accvgpr_read_b32
54; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}],
55define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(ptr addrspace(1) %arg) #0 {
56bb:
57  %in.1 = load <32 x float>, ptr addrspace(1) %arg
58  %a = bitcast i32 1 to <2 x i16>
59  %b = bitcast i32 2 to <2 x i16>
60  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16> %a, <2 x i16> %b, <32 x float> %in.1, i32 1, i32 2, i32 3)
61  store <32 x float> %mai.1, ptr addrspace(1) %arg
62  ret void
63}
64
65; GCN-LABEL: {{^}}test_mfma_f32_16x16x2bf16:
66; GCN-DAG:         v_mov_b32_e32 [[TWO:v[0-9]+]], 2
67; GCN-DAG:         v_mov_b32_e32 [[ONE:v[0-9]+]], 1
68; GCN-DAG:         s_load_dwordx16
69; GFX908-DAG-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
70; GFX90A-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
71; GCN:             v_mfma_f32_16x16x2bf16 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
72; GFX908-COUNT-16: v_accvgpr_read_b32
73; GFX908:          global_store_dwordx4
74; GFX90A-NOT:      v_accvgpr_read_b32
75; GFX90A-COUNT-4:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}],
76define amdgpu_kernel void @test_mfma_f32_16x16x2bf16(ptr addrspace(1) %arg) #0 {
77bb:
78  %in.1 = load <16 x float>, ptr addrspace(1) %arg
79  %a = bitcast i32 1 to <2 x i16>
80  %b = bitcast i32 2 to <2 x i16>
81  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
82  store <16 x float> %mai.1, ptr addrspace(1) %arg
83  ret void
84}
85
86; GCN-LABEL: {{^}}test_mfma_f32_4x4x2bf16:
87; GCN-DAG:        v_mov_b32_e32 [[TWO:v[0-9]+]], 2
88; GCN-DAG:        v_mov_b32_e32 [[ONE:v[0-9]+]], 1
89; GCN:            s_load_dwordx4
90; GFX908-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
91; GFX90A-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
92; GCN:            v_mfma_f32_4x4x2bf16 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
93; GFX908-COUNT-4: v_accvgpr_read_b32
94; GFX908:         global_store_dwordx4
95; GFX90A-NOT:     v_accvgpr_read_b32
96; GFX90A:         global_store_dwordx4 v{{[0-9]+}}, [[RES]],
97define amdgpu_kernel void @test_mfma_f32_4x4x2bf16(ptr addrspace(1) %arg) #0 {
98bb:
99  %in.1 = load <4 x float>, ptr addrspace(1) %arg
100  %a = bitcast i32 1 to <2 x i16>
101  %b = bitcast i32 2 to <2 x i16>
102  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
103  store <4 x float> %mai.1, ptr addrspace(1) %arg
104  ret void
105}
106
107; GCN-LABEL: {{^}}test_mfma_f32_32x32x4bf16:
108; GCN-DAG:         v_mov_b32_e32 [[TWO:v[0-9]+]], 2
109; GCN-DAG:         v_mov_b32_e32 [[ONE:v[0-9]+]], 1
110; GCN-DAG:         s_load_dwordx16
111; GFX908-DAG-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
112; GFX90A-COUNT-4:  v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
113; GCN:             v_mfma_f32_32x32x4bf16 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
114; GFX908-COUNT-16: v_accvgpr_read_b32
115; GFX908:          global_store_dwordx4
116; GFX90A-NOT:      v_accvgpr_read_b32
117; GFX90A-COUNT-4:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}],
118define amdgpu_kernel void @test_mfma_f32_32x32x4bf16(ptr addrspace(1) %arg) #0 {
119bb:
120  %in.1 = load <16 x float>, ptr addrspace(1) %arg
121  %a = bitcast i32 1 to <2 x i16>
122  %b = bitcast i32 2 to <2 x i16>
123  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
124  store <16 x float> %mai.1, ptr addrspace(1) %arg
125  ret void
126}
127
128; GCN-LABEL: {{^}}test_mfma_f32_16x16x8bf16:
129; GCN-DAG:        v_mov_b32_e32 [[TWO:v[0-9]+]], 2
130; GCN-DAG:        v_mov_b32_e32 [[ONE:v[0-9]+]], 1
131; GCN:            s_load_dwordx4
132; GFX908-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
133; GFX90A-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
134; GCN:            v_mfma_f32_16x16x8bf16 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
135; GFX908-COUNT-4: v_accvgpr_read_b32
136; GFX908:         global_store_dwordx4
137; GFX90A-NOT:     v_accvgpr_read_b32
138; GFX90A:         global_store_dwordx4 v{{[0-9]+}}, [[RES]],
139define amdgpu_kernel void @test_mfma_f32_16x16x8bf16(ptr addrspace(1) %arg) #0 {
140bb:
141  %in.1 = load <4 x float>, ptr addrspace(1) %arg
142  %a = bitcast i32 1 to <2 x i16>
143  %b = bitcast i32 2 to <2 x i16>
144  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
145  store <4 x float> %mai.1, ptr addrspace(1) %arg
146  ret void
147}
148
149attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
150