xref: /llvm-project/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll (revision 9e9907f1cfa424366fba58d9520f9305b537cec9)
1; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,NOLIT-SRCC,GFX908,GFX908_A %s
2; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,LIT-SRCC,GFX908,GFX908_A %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A,GFX908_A,GFX90A_40 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX940,GFX90A_40 %s
5
6declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
7declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32)
8declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32)
9declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x2f32(float, float, <16 x float>, i32, i32, i32)
10declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x4f32(float, float, <4 x float>, i32, i32, i32)
11declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half>, <4 x half>, <32 x float>, i32, i32, i32)
12declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4f16(<4 x half>, <4 x half>, <16 x float>, i32, i32, i32)
13declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x4f16(<4 x half>, <4 x half>, <4 x float>, i32, i32, i32)
14declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half>, <4 x half>, <16 x float>, i32, i32, i32)
15declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half>, <4 x half>, <4 x float>, i32, i32, i32)
16declare <32 x i32> @llvm.amdgcn.mfma.i32.32x32x4i8(i32, i32, <32 x i32>, i32, i32, i32)
17declare <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32, i32, <16 x i32>, i32, i32, i32)
18declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32)
19declare i32 @llvm.amdgcn.workitem.id.x()
20
21; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32:
22; GCN-DAG:        v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
23; GCN-DAG:        v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
24; GCN-DAG:        s_load_dwordx16
25; GCN-DAG:        s_load_dwordx16
26; GFX908-DAG:     v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
27; GFX908-DAG:     v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
28; GFX908-DAG:     v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
29; GFX908-DAG:     v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
30; GFX908-DAG:     v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
31; GFX908-DAG:     v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
32; GFX908-DAG:     v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
33; GFX908-DAG:     v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
34; GFX908-DAG:     v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
35; GFX908-DAG:     v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
36; GFX908-DAG:     v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
37; GFX908-DAG:     v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
38; GFX908-DAG:     v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
39; GFX908-DAG:     v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
40; GFX908-DAG:     v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
41; GFX908-DAG:     v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
42; GFX908-DAG:     v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
43; GFX908-DAG:     v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
44; GFX908-DAG:     v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
45; GFX908-DAG:     v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
46; GFX908-DAG:     v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
47; GFX908-DAG:     v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
48; GFX908-DAG:     v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
49; GFX908-DAG:     v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
50; GFX908-DAG:     v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
51; GFX908-DAG:     v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
52; GFX908-DAG:     v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
53; GFX908-DAG:     v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
54; GFX908-DAG:     v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
55; GFX908-DAG:     v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
56; GFX908-DAG:     v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
57; GFX908-DAG:     v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
58; GFX90A_40-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
59; GFX908_A:       v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
60; GFX940:         v_mfma_f32_32x32x1_2b_f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
61; GFX908-COUNT-4: v_accvgpr_read_b32
62; GFX908-COUNT-2: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
63; GFX908-COUNT-4: v_accvgpr_read_b32
64; GFX908-COUNT-2: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
65; GFX908-COUNT-4: v_accvgpr_read_b32
66; GFX908-COUNT-2: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
67; GFX908-COUNT-4: v_accvgpr_read_b32
68; GFX908-COUNT-2: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
69; GFX90A-NOT:     v_accvgpr_read_b32
70; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
71define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
72bb:
73  %in.1 = load <32 x float>, ptr addrspace(1) %arg
74  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3)
75  store <32 x float> %mai.1, ptr addrspace(1) %arg
76  ret void
77}
78
79; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32:
80; GCN-DAG:           v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
81; GCN-DAG:           v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
82; GCN-DAG:           s_load_dwordx16
83; GFX908-DAG-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
84; GFX90A_40-COUNT-16:v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
85; GFX908_A:          v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
86; GFX940:            v_mfma_f32_16x16x1_4b_f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
87; GFX908-COUNT:      v_accvgpr_read_b32
88; GFX908-COUNT-4:    global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
89; GFX90A-NOT:        v_accvgpr_read_b32
90; GFX90A-COUNT-4:    global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
91define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
92bb:
93  %in.1 = load <16 x float>, ptr addrspace(1) %arg
94  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %in.1, i32 1, i32 2, i32 3)
95  store <16 x float> %mai.1, ptr addrspace(1) %arg
96  ret void
97}
98
99; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32:
100; GCN-DAG:          v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
101; GCN-DAG:          v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
102; GCN:              s_load_dwordx4
103; GFX908-COUNT-4:   v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
104; GFX90A_40-COUNT-4:v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
105; GFX908_A:         v_mfma_f32_4x4x1f32 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
106; GFX940:           v_mfma_f32_4x4x1_16b_f32 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
107; GFX908-COUNT-4:   v_accvgpr_read_b32
108; GFX908:           global_store_dwordx4
109; GFX90A-NOT:       v_accvgpr_read_b32
110; GFX90A:           global_store_dwordx4 {{v[0-9]+}}, [[RES]]
111define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 {
112bb:
113  %in.1 = load <4 x float>, ptr addrspace(1) %arg
114  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %in.1, i32 1, i32 2, i32 3)
115  store <4 x float> %mai.1, ptr addrspace(1) %arg
116  ret void
117}
118
119; GCN-LABEL: {{^}}test_mfma_f32_32x32x2f32:
120; GCN-DAG:           v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
121; GCN-DAG:           v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
122; GCN-DAG:           s_load_dwordx16
123; GFX908-DAG-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
124; GFX90A_40-COUNT-16:v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
125; GFX908_A:          v_mfma_f32_32x32x2f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
126; GFX940:            v_mfma_f32_32x32x2_f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
127; GFX908-COUNT-16:   v_accvgpr_read_b32
128; GFX908-COUNT-4:    global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
129; GFX90A-NOT:        v_accvgpr_read_b32
130; GFX90A-COUNT-4:    global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
131define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 {
132bb:
133  %in.1 = load <16 x float>, ptr addrspace(1) %arg
134  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x2f32(float 1.0, float 2.0, <16 x float> %in.1, i32 1, i32 2, i32 3)
135  store <16 x float> %mai.1, ptr addrspace(1) %arg
136  ret void
137}
138
139; GCN-LABEL: {{^}}test_mfma_f32_16x16x4f32:
140; GCN-DAG:          v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
141; GCN-DAG:          v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
142; GCN:              s_load_dwordx4
143; GFX908-COUNT-4:   v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
144; GFX90A_40-COUNT-4:v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
145; GFX908_A:         v_mfma_f32_16x16x4f32 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
146; GFX940:           v_mfma_f32_16x16x4_f32 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
147; GFX908-COUNT-4:   v_accvgpr_read_b32
148; GFX908:           global_store_dwordx4
149; GFX90A-NOT:       v_accvgpr_read_b32
150; GFX90A:           global_store_dwordx4 {{v[0-9]+}}, [[RES]],
151define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 {
152bb:
153  %in.1 = load <4 x float>, ptr addrspace(1) %arg
154  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x4f32(float 1.0, float 2.0, <4 x float> %in.1, i32 1, i32 2, i32 3)
155  store <4 x float> %mai.1, ptr addrspace(1) %arg
156  ret void
157}
158
159; GCN-LABEL: {{^}}test_mfma_f32_32x32x4f16:
160; GCN-DAG:           s_load_dwordx16
161; GCN-DAG:           s_load_dwordx16
162; GFX908-COUNT-32:   v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
163; GFX90A_40-COUNT-32:v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
164; GFX908_A:          v_mfma_f32_32x32x4f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
165; GFX940:            v_mfma_f32_32x32x4_2b_f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
166; GFX908-COUNT-32:   v_accvgpr_read_b32
167; GFX908:            global_store_dwordx4
168; GFX90A-NOT:        v_accvgpr_read_b32
169; GFX90A-COUNT-8:    global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}],
170define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr addrspace(1) %c) #0 {
171bb:
172  %in.1 = load <32 x float>, ptr addrspace(1) %arg
173  %c.1 = load <4 x half>, ptr addrspace(1) %c
174  %c2p = getelementptr <4 x half>, ptr addrspace(1) %c, i64 1
175  %c.2 = load <4 x half>, ptr addrspace(1) %c2p
176  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %c.1, <4 x half> %c.2, <32 x float> %in.1, i32 1, i32 2, i32 3)
177  store <32 x float> %mai.1, ptr addrspace(1) %arg
178  ret void
179}
180
181; GCN-LABEL: {{^}}test_mfma_f32_16x16x4f16:
182; GCN:               s_load_dwordx16
183; GFX908-COUNT-16:   v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
184; GFX90A_40-COUNT-16:v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
185; GFX908_A:          v_mfma_f32_16x16x4f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
186; GFX940:            v_mfma_f32_16x16x4_4b_f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
187; GFX908-COUNT-16:   v_accvgpr_read_b32
188; GFX908:            global_store_dwordx4
189; GFX90A-NOT:        v_accvgpr_read_b32
190; GFX90A-COUNT-4:    global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}],
191define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr addrspace(1) %c) #0 {
192bb:
193  %in.1 = load <16 x float>, ptr addrspace(1) %arg
194  %c.1 = load <4 x half>, ptr addrspace(1) %c
195  %c2p = getelementptr <4 x half>, ptr addrspace(1) %c, i64 1
196  %c.2 = load <4 x half>, ptr addrspace(1) %c2p
197  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4f16(<4 x half> %c.1, <4 x half> %c.2, <16 x float> %in.1, i32 1, i32 2, i32 3)
198  store <16 x float> %mai.1, ptr addrspace(1) %arg
199  ret void
200}
201
202; GCN-LABEL: {{^}}test_mfma_f32_4x4x4f16:
203; GCN:              s_load_dwordx4
204; GCN:              s_load_dwordx4
205; GFX908-COUNT-4:   v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
206; GFX90A_40-COUNT-4:v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
207; GFX908_A:         v_mfma_f32_4x4x4f16 [[RES:a\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
208; GFX940:           v_mfma_f32_4x4x4_16b_f16 [[RES:a\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
209; GFX908-COUNT-4:   v_accvgpr_read_b32
210; GFX908:           global_store_dwordx4
211; GFX90A-NOT:       v_accvgpr_read_b32
212; GFX90A:           global_store_dwordx4 {{v[0-9]+}}, [[RES]],
213define amdgpu_kernel void @test_mfma_f32_4x4x4f16(ptr addrspace(1) %arg, ptr addrspace(1) %c) #0 {
214bb:
215  %in.1 = load <4 x float>, ptr addrspace(1) %arg
216  %c.1 = load <4 x half>, ptr addrspace(1) %c
217  %c2p = getelementptr <4 x half>, ptr addrspace(1) %c, i64 1
218  %c.2 = load <4 x half>, ptr addrspace(1) %c2p
219  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4f16(<4 x half> %c.1, <4 x half> %c.2, <4 x float> %in.1, i32 1, i32 2, i32 3)
220  store <4 x float> %mai.1, ptr addrspace(1) %arg
221  ret void
222}
223
224; GCN-LABEL: {{^}}test_mfma_f32_32x32x8f16:
225; GCN:               s_load_dwordx16
226; GCN:               s_waitcnt lgkmcnt(0)
227; GFX908_A:          v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
228; GFX908-COUNT-16:   v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
229; GFX90A_40-COUNT-16:v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
230; GFX908_A:          v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
231; GFX940:            v_mfma_f32_32x32x8_f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
232; GFX908-COUNT-16:   v_accvgpr_read_b32
233; GFX908:            global_store_dwordx4
234; GFX90A-NOT:        v_accvgpr_read_b32
235; GFX90A-COUNT-4:    global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}],
236define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr addrspace(1) %c) #0 {
237bb:
238  %in.1 = load <16 x float>, ptr addrspace(1) %arg
239  %c.1 = load <4 x half>, ptr addrspace(1) %c
240  %c2p = getelementptr <4 x half>, ptr addrspace(1) %c, i64 1
241  %c.2 = load <4 x half>, ptr addrspace(1) %c2p
242  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> %c.1, <4 x half> %c.2, <16 x float> %in.1, i32 1, i32 2, i32 3)
243  store <16 x float> %mai.1, ptr addrspace(1) %arg
244  ret void
245}
246
247; GCN-LABEL: {{^}}test_mfma_f32_16x16x16f16:
248; GCN:              s_load_dwordx4
249; GCN:              s_load_dwordx4
250; GFX908-COUNT-4:   v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
251; GFX90A_40-COUNT-4:v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
252; GFX908_A:         v_mfma_f32_16x16x16f16 [[RES:a\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
253; GFX940:           v_mfma_f32_16x16x16_f16 [[RES:a\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
254; GFX908-COUNT-4:   v_accvgpr_read_b32
255; GFX908:           global_store_dwordx4
256; GFX90A-NOT:       v_accvgpr_read_b32
257; GFX90A:           global_store_dwordx4 {{v[0-9]+}}, [[RES]],
258define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg, ptr addrspace(1) %c) #0 {
259bb:
260  %in.1 = load <4 x float>, ptr addrspace(1) %arg
261  %c.1 = load <4 x half>, ptr addrspace(1) %c
262  %c2p = getelementptr <4 x half>, ptr addrspace(1) %c, i64 1
263  %c.2 = load <4 x half>, ptr addrspace(1) %c2p
264  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %c.1, <4 x half> %c.2, <4 x float> %in.1, i32 1, i32 2, i32 3)
265  store <4 x float> %mai.1, ptr addrspace(1) %arg
266  ret void
267}
268
269; GCN-LABEL: {{^}}test_mfma_i32_32x32x4i8:
270; GCN-DAG:         v_mov_b32_e32 [[TWO:v[0-9]+]], 2
271; GCN-DAG:         v_mov_b32_e32 [[ONE:v[0-9]+]], 1
272; GCN-DAG:         s_load_dwordx16
273; GCN-DAG:         s_load_dwordx16
274; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
275; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
276; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
277; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
278; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
279; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
280; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
281; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
282; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
283; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
284; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
285; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
286; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
287; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
288; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
289; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
290; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
291; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
292; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
293; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
294; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
295; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
296; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
297; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
298; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
299; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
300; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
301; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
302; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
303; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
304; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
305; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
306; GFX90A_40-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
307; GFX908_A:        v_mfma_i32_32x32x4i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
308; GFX940:          v_mfma_i32_32x32x4_2b_i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
309; GFX908-COUNT-32: v_accvgpr_read_b32
310; GFX908:          global_store_dwordx4
311; GFX90A-NOT:      v_accvgpr_read_b32
312; GFX90A-COUNT-8:  global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}],
313define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 {
314bb:
315  %in.1 = load <32 x i32>, ptr addrspace(1) %arg
316  %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.i32.32x32x4i8(i32 1, i32 2, <32 x i32> %in.1, i32 1, i32 2, i32 3)
317  store <32 x i32> %mai.1, ptr addrspace(1) %arg
318  ret void
319}
320
321; GCN-LABEL: {{^}}test_mfma_i32_16x16x4i8:
322; GCN-DAG:           v_mov_b32_e32 [[TWO:v[0-9]+]], 2
323; GCN-DAG:           v_mov_b32_e32 [[ONE:v[0-9]+]], 1
324; GCN-DAG:           s_load_dwordx16
325; GFX908-DAG-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
326; GFX90A_40-COUNT-16:v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
327; GFX908_A:          v_mfma_i32_16x16x4i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
328; GFX940:            v_mfma_i32_16x16x4_4b_i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
329; GFX908-COUNT-16:   v_accvgpr_read_b32
330; GFX908:            global_store_dwordx4
331; GFX90A-NOT:        v_accvgpr_read_b32
332; GFX90A-COUNT-4:    global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}],
333define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) #0 {
334bb:
335  %in.1 = load <16 x i32>, ptr addrspace(1) %arg
336  %mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32 1, i32 2, <16 x i32> %in.1, i32 1, i32 2, i32 3)
337  store <16 x i32> %mai.1, ptr addrspace(1) %arg
338  ret void
339}
340
341; GCN-LABEL: {{^}}test_mfma_i32_4x4x4i8:
342; GCN-DAG:          v_mov_b32_e32 [[TWO:v[0-9]+]], 2
343; GCN-DAG:          v_mov_b32_e32 [[ONE:v[0-9]+]], 1
344; GCN:              s_load_dwordx4
345; GFX908-COUNT-4:   v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
346; GFX90A_40-COUNT-4:v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
347; GFX908_A:         v_mfma_i32_4x4x4i8 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
348; GFX940:           v_mfma_i32_4x4x4_16b_i8 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
349; GFX908-COUNT-4:   v_accvgpr_read_b32
350; GFX908:           global_store_dwordx4
351; GFX90A-NOT:       v_accvgpr_read_b32
352; GFX90A:           global_store_dwordx4 {{v[0-9]+}}, [[RES]],
353define amdgpu_kernel void @test_mfma_i32_4x4x4i8(ptr addrspace(1) %arg) #0 {
354bb:
355  %in.1 = load <4 x i32>, ptr addrspace(1) %arg
356  %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %in.1, i32 1, i32 2, i32 3)
357  store <4 x i32> %mai.1, ptr addrspace(1) %arg
358  ret void
359}
360
361; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_forward_acc:
362; GFX908_A:      v_mfma_f32_32x32x1f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
363; GFX908_A-NEXT: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]]
364; GFX940:        v_mfma_f32_32x32x1_2b_f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
365; GFX940-NEXT:   v_mfma_f32_32x32x1_2b_f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]]
366define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1) %arg) #0 {
367bb:
368  %in.1 = load <32 x float>, ptr addrspace(1) %arg
369  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
370  %mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %mai.1, i32 0, i32 0, i32 0)
371  store <32 x float> %mai.2, ptr addrspace(1) %arg
372  ret void
373}
374
375; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32_forward_acc:
376; GFX908_A:      v_mfma_f32_16x16x1f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
377; GFX908_A-NEXT: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]]
378; GFX940:        v_mfma_f32_16x16x1_4b_f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
379; GFX940-NEXT:   v_mfma_f32_16x16x1_4b_f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]]
380define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1) %arg) #0 {
381bb:
382  %in.1 = load <16 x float>, ptr addrspace(1) %arg
383  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %in.1, i32 0, i32 0, i32 0)
384  %mai.2 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %mai.1, i32 0, i32 0, i32 0)
385  store <16 x float> %mai.2, ptr addrspace(1) %arg
386  ret void
387}
388
389; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_forward_acc:
390; GFX908_A:      v_mfma_f32_4x4x1f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
391; GFX908_A-NEXT: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]]
392; GFX940:        v_mfma_f32_4x4x1_16b_f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
393; GFX940-NEXT:   s_nop 1
394; GFX940-NEXT:   v_mfma_f32_4x4x1_16b_f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]]
395define amdgpu_kernel void @test_mfma_f32_4x4x1f32_forward_acc(ptr addrspace(1) %arg) #0 {
396bb:
397  %in.1 = load <4 x float>, ptr addrspace(1) %arg
398  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %in.1, i32 0, i32 0, i32 0)
399  %mai.2 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %mai.1, i32 0, i32 0, i32 0)
400  store <4 x float> %mai.2, ptr addrspace(1) %arg
401  ret void
402}
403
404; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_imm_splat:
405; GCN-DAG:        v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
406; GCN-DAG:        v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
407; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
408; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
409; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
410; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
411; NOLIT-SRCC:     v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9:]+}}]
412; LIT-SRCC:       v_mfma_f32_4x4x1f32 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], 1.0
413; GFX90A:         v_mfma_f32_4x4x1f32 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], 1.0
414; GFX940:         v_mfma_f32_4x4x1_16b_f32 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], 1.0
415; GFX908-COUNT-4: v_accvgpr_read_b32
416; GFX908:         global_store_dwordx4
417; GFX90A-NOT:     v_accvgpr_read_b32
418; GFX90A:         global_store_dwordx4 {{v[0-9]+}}, [[RES]],
419define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm_splat(ptr addrspace(1) %arg) #0 {
420bb:
421  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0)
422  store <4 x float> %mai.1, ptr addrspace(1) %arg
423  ret void
424}
425
426; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32_imm_splat:
427; GCN-DAG:         v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
428; GCN-DAG:         v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
429; NOLIT-SRCC-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
430; NOLIT-SRCC:      v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9:]+}}]
431; LIT-SRCC:        v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 1.0
432; GFX90A:          v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 1.0
433; GFX940:          v_mfma_f32_16x16x1_4b_f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 1.0
434; GFX908-COUNT-16: v_accvgpr_read_b32
435; GFX908:          global_store_dwordx4
436; GFX90A-NOT:      v_accvgpr_read_b32
437; GFX90A-COUNT-4:  global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}],
438define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) %arg) #0 {
439bb:
440  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0)
441  store <16 x float> %mai.1, ptr addrspace(1) %arg
442  ret void
443}
444
445; GCN-LABEL: {{^}}test_mfma_f32_32x32x8f16_imm_splat:
446; GCN-DAG:         v_mov_b32_e32 v[[TWO:[0-9]+]], 0x40004000
447; GCN-DAG:         v_mov_b32_e32 v[[ONE:[0-9]+]], 0x3c003c00
448; NOLIT-SRCC-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
449; NOLIT-SRCC:      v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], a[{{[0-9:]+}}]
450; LIT-SRCC:        v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], 1.0
451; GFX90A:          v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], 1.0
452; GFX940:          v_mfma_f32_32x32x8_f16 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], 1.0
453; GFX908-COUNT-16: v_accvgpr_read_b32
454; GFX908:          global_store_dwordx4
455; GFX90A-NOT:      v_accvgpr_read_b32
456; GFX90A-COUNT-4:  global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}],
457define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) %arg) #0 {
458bb:
459  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, <4 x half> <half 2.0, half 2.0, half 2.0, half 2.0>, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0)
460  store <16 x float> %mai.1, ptr addrspace(1) %arg
461  ret void
462}
463
464; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_imm_splat:
465; GCN-DAG:         v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
466; GCN-DAG:         v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
467; NOLIT-SRCC-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, 0
468; NOLIT-SRCC:      v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9:]+}}]
469; LIT-SRCC:        v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 0
470; GFX90A:          v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 0
471; GFX940:          v_mfma_f32_32x32x1_2b_f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 0
472; GFX908-COUNT-32: v_accvgpr_read_b32
473; GFX908:          global_store_dwordx4
474; GFX90A-NOT:      v_accvgpr_read_b32
475; GFX90A-COUNT-8:  global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}],
476define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) %arg) #0 {
477bb:
478  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> <float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, i32 0, i32 0, i32 0)
479  store <32 x float> %mai.1, ptr addrspace(1) %arg
480  ret void
481}
482
483; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_imm:
484; GCN-DAG:        v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
485; GCN-DAG:        v_accvgpr_write_b32 a{{[0-9]+}}, 2.0
486; GFX908-DAG:     v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
487; GFX908-DAG:     v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
488; GFX90A-DAG:     v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
489; GFX90A-DAG:     v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
490; GFX908_A:       v_mfma_f32_4x4x1f32 [[RES:a\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
491; GFX940:         v_mfma_f32_4x4x1_16b_f32 [[RES:a\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
492; GFX908-COUNT-4: v_accvgpr_read_b32
493; GFX908:         global_store_dwordx4
494; GFX90A-NOT:     v_accvgpr_read_b32
495; GFX90A:         global_store_dwordx4 {{v[0-9]+}}, [[RES]],
496define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm(ptr addrspace(1) %arg) #0 {
497bb:
498  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> <float 1.0, float 2.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0)
499  store <4 x float> %mai.1, ptr addrspace(1) %arg
500  ret void
501}
502
503; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32_imm:
504; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
505; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, 2.0
506; GFX908-COUNT-14: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
507; GFX90A-COUNT-14: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
508; GFX908_A:        v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
509; GFX940:          v_mfma_f32_16x16x1_4b_f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
510; GFX908-COUNT-16: v_accvgpr_read_b32
511; GFX908:          global_store_dwordx4
512; GFX90A-NOT:      v_accvgpr_read_b32
513; GFX90A-COUNT-4:  global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}],
514define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(ptr addrspace(1) %arg) #0 {
515bb:
516  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 2.0>, i32 0, i32 0, i32 0)
517  store <16 x float> %mai.1, ptr addrspace(1) %arg
518  ret void
519}
520
521; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_imm:
522; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, 0
523; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
524; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
525; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
526; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
527; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
528; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
529; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
530; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
531; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
532; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
533; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
534; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
535; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
536; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
537; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
538; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
539; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
540; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
541; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
542; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
543; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
544; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
545; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
546; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
547; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
548; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
549; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
550; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
551; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
552; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
553; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, 0
554; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
555; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
556; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
557; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
558; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
559; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
560; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
561; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
562; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
563; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
564; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
565; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
566; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
567; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
568; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
569; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
570; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
571; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
572; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
573; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
574; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
575; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
576; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
577; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
578; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
579; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
580; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
581; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
582; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
583; GFX90A-DAG:      v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
584; GFX908_A:        v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
585; GFX940:          v_mfma_f32_32x32x1_2b_f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
586; GFX908-COUNT-32: v_accvgpr_read_b32
587; GFX908:          global_store_dwordx4
588; GFX90A-NOT:      v_accvgpr_read_b32
589; GFX90A-COUNT-8:  global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}],
590define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #0 {
591bb:
592  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> <float 1.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, i32 0, i32 0, i32 0)
593  store <32 x float> %mai.1, ptr addrspace(1) %arg
594  ret void
595}
596
597; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_lit_splat:
598; GCN:            v_mov_b32_e32 [[TMP:v[0-9]+]], 0x42f60000
599; GCN:            v_accvgpr_write_b32 [[TTMPA:a[0-9]+]], [[TMP]]
600; GFX908:         v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
601; GFX908:         v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
602; GFX908:         v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
603; GFX90A:         v_accvgpr_mov_b32 a{{[0-9]+}}, [[TTMPA]]
604; GFX90A:         v_accvgpr_mov_b32 a{{[0-9]+}}, [[TTMPA]]
605; GFX90A:         v_accvgpr_mov_b32 a{{[0-9]+}}, [[TTMPA]]
606; GFX908_A:       v_mfma_f32_4x4x1f32 [[RES:a\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
607; GFX940:         v_mfma_f32_4x4x1_16b_f32 [[RES:a\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
608; GFX908-COUNT-4: v_accvgpr_read_b32
609; GFX908:         global_store_dwordx4
610; GFX90A-NOT:     v_accvgpr_read_b32
611; GFX90A:         global_store_dwordx4 {{v[0-9]+}}, [[RES]]
612define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat(ptr addrspace(1) %arg, i64 %idx) #0 {
613bb:
614  %tid = call i32 @llvm.amdgcn.workitem.id.x()
615  %gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %arg, i32 %tid
616  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> <float 123.0, float 123.0, float 123.0, float 123.0>, i32 0, i32 0, i32 0)
617  ;store <4 x float> %mai.1, ptr addrspace(1) %arg
618  store <4 x float> %mai.1, ptr addrspace(1) %gep
619  ret void
620}
621
622; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_lit_splat_bad_code:
623; GCN:      v_mov_b32_e32 [[TMP0:v[0-9]+]], 0x42f60000
624; GCN:      v_accvgpr_write_b32 [[AGPR:a[0-9]+]], [[TMP0]]
625; GFX90A_40-COUNT-3: v_accvgpr_mov_b32 a{{[0-9]+}}, [[AGPR]]
626; GFX908-NEXT:   v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP0]]
627; GFX908-NEXT:   v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP0]]
628; GFX908-NEXT:   v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP0]]
629; GCN: s_nop 0
630; GFX908_A:  v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
631; GFX940:    v_mfma_f32_4x4x1_16b_f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
632; GFX908-COUNT-4: v_accvgpr_read_b32
633; GFX908:    global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}], s[{{[0-9:]+}}]
634; GFX90A_40: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}], s[{{[0-9:]+}}]
635define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat_bad_code(ptr addrspace(1) %arg) #0 {
636bb:
637  %tid = call i32 @llvm.amdgcn.workitem.id.x()
638  %gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %arg, i32 %tid
639
640  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> <float 123.0, float 123.0, float 123.0, float 123.0>, i32 0, i32 0, i32 0)
641  store <4 x float> %mai.1, ptr addrspace(1) %arg
642  ret void
643}
644
645; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_vecarg:
646; GFX90A_40-DAG:     v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
647; GFX90A_40-DAG:     v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
648; GCN-COUNT-8:       global_load_dwordx4
649; GFX908-COUNT-16:   v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
650; GFX90A_40-NOT:     v_accvgpr_write
651; GFX908-DAG:        v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
652; GFX908-DAG:        v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
653; GFX908:            v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
654; GFX90A:            v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
655; GFX940:            v_mfma_f32_32x32x1_2b_f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
656; GFX908:            v_accvgpr_read_b32
657; GFX908-COUNT-8:    global_store_dwordx4
658; GFX90A_40-NOT:     v_accvgpr_read_b32
659; GFX90A_40-COUNT-5: global_store_dwordx4 v{{[0-9:]+}}, a[{{[0-9:]+}}], s[{{[0-9:]+}}]
660define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg) #0 {
661bb:
662  %tid = call i32 @llvm.amdgcn.workitem.id.x()
663  %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %arg, i32 %tid
664  %in.1 = load <32 x float>, ptr addrspace(1) %gep
665  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3)
666  store <32 x float> %mai.1, ptr addrspace(1) %gep
667  ret void
668}
669
670attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
671