1; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,NOLIT-SRCC,GFX908,GFX908_A %s 2; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,LIT-SRCC,GFX908,GFX908_A %s 3; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A,GFX908_A,GFX90A_40 %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX940,GFX90A_40 %s 5 6declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) 7declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32) 8declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32) 9declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x2f32(float, float, <16 x float>, i32, i32, i32) 10declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x4f32(float, float, <4 x float>, i32, i32, i32) 11declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half>, <4 x half>, <32 x float>, i32, i32, i32) 12declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4f16(<4 x half>, <4 x half>, <16 x float>, i32, i32, i32) 13declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x4f16(<4 x half>, <4 x half>, <4 x float>, i32, i32, i32) 14declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half>, <4 x half>, <16 x float>, i32, i32, i32) 15declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half>, <4 x half>, <4 x float>, i32, i32, i32) 16declare <32 x i32> @llvm.amdgcn.mfma.i32.32x32x4i8(i32, i32, <32 x i32>, i32, i32, i32) 17declare <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32, i32, <16 x i32>, i32, i32, i32) 18declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32) 19declare i32 @llvm.amdgcn.workitem.id.x() 20 21; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32: 22; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 23; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 24; GCN-DAG: s_load_dwordx16 25; GCN-DAG: s_load_dwordx16 26; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 27; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 28; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 29; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 30; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 31; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 32; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 33; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 34; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 35; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 36; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 37; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 38; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 39; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 40; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 41; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 42; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 43; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 44; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 45; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 46; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 47; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 48; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 49; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 50; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 51; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 52; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 53; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 54; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 55; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 56; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 57; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 58; GFX90A_40-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}} 59; GFX908_A: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 60; GFX940: v_mfma_f32_32x32x1_2b_f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 61; GFX908-COUNT-4: v_accvgpr_read_b32 62; GFX908-COUNT-2: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] 63; GFX908-COUNT-4: v_accvgpr_read_b32 64; GFX908-COUNT-2: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] 65; GFX908-COUNT-4: v_accvgpr_read_b32 66; GFX908-COUNT-2: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] 67; GFX908-COUNT-4: v_accvgpr_read_b32 68; GFX908-COUNT-2: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] 69; GFX90A-NOT: v_accvgpr_read_b32 70; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] 71define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { 72bb: 73 %in.1 = load <32 x float>, ptr addrspace(1) %arg 74 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3) 75 store <32 x float> %mai.1, ptr addrspace(1) %arg 76 ret void 77} 78 79; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32: 80; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 81; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 82; GCN-DAG: s_load_dwordx16 83; GFX908-DAG-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 84; GFX90A_40-COUNT-16:v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}} 85; GFX908_A: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 86; GFX940: v_mfma_f32_16x16x1_4b_f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 87; GFX908-COUNT: v_accvgpr_read_b32 88; GFX908-COUNT-4: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] 89; GFX90A-NOT: v_accvgpr_read_b32 90; GFX90A-COUNT-4: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] 91define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { 92bb: 93 %in.1 = load <16 x float>, ptr addrspace(1) %arg 94 %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %in.1, i32 1, i32 2, i32 3) 95 store <16 x float> %mai.1, ptr addrspace(1) %arg 96 ret void 97} 98 99; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32: 100; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 101; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 102; GCN: s_load_dwordx4 103; GFX908-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 104; GFX90A_40-COUNT-4:v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}} 105; GFX908_A: v_mfma_f32_4x4x1f32 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 106; GFX940: v_mfma_f32_4x4x1_16b_f32 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 107; GFX908-COUNT-4: v_accvgpr_read_b32 108; GFX908: global_store_dwordx4 109; GFX90A-NOT: v_accvgpr_read_b32 110; GFX90A: global_store_dwordx4 {{v[0-9]+}}, [[RES]] 111define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 { 112bb: 113 %in.1 = load <4 x float>, ptr addrspace(1) %arg 114 %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %in.1, i32 1, i32 2, i32 3) 115 store <4 x float> %mai.1, ptr addrspace(1) %arg 116 ret void 117} 118 119; GCN-LABEL: {{^}}test_mfma_f32_32x32x2f32: 120; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 121; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 122; GCN-DAG: s_load_dwordx16 123; GFX908-DAG-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 124; GFX90A_40-COUNT-16:v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}} 125; GFX908_A: v_mfma_f32_32x32x2f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 126; GFX940: v_mfma_f32_32x32x2_f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 127; GFX908-COUNT-16: v_accvgpr_read_b32 128; GFX908-COUNT-4: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] 129; GFX90A-NOT: v_accvgpr_read_b32 130; GFX90A-COUNT-4: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] 131define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 { 132bb: 133 %in.1 = load <16 x float>, ptr addrspace(1) %arg 134 %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x2f32(float 1.0, float 2.0, <16 x float> %in.1, i32 1, i32 2, i32 3) 135 store <16 x float> %mai.1, ptr addrspace(1) %arg 136 ret void 137} 138 139; GCN-LABEL: {{^}}test_mfma_f32_16x16x4f32: 140; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 141; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 142; GCN: s_load_dwordx4 143; GFX908-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 144; GFX90A_40-COUNT-4:v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}} 145; GFX908_A: v_mfma_f32_16x16x4f32 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 146; GFX940: v_mfma_f32_16x16x4_f32 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 147; GFX908-COUNT-4: v_accvgpr_read_b32 148; GFX908: global_store_dwordx4 149; GFX90A-NOT: v_accvgpr_read_b32 150; GFX90A: global_store_dwordx4 {{v[0-9]+}}, [[RES]], 151define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 { 152bb: 153 %in.1 = load <4 x float>, ptr addrspace(1) %arg 154 %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x4f32(float 1.0, float 2.0, <4 x float> %in.1, i32 1, i32 2, i32 3) 155 store <4 x float> %mai.1, ptr addrspace(1) %arg 156 ret void 157} 158 159; GCN-LABEL: {{^}}test_mfma_f32_32x32x4f16: 160; GCN-DAG: s_load_dwordx16 161; GCN-DAG: s_load_dwordx16 162; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 163; GFX90A_40-COUNT-32:v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}} 164; GFX908_A: v_mfma_f32_32x32x4f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 165; GFX940: v_mfma_f32_32x32x4_2b_f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 166; GFX908-COUNT-32: v_accvgpr_read_b32 167; GFX908: global_store_dwordx4 168; GFX90A-NOT: v_accvgpr_read_b32 169; GFX90A-COUNT-8: global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}], 170define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr addrspace(1) %c) #0 { 171bb: 172 %in.1 = load <32 x float>, ptr addrspace(1) %arg 173 %c.1 = load <4 x half>, ptr addrspace(1) %c 174 %c2p = getelementptr <4 x half>, ptr addrspace(1) %c, i64 1 175 %c.2 = load <4 x half>, ptr addrspace(1) %c2p 176 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %c.1, <4 x half> %c.2, <32 x float> %in.1, i32 1, i32 2, i32 3) 177 store <32 x float> %mai.1, ptr addrspace(1) %arg 178 ret void 179} 180 181; GCN-LABEL: {{^}}test_mfma_f32_16x16x4f16: 182; GCN: s_load_dwordx16 183; GFX908-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 184; GFX90A_40-COUNT-16:v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}} 185; GFX908_A: v_mfma_f32_16x16x4f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 186; GFX940: v_mfma_f32_16x16x4_4b_f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 187; GFX908-COUNT-16: v_accvgpr_read_b32 188; GFX908: global_store_dwordx4 189; GFX90A-NOT: v_accvgpr_read_b32 190; GFX90A-COUNT-4: global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}], 191define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr addrspace(1) %c) #0 { 192bb: 193 %in.1 = load <16 x float>, ptr addrspace(1) %arg 194 %c.1 = load <4 x half>, ptr addrspace(1) %c 195 %c2p = getelementptr <4 x half>, ptr addrspace(1) %c, i64 1 196 %c.2 = load <4 x half>, ptr addrspace(1) %c2p 197 %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4f16(<4 x half> %c.1, <4 x half> %c.2, <16 x float> %in.1, i32 1, i32 2, i32 3) 198 store <16 x float> %mai.1, ptr addrspace(1) %arg 199 ret void 200} 201 202; GCN-LABEL: {{^}}test_mfma_f32_4x4x4f16: 203; GCN: s_load_dwordx4 204; GCN: s_load_dwordx4 205; GFX908-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 206; GFX90A_40-COUNT-4:v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}} 207; GFX908_A: v_mfma_f32_4x4x4f16 [[RES:a\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 208; GFX940: v_mfma_f32_4x4x4_16b_f16 [[RES:a\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 209; GFX908-COUNT-4: v_accvgpr_read_b32 210; GFX908: global_store_dwordx4 211; GFX90A-NOT: v_accvgpr_read_b32 212; GFX90A: global_store_dwordx4 {{v[0-9]+}}, [[RES]], 213define amdgpu_kernel void @test_mfma_f32_4x4x4f16(ptr addrspace(1) %arg, ptr addrspace(1) %c) #0 { 214bb: 215 %in.1 = load <4 x float>, ptr addrspace(1) %arg 216 %c.1 = load <4 x half>, ptr addrspace(1) %c 217 %c2p = getelementptr <4 x half>, ptr addrspace(1) %c, i64 1 218 %c.2 = load <4 x half>, ptr addrspace(1) %c2p 219 %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4f16(<4 x half> %c.1, <4 x half> %c.2, <4 x float> %in.1, i32 1, i32 2, i32 3) 220 store <4 x float> %mai.1, ptr addrspace(1) %arg 221 ret void 222} 223 224; GCN-LABEL: {{^}}test_mfma_f32_32x32x8f16: 225; GCN: s_load_dwordx16 226; GCN: s_waitcnt lgkmcnt(0) 227; GFX908_A: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} 228; GFX908-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 229; GFX90A_40-COUNT-16:v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}} 230; GFX908_A: v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 231; GFX940: v_mfma_f32_32x32x8_f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 232; GFX908-COUNT-16: v_accvgpr_read_b32 233; GFX908: global_store_dwordx4 234; GFX90A-NOT: v_accvgpr_read_b32 235; GFX90A-COUNT-4: global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}], 236define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr addrspace(1) %c) #0 { 237bb: 238 %in.1 = load <16 x float>, ptr addrspace(1) %arg 239 %c.1 = load <4 x half>, ptr addrspace(1) %c 240 %c2p = getelementptr <4 x half>, ptr addrspace(1) %c, i64 1 241 %c.2 = load <4 x half>, ptr addrspace(1) %c2p 242 %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> %c.1, <4 x half> %c.2, <16 x float> %in.1, i32 1, i32 2, i32 3) 243 store <16 x float> %mai.1, ptr addrspace(1) %arg 244 ret void 245} 246 247; GCN-LABEL: {{^}}test_mfma_f32_16x16x16f16: 248; GCN: s_load_dwordx4 249; GCN: s_load_dwordx4 250; GFX908-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 251; GFX90A_40-COUNT-4:v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}} 252; GFX908_A: v_mfma_f32_16x16x16f16 [[RES:a\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 253; GFX940: v_mfma_f32_16x16x16_f16 [[RES:a\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 254; GFX908-COUNT-4: v_accvgpr_read_b32 255; GFX908: global_store_dwordx4 256; GFX90A-NOT: v_accvgpr_read_b32 257; GFX90A: global_store_dwordx4 {{v[0-9]+}}, [[RES]], 258define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg, ptr addrspace(1) %c) #0 { 259bb: 260 %in.1 = load <4 x float>, ptr addrspace(1) %arg 261 %c.1 = load <4 x half>, ptr addrspace(1) %c 262 %c2p = getelementptr <4 x half>, ptr addrspace(1) %c, i64 1 263 %c.2 = load <4 x half>, ptr addrspace(1) %c2p 264 %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %c.1, <4 x half> %c.2, <4 x float> %in.1, i32 1, i32 2, i32 3) 265 store <4 x float> %mai.1, ptr addrspace(1) %arg 266 ret void 267} 268 269; GCN-LABEL: {{^}}test_mfma_i32_32x32x4i8: 270; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 271; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 272; GCN-DAG: s_load_dwordx16 273; GCN-DAG: s_load_dwordx16 274; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 275; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 276; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 277; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 278; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 279; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 280; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 281; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 282; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 283; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 284; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 285; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 286; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 287; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 288; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 289; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 290; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 291; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 292; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 293; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 294; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 295; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 296; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 297; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 298; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 299; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 300; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 301; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 302; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 303; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 304; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 305; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 306; GFX90A_40-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}} 307; GFX908_A: v_mfma_i32_32x32x4i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 308; GFX940: v_mfma_i32_32x32x4_2b_i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 309; GFX908-COUNT-32: v_accvgpr_read_b32 310; GFX908: global_store_dwordx4 311; GFX90A-NOT: v_accvgpr_read_b32 312; GFX90A-COUNT-8: global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}], 313define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 { 314bb: 315 %in.1 = load <32 x i32>, ptr addrspace(1) %arg 316 %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.i32.32x32x4i8(i32 1, i32 2, <32 x i32> %in.1, i32 1, i32 2, i32 3) 317 store <32 x i32> %mai.1, ptr addrspace(1) %arg 318 ret void 319} 320 321; GCN-LABEL: {{^}}test_mfma_i32_16x16x4i8: 322; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 323; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 324; GCN-DAG: s_load_dwordx16 325; GFX908-DAG-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 326; GFX90A_40-COUNT-16:v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}} 327; GFX908_A: v_mfma_i32_16x16x4i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 328; GFX940: v_mfma_i32_16x16x4_4b_i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 329; GFX908-COUNT-16: v_accvgpr_read_b32 330; GFX908: global_store_dwordx4 331; GFX90A-NOT: v_accvgpr_read_b32 332; GFX90A-COUNT-4: global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}], 333define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) #0 { 334bb: 335 %in.1 = load <16 x i32>, ptr addrspace(1) %arg 336 %mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32 1, i32 2, <16 x i32> %in.1, i32 1, i32 2, i32 3) 337 store <16 x i32> %mai.1, ptr addrspace(1) %arg 338 ret void 339} 340 341; GCN-LABEL: {{^}}test_mfma_i32_4x4x4i8: 342; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 343; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 344; GCN: s_load_dwordx4 345; GFX908-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 346; GFX90A_40-COUNT-4:v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}} 347; GFX908_A: v_mfma_i32_4x4x4i8 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 348; GFX940: v_mfma_i32_4x4x4_16b_i8 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 349; GFX908-COUNT-4: v_accvgpr_read_b32 350; GFX908: global_store_dwordx4 351; GFX90A-NOT: v_accvgpr_read_b32 352; GFX90A: global_store_dwordx4 {{v[0-9]+}}, [[RES]], 353define amdgpu_kernel void @test_mfma_i32_4x4x4i8(ptr addrspace(1) %arg) #0 { 354bb: 355 %in.1 = load <4 x i32>, ptr addrspace(1) %arg 356 %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %in.1, i32 1, i32 2, i32 3) 357 store <4 x i32> %mai.1, ptr addrspace(1) %arg 358 ret void 359} 360 361; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_forward_acc: 362; GFX908_A: v_mfma_f32_32x32x1f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}] 363; GFX908_A-NEXT: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]] 364; GFX940: v_mfma_f32_32x32x1_2b_f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}] 365; GFX940-NEXT: v_mfma_f32_32x32x1_2b_f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]] 366define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1) %arg) #0 { 367bb: 368 %in.1 = load <32 x float>, ptr addrspace(1) %arg 369 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0) 370 %mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %mai.1, i32 0, i32 0, i32 0) 371 store <32 x float> %mai.2, ptr addrspace(1) %arg 372 ret void 373} 374 375; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32_forward_acc: 376; GFX908_A: v_mfma_f32_16x16x1f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}] 377; GFX908_A-NEXT: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]] 378; GFX940: v_mfma_f32_16x16x1_4b_f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}] 379; GFX940-NEXT: v_mfma_f32_16x16x1_4b_f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]] 380define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1) %arg) #0 { 381bb: 382 %in.1 = load <16 x float>, ptr addrspace(1) %arg 383 %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %in.1, i32 0, i32 0, i32 0) 384 %mai.2 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %mai.1, i32 0, i32 0, i32 0) 385 store <16 x float> %mai.2, ptr addrspace(1) %arg 386 ret void 387} 388 389; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_forward_acc: 390; GFX908_A: v_mfma_f32_4x4x1f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}] 391; GFX908_A-NEXT: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]] 392; GFX940: v_mfma_f32_4x4x1_16b_f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}] 393; GFX940-NEXT: s_nop 1 394; GFX940-NEXT: v_mfma_f32_4x4x1_16b_f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]] 395define amdgpu_kernel void @test_mfma_f32_4x4x1f32_forward_acc(ptr addrspace(1) %arg) #0 { 396bb: 397 %in.1 = load <4 x float>, ptr addrspace(1) %arg 398 %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %in.1, i32 0, i32 0, i32 0) 399 %mai.2 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %mai.1, i32 0, i32 0, i32 0) 400 store <4 x float> %mai.2, ptr addrspace(1) %arg 401 ret void 402} 403 404; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_imm_splat: 405; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 406; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 407; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 408; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 409; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 410; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 411; NOLIT-SRCC: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9:]+}}] 412; LIT-SRCC: v_mfma_f32_4x4x1f32 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], 1.0 413; GFX90A: v_mfma_f32_4x4x1f32 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], 1.0 414; GFX940: v_mfma_f32_4x4x1_16b_f32 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], 1.0 415; GFX908-COUNT-4: v_accvgpr_read_b32 416; GFX908: global_store_dwordx4 417; GFX90A-NOT: v_accvgpr_read_b32 418; GFX90A: global_store_dwordx4 {{v[0-9]+}}, [[RES]], 419define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm_splat(ptr addrspace(1) %arg) #0 { 420bb: 421 %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0) 422 store <4 x float> %mai.1, ptr addrspace(1) %arg 423 ret void 424} 425 426; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32_imm_splat: 427; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 428; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 429; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 430; NOLIT-SRCC: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9:]+}}] 431; LIT-SRCC: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 1.0 432; GFX90A: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 1.0 433; GFX940: v_mfma_f32_16x16x1_4b_f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 1.0 434; GFX908-COUNT-16: v_accvgpr_read_b32 435; GFX908: global_store_dwordx4 436; GFX90A-NOT: v_accvgpr_read_b32 437; GFX90A-COUNT-4: global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}], 438define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) %arg) #0 { 439bb: 440 %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0) 441 store <16 x float> %mai.1, ptr addrspace(1) %arg 442 ret void 443} 444 445; GCN-LABEL: {{^}}test_mfma_f32_32x32x8f16_imm_splat: 446; GCN-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 0x40004000 447; GCN-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 0x3c003c00 448; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 449; NOLIT-SRCC: v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], a[{{[0-9:]+}}] 450; LIT-SRCC: v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], 1.0 451; GFX90A: v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], 1.0 452; GFX940: v_mfma_f32_32x32x8_f16 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], 1.0 453; GFX908-COUNT-16: v_accvgpr_read_b32 454; GFX908: global_store_dwordx4 455; GFX90A-NOT: v_accvgpr_read_b32 456; GFX90A-COUNT-4: global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}], 457define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) %arg) #0 { 458bb: 459 %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, <4 x half> <half 2.0, half 2.0, half 2.0, half 2.0>, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0) 460 store <16 x float> %mai.1, ptr addrspace(1) %arg 461 ret void 462} 463 464; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_imm_splat: 465; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 466; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 467; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 468; NOLIT-SRCC: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9:]+}}] 469; LIT-SRCC: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 0 470; GFX90A: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 0 471; GFX940: v_mfma_f32_32x32x1_2b_f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 0 472; GFX908-COUNT-32: v_accvgpr_read_b32 473; GFX908: global_store_dwordx4 474; GFX90A-NOT: v_accvgpr_read_b32 475; GFX90A-COUNT-8: global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}], 476define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) %arg) #0 { 477bb: 478 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> <float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, i32 0, i32 0, i32 0) 479 store <32 x float> %mai.1, ptr addrspace(1) %arg 480 ret void 481} 482 483; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_imm: 484; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 485; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 2.0 486; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 487; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 488; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} 489; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} 490; GFX908_A: v_mfma_f32_4x4x1f32 [[RES:a\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}] 491; GFX940: v_mfma_f32_4x4x1_16b_f32 [[RES:a\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}] 492; GFX908-COUNT-4: v_accvgpr_read_b32 493; GFX908: global_store_dwordx4 494; GFX90A-NOT: v_accvgpr_read_b32 495; GFX90A: global_store_dwordx4 {{v[0-9]+}}, [[RES]], 496define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm(ptr addrspace(1) %arg) #0 { 497bb: 498 %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> <float 1.0, float 2.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0) 499 store <4 x float> %mai.1, ptr addrspace(1) %arg 500 ret void 501} 502 503; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32_imm: 504; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 505; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 2.0 506; GFX908-COUNT-14: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 507; GFX90A-COUNT-14: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} 508; GFX908_A: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}] 509; GFX940: v_mfma_f32_16x16x1_4b_f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}] 510; GFX908-COUNT-16: v_accvgpr_read_b32 511; GFX908: global_store_dwordx4 512; GFX90A-NOT: v_accvgpr_read_b32 513; GFX90A-COUNT-4: global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}], 514define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(ptr addrspace(1) %arg) #0 { 515bb: 516 %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 2.0>, i32 0, i32 0, i32 0) 517 store <16 x float> %mai.1, ptr addrspace(1) %arg 518 ret void 519} 520 521; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_imm: 522; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 523; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 524; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 525; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 526; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 527; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 528; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 529; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 530; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 531; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 532; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 533; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 534; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 535; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 536; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 537; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 538; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 539; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 540; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 541; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 542; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 543; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 544; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 545; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 546; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 547; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 548; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 549; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 550; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 551; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 552; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 553; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 554; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} 555; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} 556; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} 557; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} 558; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} 559; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} 560; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} 561; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} 562; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} 563; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} 564; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} 565; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} 566; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} 567; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} 568; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} 569; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} 570; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} 571; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} 572; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} 573; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} 574; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} 575; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} 576; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} 577; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} 578; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} 579; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} 580; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} 581; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} 582; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} 583; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} 584; GFX908_A: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}] 585; GFX940: v_mfma_f32_32x32x1_2b_f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}] 586; GFX908-COUNT-32: v_accvgpr_read_b32 587; GFX908: global_store_dwordx4 588; GFX90A-NOT: v_accvgpr_read_b32 589; GFX90A-COUNT-8: global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}], 590define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #0 { 591bb: 592 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> <float 1.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, i32 0, i32 0, i32 0) 593 store <32 x float> %mai.1, ptr addrspace(1) %arg 594 ret void 595} 596 597; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_lit_splat: 598; GCN: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x42f60000 599; GCN: v_accvgpr_write_b32 [[TTMPA:a[0-9]+]], [[TMP]] 600; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 601; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 602; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 603; GFX90A: v_accvgpr_mov_b32 a{{[0-9]+}}, [[TTMPA]] 604; GFX90A: v_accvgpr_mov_b32 a{{[0-9]+}}, [[TTMPA]] 605; GFX90A: v_accvgpr_mov_b32 a{{[0-9]+}}, [[TTMPA]] 606; GFX908_A: v_mfma_f32_4x4x1f32 [[RES:a\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}] 607; GFX940: v_mfma_f32_4x4x1_16b_f32 [[RES:a\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}] 608; GFX908-COUNT-4: v_accvgpr_read_b32 609; GFX908: global_store_dwordx4 610; GFX90A-NOT: v_accvgpr_read_b32 611; GFX90A: global_store_dwordx4 {{v[0-9]+}}, [[RES]] 612define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat(ptr addrspace(1) %arg, i64 %idx) #0 { 613bb: 614 %tid = call i32 @llvm.amdgcn.workitem.id.x() 615 %gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %arg, i32 %tid 616 %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> <float 123.0, float 123.0, float 123.0, float 123.0>, i32 0, i32 0, i32 0) 617 ;store <4 x float> %mai.1, ptr addrspace(1) %arg 618 store <4 x float> %mai.1, ptr addrspace(1) %gep 619 ret void 620} 621 622; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_lit_splat_bad_code: 623; GCN: v_mov_b32_e32 [[TMP0:v[0-9]+]], 0x42f60000 624; GCN: v_accvgpr_write_b32 [[AGPR:a[0-9]+]], [[TMP0]] 625; GFX90A_40-COUNT-3: v_accvgpr_mov_b32 a{{[0-9]+}}, [[AGPR]] 626; GFX908-NEXT: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP0]] 627; GFX908-NEXT: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP0]] 628; GFX908-NEXT: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP0]] 629; GCN: s_nop 0 630; GFX908_A: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}] 631; GFX940: v_mfma_f32_4x4x1_16b_f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}] 632; GFX908-COUNT-4: v_accvgpr_read_b32 633; GFX908: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}], s[{{[0-9:]+}}] 634; GFX90A_40: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}], s[{{[0-9:]+}}] 635define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat_bad_code(ptr addrspace(1) %arg) #0 { 636bb: 637 %tid = call i32 @llvm.amdgcn.workitem.id.x() 638 %gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %arg, i32 %tid 639 640 %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> <float 123.0, float 123.0, float 123.0, float 123.0>, i32 0, i32 0, i32 0) 641 store <4 x float> %mai.1, ptr addrspace(1) %arg 642 ret void 643} 644 645; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_vecarg: 646; GFX90A_40-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 647; GFX90A_40-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 648; GCN-COUNT-8: global_load_dwordx4 649; GFX908-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 650; GFX90A_40-NOT: v_accvgpr_write 651; GFX908-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 652; GFX908-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 653; GFX908: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 654; GFX90A: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 655; GFX940: v_mfma_f32_32x32x1_2b_f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 656; GFX908: v_accvgpr_read_b32 657; GFX908-COUNT-8: global_store_dwordx4 658; GFX90A_40-NOT: v_accvgpr_read_b32 659; GFX90A_40-COUNT-5: global_store_dwordx4 v{{[0-9:]+}}, a[{{[0-9:]+}}], s[{{[0-9:]+}}] 660define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg) #0 { 661bb: 662 %tid = call i32 @llvm.amdgcn.workitem.id.x() 663 %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %arg, i32 %tid 664 %in.1 = load <32 x float>, ptr addrspace(1) %gep 665 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3) 666 store <32 x float> %mai.1, ptr addrspace(1) %gep 667 ret void 668} 669 670attributes #0 = { "amdgpu-flat-work-group-size"="1,256" } 671