1; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A %s 2; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX940 %s 3 4declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16>, <4 x i16>, <32 x float>, i32, i32, i32) 5declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16>, <4 x i16>, <16 x float>, i32, i32, i32) 6declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16>, <4 x i16>, <4 x float>, i32, i32, i32) 7declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16>, <4 x i16>, <16 x float>, i32, i32, i32) 8declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16>, <4 x i16>, <4 x float>, i32, i32, i32) 9declare <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double, double, <4 x double>, i32, i32, i32) 10declare double @llvm.amdgcn.mfma.f64.4x4x4f64(double, double, double, i32, i32, i32) 11declare i32 @llvm.amdgcn.workitem.id.x() 12 13; GCN-LABEL: {{^}}test_mfma_f32_32x32x4bf16_1k: 14; GCN-DAG: s_load_dwordx16 15; GCN-DAG: s_load_dwordx16 16; GCN-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2 17; GCN-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1 18; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}} 19; GFX90A: v_mfma_f32_32x32x4bf16_1k a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 20; GFX940: v_mfma_f32_32x32x4_2b_bf16 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9+]}}], v[[[TWO]]:{{[0-9+]}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 21; GCN-NOT: v_accvgpr_read_b32 22; GCN-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] 23define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #0 { 24bb: 25 %in.1 = load <32 x float>, ptr addrspace(1) %arg 26 %a = bitcast i64 1 to <4 x i16> 27 %b = bitcast i64 2 to <4 x i16> 28 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <32 x float> %in.1, i32 1, i32 2, i32 3) 29 store <32 x float> %mai.1, ptr addrspace(1) %arg 30 ret void 31} 32 33; GCN-LABEL: {{^}}test_mfma_f32_16x16x4bf16_1k: 34; GCN-DAG: s_load_dwordx16 35; GCN-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2 36; GCN-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1 37; GCN-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}} 38; GFX90A: v_mfma_f32_16x16x4bf16_1k a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 39; GFX940: v_mfma_f32_16x16x4_4b_bf16 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9+]}}], v[[[TWO]]:{{[0-9+]}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 40; GCN-NOT: v_accvgpr_read_b32 41; GCN-COUNT-4: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] 42define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #0 { 43bb: 44 %in.1 = load <16 x float>, ptr addrspace(1) %arg 45 %a = bitcast i64 1 to <4 x i16> 46 %b = bitcast i64 2 to <4 x i16> 47 %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3) 48 store <16 x float> %mai.1, ptr addrspace(1) %arg 49 ret void 50} 51 52; GCN-LABEL: {{^}}test_mfma_f32_4x4x4bf16_1k: 53; GCN-DAG: s_load_dwordx4 54; GCN-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2 55; GCN-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1 56; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}} 57; GFX90A: v_mfma_f32_4x4x4bf16_1k [[RES:a\[[0-9]+:[0-9]+\]]], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 58; GFX940: v_mfma_f32_4x4x4_16b_bf16 [[RES:a\[[0-9]+:[0-9]+\]]], v[[[ONE]]:{{[0-9+]}}], v[[[TWO]]:{{[0-9+]}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 59; GCN-NOT: v_accvgpr_read_b32 60; GCN: global_store_dwordx4 v{{[0-9]+}}, [[RES]], 61define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) #0 { 62bb: 63 %in.1 = load <4 x float>, ptr addrspace(1) %arg 64 %a = bitcast i64 1 to <4 x i16> 65 %b = bitcast i64 2 to <4 x i16> 66 %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3) 67 store <4 x float> %mai.1, ptr addrspace(1) %arg 68 ret void 69} 70 71; GCN-LABEL: {{^}}test_mfma_f32_32x32x8bf16_1k: 72; GCN-DAG: s_load_dwordx16 73; GCN-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2 74; GCN-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1 75; GCN-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}} 76; GFX90A: v_mfma_f32_32x32x8bf16_1k a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 77; GFX940: v_mfma_f32_32x32x8_bf16 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9+]}}], v[[[TWO]]:{{[0-9+]}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 78; GCN-NOT: v_accvgpr_read_b32 79; GCN-COUNT-4: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] 80define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #0 { 81bb: 82 %in.1 = load <16 x float>, ptr addrspace(1) %arg 83 %a = bitcast i64 1 to <4 x i16> 84 %b = bitcast i64 2 to <4 x i16> 85 %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3) 86 store <16 x float> %mai.1, ptr addrspace(1) %arg 87 ret void 88} 89 90; GCN-LABEL: {{^}}test_mfma_f32_16x16x16bf16_1k: 91; GCN-DAG: s_load_dwordx4 92; GCN-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2 93; GCN-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1 94; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}} 95; GFX90A: v_mfma_f32_16x16x16bf16_1k [[RES:a\[[0-9]+:[0-9]+\]]], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 96; GFX940: v_mfma_f32_16x16x16_bf16 [[RES:a\[[0-9]+:[0-9]+\]]], v[[[ONE]]:{{[0-9+]}}], v[[[TWO]]:{{[0-9+]}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 97; GCN-NOT: v_accvgpr_read_b32 98; GCN: global_store_dwordx4 v{{[0-9]+}}, [[RES]], 99define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg) #0 { 100bb: 101 %in.1 = load <4 x float>, ptr addrspace(1) %arg 102 %a = bitcast i64 1 to <4 x i16> 103 %b = bitcast i64 2 to <4 x i16> 104 %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3) 105 store <4 x float> %mai.1, ptr addrspace(1) %arg 106 ret void 107} 108 109; GCN-LABEL: {{^}}test_mfma_f64_4x4x4f64: 110; GFX90A: v_mfma_f64_4x4x4f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], 0{{$}} 111; GFX90A: v_mfma_f64_4x4x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 blgp:3 112; GFX940: v_mfma_f64_4x4x4_4b_f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], 0{{$}} 113; GFX940: v_mfma_f64_4x4x4_4b_f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 neg:[1,1,0] 114; GCN: global_store_dwordx2 115define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg, double %a, double %b) #0 { 116bb: 117 %mai.1 = tail call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %a, double %b, double 0.0, i32 0, i32 0, i32 0) 118 %mai.2 = tail call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %a, double %b, double %mai.1, i32 1, i32 2, i32 3) 119 store double %mai.2, ptr addrspace(1) %arg 120 ret void 121} 122 123; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64: 124; GCN: s_load_dwordx8 125; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 126; GFX940: v_mfma_f64_16x16x4_f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 neg:[1,1,0] 127; GCN: global_store_dwordx4 128; GCN: global_store_dwordx4 129define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, double %a, double %b) #0 { 130bb: 131 %in.1 = load <4 x double>, ptr addrspace(1) %arg 132 %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %in.1, i32 1, i32 2, i32 3) 133 store <4 x double> %mai.1, ptr addrspace(1) %arg 134 ret void 135} 136 137; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64_splat_imm: 138; GFX90A: v_mfma_f64_16x16x4f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], 0{{$}} 139; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 blgp:3 140; GFX940: v_mfma_f64_16x16x4_f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], 0{{$}} 141; GFX940: v_mfma_f64_16x16x4_f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 neg:[1,1,0] 142; GCN: global_store_dwordx4 143; GCN: global_store_dwordx4 144define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm(ptr addrspace(1) %arg, double %a, double %b) #0 { 145bb: 146 %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 0.0, double 0.0, double 0.0, double 0.0>, i32 0, i32 0, i32 0) 147 %mai.2 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %mai.1, i32 1, i32 2, i32 3) 148 store <4 x double> %mai.2, ptr addrspace(1) %arg 149 ret void 150} 151 152; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64_imm: 153; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}]{{$}} 154; GFX940: v_mfma_f64_16x16x4_f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}]{{$}} 155; GCN: global_store_dwordx4 156; GCN: global_store_dwordx4 157define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, double %a, double %b) #0 { 158bb: 159 %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 0.0, double 0.0, double 0.0, double 1.0>, i32 0, i32 0, i32 0) 160 store <4 x double> %mai.1, ptr addrspace(1) %arg 161 ret void 162} 163 164; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64_splat_lit: 165; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 166; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x405ec000 167; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}]{{$}} 168; GFX940: v_mfma_f64_16x16x4_f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}]{{$}} 169; GCN: global_store_dwordx4 170; GCN: global_store_dwordx4 171define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %arg, double %a, double %b) #0 { 172bb: 173 %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 123.0, double 123.0, double 123.0, double 123.0>, i32 0, i32 0, i32 0) 174 store <4 x double> %mai.1, ptr addrspace(1) %arg 175 ret void 176} 177 178attributes #0 = { "amdgpu-flat-work-group-size"="1,256" } 179