1; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck --enable-var-scope --check-prefixes=GCN,GFX908 %s 2; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck --enable-var-scope --check-prefixes=GCN,GFX90A %s 3; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck --enable-var-scope --check-prefixes=GCN,GFX90A %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck --enable-var-scope --check-prefixes=GCN,GFX90A %s 5; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck --enable-var-scope --check-prefixes=GCN,GFX90A %s 6 7declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) 8 9; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_vgpr: 10; GFX908: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}] 11; GFX90A: v_mfma_f32_32x32x1{{.*}} v[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, v[{{[0-9:]+}}] 12define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vgpr(ptr addrspace(1) %arg) #0 { 13bb: 14 %in.1 = load <32 x float>, ptr addrspace(1) %arg 15 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0) 16 store <32 x float> %mai.1, ptr addrspace(1) %arg 17 ret void 18} 19 20; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_agpr: 21; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}] 22define amdgpu_kernel void @test_mfma_f32_32x32x1f32_agpr(ptr addrspace(1) %arg) #2 { 23bb: 24 %in.1 = load <32 x float>, ptr addrspace(1) %arg 25 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0) 26 store <32 x float> %mai.1, ptr addrspace(1) %arg 27 ret void 28} 29 30; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr: 31; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}] 32define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr(ptr addrspace(1) %arg) { 33bb: 34 %acc = call i32 asm sideeffect "; def $0", "={a0}"() 35 %in.1 = load <32 x float>, ptr addrspace(1) %arg 36 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0) 37 store <32 x float> %mai.1, ptr addrspace(1) %arg 38 ret void 39} 40 41; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_inline_asm_phys_agpr: 42; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}] 43define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_phys_agpr(ptr addrspace(1) %arg) { 44bb: 45 call void asm sideeffect "; use $0", "{a[100:131]}"(<32 x float> undef) 46 %in.1 = load <32 x float>, ptr addrspace(1) %arg 47 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0) 48 store <32 x float> %mai.1, ptr addrspace(1) %arg 49 ret void 50} 51 52; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_inline_asm_no_agprs: 53; GFX908: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}] 54; GFX90A: v_mfma_f32_32x32x1{{.*}} v[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, v[{{[0-9:]+}}] 55define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_no_agprs(ptr addrspace(1) %arg) #0 { 56bb: 57 %acc = call i32 asm sideeffect "; def $0", "={v0}"() 58 %in.1 = load <32 x float>, ptr addrspace(1) %arg 59 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0) 60 store <32 x float> %mai.1, ptr addrspace(1) %arg 61 ret void 62} 63 64; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_call: 65; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}] 66define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call(ptr addrspace(1) %arg) #1 { 67bb: 68 call void @foo() 69 %in.1 = load <32 x float>, ptr addrspace(1) %arg 70 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0) 71 store <32 x float> %mai.1, ptr addrspace(1) %arg 72 ret void 73} 74 75; We could avoid scan to find calls since we see these during lowering before selection. 76; However, in SDag lowering and selection is done block by block, so it would only work 77; in Global ISel. 78 79; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_call_multi_bb: 80; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}] 81define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call_multi_bb(ptr addrspace(1) %arg, i1 %c0) #1 { 82bb1: 83 %in.1 = load <32 x float>, ptr addrspace(1) %arg 84 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3) 85 store <32 x float> %mai.1, ptr addrspace(1) %arg 86 br i1 %c0, label %bb2, label %bb3 87 br label %bb2 88 89bb2: 90 call void @foo() 91 br label %bb3 92 93bb3: 94 ret void 95} 96 97; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_nonentry: 98; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}] 99define void @test_mfma_f32_32x32x1f32_nonentry(ptr addrspace(1) %arg) #0 { 100bb: 101 %in.1 = load <32 x float>, ptr addrspace(1) %arg 102 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0) 103 store <32 x float> %mai.1, ptr addrspace(1) %arg 104 ret void 105} 106 107declare void @foo() 108 109attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2" "amdgpu-no-agpr" } 110attributes #1 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2" } 111attributes #2 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-agpr" } 112