1; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908 %s 2; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A %s 3 4; GCN-LABEL: {{^}}max_12regs_13a_used: 5; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 6; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 7; GCN: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}} 8; GCN-NOT: buffer_store_dword 9; GCN-NOT: buffer_load_dword 10; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] 11; GCN: ScratchSize: 0 12define amdgpu_kernel void @max_12regs_13a_used(i32 %cond, ptr addrspace(1) %arg, ptr addrspace(1) %out) #2 { 13bb: 14 %in.1 = load <4 x float>, ptr addrspace(1) %arg 15 %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %in.1, i32 0, i32 0, i32 0) 16 %mai.2 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai.1, i32 0, i32 0, i32 0) 17 %cmp = icmp eq i32 %cond, 0 18 br i1 %cmp, label %use, label %st 19 20use: 21 call void asm sideeffect "", "a,a,a,a,a"(i32 1, i32 2, i32 3, i32 4, i32 5) 22 store volatile <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, ptr addrspace(1) %out 23 br label %st 24 25st: 26 %gep1 = getelementptr <4 x float>, ptr addrspace(1) %out, i64 16 27 %gep2 = getelementptr <4 x float>, ptr addrspace(1) %out, i64 32 28 call void asm sideeffect "", "a,a"(<4 x float> %mai.1, <4 x float> %mai.2) 29 ret void 30} 31 32; GCN-LABEL: {{^}}max_10_vgprs_used_9a: 33; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 34; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 35; GCN: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}} 36; GCN-NOT: buffer_store_dword 37; GCN-NOT: buffer_load_dword 38; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] 39; GCN: ScratchSize: 0 40define amdgpu_kernel void @max_10_vgprs_used_9a() #1 { 41 %a1 = call <4 x i32> asm sideeffect "", "=a"() 42 %a2 = call <4 x i32> asm sideeffect "", "=a"() 43 %a3 = call i32 asm sideeffect "", "=a"() 44 %a4 = call <2 x i32> asm sideeffect "", "=a"() 45 call void asm sideeffect "", "a,a,a"(<4 x i32> %a1, <4 x i32> %a2, i32 %a3) 46 call void asm sideeffect "", "a"(<2 x i32> %a4) 47 ret void 48} 49 50; GCN-LABEL: {{^}}max_32regs_mfma32: 51; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 52; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 53; GCN-NOT: buffer_store_dword 54; GCN: v_accvgpr_read_b32 55; GCN: v_mfma_f32_32x32x1f32 56; GCN-NOT: buffer_load_dword 57; GCN: v_accvgpr_write_b32 58; GCN: ScratchSize: 0 59define amdgpu_kernel void @max_32regs_mfma32(ptr addrspace(1) %arg) #3 { 60bb: 61 %v = call i32 asm sideeffect "", "=a"() 62 br label %use 63 64use: 65 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 1.0, <32 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0, float 17.0, float 18.0, float 19.0, float 20.0, float 21.0, float 22.0, float 23.0, float 24.0, float 25.0, float 26.0, float 27.0, float 28.0, float 29.0, float 30.0, float 31.0, float 2.0>, i32 0, i32 0, i32 0) 66 call void asm sideeffect "", "a"(i32 %v) 67 %elt1 = extractelement <32 x float> %mai.1, i32 0 68 store float %elt1, ptr addrspace(1) %arg 69 ret void 70} 71 72; Should spill agprs to memory for both gfx908 and gfx90a. 73; GCN-LABEL: {{^}}max_6regs_used_8a: 74; GCN: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 75; GCN: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 76 77; GFX908-DAG: v_accvgpr_read_b32 v5, a0 ; Reload Reuse 78; GFX908-DAG: buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 ; 4-byte Folded Spill 79; GFX908-DAG: v_accvgpr_read_b32 v5, a1 ; Reload Reuse 80; GFX908-DAG: buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Spill 81; GFX908-DAG: v_accvgpr_read_b32 v5, a2 ; Reload Reuse 82; GFX908-DAG: buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Spill 83; GFX908-DAG: v_accvgpr_read_b32 v5, a3 ; Reload Reuse 84; GFX908-DAG: buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Spill 85 86; GFX90A-DAG: buffer_store_dword a0, off, s[{{[0-9:]+}}], 0 ; 4-byte Folded Spill 87; GFX90A-DAG: buffer_store_dword a1, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Spill 88; GFX90A-DAG: buffer_store_dword a2, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Spill 89; GFX90A-DAG: buffer_store_dword a3, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Spill 90 91; GCN: v_mfma_f32_4x4x1f32 a[0:3], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3] 92 93; GFX908-DAG: buffer_load_dword v0, off, s[{{[0-9:]+}}], 0 ; 4-byte Folded Reload 94; GFX908-DAG: buffer_load_dword v1, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Reload 95; GFX908-DAG: buffer_load_dword v2, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Reload 96; GFX908-DAG: buffer_load_dword v3, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Reload 97; GFX908: global_store_dwordx4 v[{{[0-9:]+}}], v[0:3], off 98 99; GFX90A-DAG: buffer_load_dword v2, off, s[4:7], 0 ; 4-byte Folded Reload 100; GFX90A-DAG: buffer_load_dword v3, off, s[4:7], 0 offset:4 ; 4-byte Folded Reload 101; GFX90A-DAG: buffer_load_dword v4, off, s[4:7], 0 offset:8 ; 4-byte Folded Reload 102; GFX90A-DAG: buffer_load_dword v5, off, s[4:7], 0 offset:12 ; 4-byte Folded Reload 103; GFX90A: global_store_dwordx4 v[0:1], v[2:5], off 104 105; GCN: ScratchSize: 20 106define amdgpu_kernel void @max_6regs_used_8a(ptr addrspace(1) %arg) #4 { 107 %tid = call i32 @llvm.amdgcn.workitem.id.x() 108 %v0 = call float asm sideeffect "; def $0", "=v"() 109 %a4 = call <4 x float> asm sideeffect "; def $0", "=a"() 110 %gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %arg, i32 %tid 111 %mai.in = load <4 x float>, ptr addrspace(1) %gep 112 %mai.out = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai.in, i32 0, i32 0, i32 0) 113 store <4 x float> %mai.out, ptr addrspace(1) %gep 114 store volatile <4 x float> %a4, ptr addrspace(1) undef 115 call void asm sideeffect "; use $0", "v"(float %v0); 116 ret void 117} 118 119declare i32 @llvm.amdgcn.workitem.id.x() 120declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32) 121declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32) 122declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) 123 124attributes #1 = { nounwind "amdgpu-num-vgpr"="10" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } 125attributes #2 = { nounwind "amdgpu-num-vgpr"="12" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } 126attributes #3 = { nounwind "amdgpu-num-vgpr"="32" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } 127attributes #4 = { nounwind "amdgpu-num-vgpr"="6" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } 128