1; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=ALL %s 2 3; FIXME: Vectorization can increase required SGPR count beyond limit. 4 5; ALL-LABEL: {{^}}max_10_sgprs: 6 7; ALL: SGPRBlocks: 1 8; ALL: NumSGPRsForWavesPerEU: 10 9define amdgpu_kernel void @max_10_sgprs() #0 { 10 %one = load volatile i32, ptr addrspace(4) undef 11 %two = load volatile i32, ptr addrspace(4) undef 12 %three = load volatile i32, ptr addrspace(4) undef 13 %four = load volatile i32, ptr addrspace(4) undef 14 %five = load volatile i32, ptr addrspace(4) undef 15 %six = load volatile i32, ptr addrspace(4) undef 16 %seven = load volatile i32, ptr addrspace(4) undef 17 %eight = load volatile i32, ptr addrspace(4) undef 18 %nine = load volatile i32, ptr addrspace(4) undef 19 %ten = load volatile i32, ptr addrspace(4) undef 20 %eleven = load volatile i32, ptr addrspace(4) undef 21 call void asm sideeffect "", "s,s,s,s,s,s,s,s,s,s"(i32 %one, i32 %two, i32 %three, i32 %four, i32 %five, i32 %six, i32 %seven, i32 %eight, i32 %nine, i32 %ten) 22 store volatile i32 %one, ptr addrspace(1) undef 23 store volatile i32 %two, ptr addrspace(1) undef 24 store volatile i32 %three, ptr addrspace(1) undef 25 store volatile i32 %four, ptr addrspace(1) undef 26 store volatile i32 %five, ptr addrspace(1) undef 27 store volatile i32 %six, ptr addrspace(1) undef 28 store volatile i32 %seven, ptr addrspace(1) undef 29 store volatile i32 %eight, ptr addrspace(1) undef 30 store volatile i32 %nine, ptr addrspace(1) undef 31 store volatile i32 %ten, ptr addrspace(1) undef 32 store volatile i32 %eleven, ptr addrspace(1) undef 33 ret void 34} 35 36; private resource: 4 37; scratch wave offset: 1 38; workgroup ids: 3 39; dispatch id: 2 40; queue ptr: 2 41; flat scratch init: 2 42; --------------------- 43; total: 14 44 45; + reserved vcc = 16 46 47; Because we can't handle re-using the last few input registers as the 48; special vcc etc. registers (as well as decide to not use the unused 49; features when the number of registers is frozen), this ends up using 50; more than expected. 51 52; XALL-LABEL: {{^}}max_12_sgprs_14_input_sgprs: 53; XTOSGPR: SGPRBlocks: 1 54; XTOSGPR: NumSGPRsForWavesPerEU: 16 55 56; This test case is disabled: When calculating the spillslot addresses AMDGPU 57; creates an extra vreg to save/restore m0 which in a point of maximum register 58; pressure would trigger an endless loop; the compiler aborts earlier with 59; "Incomplete scavenging after 2nd pass" in practice. 60;define amdgpu_kernel void @max_12_sgprs_14_input_sgprs(ptr addrspace(1) %out1, 61; ptr addrspace(1) %out2, 62; ptr addrspace(1) %out3, 63; ptr addrspace(1) %out4, 64; i32 %one, i32 %two, i32 %three, i32 %four) #2 { 65; %x.0 = call i32 @llvm.amdgcn.workgroup.id.x() 66; %x.1 = call i32 @llvm.amdgcn.workgroup.id.y() 67; %x.2 = call i32 @llvm.amdgcn.workgroup.id.z() 68; %x.3 = call i64 @llvm.amdgcn.dispatch.id() 69; %x.4 = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() 70; %x.5 = call ptr addrspace(4) @llvm.amdgcn.queue.ptr() 71; store volatile i32 0, ptr undef 72; br label %stores 73; 74;stores: 75; store volatile i32 %x.0, ptr addrspace(1) undef 76; store volatile i32 %x.0, ptr addrspace(1) undef 77; store volatile i32 %x.0, ptr addrspace(1) undef 78; store volatile i64 %x.3, ptr addrspace(1) undef 79; store volatile ptr addrspace(4) %x.4, ptr addrspace(1) undef 80; store volatile ptr addrspace(4) %x.5, ptr addrspace(1) undef 81; 82; store i32 %one, ptr addrspace(1) %out1 83; store i32 %two, ptr addrspace(1) %out2 84; store i32 %three, ptr addrspace(1) %out3 85; store i32 %four, ptr addrspace(1) %out4 86; ret void 87;} 88 89; The following test is commented out for now; http://llvm.org/PR31230 90; XALL-LABEL: max_12_sgprs_12_input_sgprs{{$}} 91; ; Make sure copies for input buffer are not clobbered. This requires 92; ; swapping the order the registers are copied from what normally 93; ; happens. 94 95; XALL: SGPRBlocks: 2 96; XALL: NumSGPRsForWavesPerEU: 18 97;define amdgpu_kernel void @max_12_sgprs_12_input_sgprs(ptr addrspace(1) %out1, 98; ptr addrspace(1) %out2, 99; ptr addrspace(1) %out3, 100; ptr addrspace(1) %out4, 101; i32 %one, i32 %two, i32 %three, i32 %four) #2 { 102; store volatile i32 0, ptr undef 103; %x.0 = call i32 @llvm.amdgcn.workgroup.id.x() 104; store volatile i32 %x.0, ptr addrspace(1) undef 105; %x.1 = call i32 @llvm.amdgcn.workgroup.id.y() 106; store volatile i32 %x.0, ptr addrspace(1) undef 107; %x.2 = call i32 @llvm.amdgcn.workgroup.id.z() 108; store volatile i32 %x.0, ptr addrspace(1) undef 109; %x.3 = call i64 @llvm.amdgcn.dispatch.id() 110; store volatile i64 %x.3, ptr addrspace(1) undef 111; %x.4 = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() 112; store volatile ptr addrspace(4) %x.4, ptr addrspace(1) undef 113; 114; store i32 %one, ptr addrspace(1) %out1 115; store i32 %two, ptr addrspace(1) %out2 116; store i32 %three, ptr addrspace(1) %out3 117; store i32 %four, ptr addrspace(1) %out4 118; ret void 119;} 120 121declare i32 @llvm.amdgcn.workgroup.id.x() #1 122declare i32 @llvm.amdgcn.workgroup.id.y() #1 123declare i32 @llvm.amdgcn.workgroup.id.z() #1 124declare i64 @llvm.amdgcn.dispatch.id() #1 125declare ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #1 126declare ptr addrspace(4) @llvm.amdgcn.queue.ptr() #1 127 128attributes #0 = { nounwind "amdgpu-num-sgpr"="14" } 129attributes #1 = { nounwind readnone } 130attributes #2 = { nounwind "amdgpu-num-sgpr"="12" } 131attributes #3 = { nounwind "amdgpu-num-sgpr"="11" } 132