1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs --amdgpu-lower-module-lds-strategy=module < %s | FileCheck %s -check-prefixes=GCN,SI 3; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs --amdgpu-lower-module-lds-strategy=module < %s | FileCheck %s -check-prefixes=GCN,CI 4 5@local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4 6 7define amdgpu_kernel void @local_memory(ptr addrspace(1) %out) #0 { 8; GCN-LABEL: local_memory: 9; GCN: ; %bb.0: ; %entry 10; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 11; GCN-NEXT: s_mov_b32 m0, -1 12; GCN-NEXT: ds_write_b32 v1, v0 13; GCN-NEXT: v_add_i32_e32 v0, vcc, 1, v0 14; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 16, v0 15; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 16; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 17; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 18; GCN-NEXT: s_waitcnt lgkmcnt(0) 19; GCN-NEXT: s_barrier 20; GCN-NEXT: ds_read_b32 v0, v0 21; GCN-NEXT: s_mov_b32 s2, 0 22; GCN-NEXT: s_mov_b32 s3, 0xf000 23; GCN-NEXT: v_mov_b32_e32 v2, 0 24; GCN-NEXT: s_waitcnt lgkmcnt(0) 25; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 26; GCN-NEXT: s_endpgm 27entry: 28 %y.i = call i32 @llvm.amdgcn.workitem.id.x() #1 29 %arrayidx = getelementptr inbounds [128 x i32], ptr addrspace(3) @local_memory.local_mem, i32 0, i32 %y.i 30 store i32 %y.i, ptr addrspace(3) %arrayidx, align 4 31 %add = add nsw i32 %y.i, 1 32 %cmp = icmp eq i32 %add, 16 33 %.add = select i1 %cmp, i32 0, i32 %add 34 call void @llvm.amdgcn.s.barrier() 35 %arrayidx1 = getelementptr inbounds [128 x i32], ptr addrspace(3) @local_memory.local_mem, i32 0, i32 %.add 36 %tmp = load i32, ptr addrspace(3) %arrayidx1, align 4 37 %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %y.i 38 store i32 %tmp, ptr addrspace(1) %arrayidx2, align 4 39 ret void 40} 41 42@local_memory_two_objects.local_mem0 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4 43@local_memory_two_objects.local_mem1 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4 44 45; Check that the LDS size emitted correctly 46define amdgpu_kernel void @local_memory_two_objects(ptr addrspace(1) %out) #0 { 47; SI-LABEL: local_memory_two_objects: 48; SI: ; %bb.0: ; %entry 49; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 50; SI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 51; SI-NEXT: s_mov_b32 m0, -1 52; SI-NEXT: ds_write2_b32 v1, v0, v2 offset1:4 53; SI-NEXT: v_sub_i32_e32 v0, vcc, 12, v1 54; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 55; SI-NEXT: s_waitcnt lgkmcnt(0) 56; SI-NEXT: s_barrier 57; SI-NEXT: v_sub_i32_e32 v2, vcc, 28, v1 58; SI-NEXT: ds_read_b32 v0, v0 59; SI-NEXT: ds_read_b32 v3, v2 60; SI-NEXT: s_mov_b32 s3, 0xf000 61; SI-NEXT: s_mov_b32 s2, 0 62; SI-NEXT: v_mov_b32_e32 v2, 0 63; SI-NEXT: s_waitcnt lgkmcnt(1) 64; SI-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 65; SI-NEXT: s_waitcnt lgkmcnt(0) 66; SI-NEXT: buffer_store_dword v3, v[1:2], s[0:3], 0 addr64 offset:16 67; SI-NEXT: s_endpgm 68; 69; CI-LABEL: local_memory_two_objects: 70; CI: ; %bb.0: ; %entry 71; CI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 72; CI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 73; CI-NEXT: s_mov_b32 m0, -1 74; CI-NEXT: ds_write2_b32 v1, v0, v2 offset1:4 75; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v1 76; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 77; CI-NEXT: s_waitcnt lgkmcnt(0) 78; CI-NEXT: s_barrier 79; CI-NEXT: ds_read2_b32 v[3:4], v0 offset0:3 offset1:7 80; CI-NEXT: s_mov_b32 s3, 0xf000 81; CI-NEXT: s_mov_b32 s2, 0 82; CI-NEXT: v_mov_b32_e32 v2, 0 83; CI-NEXT: s_waitcnt lgkmcnt(0) 84; CI-NEXT: buffer_store_dword v3, v[1:2], s[0:3], 0 addr64 85; CI-NEXT: buffer_store_dword v4, v[1:2], s[0:3], 0 addr64 offset:16 86; CI-NEXT: s_endpgm 87entry: 88 %x.i = call i32 @llvm.amdgcn.workitem.id.x() 89 %arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(3) @local_memory_two_objects.local_mem0, i32 0, i32 %x.i 90 store i32 %x.i, ptr addrspace(3) %arrayidx, align 4 91 %mul = shl nsw i32 %x.i, 1 92 %arrayidx1 = getelementptr inbounds [4 x i32], ptr addrspace(3) @local_memory_two_objects.local_mem1, i32 0, i32 %x.i 93 store i32 %mul, ptr addrspace(3) %arrayidx1, align 4 94 %sub = sub nsw i32 3, %x.i 95 call void @llvm.amdgcn.s.barrier() 96 %arrayidx2 = getelementptr inbounds [4 x i32], ptr addrspace(3) @local_memory_two_objects.local_mem0, i32 0, i32 %sub 97 %tmp = load i32, ptr addrspace(3) %arrayidx2, align 4 98 %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %x.i 99 store i32 %tmp, ptr addrspace(1) %arrayidx3, align 4 100 %arrayidx4 = getelementptr inbounds [4 x i32], ptr addrspace(3) @local_memory_two_objects.local_mem1, i32 0, i32 %sub 101 %tmp1 = load i32, ptr addrspace(3) %arrayidx4, align 4 102 %add = add nsw i32 %x.i, 4 103 %arrayidx5 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %add 104 store i32 %tmp1, ptr addrspace(1) %arrayidx5, align 4 105 ret void 106} 107 108declare i32 @llvm.amdgcn.workitem.id.x() #1 109declare void @llvm.amdgcn.s.barrier() #2 110 111attributes #0 = { nounwind } 112attributes #1 = { nounwind readnone } 113attributes #2 = { convergent nounwind } 114