19e9907f1SFangrui Song; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=GCN,GFX9 29e9907f1SFangrui Song; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck %s --check-prefixes=GCN,GFX10 37f540701SStanislav Mekhanoshin 47f540701SStanislav Mekhanoshin@lds.0 = internal addrspace(3) global [64 x float] poison, align 16 57f540701SStanislav Mekhanoshin@lds.1 = internal addrspace(3) global [64 x float] poison, align 16 6*021def6cSStanislav Mekhanoshin@lds.2 = internal addrspace(3) global [64 x float] poison, align 16 7*021def6cSStanislav Mekhanoshin@lds.3 = internal addrspace(3) global [64 x float] poison, align 16 8*021def6cSStanislav Mekhanoshin@lds.4 = internal addrspace(3) global [64 x float] poison, align 16 9*021def6cSStanislav Mekhanoshin@lds.5 = internal addrspace(3) global [64 x float] poison, align 16 10*021def6cSStanislav Mekhanoshin@lds.6 = internal addrspace(3) global [64 x float] poison, align 16 11*021def6cSStanislav Mekhanoshin@lds.7 = internal addrspace(3) global [64 x float] poison, align 16 12*021def6cSStanislav Mekhanoshin@lds.8 = internal addrspace(3) global [64 x float] poison, align 16 13*021def6cSStanislav Mekhanoshin@lds.9 = internal addrspace(3) global [64 x float] poison, align 16 147f540701SStanislav Mekhanoshin 157f540701SStanislav Mekhanoshindeclare void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) nocapture, i32 %size, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux) 167f540701SStanislav Mekhanoshindeclare void @llvm.amdgcn.global.load.lds(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture %lptr, i32 %size, i32 %offset, i32 %aux) 177f540701SStanislav Mekhanoshin 187f540701SStanislav Mekhanoshin; GCN-LABEL: {{^}}buffer_load_lds_dword_2_arrays: 197f540701SStanislav Mekhanoshin; GCN-COUNT-4: buffer_load_dword 20*021def6cSStanislav Mekhanoshin; GCN: s_waitcnt vmcnt(2) 217f540701SStanislav Mekhanoshin; GCN: ds_read_b32 22*021def6cSStanislav Mekhanoshin; GCN: s_waitcnt vmcnt(0) 237f540701SStanislav Mekhanoshin; GCN: ds_read_b32 247f540701SStanislav Mekhanoshindefine amdgpu_kernel void @buffer_load_lds_dword_2_arrays(<4 x i32> %rsrc, i32 %i1, i32 %i2, ptr addrspace(1) %out) { 257f540701SStanislav Mekhanoshinmain_body: 267f540701SStanislav Mekhanoshin call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.0, i32 4, i32 0, i32 0, i32 0, i32 0) 277f540701SStanislav Mekhanoshin call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.0, i32 4, i32 4, i32 0, i32 0, i32 0) 287f540701SStanislav Mekhanoshin call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.1, i32 4, i32 8, i32 0, i32 0, i32 0) 297f540701SStanislav Mekhanoshin call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.1, i32 4, i32 12, i32 0, i32 0, i32 0) 307f540701SStanislav Mekhanoshin %gep.0 = getelementptr float, ptr addrspace(3) @lds.0, i32 %i1 317f540701SStanislav Mekhanoshin %gep.1 = getelementptr float, ptr addrspace(3) @lds.1, i32 %i2 327f540701SStanislav Mekhanoshin %val.0 = load float, ptr addrspace(3) %gep.0, align 4 337f540701SStanislav Mekhanoshin call void @llvm.amdgcn.wave.barrier() 347f540701SStanislav Mekhanoshin %val.1 = load float, ptr addrspace(3) %gep.1, align 4 357f540701SStanislav Mekhanoshin %tmp.0 = insertelement <2 x float> undef, float %val.0, i32 0 367f540701SStanislav Mekhanoshin %res = insertelement <2 x float> %tmp.0, float %val.1, i32 1 377f540701SStanislav Mekhanoshin store <2 x float> %res, ptr addrspace(1) %out 387f540701SStanislav Mekhanoshin ret void 397f540701SStanislav Mekhanoshin} 407f540701SStanislav Mekhanoshin 417f540701SStanislav Mekhanoshin; On gfx9 if there is a pending FLAT operation, and this is a VMem or LGKM 427f540701SStanislav Mekhanoshin; waitcnt and the target can report early completion, then we need to force a waitcnt 0. 437f540701SStanislav Mekhanoshin 447f540701SStanislav Mekhanoshin; GCN-LABEL: {{^}}global_load_lds_dword_2_arrays: 457f540701SStanislav Mekhanoshin; GCN-COUNT-4: global_load_dword 467f540701SStanislav Mekhanoshin; GFX9: s_waitcnt vmcnt(0) 477f540701SStanislav Mekhanoshin; GFX9-COUNT-2: ds_read_b32 48*021def6cSStanislav Mekhanoshin; GFX10: s_waitcnt vmcnt(2) 497f540701SStanislav Mekhanoshin; GFX10: ds_read_b32 50*021def6cSStanislav Mekhanoshin; GFX10: s_waitcnt vmcnt(0) 517f540701SStanislav Mekhanoshin; GFX10: ds_read_b32 527f540701SStanislav Mekhanoshindefine amdgpu_kernel void @global_load_lds_dword_2_arrays(ptr addrspace(1) nocapture %gptr, i32 %i1, i32 %i2, ptr addrspace(1) %out) { 537f540701SStanislav Mekhanoshinmain_body: 547f540701SStanislav Mekhanoshin call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.0, i32 4, i32 0, i32 0) 557f540701SStanislav Mekhanoshin call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.0, i32 4, i32 4, i32 0) 567f540701SStanislav Mekhanoshin call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.1, i32 4, i32 8, i32 0) 577f540701SStanislav Mekhanoshin call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.1, i32 4, i32 12, i32 0) 587f540701SStanislav Mekhanoshin %gep.0 = getelementptr float, ptr addrspace(3) @lds.0, i32 %i1 597f540701SStanislav Mekhanoshin %gep.1 = getelementptr float, ptr addrspace(3) @lds.1, i32 %i2 607f540701SStanislav Mekhanoshin %val.0 = load float, ptr addrspace(3) %gep.0, align 4 617f540701SStanislav Mekhanoshin call void @llvm.amdgcn.wave.barrier() 627f540701SStanislav Mekhanoshin %val.1 = load float, ptr addrspace(3) %gep.1, align 4 637f540701SStanislav Mekhanoshin %tmp.0 = insertelement <2 x float> undef, float %val.0, i32 0 647f540701SStanislav Mekhanoshin %res = insertelement <2 x float> %tmp.0, float %val.1, i32 1 657f540701SStanislav Mekhanoshin store <2 x float> %res, ptr addrspace(1) %out 667f540701SStanislav Mekhanoshin ret void 677f540701SStanislav Mekhanoshin} 687f540701SStanislav Mekhanoshin 69*021def6cSStanislav Mekhanoshin; There are 8 pseudo registers defined to track LDS DMA dependencies. 70*021def6cSStanislav Mekhanoshin; When exhausted we default to vmcnt(0). 71*021def6cSStanislav Mekhanoshin 72*021def6cSStanislav Mekhanoshin; GCN-LABEL: {{^}}buffer_load_lds_dword_10_arrays: 73*021def6cSStanislav Mekhanoshin; GCN-COUNT-10: buffer_load_dword 74*021def6cSStanislav Mekhanoshin; GCN: s_waitcnt vmcnt(8) 75*021def6cSStanislav Mekhanoshin; GCN: ds_read_b32 76*021def6cSStanislav Mekhanoshin; GCN: s_waitcnt vmcnt(7) 77*021def6cSStanislav Mekhanoshin; GCN: ds_read_b32 78*021def6cSStanislav Mekhanoshin; GCN: s_waitcnt vmcnt(6) 79*021def6cSStanislav Mekhanoshin; GCN: ds_read_b32 80*021def6cSStanislav Mekhanoshin; GCN: s_waitcnt vmcnt(5) 81*021def6cSStanislav Mekhanoshin; GCN: ds_read_b32 82*021def6cSStanislav Mekhanoshin; GCN: s_waitcnt vmcnt(4) 83*021def6cSStanislav Mekhanoshin; GCN: ds_read_b32 84*021def6cSStanislav Mekhanoshin; GCN: s_waitcnt vmcnt(3) 85*021def6cSStanislav Mekhanoshin; GCN: ds_read_b32 86*021def6cSStanislav Mekhanoshin; GCN: s_waitcnt vmcnt(2) 87*021def6cSStanislav Mekhanoshin; GCN-NOT: s_waitcnt vmcnt 88*021def6cSStanislav Mekhanoshin; GCN: ds_read_b32 89*021def6cSStanislav Mekhanoshin; GCN: s_waitcnt vmcnt(0) 90*021def6cSStanislav Mekhanoshin; GCN: ds_read_b32 91*021def6cSStanislav Mekhanoshindefine amdgpu_kernel void @buffer_load_lds_dword_10_arrays(<4 x i32> %rsrc, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, ptr addrspace(1) %out) { 92*021def6cSStanislav Mekhanoshinmain_body: 93*021def6cSStanislav Mekhanoshin call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.0, i32 4, i32 0, i32 0, i32 0, i32 0) 94*021def6cSStanislav Mekhanoshin call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.1, i32 4, i32 0, i32 0, i32 0, i32 0) 95*021def6cSStanislav Mekhanoshin call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.2, i32 4, i32 0, i32 0, i32 0, i32 0) 96*021def6cSStanislav Mekhanoshin call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.3, i32 4, i32 0, i32 0, i32 0, i32 0) 97*021def6cSStanislav Mekhanoshin call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.4, i32 4, i32 0, i32 0, i32 0, i32 0) 98*021def6cSStanislav Mekhanoshin call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.5, i32 4, i32 0, i32 0, i32 0, i32 0) 99*021def6cSStanislav Mekhanoshin call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.6, i32 4, i32 0, i32 0, i32 0, i32 0) 100*021def6cSStanislav Mekhanoshin call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.7, i32 4, i32 0, i32 0, i32 0, i32 0) 101*021def6cSStanislav Mekhanoshin call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.8, i32 4, i32 0, i32 0, i32 0, i32 0) 102*021def6cSStanislav Mekhanoshin call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.9, i32 4, i32 0, i32 0, i32 0, i32 0) 103*021def6cSStanislav Mekhanoshin %gep.0 = getelementptr float, ptr addrspace(3) @lds.0, i32 %i1 104*021def6cSStanislav Mekhanoshin %gep.1 = getelementptr float, ptr addrspace(3) @lds.1, i32 %i2 105*021def6cSStanislav Mekhanoshin %gep.2 = getelementptr float, ptr addrspace(3) @lds.2, i32 %i2 106*021def6cSStanislav Mekhanoshin %gep.3 = getelementptr float, ptr addrspace(3) @lds.3, i32 %i2 107*021def6cSStanislav Mekhanoshin %gep.4 = getelementptr float, ptr addrspace(3) @lds.4, i32 %i2 108*021def6cSStanislav Mekhanoshin %gep.5 = getelementptr float, ptr addrspace(3) @lds.5, i32 %i2 109*021def6cSStanislav Mekhanoshin %gep.6 = getelementptr float, ptr addrspace(3) @lds.6, i32 %i2 110*021def6cSStanislav Mekhanoshin %gep.7 = getelementptr float, ptr addrspace(3) @lds.7, i32 %i2 111*021def6cSStanislav Mekhanoshin %gep.8 = getelementptr float, ptr addrspace(3) @lds.8, i32 %i2 112*021def6cSStanislav Mekhanoshin %gep.9 = getelementptr float, ptr addrspace(3) @lds.9, i32 %i2 113*021def6cSStanislav Mekhanoshin %val.0 = load float, ptr addrspace(3) %gep.0, align 4 114*021def6cSStanislav Mekhanoshin call void @llvm.amdgcn.wave.barrier() 115*021def6cSStanislav Mekhanoshin %val.1 = load float, ptr addrspace(3) %gep.1, align 4 116*021def6cSStanislav Mekhanoshin call void @llvm.amdgcn.wave.barrier() 117*021def6cSStanislav Mekhanoshin %val.2 = load float, ptr addrspace(3) %gep.2, align 4 118*021def6cSStanislav Mekhanoshin call void @llvm.amdgcn.wave.barrier() 119*021def6cSStanislav Mekhanoshin %val.3 = load float, ptr addrspace(3) %gep.3, align 4 120*021def6cSStanislav Mekhanoshin call void @llvm.amdgcn.wave.barrier() 121*021def6cSStanislav Mekhanoshin %val.4 = load float, ptr addrspace(3) %gep.4, align 4 122*021def6cSStanislav Mekhanoshin call void @llvm.amdgcn.wave.barrier() 123*021def6cSStanislav Mekhanoshin %val.5 = load float, ptr addrspace(3) %gep.5, align 4 124*021def6cSStanislav Mekhanoshin call void @llvm.amdgcn.wave.barrier() 125*021def6cSStanislav Mekhanoshin %val.6 = load float, ptr addrspace(3) %gep.6, align 4 126*021def6cSStanislav Mekhanoshin call void @llvm.amdgcn.wave.barrier() 127*021def6cSStanislav Mekhanoshin %val.7 = load float, ptr addrspace(3) %gep.7, align 4 128*021def6cSStanislav Mekhanoshin call void @llvm.amdgcn.wave.barrier() 129*021def6cSStanislav Mekhanoshin %val.8 = load float, ptr addrspace(3) %gep.8, align 4 130*021def6cSStanislav Mekhanoshin call void @llvm.amdgcn.wave.barrier() 131*021def6cSStanislav Mekhanoshin %val.9 = load float, ptr addrspace(3) %gep.9, align 4 132*021def6cSStanislav Mekhanoshin %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1 133*021def6cSStanislav Mekhanoshin %out.gep.2 = getelementptr float, ptr addrspace(1) %out, i32 2 134*021def6cSStanislav Mekhanoshin %out.gep.3 = getelementptr float, ptr addrspace(1) %out, i32 3 135*021def6cSStanislav Mekhanoshin %out.gep.4 = getelementptr float, ptr addrspace(1) %out, i32 4 136*021def6cSStanislav Mekhanoshin %out.gep.5 = getelementptr float, ptr addrspace(1) %out, i32 5 137*021def6cSStanislav Mekhanoshin %out.gep.6 = getelementptr float, ptr addrspace(1) %out, i32 6 138*021def6cSStanislav Mekhanoshin %out.gep.7 = getelementptr float, ptr addrspace(1) %out, i32 7 139*021def6cSStanislav Mekhanoshin %out.gep.8 = getelementptr float, ptr addrspace(1) %out, i32 8 140*021def6cSStanislav Mekhanoshin %out.gep.9 = getelementptr float, ptr addrspace(1) %out, i32 9 141*021def6cSStanislav Mekhanoshin store float %val.0, ptr addrspace(1) %out 142*021def6cSStanislav Mekhanoshin store float %val.1, ptr addrspace(1) %out.gep.1 143*021def6cSStanislav Mekhanoshin store float %val.2, ptr addrspace(1) %out.gep.2 144*021def6cSStanislav Mekhanoshin store float %val.3, ptr addrspace(1) %out.gep.3 145*021def6cSStanislav Mekhanoshin store float %val.4, ptr addrspace(1) %out.gep.4 146*021def6cSStanislav Mekhanoshin store float %val.5, ptr addrspace(1) %out.gep.5 147*021def6cSStanislav Mekhanoshin store float %val.6, ptr addrspace(1) %out.gep.6 148*021def6cSStanislav Mekhanoshin store float %val.7, ptr addrspace(1) %out.gep.7 149*021def6cSStanislav Mekhanoshin store float %val.8, ptr addrspace(1) %out.gep.8 150*021def6cSStanislav Mekhanoshin store float %val.9, ptr addrspace(1) %out.gep.9 151*021def6cSStanislav Mekhanoshin ret void 152*021def6cSStanislav Mekhanoshin} 153*021def6cSStanislav Mekhanoshin 1547f540701SStanislav Mekhanoshindeclare void @llvm.amdgcn.wave.barrier() 155