xref: /llvm-project/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll (revision 021def6c2278fd932d18b4d891c2e75c1d8e6f1d)
19e9907f1SFangrui Song; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=GCN,GFX9
29e9907f1SFangrui Song; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck %s --check-prefixes=GCN,GFX10
37f540701SStanislav Mekhanoshin
47f540701SStanislav Mekhanoshin@lds.0 = internal addrspace(3) global [64 x float] poison, align 16
57f540701SStanislav Mekhanoshin@lds.1 = internal addrspace(3) global [64 x float] poison, align 16
6*021def6cSStanislav Mekhanoshin@lds.2 = internal addrspace(3) global [64 x float] poison, align 16
7*021def6cSStanislav Mekhanoshin@lds.3 = internal addrspace(3) global [64 x float] poison, align 16
8*021def6cSStanislav Mekhanoshin@lds.4 = internal addrspace(3) global [64 x float] poison, align 16
9*021def6cSStanislav Mekhanoshin@lds.5 = internal addrspace(3) global [64 x float] poison, align 16
10*021def6cSStanislav Mekhanoshin@lds.6 = internal addrspace(3) global [64 x float] poison, align 16
11*021def6cSStanislav Mekhanoshin@lds.7 = internal addrspace(3) global [64 x float] poison, align 16
12*021def6cSStanislav Mekhanoshin@lds.8 = internal addrspace(3) global [64 x float] poison, align 16
13*021def6cSStanislav Mekhanoshin@lds.9 = internal addrspace(3) global [64 x float] poison, align 16
147f540701SStanislav Mekhanoshin
157f540701SStanislav Mekhanoshindeclare void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) nocapture, i32 %size, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux)
167f540701SStanislav Mekhanoshindeclare void @llvm.amdgcn.global.load.lds(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture %lptr, i32 %size, i32 %offset, i32 %aux)
177f540701SStanislav Mekhanoshin
187f540701SStanislav Mekhanoshin; GCN-LABEL: {{^}}buffer_load_lds_dword_2_arrays:
197f540701SStanislav Mekhanoshin; GCN-COUNT-4: buffer_load_dword
20*021def6cSStanislav Mekhanoshin; GCN: s_waitcnt vmcnt(2)
217f540701SStanislav Mekhanoshin; GCN: ds_read_b32
22*021def6cSStanislav Mekhanoshin; GCN: s_waitcnt vmcnt(0)
237f540701SStanislav Mekhanoshin; GCN: ds_read_b32
247f540701SStanislav Mekhanoshindefine amdgpu_kernel void @buffer_load_lds_dword_2_arrays(<4 x i32> %rsrc, i32 %i1, i32 %i2, ptr addrspace(1) %out) {
257f540701SStanislav Mekhanoshinmain_body:
267f540701SStanislav Mekhanoshin  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.0, i32 4, i32 0, i32 0, i32 0, i32 0)
277f540701SStanislav Mekhanoshin  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.0, i32 4, i32 4, i32 0, i32 0, i32 0)
287f540701SStanislav Mekhanoshin  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.1, i32 4, i32 8, i32 0, i32 0, i32 0)
297f540701SStanislav Mekhanoshin  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.1, i32 4, i32 12, i32 0, i32 0, i32 0)
307f540701SStanislav Mekhanoshin  %gep.0 = getelementptr float, ptr addrspace(3) @lds.0, i32 %i1
317f540701SStanislav Mekhanoshin  %gep.1 = getelementptr float, ptr addrspace(3) @lds.1, i32 %i2
327f540701SStanislav Mekhanoshin  %val.0 = load float, ptr addrspace(3) %gep.0, align 4
337f540701SStanislav Mekhanoshin  call void @llvm.amdgcn.wave.barrier()
347f540701SStanislav Mekhanoshin  %val.1 = load float, ptr addrspace(3) %gep.1, align 4
357f540701SStanislav Mekhanoshin  %tmp.0 = insertelement <2 x float> undef, float %val.0, i32 0
367f540701SStanislav Mekhanoshin  %res = insertelement <2 x float> %tmp.0, float %val.1, i32 1
377f540701SStanislav Mekhanoshin  store <2 x float> %res, ptr addrspace(1) %out
387f540701SStanislav Mekhanoshin  ret void
397f540701SStanislav Mekhanoshin}
407f540701SStanislav Mekhanoshin
417f540701SStanislav Mekhanoshin; On gfx9 if there is a pending FLAT operation, and this is a VMem or LGKM
427f540701SStanislav Mekhanoshin; waitcnt and the target can report early completion, then we need to force a waitcnt 0.
437f540701SStanislav Mekhanoshin
447f540701SStanislav Mekhanoshin; GCN-LABEL: {{^}}global_load_lds_dword_2_arrays:
457f540701SStanislav Mekhanoshin; GCN-COUNT-4: global_load_dword
467f540701SStanislav Mekhanoshin; GFX9: s_waitcnt vmcnt(0)
477f540701SStanislav Mekhanoshin; GFX9-COUNT-2: ds_read_b32
48*021def6cSStanislav Mekhanoshin; GFX10: s_waitcnt vmcnt(2)
497f540701SStanislav Mekhanoshin; GFX10: ds_read_b32
50*021def6cSStanislav Mekhanoshin; GFX10: s_waitcnt vmcnt(0)
517f540701SStanislav Mekhanoshin; GFX10: ds_read_b32
527f540701SStanislav Mekhanoshindefine amdgpu_kernel void @global_load_lds_dword_2_arrays(ptr addrspace(1) nocapture %gptr, i32 %i1, i32 %i2, ptr addrspace(1) %out) {
537f540701SStanislav Mekhanoshinmain_body:
547f540701SStanislav Mekhanoshin  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.0, i32 4, i32 0, i32 0)
557f540701SStanislav Mekhanoshin  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.0, i32 4, i32 4, i32 0)
567f540701SStanislav Mekhanoshin  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.1, i32 4, i32 8, i32 0)
577f540701SStanislav Mekhanoshin  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.1, i32 4, i32 12, i32 0)
587f540701SStanislav Mekhanoshin  %gep.0 = getelementptr float, ptr addrspace(3) @lds.0, i32 %i1
597f540701SStanislav Mekhanoshin  %gep.1 = getelementptr float, ptr addrspace(3) @lds.1, i32 %i2
607f540701SStanislav Mekhanoshin  %val.0 = load float, ptr addrspace(3) %gep.0, align 4
617f540701SStanislav Mekhanoshin  call void @llvm.amdgcn.wave.barrier()
627f540701SStanislav Mekhanoshin  %val.1 = load float, ptr addrspace(3) %gep.1, align 4
637f540701SStanislav Mekhanoshin  %tmp.0 = insertelement <2 x float> undef, float %val.0, i32 0
647f540701SStanislav Mekhanoshin  %res = insertelement <2 x float> %tmp.0, float %val.1, i32 1
657f540701SStanislav Mekhanoshin  store <2 x float> %res, ptr addrspace(1) %out
667f540701SStanislav Mekhanoshin  ret void
677f540701SStanislav Mekhanoshin}
687f540701SStanislav Mekhanoshin
69*021def6cSStanislav Mekhanoshin; There are 8 pseudo registers defined to track LDS DMA dependencies.
70*021def6cSStanislav Mekhanoshin; When exhausted we default to vmcnt(0).
71*021def6cSStanislav Mekhanoshin
72*021def6cSStanislav Mekhanoshin; GCN-LABEL: {{^}}buffer_load_lds_dword_10_arrays:
73*021def6cSStanislav Mekhanoshin; GCN-COUNT-10: buffer_load_dword
74*021def6cSStanislav Mekhanoshin; GCN: s_waitcnt vmcnt(8)
75*021def6cSStanislav Mekhanoshin; GCN: ds_read_b32
76*021def6cSStanislav Mekhanoshin; GCN: s_waitcnt vmcnt(7)
77*021def6cSStanislav Mekhanoshin; GCN: ds_read_b32
78*021def6cSStanislav Mekhanoshin; GCN: s_waitcnt vmcnt(6)
79*021def6cSStanislav Mekhanoshin; GCN: ds_read_b32
80*021def6cSStanislav Mekhanoshin; GCN: s_waitcnt vmcnt(5)
81*021def6cSStanislav Mekhanoshin; GCN: ds_read_b32
82*021def6cSStanislav Mekhanoshin; GCN: s_waitcnt vmcnt(4)
83*021def6cSStanislav Mekhanoshin; GCN: ds_read_b32
84*021def6cSStanislav Mekhanoshin; GCN: s_waitcnt vmcnt(3)
85*021def6cSStanislav Mekhanoshin; GCN: ds_read_b32
86*021def6cSStanislav Mekhanoshin; GCN: s_waitcnt vmcnt(2)
87*021def6cSStanislav Mekhanoshin; GCN-NOT: s_waitcnt vmcnt
88*021def6cSStanislav Mekhanoshin; GCN: ds_read_b32
89*021def6cSStanislav Mekhanoshin; GCN: s_waitcnt vmcnt(0)
90*021def6cSStanislav Mekhanoshin; GCN: ds_read_b32
91*021def6cSStanislav Mekhanoshindefine amdgpu_kernel void @buffer_load_lds_dword_10_arrays(<4 x i32> %rsrc, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, ptr addrspace(1) %out) {
92*021def6cSStanislav Mekhanoshinmain_body:
93*021def6cSStanislav Mekhanoshin  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.0, i32 4, i32 0, i32 0, i32 0, i32 0)
94*021def6cSStanislav Mekhanoshin  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.1, i32 4, i32 0, i32 0, i32 0, i32 0)
95*021def6cSStanislav Mekhanoshin  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.2, i32 4, i32 0, i32 0, i32 0, i32 0)
96*021def6cSStanislav Mekhanoshin  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.3, i32 4, i32 0, i32 0, i32 0, i32 0)
97*021def6cSStanislav Mekhanoshin  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.4, i32 4, i32 0, i32 0, i32 0, i32 0)
98*021def6cSStanislav Mekhanoshin  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.5, i32 4, i32 0, i32 0, i32 0, i32 0)
99*021def6cSStanislav Mekhanoshin  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.6, i32 4, i32 0, i32 0, i32 0, i32 0)
100*021def6cSStanislav Mekhanoshin  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.7, i32 4, i32 0, i32 0, i32 0, i32 0)
101*021def6cSStanislav Mekhanoshin  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.8, i32 4, i32 0, i32 0, i32 0, i32 0)
102*021def6cSStanislav Mekhanoshin  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.9, i32 4, i32 0, i32 0, i32 0, i32 0)
103*021def6cSStanislav Mekhanoshin  %gep.0 = getelementptr float, ptr addrspace(3) @lds.0, i32 %i1
104*021def6cSStanislav Mekhanoshin  %gep.1 = getelementptr float, ptr addrspace(3) @lds.1, i32 %i2
105*021def6cSStanislav Mekhanoshin  %gep.2 = getelementptr float, ptr addrspace(3) @lds.2, i32 %i2
106*021def6cSStanislav Mekhanoshin  %gep.3 = getelementptr float, ptr addrspace(3) @lds.3, i32 %i2
107*021def6cSStanislav Mekhanoshin  %gep.4 = getelementptr float, ptr addrspace(3) @lds.4, i32 %i2
108*021def6cSStanislav Mekhanoshin  %gep.5 = getelementptr float, ptr addrspace(3) @lds.5, i32 %i2
109*021def6cSStanislav Mekhanoshin  %gep.6 = getelementptr float, ptr addrspace(3) @lds.6, i32 %i2
110*021def6cSStanislav Mekhanoshin  %gep.7 = getelementptr float, ptr addrspace(3) @lds.7, i32 %i2
111*021def6cSStanislav Mekhanoshin  %gep.8 = getelementptr float, ptr addrspace(3) @lds.8, i32 %i2
112*021def6cSStanislav Mekhanoshin  %gep.9 = getelementptr float, ptr addrspace(3) @lds.9, i32 %i2
113*021def6cSStanislav Mekhanoshin  %val.0 = load float, ptr addrspace(3) %gep.0, align 4
114*021def6cSStanislav Mekhanoshin  call void @llvm.amdgcn.wave.barrier()
115*021def6cSStanislav Mekhanoshin  %val.1 = load float, ptr addrspace(3) %gep.1, align 4
116*021def6cSStanislav Mekhanoshin  call void @llvm.amdgcn.wave.barrier()
117*021def6cSStanislav Mekhanoshin  %val.2 = load float, ptr addrspace(3) %gep.2, align 4
118*021def6cSStanislav Mekhanoshin  call void @llvm.amdgcn.wave.barrier()
119*021def6cSStanislav Mekhanoshin  %val.3 = load float, ptr addrspace(3) %gep.3, align 4
120*021def6cSStanislav Mekhanoshin  call void @llvm.amdgcn.wave.barrier()
121*021def6cSStanislav Mekhanoshin  %val.4 = load float, ptr addrspace(3) %gep.4, align 4
122*021def6cSStanislav Mekhanoshin  call void @llvm.amdgcn.wave.barrier()
123*021def6cSStanislav Mekhanoshin  %val.5 = load float, ptr addrspace(3) %gep.5, align 4
124*021def6cSStanislav Mekhanoshin  call void @llvm.amdgcn.wave.barrier()
125*021def6cSStanislav Mekhanoshin  %val.6 = load float, ptr addrspace(3) %gep.6, align 4
126*021def6cSStanislav Mekhanoshin  call void @llvm.amdgcn.wave.barrier()
127*021def6cSStanislav Mekhanoshin  %val.7 = load float, ptr addrspace(3) %gep.7, align 4
128*021def6cSStanislav Mekhanoshin  call void @llvm.amdgcn.wave.barrier()
129*021def6cSStanislav Mekhanoshin  %val.8 = load float, ptr addrspace(3) %gep.8, align 4
130*021def6cSStanislav Mekhanoshin  call void @llvm.amdgcn.wave.barrier()
131*021def6cSStanislav Mekhanoshin  %val.9 = load float, ptr addrspace(3) %gep.9, align 4
132*021def6cSStanislav Mekhanoshin  %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
133*021def6cSStanislav Mekhanoshin  %out.gep.2 = getelementptr float, ptr addrspace(1) %out, i32 2
134*021def6cSStanislav Mekhanoshin  %out.gep.3 = getelementptr float, ptr addrspace(1) %out, i32 3
135*021def6cSStanislav Mekhanoshin  %out.gep.4 = getelementptr float, ptr addrspace(1) %out, i32 4
136*021def6cSStanislav Mekhanoshin  %out.gep.5 = getelementptr float, ptr addrspace(1) %out, i32 5
137*021def6cSStanislav Mekhanoshin  %out.gep.6 = getelementptr float, ptr addrspace(1) %out, i32 6
138*021def6cSStanislav Mekhanoshin  %out.gep.7 = getelementptr float, ptr addrspace(1) %out, i32 7
139*021def6cSStanislav Mekhanoshin  %out.gep.8 = getelementptr float, ptr addrspace(1) %out, i32 8
140*021def6cSStanislav Mekhanoshin  %out.gep.9 = getelementptr float, ptr addrspace(1) %out, i32 9
141*021def6cSStanislav Mekhanoshin  store float %val.0, ptr addrspace(1) %out
142*021def6cSStanislav Mekhanoshin  store float %val.1, ptr addrspace(1) %out.gep.1
143*021def6cSStanislav Mekhanoshin  store float %val.2, ptr addrspace(1) %out.gep.2
144*021def6cSStanislav Mekhanoshin  store float %val.3, ptr addrspace(1) %out.gep.3
145*021def6cSStanislav Mekhanoshin  store float %val.4, ptr addrspace(1) %out.gep.4
146*021def6cSStanislav Mekhanoshin  store float %val.5, ptr addrspace(1) %out.gep.5
147*021def6cSStanislav Mekhanoshin  store float %val.6, ptr addrspace(1) %out.gep.6
148*021def6cSStanislav Mekhanoshin  store float %val.7, ptr addrspace(1) %out.gep.7
149*021def6cSStanislav Mekhanoshin  store float %val.8, ptr addrspace(1) %out.gep.8
150*021def6cSStanislav Mekhanoshin  store float %val.9, ptr addrspace(1) %out.gep.9
151*021def6cSStanislav Mekhanoshin  ret void
152*021def6cSStanislav Mekhanoshin}
153*021def6cSStanislav Mekhanoshin
1547f540701SStanislav Mekhanoshindeclare void @llvm.amdgcn.wave.barrier()
155