xref: /llvm-project/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll (revision cc3aab580b680e8566e9f7a1ff9feff895ecfc49)
1*cc3aab58SAcim Maravic; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2*cc3aab58SAcim Maravic; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefix=GFX12 %s
3*cc3aab58SAcim Maravic; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefix=GFX12 %s
4*cc3aab58SAcim Maravic
5*cc3aab58SAcim Maravic
6*cc3aab58SAcim Maravicdefine amdgpu_kernel void @buffer_last_use_load_0(ptr addrspace(7) %in, ptr addrspace(7) %out) {
7*cc3aab58SAcim Maravic; GFX12-LABEL: buffer_last_use_load_0:
8*cc3aab58SAcim Maravic; GFX12:       ; %bb.0: ; %entry
9*cc3aab58SAcim Maravic; GFX12-NEXT:    s_clause 0x2
10*cc3aab58SAcim Maravic; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
11*cc3aab58SAcim Maravic; GFX12-NEXT:    s_load_b128 s[8:11], s[4:5], 0x20
12*cc3aab58SAcim Maravic; GFX12-NEXT:    s_load_b32 s6, s[4:5], 0x10
13*cc3aab58SAcim Maravic; GFX12-NEXT:    s_wait_kmcnt 0x0
14*cc3aab58SAcim Maravic; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
15*cc3aab58SAcim Maravic; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
16*cc3aab58SAcim Maravic; GFX12-NEXT:    v_dual_mov_b32 v7, s8 :: v_dual_mov_b32 v8, s9
17*cc3aab58SAcim Maravic; GFX12-NEXT:    v_dual_mov_b32 v9, s10 :: v_dual_mov_b32 v10, s11
18*cc3aab58SAcim Maravic; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:32
19*cc3aab58SAcim Maravic; GFX12-NEXT:    s_clause 0x1
20*cc3aab58SAcim Maravic; GFX12-NEXT:    scratch_load_b64 v[5:6], off, off offset:40
21*cc3aab58SAcim Maravic; GFX12-NEXT:    scratch_load_b32 v4, off, off offset:36
22*cc3aab58SAcim Maravic; GFX12-NEXT:    s_load_b32 s1, s[4:5], 0x30
23*cc3aab58SAcim Maravic; GFX12-NEXT:    scratch_store_b128 off, v[7:10], off
24*cc3aab58SAcim Maravic; GFX12-NEXT:    s_clause 0x1
25*cc3aab58SAcim Maravic; GFX12-NEXT:    scratch_load_b64 v[1:2], off, off offset:8
26*cc3aab58SAcim Maravic; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:4
27*cc3aab58SAcim Maravic; GFX12-NEXT:    v_mov_b32_e32 v7, s6
28*cc3aab58SAcim Maravic; GFX12-NEXT:    v_mov_b32_e32 v9, s0
29*cc3aab58SAcim Maravic; GFX12-NEXT:    s_wait_kmcnt 0x0
30*cc3aab58SAcim Maravic; GFX12-NEXT:    v_mov_b32_e32 v3, s1
31*cc3aab58SAcim Maravic; GFX12-NEXT:    s_mov_b32 s1, exec_lo
32*cc3aab58SAcim Maravic; GFX12-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
33*cc3aab58SAcim Maravic; GFX12-NEXT:    s_wait_loadcnt 0x2
34*cc3aab58SAcim Maravic; GFX12-NEXT:    v_readfirstlane_b32 s4, v4
35*cc3aab58SAcim Maravic; GFX12-NEXT:    v_readfirstlane_b32 s5, v5
36*cc3aab58SAcim Maravic; GFX12-NEXT:    v_readfirstlane_b32 s6, v6
37*cc3aab58SAcim Maravic; GFX12-NEXT:    v_readfirstlane_b32 s7, v7
38*cc3aab58SAcim Maravic; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
39*cc3aab58SAcim Maravic; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
40*cc3aab58SAcim Maravic; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[6:7]
41*cc3aab58SAcim Maravic; GFX12-NEXT:    s_wait_alu 0xfffe
42*cc3aab58SAcim Maravic; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
43*cc3aab58SAcim Maravic; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
44*cc3aab58SAcim Maravic; GFX12-NEXT:    s_wait_alu 0xfffe
45*cc3aab58SAcim Maravic; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
46*cc3aab58SAcim Maravic; GFX12-NEXT:    s_wait_loadcnt 0x0
47*cc3aab58SAcim Maravic; GFX12-NEXT:    buffer_load_b32 v8, v9, s[4:7], null offen th:TH_LOAD_LU
48*cc3aab58SAcim Maravic; GFX12-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
49*cc3aab58SAcim Maravic; GFX12-NEXT:    ; implicit-def: $vgpr9
50*cc3aab58SAcim Maravic; GFX12-NEXT:    s_wait_alu 0xfffe
51*cc3aab58SAcim Maravic; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
52*cc3aab58SAcim Maravic; GFX12-NEXT:    s_cbranch_execnz .LBB0_1
53*cc3aab58SAcim Maravic; GFX12-NEXT:  ; %bb.2:
54*cc3aab58SAcim Maravic; GFX12-NEXT:    s_mov_b32 exec_lo, s1
55*cc3aab58SAcim Maravic; GFX12-NEXT:    v_mov_b32_e32 v4, s8
56*cc3aab58SAcim Maravic; GFX12-NEXT:    s_mov_b32 s0, exec_lo
57*cc3aab58SAcim Maravic; GFX12-NEXT:  .LBB0_3: ; =>This Inner Loop Header: Depth=1
58*cc3aab58SAcim Maravic; GFX12-NEXT:    s_wait_loadcnt 0x1
59*cc3aab58SAcim Maravic; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
60*cc3aab58SAcim Maravic; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
61*cc3aab58SAcim Maravic; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
62*cc3aab58SAcim Maravic; GFX12-NEXT:    v_readfirstlane_b32 s7, v3
63*cc3aab58SAcim Maravic; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
64*cc3aab58SAcim Maravic; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
65*cc3aab58SAcim Maravic; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
66*cc3aab58SAcim Maravic; GFX12-NEXT:    s_wait_alu 0xfffe
67*cc3aab58SAcim Maravic; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
68*cc3aab58SAcim Maravic; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
69*cc3aab58SAcim Maravic; GFX12-NEXT:    s_wait_alu 0xfffe
70*cc3aab58SAcim Maravic; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
71*cc3aab58SAcim Maravic; GFX12-NEXT:    s_wait_loadcnt 0x0
72*cc3aab58SAcim Maravic; GFX12-NEXT:    buffer_store_b32 v8, v4, s[4:7], null offen
73*cc3aab58SAcim Maravic; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
74*cc3aab58SAcim Maravic; GFX12-NEXT:    ; implicit-def: $vgpr8
75*cc3aab58SAcim Maravic; GFX12-NEXT:    ; implicit-def: $vgpr4
76*cc3aab58SAcim Maravic; GFX12-NEXT:    s_wait_alu 0xfffe
77*cc3aab58SAcim Maravic; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
78*cc3aab58SAcim Maravic; GFX12-NEXT:    s_cbranch_execnz .LBB0_3
79*cc3aab58SAcim Maravic; GFX12-NEXT:  ; %bb.4:
80*cc3aab58SAcim Maravic; GFX12-NEXT:    s_endpgm
81*cc3aab58SAcim Maravicentry:
82*cc3aab58SAcim Maravic  %val = load i32, ptr addrspace(7) %in, !amdgpu.last.use !{}
83*cc3aab58SAcim Maravic  store i32 %val, ptr addrspace(7) %out
84*cc3aab58SAcim Maravic  ret void
85*cc3aab58SAcim Maravic}
86*cc3aab58SAcim Maravic
87*cc3aab58SAcim Maravicdefine amdgpu_kernel void @buffer_last_use_load_1(ptr addrspace(7) %in, ptr addrspace(7) %out) {
88*cc3aab58SAcim Maravic; GFX12-LABEL: buffer_last_use_load_1:
89*cc3aab58SAcim Maravic; GFX12:       ; %bb.0: ; %entry
90*cc3aab58SAcim Maravic; GFX12-NEXT:    s_clause 0x2
91*cc3aab58SAcim Maravic; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
92*cc3aab58SAcim Maravic; GFX12-NEXT:    s_load_b128 s[8:11], s[4:5], 0x20
93*cc3aab58SAcim Maravic; GFX12-NEXT:    s_load_b32 s6, s[4:5], 0x10
94*cc3aab58SAcim Maravic; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
95*cc3aab58SAcim Maravic; GFX12-NEXT:    s_wait_kmcnt 0x0
96*cc3aab58SAcim Maravic; GFX12-NEXT:    v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2
97*cc3aab58SAcim Maravic; GFX12-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
98*cc3aab58SAcim Maravic; GFX12-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
99*cc3aab58SAcim Maravic; GFX12-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
100*cc3aab58SAcim Maravic; GFX12-NEXT:    scratch_store_b128 off, v[1:4], off offset:32
101*cc3aab58SAcim Maravic; GFX12-NEXT:    s_clause 0x1
102*cc3aab58SAcim Maravic; GFX12-NEXT:    scratch_load_b64 v[6:7], off, off offset:40
103*cc3aab58SAcim Maravic; GFX12-NEXT:    scratch_load_b32 v5, off, off offset:36
104*cc3aab58SAcim Maravic; GFX12-NEXT:    s_load_b32 s1, s[4:5], 0x30
105*cc3aab58SAcim Maravic; GFX12-NEXT:    scratch_store_b128 off, v[8:11], off
106*cc3aab58SAcim Maravic; GFX12-NEXT:    s_clause 0x1
107*cc3aab58SAcim Maravic; GFX12-NEXT:    scratch_load_b64 v[2:3], off, off offset:8
108*cc3aab58SAcim Maravic; GFX12-NEXT:    scratch_load_b32 v1, off, off offset:4
109*cc3aab58SAcim Maravic; GFX12-NEXT:    v_mov_b32_e32 v8, s6
110*cc3aab58SAcim Maravic; GFX12-NEXT:    v_lshl_add_u32 v9, v0, 2, s0
111*cc3aab58SAcim Maravic; GFX12-NEXT:    s_wait_kmcnt 0x0
112*cc3aab58SAcim Maravic; GFX12-NEXT:    v_mov_b32_e32 v4, s1
113*cc3aab58SAcim Maravic; GFX12-NEXT:    s_mov_b32 s1, exec_lo
114*cc3aab58SAcim Maravic; GFX12-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
115*cc3aab58SAcim Maravic; GFX12-NEXT:    s_wait_loadcnt 0x2
116*cc3aab58SAcim Maravic; GFX12-NEXT:    v_readfirstlane_b32 s4, v5
117*cc3aab58SAcim Maravic; GFX12-NEXT:    v_readfirstlane_b32 s5, v6
118*cc3aab58SAcim Maravic; GFX12-NEXT:    v_readfirstlane_b32 s6, v7
119*cc3aab58SAcim Maravic; GFX12-NEXT:    v_readfirstlane_b32 s7, v8
120*cc3aab58SAcim Maravic; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
121*cc3aab58SAcim Maravic; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[5:6]
122*cc3aab58SAcim Maravic; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
123*cc3aab58SAcim Maravic; GFX12-NEXT:    s_wait_alu 0xfffe
124*cc3aab58SAcim Maravic; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
125*cc3aab58SAcim Maravic; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
126*cc3aab58SAcim Maravic; GFX12-NEXT:    s_wait_alu 0xfffe
127*cc3aab58SAcim Maravic; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
128*cc3aab58SAcim Maravic; GFX12-NEXT:    s_wait_loadcnt 0x0
129*cc3aab58SAcim Maravic; GFX12-NEXT:    buffer_load_b32 v0, v9, s[4:7], null offen th:TH_LOAD_LU
130*cc3aab58SAcim Maravic; GFX12-NEXT:    ; implicit-def: $vgpr5_vgpr6_vgpr7_vgpr8
131*cc3aab58SAcim Maravic; GFX12-NEXT:    ; implicit-def: $vgpr9
132*cc3aab58SAcim Maravic; GFX12-NEXT:    s_wait_alu 0xfffe
133*cc3aab58SAcim Maravic; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
134*cc3aab58SAcim Maravic; GFX12-NEXT:    s_cbranch_execnz .LBB1_1
135*cc3aab58SAcim Maravic; GFX12-NEXT:  ; %bb.2:
136*cc3aab58SAcim Maravic; GFX12-NEXT:    s_mov_b32 exec_lo, s1
137*cc3aab58SAcim Maravic; GFX12-NEXT:    v_mov_b32_e32 v5, s8
138*cc3aab58SAcim Maravic; GFX12-NEXT:    s_mov_b32 s0, exec_lo
139*cc3aab58SAcim Maravic; GFX12-NEXT:  .LBB1_3: ; =>This Inner Loop Header: Depth=1
140*cc3aab58SAcim Maravic; GFX12-NEXT:    s_wait_loadcnt 0x1
141*cc3aab58SAcim Maravic; GFX12-NEXT:    v_readfirstlane_b32 s4, v1
142*cc3aab58SAcim Maravic; GFX12-NEXT:    v_readfirstlane_b32 s5, v2
143*cc3aab58SAcim Maravic; GFX12-NEXT:    v_readfirstlane_b32 s6, v3
144*cc3aab58SAcim Maravic; GFX12-NEXT:    v_readfirstlane_b32 s7, v4
145*cc3aab58SAcim Maravic; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
146*cc3aab58SAcim Maravic; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
147*cc3aab58SAcim Maravic; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
148*cc3aab58SAcim Maravic; GFX12-NEXT:    s_wait_alu 0xfffe
149*cc3aab58SAcim Maravic; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
150*cc3aab58SAcim Maravic; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
151*cc3aab58SAcim Maravic; GFX12-NEXT:    s_wait_alu 0xfffe
152*cc3aab58SAcim Maravic; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
153*cc3aab58SAcim Maravic; GFX12-NEXT:    s_wait_loadcnt 0x0
154*cc3aab58SAcim Maravic; GFX12-NEXT:    buffer_store_b32 v0, v5, s[4:7], null offen
155*cc3aab58SAcim Maravic; GFX12-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
156*cc3aab58SAcim Maravic; GFX12-NEXT:    ; implicit-def: $vgpr0
157*cc3aab58SAcim Maravic; GFX12-NEXT:    ; implicit-def: $vgpr5
158*cc3aab58SAcim Maravic; GFX12-NEXT:    s_wait_alu 0xfffe
159*cc3aab58SAcim Maravic; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
160*cc3aab58SAcim Maravic; GFX12-NEXT:    s_cbranch_execnz .LBB1_3
161*cc3aab58SAcim Maravic; GFX12-NEXT:  ; %bb.4:
162*cc3aab58SAcim Maravic; GFX12-NEXT:    s_endpgm
163*cc3aab58SAcim Maravicentry:
164*cc3aab58SAcim Maravic  %tid = call i32 @llvm.amdgcn.workitem.id.x()
165*cc3aab58SAcim Maravic  %val.gep = getelementptr inbounds i32, ptr addrspace(7) %in, i32 %tid
166*cc3aab58SAcim Maravic  %val = load i32, ptr addrspace(7) %val.gep, align 4, !amdgpu.last.use !{}
167*cc3aab58SAcim Maravic  store i32 %val, ptr addrspace(7) %out
168*cc3aab58SAcim Maravic  ret void
169*cc3aab58SAcim Maravic}
170*cc3aab58SAcim Maravic
171*cc3aab58SAcim Maravicdefine amdgpu_kernel void @buffer_last_use_and_volatile_load(ptr addrspace(7) %in, ptr addrspace(7) %out) {
172*cc3aab58SAcim Maravic; GFX12-LABEL: buffer_last_use_and_volatile_load:
173*cc3aab58SAcim Maravic; GFX12:       ; %bb.0: ; %entry
174*cc3aab58SAcim Maravic; GFX12-NEXT:    s_clause 0x2
175*cc3aab58SAcim Maravic; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
176*cc3aab58SAcim Maravic; GFX12-NEXT:    s_load_b128 s[8:11], s[4:5], 0x20
177*cc3aab58SAcim Maravic; GFX12-NEXT:    s_load_b32 s6, s[4:5], 0x10
178*cc3aab58SAcim Maravic; GFX12-NEXT:    s_wait_kmcnt 0x0
179*cc3aab58SAcim Maravic; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
180*cc3aab58SAcim Maravic; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
181*cc3aab58SAcim Maravic; GFX12-NEXT:    v_dual_mov_b32 v7, s8 :: v_dual_mov_b32 v8, s9
182*cc3aab58SAcim Maravic; GFX12-NEXT:    v_dual_mov_b32 v9, s10 :: v_dual_mov_b32 v10, s11
183*cc3aab58SAcim Maravic; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:32
184*cc3aab58SAcim Maravic; GFX12-NEXT:    s_clause 0x1
185*cc3aab58SAcim Maravic; GFX12-NEXT:    scratch_load_b64 v[5:6], off, off offset:40
186*cc3aab58SAcim Maravic; GFX12-NEXT:    scratch_load_b32 v4, off, off offset:36
187*cc3aab58SAcim Maravic; GFX12-NEXT:    s_load_b32 s1, s[4:5], 0x30
188*cc3aab58SAcim Maravic; GFX12-NEXT:    scratch_store_b128 off, v[7:10], off
189*cc3aab58SAcim Maravic; GFX12-NEXT:    s_clause 0x1
190*cc3aab58SAcim Maravic; GFX12-NEXT:    scratch_load_b64 v[1:2], off, off offset:8
191*cc3aab58SAcim Maravic; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:4
192*cc3aab58SAcim Maravic; GFX12-NEXT:    v_mov_b32_e32 v7, s6
193*cc3aab58SAcim Maravic; GFX12-NEXT:    v_mov_b32_e32 v9, s0
194*cc3aab58SAcim Maravic; GFX12-NEXT:    s_wait_kmcnt 0x0
195*cc3aab58SAcim Maravic; GFX12-NEXT:    v_mov_b32_e32 v3, s1
196*cc3aab58SAcim Maravic; GFX12-NEXT:    s_mov_b32 s1, exec_lo
197*cc3aab58SAcim Maravic; GFX12-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
198*cc3aab58SAcim Maravic; GFX12-NEXT:    s_wait_loadcnt 0x2
199*cc3aab58SAcim Maravic; GFX12-NEXT:    v_readfirstlane_b32 s4, v4
200*cc3aab58SAcim Maravic; GFX12-NEXT:    v_readfirstlane_b32 s5, v5
201*cc3aab58SAcim Maravic; GFX12-NEXT:    v_readfirstlane_b32 s6, v6
202*cc3aab58SAcim Maravic; GFX12-NEXT:    v_readfirstlane_b32 s7, v7
203*cc3aab58SAcim Maravic; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
204*cc3aab58SAcim Maravic; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
205*cc3aab58SAcim Maravic; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[6:7]
206*cc3aab58SAcim Maravic; GFX12-NEXT:    s_wait_alu 0xfffe
207*cc3aab58SAcim Maravic; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
208*cc3aab58SAcim Maravic; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
209*cc3aab58SAcim Maravic; GFX12-NEXT:    s_wait_alu 0xfffe
210*cc3aab58SAcim Maravic; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
211*cc3aab58SAcim Maravic; GFX12-NEXT:    s_wait_loadcnt 0x0
212*cc3aab58SAcim Maravic; GFX12-NEXT:    buffer_load_b32 v8, v9, s[4:7], null offen th:TH_LOAD_BYPASS scope:SCOPE_SYS
213*cc3aab58SAcim Maravic; GFX12-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
214*cc3aab58SAcim Maravic; GFX12-NEXT:    ; implicit-def: $vgpr9
215*cc3aab58SAcim Maravic; GFX12-NEXT:    s_wait_alu 0xfffe
216*cc3aab58SAcim Maravic; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
217*cc3aab58SAcim Maravic; GFX12-NEXT:    s_cbranch_execnz .LBB2_1
218*cc3aab58SAcim Maravic; GFX12-NEXT:  ; %bb.2:
219*cc3aab58SAcim Maravic; GFX12-NEXT:    s_mov_b32 exec_lo, s1
220*cc3aab58SAcim Maravic; GFX12-NEXT:    v_mov_b32_e32 v4, s8
221*cc3aab58SAcim Maravic; GFX12-NEXT:    s_mov_b32 s0, exec_lo
222*cc3aab58SAcim Maravic; GFX12-NEXT:  .LBB2_3: ; =>This Inner Loop Header: Depth=1
223*cc3aab58SAcim Maravic; GFX12-NEXT:    s_wait_loadcnt 0x1
224*cc3aab58SAcim Maravic; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
225*cc3aab58SAcim Maravic; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
226*cc3aab58SAcim Maravic; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
227*cc3aab58SAcim Maravic; GFX12-NEXT:    v_readfirstlane_b32 s7, v3
228*cc3aab58SAcim Maravic; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
229*cc3aab58SAcim Maravic; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
230*cc3aab58SAcim Maravic; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
231*cc3aab58SAcim Maravic; GFX12-NEXT:    s_wait_alu 0xfffe
232*cc3aab58SAcim Maravic; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
233*cc3aab58SAcim Maravic; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
234*cc3aab58SAcim Maravic; GFX12-NEXT:    s_wait_alu 0xfffe
235*cc3aab58SAcim Maravic; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
236*cc3aab58SAcim Maravic; GFX12-NEXT:    s_wait_loadcnt 0x0
237*cc3aab58SAcim Maravic; GFX12-NEXT:    buffer_store_b32 v8, v4, s[4:7], null offen
238*cc3aab58SAcim Maravic; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
239*cc3aab58SAcim Maravic; GFX12-NEXT:    ; implicit-def: $vgpr8
240*cc3aab58SAcim Maravic; GFX12-NEXT:    ; implicit-def: $vgpr4
241*cc3aab58SAcim Maravic; GFX12-NEXT:    s_wait_alu 0xfffe
242*cc3aab58SAcim Maravic; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
243*cc3aab58SAcim Maravic; GFX12-NEXT:    s_cbranch_execnz .LBB2_3
244*cc3aab58SAcim Maravic; GFX12-NEXT:  ; %bb.4:
245*cc3aab58SAcim Maravic; GFX12-NEXT:    s_endpgm
246*cc3aab58SAcim Maravicentry:
247*cc3aab58SAcim Maravic  %val = load volatile i32, ptr addrspace(7) %in, !amdgpu.last.use !{}
248*cc3aab58SAcim Maravic  store i32 %val, ptr addrspace(7) %out
249*cc3aab58SAcim Maravic  ret void
250*cc3aab58SAcim Maravic}
251*cc3aab58SAcim Maravic
252*cc3aab58SAcim Maravicdefine amdgpu_kernel void @buffer_last_use_and_nontemporal_load(ptr addrspace(7) %in, ptr addrspace(7) %out) {
253*cc3aab58SAcim Maravic; GFX12-LABEL: buffer_last_use_and_nontemporal_load:
254*cc3aab58SAcim Maravic; GFX12:       ; %bb.0: ; %entry
255*cc3aab58SAcim Maravic; GFX12-NEXT:    s_clause 0x2
256*cc3aab58SAcim Maravic; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
257*cc3aab58SAcim Maravic; GFX12-NEXT:    s_load_b128 s[8:11], s[4:5], 0x20
258*cc3aab58SAcim Maravic; GFX12-NEXT:    s_load_b32 s6, s[4:5], 0x10
259*cc3aab58SAcim Maravic; GFX12-NEXT:    s_wait_kmcnt 0x0
260*cc3aab58SAcim Maravic; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
261*cc3aab58SAcim Maravic; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
262*cc3aab58SAcim Maravic; GFX12-NEXT:    v_dual_mov_b32 v7, s8 :: v_dual_mov_b32 v8, s9
263*cc3aab58SAcim Maravic; GFX12-NEXT:    v_dual_mov_b32 v9, s10 :: v_dual_mov_b32 v10, s11
264*cc3aab58SAcim Maravic; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:32
265*cc3aab58SAcim Maravic; GFX12-NEXT:    s_clause 0x1
266*cc3aab58SAcim Maravic; GFX12-NEXT:    scratch_load_b64 v[5:6], off, off offset:40
267*cc3aab58SAcim Maravic; GFX12-NEXT:    scratch_load_b32 v4, off, off offset:36
268*cc3aab58SAcim Maravic; GFX12-NEXT:    s_load_b32 s1, s[4:5], 0x30
269*cc3aab58SAcim Maravic; GFX12-NEXT:    scratch_store_b128 off, v[7:10], off
270*cc3aab58SAcim Maravic; GFX12-NEXT:    s_clause 0x1
271*cc3aab58SAcim Maravic; GFX12-NEXT:    scratch_load_b64 v[1:2], off, off offset:8
272*cc3aab58SAcim Maravic; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:4
273*cc3aab58SAcim Maravic; GFX12-NEXT:    v_mov_b32_e32 v7, s6
274*cc3aab58SAcim Maravic; GFX12-NEXT:    v_mov_b32_e32 v9, s0
275*cc3aab58SAcim Maravic; GFX12-NEXT:    s_wait_kmcnt 0x0
276*cc3aab58SAcim Maravic; GFX12-NEXT:    v_mov_b32_e32 v3, s1
277*cc3aab58SAcim Maravic; GFX12-NEXT:    s_mov_b32 s1, exec_lo
278*cc3aab58SAcim Maravic; GFX12-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
279*cc3aab58SAcim Maravic; GFX12-NEXT:    s_wait_loadcnt 0x2
280*cc3aab58SAcim Maravic; GFX12-NEXT:    v_readfirstlane_b32 s4, v4
281*cc3aab58SAcim Maravic; GFX12-NEXT:    v_readfirstlane_b32 s5, v5
282*cc3aab58SAcim Maravic; GFX12-NEXT:    v_readfirstlane_b32 s6, v6
283*cc3aab58SAcim Maravic; GFX12-NEXT:    v_readfirstlane_b32 s7, v7
284*cc3aab58SAcim Maravic; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
285*cc3aab58SAcim Maravic; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
286*cc3aab58SAcim Maravic; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[6:7]
287*cc3aab58SAcim Maravic; GFX12-NEXT:    s_wait_alu 0xfffe
288*cc3aab58SAcim Maravic; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
289*cc3aab58SAcim Maravic; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
290*cc3aab58SAcim Maravic; GFX12-NEXT:    s_wait_alu 0xfffe
291*cc3aab58SAcim Maravic; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
292*cc3aab58SAcim Maravic; GFX12-NEXT:    s_wait_loadcnt 0x0
293*cc3aab58SAcim Maravic; GFX12-NEXT:    buffer_load_b32 v8, v9, s[4:7], null offen th:TH_LOAD_LU
294*cc3aab58SAcim Maravic; GFX12-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
295*cc3aab58SAcim Maravic; GFX12-NEXT:    ; implicit-def: $vgpr9
296*cc3aab58SAcim Maravic; GFX12-NEXT:    s_wait_alu 0xfffe
297*cc3aab58SAcim Maravic; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
298*cc3aab58SAcim Maravic; GFX12-NEXT:    s_cbranch_execnz .LBB3_1
299*cc3aab58SAcim Maravic; GFX12-NEXT:  ; %bb.2:
300*cc3aab58SAcim Maravic; GFX12-NEXT:    s_mov_b32 exec_lo, s1
301*cc3aab58SAcim Maravic; GFX12-NEXT:    v_mov_b32_e32 v4, s8
302*cc3aab58SAcim Maravic; GFX12-NEXT:    s_mov_b32 s0, exec_lo
303*cc3aab58SAcim Maravic; GFX12-NEXT:  .LBB3_3: ; =>This Inner Loop Header: Depth=1
304*cc3aab58SAcim Maravic; GFX12-NEXT:    s_wait_loadcnt 0x1
305*cc3aab58SAcim Maravic; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
306*cc3aab58SAcim Maravic; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
307*cc3aab58SAcim Maravic; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
308*cc3aab58SAcim Maravic; GFX12-NEXT:    v_readfirstlane_b32 s7, v3
309*cc3aab58SAcim Maravic; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
310*cc3aab58SAcim Maravic; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
311*cc3aab58SAcim Maravic; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
312*cc3aab58SAcim Maravic; GFX12-NEXT:    s_wait_alu 0xfffe
313*cc3aab58SAcim Maravic; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
314*cc3aab58SAcim Maravic; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
315*cc3aab58SAcim Maravic; GFX12-NEXT:    s_wait_alu 0xfffe
316*cc3aab58SAcim Maravic; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
317*cc3aab58SAcim Maravic; GFX12-NEXT:    s_wait_loadcnt 0x0
318*cc3aab58SAcim Maravic; GFX12-NEXT:    buffer_store_b32 v8, v4, s[4:7], null offen
319*cc3aab58SAcim Maravic; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
320*cc3aab58SAcim Maravic; GFX12-NEXT:    ; implicit-def: $vgpr8
321*cc3aab58SAcim Maravic; GFX12-NEXT:    ; implicit-def: $vgpr4
322*cc3aab58SAcim Maravic; GFX12-NEXT:    s_wait_alu 0xfffe
323*cc3aab58SAcim Maravic; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
324*cc3aab58SAcim Maravic; GFX12-NEXT:    s_cbranch_execnz .LBB3_3
325*cc3aab58SAcim Maravic; GFX12-NEXT:  ; %bb.4:
326*cc3aab58SAcim Maravic; GFX12-NEXT:    s_endpgm
327*cc3aab58SAcim Maravicentry:
328*cc3aab58SAcim Maravic  %val = load i32, ptr addrspace(7) %in, !amdgpu.last.use !{}, !nontemporal !0
329*cc3aab58SAcim Maravic  store i32 %val, ptr addrspace(7) %out
330*cc3aab58SAcim Maravic  ret void
331*cc3aab58SAcim Maravic}
332*cc3aab58SAcim Maravic
333*cc3aab58SAcim Maravic!0 = !{i32 1}
334*cc3aab58SAcim Maravicdeclare i32 @llvm.amdgcn.workitem.id.x()
335