xref: /llvm-project/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd- -O0 -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
4; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
5; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
6; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
7; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
8; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
9; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
10; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
11
12define amdgpu_kernel void @private_volatile_load_0(
13; GFX6-LABEL: private_volatile_load_0:
14; GFX6:       ; %bb.0: ; %entry
15; GFX6-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
16; GFX6-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
17; GFX6-NEXT:    s_mov_b32 s14, -1
18; GFX6-NEXT:    s_mov_b32 s15, 0xe8f000
19; GFX6-NEXT:    s_add_u32 s12, s12, s11
20; GFX6-NEXT:    s_addc_u32 s13, s13, 0
21; GFX6-NEXT:    s_mov_b64 s[0:1], s[4:5]
22; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x9
23; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
24; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
25; GFX6-NEXT:    s_mov_b32 s7, s1
26; GFX6-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
27; GFX6-NEXT:    s_mov_b32 s5, 0xf000
28; GFX6-NEXT:    s_mov_b32 s6, -1
29; GFX6-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
30; GFX6-NEXT:    s_mov_b32 s1, s7
31; GFX6-NEXT:    s_mov_b32 s2, s6
32; GFX6-NEXT:    s_mov_b32 s3, s5
33; GFX6-NEXT:    v_mov_b32_e32 v0, s4
34; GFX6-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen glc
35; GFX6-NEXT:    s_waitcnt vmcnt(0)
36; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
37; GFX6-NEXT:    s_endpgm
38;
39; GFX7-LABEL: private_volatile_load_0:
40; GFX7:       ; %bb.0: ; %entry
41; GFX7-NEXT:    s_add_u32 s0, s0, s15
42; GFX7-NEXT:    s_addc_u32 s1, s1, 0
43; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
44; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
45; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
46; GFX7-NEXT:    v_mov_b32_e32 v0, s6
47; GFX7-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen glc
48; GFX7-NEXT:    s_waitcnt vmcnt(0)
49; GFX7-NEXT:    v_mov_b32_e32 v0, s4
50; GFX7-NEXT:    v_mov_b32_e32 v1, s5
51; GFX7-NEXT:    flat_store_dword v[0:1], v2
52; GFX7-NEXT:    s_endpgm
53;
54; GFX10-WGP-LABEL: private_volatile_load_0:
55; GFX10-WGP:       ; %bb.0: ; %entry
56; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
57; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
58; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
59; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
60; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
61; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
62; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
63; GFX10-WGP-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen glc dlc
64; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
65; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
66; GFX10-WGP-NEXT:    s_endpgm
67;
68; GFX10-CU-LABEL: private_volatile_load_0:
69; GFX10-CU:       ; %bb.0: ; %entry
70; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
71; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
72; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
73; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
74; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
75; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
76; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
77; GFX10-CU-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen glc dlc
78; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
79; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
80; GFX10-CU-NEXT:    s_endpgm
81;
82; SKIP-CACHE-INV-LABEL: private_volatile_load_0:
83; SKIP-CACHE-INV:       ; %bb.0: ; %entry
84; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
85; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
86; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
87; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
88; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
89; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
90; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
91; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
92; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
93; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
94; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
95; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
96; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
97; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
98; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
99; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
100; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
101; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
102; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
103; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen glc
104; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
105; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
106; SKIP-CACHE-INV-NEXT:    s_endpgm
107;
108; GFX11-WGP-LABEL: private_volatile_load_0:
109; GFX11-WGP:       ; %bb.0: ; %entry
110; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
111; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
112; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
113; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
114; GFX11-WGP-NEXT:    scratch_load_b32 v1, off, s2 glc dlc
115; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
116; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
117; GFX11-WGP-NEXT:    s_endpgm
118;
119; GFX11-CU-LABEL: private_volatile_load_0:
120; GFX11-CU:       ; %bb.0: ; %entry
121; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
122; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
123; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
124; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
125; GFX11-CU-NEXT:    scratch_load_b32 v1, off, s2 glc dlc
126; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
127; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
128; GFX11-CU-NEXT:    s_endpgm
129;
130; GFX12-WGP-LABEL: private_volatile_load_0:
131; GFX12-WGP:       ; %bb.0: ; %entry
132; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
133; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
134; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
135; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
136; GFX12-WGP-NEXT:    scratch_load_b32 v1, off, s2 scope:SCOPE_SYS
137; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
138; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
139; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
140; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
141; GFX12-WGP-NEXT:    s_endpgm
142;
143; GFX12-CU-LABEL: private_volatile_load_0:
144; GFX12-CU:       ; %bb.0: ; %entry
145; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
146; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
147; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
148; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
149; GFX12-CU-NEXT:    scratch_load_b32 v1, off, s2 scope:SCOPE_SYS
150; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
151; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
152; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
153; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
154; GFX12-CU-NEXT:    s_endpgm
155    ptr addrspace(5) %in, ptr addrspace(1) %out) {
156entry:
157  %val = load volatile i32, ptr addrspace(5) %in, align 4
158  store i32 %val, ptr addrspace(1) %out
159  ret void
160}
161
162define amdgpu_kernel void @private_volatile_load_1(
163; GFX6-LABEL: private_volatile_load_1:
164; GFX6:       ; %bb.0: ; %entry
165; GFX6-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
166; GFX6-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
167; GFX6-NEXT:    s_mov_b32 s14, -1
168; GFX6-NEXT:    s_mov_b32 s15, 0xe8f000
169; GFX6-NEXT:    s_add_u32 s12, s12, s11
170; GFX6-NEXT:    s_addc_u32 s13, s13, 0
171; GFX6-NEXT:    s_mov_b64 s[0:1], s[4:5]
172; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x9
173; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
174; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
175; GFX6-NEXT:    s_mov_b32 s7, s1
176; GFX6-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
177; GFX6-NEXT:    s_mov_b32 s5, 0xf000
178; GFX6-NEXT:    s_mov_b32 s6, -1
179; GFX6-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
180; GFX6-NEXT:    s_mov_b32 s1, s7
181; GFX6-NEXT:    s_mov_b32 s2, s6
182; GFX6-NEXT:    s_mov_b32 s3, s5
183; GFX6-NEXT:    s_mov_b32 s5, 2
184; GFX6-NEXT:    v_lshlrev_b32_e64 v0, s5, v0
185; GFX6-NEXT:    v_add_i32_e64 v0, s[4:5], s4, v0
186; GFX6-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen glc
187; GFX6-NEXT:    s_waitcnt vmcnt(0)
188; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
189; GFX6-NEXT:    s_endpgm
190;
191; GFX7-LABEL: private_volatile_load_1:
192; GFX7:       ; %bb.0: ; %entry
193; GFX7-NEXT:    s_add_u32 s0, s0, s15
194; GFX7-NEXT:    s_addc_u32 s1, s1, 0
195; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
196; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
197; GFX7-NEXT:    s_mov_b32 s7, 2
198; GFX7-NEXT:    v_lshlrev_b32_e64 v0, s7, v0
199; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
200; GFX7-NEXT:    v_add_i32_e64 v0, s[6:7], s6, v0
201; GFX7-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen glc
202; GFX7-NEXT:    s_waitcnt vmcnt(0)
203; GFX7-NEXT:    v_mov_b32_e32 v0, s4
204; GFX7-NEXT:    v_mov_b32_e32 v1, s5
205; GFX7-NEXT:    flat_store_dword v[0:1], v2
206; GFX7-NEXT:    s_endpgm
207;
208; GFX10-WGP-LABEL: private_volatile_load_1:
209; GFX10-WGP:       ; %bb.0: ; %entry
210; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
211; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
212; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, v0
213; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x0
214; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
215; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
216; GFX10-WGP-NEXT:    s_mov_b32 s6, 2
217; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
218; GFX10-WGP-NEXT:    v_lshl_add_u32 v1, v1, s6, s7
219; GFX10-WGP-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen glc dlc
220; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
221; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
222; GFX10-WGP-NEXT:    s_endpgm
223;
224; GFX10-CU-LABEL: private_volatile_load_1:
225; GFX10-CU:       ; %bb.0: ; %entry
226; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
227; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
228; GFX10-CU-NEXT:    v_mov_b32_e32 v1, v0
229; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x0
230; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
231; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
232; GFX10-CU-NEXT:    s_mov_b32 s6, 2
233; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
234; GFX10-CU-NEXT:    v_lshl_add_u32 v1, v1, s6, s7
235; GFX10-CU-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen glc dlc
236; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
237; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
238; GFX10-CU-NEXT:    s_endpgm
239;
240; SKIP-CACHE-INV-LABEL: private_volatile_load_1:
241; SKIP-CACHE-INV:       ; %bb.0: ; %entry
242; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
243; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
244; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
245; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
246; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
247; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
248; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
249; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
250; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
251; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
252; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
253; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
254; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
255; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
256; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
257; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
258; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
259; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
260; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 2
261; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e64 v0, s5, v0
262; SKIP-CACHE-INV-NEXT:    v_add_i32_e64 v0, s[4:5], s4, v0
263; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen glc
264; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
265; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
266; SKIP-CACHE-INV-NEXT:    s_endpgm
267;
268; GFX11-WGP-LABEL: private_volatile_load_1:
269; GFX11-WGP:       ; %bb.0: ; %entry
270; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, v0
271; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x0
272; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
273; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
274; GFX11-WGP-NEXT:    s_mov_b32 s2, 0x3ff
275; GFX11-WGP-NEXT:    v_and_b32_e64 v1, v1, s2
276; GFX11-WGP-NEXT:    s_mov_b32 s2, 2
277; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
278; GFX11-WGP-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
279; GFX11-WGP-NEXT:    scratch_load_b32 v1, v1, off glc dlc
280; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
281; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
282; GFX11-WGP-NEXT:    s_endpgm
283;
284; GFX11-CU-LABEL: private_volatile_load_1:
285; GFX11-CU:       ; %bb.0: ; %entry
286; GFX11-CU-NEXT:    v_mov_b32_e32 v1, v0
287; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x0
288; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
289; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
290; GFX11-CU-NEXT:    s_mov_b32 s2, 0x3ff
291; GFX11-CU-NEXT:    v_and_b32_e64 v1, v1, s2
292; GFX11-CU-NEXT:    s_mov_b32 s2, 2
293; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
294; GFX11-CU-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
295; GFX11-CU-NEXT:    scratch_load_b32 v1, v1, off glc dlc
296; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
297; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
298; GFX11-CU-NEXT:    s_endpgm
299;
300; GFX12-WGP-LABEL: private_volatile_load_1:
301; GFX12-WGP:       ; %bb.0: ; %entry
302; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, v0
303; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
304; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
305; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
306; GFX12-WGP-NEXT:    s_mov_b32 s3, 0x3ff
307; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
308; GFX12-WGP-NEXT:    v_and_b32_e64 v1, v1, s3
309; GFX12-WGP-NEXT:    s_mov_b32 s3, 2
310; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
311; GFX12-WGP-NEXT:    v_lshlrev_b32_e64 v1, s3, v1
312; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
313; GFX12-WGP-NEXT:    scratch_load_b32 v1, v1, s2 scope:SCOPE_SYS
314; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
315; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
316; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
317; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
318; GFX12-WGP-NEXT:    s_endpgm
319;
320; GFX12-CU-LABEL: private_volatile_load_1:
321; GFX12-CU:       ; %bb.0: ; %entry
322; GFX12-CU-NEXT:    v_mov_b32_e32 v1, v0
323; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
324; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
325; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
326; GFX12-CU-NEXT:    s_mov_b32 s3, 0x3ff
327; GFX12-CU-NEXT:    s_wait_alu 0xfffe
328; GFX12-CU-NEXT:    v_and_b32_e64 v1, v1, s3
329; GFX12-CU-NEXT:    s_mov_b32 s3, 2
330; GFX12-CU-NEXT:    s_wait_alu 0xfffe
331; GFX12-CU-NEXT:    v_lshlrev_b32_e64 v1, s3, v1
332; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
333; GFX12-CU-NEXT:    scratch_load_b32 v1, v1, s2 scope:SCOPE_SYS
334; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
335; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
336; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
337; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
338; GFX12-CU-NEXT:    s_endpgm
339    ptr addrspace(5) %in, ptr addrspace(1) %out) {
340entry:
341  %tid = call i32 @llvm.amdgcn.workitem.id.x()
342  %val.gep = getelementptr inbounds i32, ptr addrspace(5) %in, i32 %tid
343  %val = load volatile i32, ptr addrspace(5) %val.gep, align 4
344  store i32 %val, ptr addrspace(1) %out
345  ret void
346}
347
348define amdgpu_kernel void @private_volatile_store_0(
349; GFX6-LABEL: private_volatile_store_0:
350; GFX6:       ; %bb.0: ; %entry
351; GFX6-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
352; GFX6-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
353; GFX6-NEXT:    s_mov_b32 s14, -1
354; GFX6-NEXT:    s_mov_b32 s15, 0xe8f000
355; GFX6-NEXT:    s_add_u32 s12, s12, s11
356; GFX6-NEXT:    s_addc_u32 s13, s13, 0
357; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
358; GFX6-NEXT:    s_load_dword s0, s[4:5], 0xb
359; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
360; GFX6-NEXT:    s_load_dword s1, s[2:3], 0x0
361; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
362; GFX6-NEXT:    v_mov_b32_e32 v0, s1
363; GFX6-NEXT:    v_mov_b32_e32 v1, s0
364; GFX6-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
365; GFX6-NEXT:    s_waitcnt vmcnt(0)
366; GFX6-NEXT:    s_endpgm
367;
368; GFX7-LABEL: private_volatile_store_0:
369; GFX7:       ; %bb.0: ; %entry
370; GFX7-NEXT:    s_add_u32 s0, s0, s15
371; GFX7-NEXT:    s_addc_u32 s1, s1, 0
372; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
373; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
374; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
375; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
376; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
377; GFX7-NEXT:    v_mov_b32_e32 v0, s5
378; GFX7-NEXT:    v_mov_b32_e32 v1, s4
379; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
380; GFX7-NEXT:    s_waitcnt vmcnt(0)
381; GFX7-NEXT:    s_endpgm
382;
383; GFX10-WGP-LABEL: private_volatile_store_0:
384; GFX10-WGP:       ; %bb.0: ; %entry
385; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
386; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
387; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
388; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x8
389; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
390; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
391; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
392; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
393; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
394; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
395; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
396; GFX10-WGP-NEXT:    s_endpgm
397;
398; GFX10-CU-LABEL: private_volatile_store_0:
399; GFX10-CU:       ; %bb.0: ; %entry
400; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
401; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
402; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
403; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x8
404; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
405; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
406; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
407; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
408; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
409; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
410; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
411; GFX10-CU-NEXT:    s_endpgm
412;
413; SKIP-CACHE-INV-LABEL: private_volatile_store_0:
414; SKIP-CACHE-INV:       ; %bb.0: ; %entry
415; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
416; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
417; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
418; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
419; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
420; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
421; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
422; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x2
423; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
424; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
425; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
426; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
427; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
428; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
429; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
430; SKIP-CACHE-INV-NEXT:    s_endpgm
431;
432; GFX11-WGP-LABEL: private_volatile_store_0:
433; GFX11-WGP:       ; %bb.0: ; %entry
434; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
435; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
436; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
437; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
438; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
439; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
440; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0 dlc
441; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
442; GFX11-WGP-NEXT:    s_endpgm
443;
444; GFX11-CU-LABEL: private_volatile_store_0:
445; GFX11-CU:       ; %bb.0: ; %entry
446; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
447; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
448; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
449; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
450; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
451; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
452; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0 dlc
453; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
454; GFX11-CU-NEXT:    s_endpgm
455;
456; GFX12-WGP-LABEL: private_volatile_store_0:
457; GFX12-WGP:       ; %bb.0: ; %entry
458; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
459; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
460; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
461; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
462; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
463; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
464; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
465; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
466; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
467; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
468; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
469; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
470; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
471; GFX12-WGP-NEXT:    s_endpgm
472;
473; GFX12-CU-LABEL: private_volatile_store_0:
474; GFX12-CU:       ; %bb.0: ; %entry
475; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
476; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
477; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
478; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
479; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
480; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
481; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
482; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
483; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
484; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
485; GFX12-CU-NEXT:    s_wait_storecnt 0x0
486; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
487; GFX12-CU-NEXT:    s_wait_storecnt 0x0
488; GFX12-CU-NEXT:    s_endpgm
489    ptr addrspace(1) %in, ptr addrspace(5) %out) {
490entry:
491  %val = load i32, ptr addrspace(1) %in, align 4
492  store volatile i32 %val, ptr addrspace(5) %out
493  ret void
494}
495
496define amdgpu_kernel void @private_volatile_store_1(
497; GFX6-LABEL: private_volatile_store_1:
498; GFX6:       ; %bb.0: ; %entry
499; GFX6-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
500; GFX6-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
501; GFX6-NEXT:    s_mov_b32 s14, -1
502; GFX6-NEXT:    s_mov_b32 s15, 0xe8f000
503; GFX6-NEXT:    s_add_u32 s12, s12, s11
504; GFX6-NEXT:    s_addc_u32 s13, s13, 0
505; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
506; GFX6-NEXT:    s_load_dword s1, s[4:5], 0xb
507; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
508; GFX6-NEXT:    s_load_dword s0, s[2:3], 0x0
509; GFX6-NEXT:    s_mov_b32 s2, 2
510; GFX6-NEXT:    v_lshlrev_b32_e64 v0, s2, v0
511; GFX6-NEXT:    v_add_i32_e64 v1, s[2:3], s1, v0
512; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
513; GFX6-NEXT:    v_mov_b32_e32 v0, s0
514; GFX6-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
515; GFX6-NEXT:    s_waitcnt vmcnt(0)
516; GFX6-NEXT:    s_endpgm
517;
518; GFX7-LABEL: private_volatile_store_1:
519; GFX7:       ; %bb.0: ; %entry
520; GFX7-NEXT:    s_add_u32 s0, s0, s15
521; GFX7-NEXT:    s_addc_u32 s1, s1, 0
522; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
523; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
524; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
525; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
526; GFX7-NEXT:    s_mov_b32 s6, 2
527; GFX7-NEXT:    v_lshlrev_b32_e64 v0, s6, v0
528; GFX7-NEXT:    v_add_i32_e64 v1, s[6:7], s5, v0
529; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
530; GFX7-NEXT:    v_mov_b32_e32 v0, s4
531; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
532; GFX7-NEXT:    s_waitcnt vmcnt(0)
533; GFX7-NEXT:    s_endpgm
534;
535; GFX10-WGP-LABEL: private_volatile_store_1:
536; GFX10-WGP:       ; %bb.0: ; %entry
537; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
538; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
539; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
540; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
541; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
542; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x0
543; GFX10-WGP-NEXT:    s_mov_b32 s5, 2
544; GFX10-WGP-NEXT:    v_lshl_add_u32 v1, v0, s5, s6
545; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
546; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
547; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
548; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
549; GFX10-WGP-NEXT:    s_endpgm
550;
551; GFX10-CU-LABEL: private_volatile_store_1:
552; GFX10-CU:       ; %bb.0: ; %entry
553; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
554; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
555; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
556; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
557; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
558; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x0
559; GFX10-CU-NEXT:    s_mov_b32 s5, 2
560; GFX10-CU-NEXT:    v_lshl_add_u32 v1, v0, s5, s6
561; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
562; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
563; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
564; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
565; GFX10-CU-NEXT:    s_endpgm
566;
567; SKIP-CACHE-INV-LABEL: private_volatile_store_1:
568; SKIP-CACHE-INV:       ; %bb.0: ; %entry
569; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
570; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
571; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
572; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
573; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
574; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
575; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
576; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
577; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
578; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
579; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, 2
580; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e64 v0, s2, v0
581; SKIP-CACHE-INV-NEXT:    v_add_i32_e64 v1, s[2:3], s1, v0
582; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
583; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
584; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
585; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
586; SKIP-CACHE-INV-NEXT:    s_endpgm
587;
588; GFX11-WGP-LABEL: private_volatile_store_1:
589; GFX11-WGP:       ; %bb.0: ; %entry
590; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
591; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
592; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
593; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
594; GFX11-WGP-NEXT:    s_mov_b32 s1, 0x3ff
595; GFX11-WGP-NEXT:    v_and_b32_e64 v0, v0, s1
596; GFX11-WGP-NEXT:    s_mov_b32 s1, 2
597; GFX11-WGP-NEXT:    v_lshl_add_u32 v1, v0, s1, s2
598; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
599; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
600; GFX11-WGP-NEXT:    scratch_store_b32 v1, v0, off dlc
601; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
602; GFX11-WGP-NEXT:    s_endpgm
603;
604; GFX11-CU-LABEL: private_volatile_store_1:
605; GFX11-CU:       ; %bb.0: ; %entry
606; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
607; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
608; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
609; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
610; GFX11-CU-NEXT:    s_mov_b32 s1, 0x3ff
611; GFX11-CU-NEXT:    v_and_b32_e64 v0, v0, s1
612; GFX11-CU-NEXT:    s_mov_b32 s1, 2
613; GFX11-CU-NEXT:    v_lshl_add_u32 v1, v0, s1, s2
614; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
615; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
616; GFX11-CU-NEXT:    scratch_store_b32 v1, v0, off dlc
617; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
618; GFX11-CU-NEXT:    s_endpgm
619;
620; GFX12-WGP-LABEL: private_volatile_store_1:
621; GFX12-WGP:       ; %bb.0: ; %entry
622; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
623; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
624; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
625; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
626; GFX12-WGP-NEXT:    s_mov_b32 s2, 0x3ff
627; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
628; GFX12-WGP-NEXT:    v_and_b32_e64 v0, v0, s2
629; GFX12-WGP-NEXT:    s_mov_b32 s2, 2
630; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
631; GFX12-WGP-NEXT:    v_lshlrev_b32_e64 v1, s2, v0
632; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
633; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
634; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
635; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
636; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
637; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
638; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
639; GFX12-WGP-NEXT:    scratch_store_b32 v1, v0, s0 scope:SCOPE_SYS
640; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
641; GFX12-WGP-NEXT:    s_endpgm
642;
643; GFX12-CU-LABEL: private_volatile_store_1:
644; GFX12-CU:       ; %bb.0: ; %entry
645; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
646; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
647; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
648; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
649; GFX12-CU-NEXT:    s_mov_b32 s2, 0x3ff
650; GFX12-CU-NEXT:    s_wait_alu 0xfffe
651; GFX12-CU-NEXT:    v_and_b32_e64 v0, v0, s2
652; GFX12-CU-NEXT:    s_mov_b32 s2, 2
653; GFX12-CU-NEXT:    s_wait_alu 0xfffe
654; GFX12-CU-NEXT:    v_lshlrev_b32_e64 v1, s2, v0
655; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
656; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
657; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
658; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
659; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
660; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
661; GFX12-CU-NEXT:    s_wait_storecnt 0x0
662; GFX12-CU-NEXT:    scratch_store_b32 v1, v0, s0 scope:SCOPE_SYS
663; GFX12-CU-NEXT:    s_wait_storecnt 0x0
664; GFX12-CU-NEXT:    s_endpgm
665    ptr addrspace(1) %in, ptr addrspace(5) %out) {
666entry:
667  %tid = call i32 @llvm.amdgcn.workitem.id.x()
668  %val = load i32, ptr addrspace(1) %in, align 4
669  %out.gep = getelementptr inbounds i32, ptr addrspace(5) %out, i32 %tid
670  store volatile i32 %val, ptr addrspace(5) %out.gep
671  ret void
672}
673
674declare i32 @llvm.amdgcn.workitem.id.x()
675