xref: /llvm-project/llvm/test/CodeGen/AMDGPU/flat_atomics.ll (revision 5a3299a684d7d8c40f48d732e5b80a8bd29aa882)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=GCN1 %s
3; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN2 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN3 %s
5
6define amdgpu_kernel void @atomic_add_i32_offset(ptr %out, i32 %in) {
7; GCN1-LABEL: atomic_add_i32_offset:
8; GCN1:       ; %bb.0: ; %entry
9; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
10; GCN1-NEXT:    s_load_dword s2, s[4:5], 0xb
11; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
12; GCN1-NEXT:    s_add_u32 s0, s0, 16
13; GCN1-NEXT:    s_addc_u32 s1, s1, 0
14; GCN1-NEXT:    v_mov_b32_e32 v0, s0
15; GCN1-NEXT:    v_mov_b32_e32 v1, s1
16; GCN1-NEXT:    v_mov_b32_e32 v2, s2
17; GCN1-NEXT:    flat_atomic_add v[0:1], v2
18; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
19; GCN1-NEXT:    buffer_wbinvl1_vol
20; GCN1-NEXT:    s_endpgm
21;
22; GCN2-LABEL: atomic_add_i32_offset:
23; GCN2:       ; %bb.0: ; %entry
24; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
25; GCN2-NEXT:    s_load_dword s2, s[4:5], 0x2c
26; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
27; GCN2-NEXT:    s_add_u32 s0, s0, 16
28; GCN2-NEXT:    s_addc_u32 s1, s1, 0
29; GCN2-NEXT:    v_mov_b32_e32 v0, s0
30; GCN2-NEXT:    v_mov_b32_e32 v1, s1
31; GCN2-NEXT:    v_mov_b32_e32 v2, s2
32; GCN2-NEXT:    flat_atomic_add v[0:1], v2
33; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
34; GCN2-NEXT:    buffer_wbinvl1_vol
35; GCN2-NEXT:    s_endpgm
36;
37; GCN3-LABEL: atomic_add_i32_offset:
38; GCN3:       ; %bb.0: ; %entry
39; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
40; GCN3-NEXT:    s_load_dword s2, s[4:5], 0x2c
41; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
42; GCN3-NEXT:    v_mov_b32_e32 v0, s0
43; GCN3-NEXT:    v_mov_b32_e32 v1, s1
44; GCN3-NEXT:    v_mov_b32_e32 v2, s2
45; GCN3-NEXT:    flat_atomic_add v[0:1], v2 offset:16
46; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
47; GCN3-NEXT:    buffer_wbinvl1_vol
48; GCN3-NEXT:    s_endpgm
49entry:
50  %gep = getelementptr i32, ptr %out, i32 4
51  %val = atomicrmw add ptr %gep, i32 %in syncscope("agent") seq_cst
52  ret void
53}
54
55define amdgpu_kernel void @atomic_add_i32_max_offset(ptr %out, i32 %in) {
56; GCN1-LABEL: atomic_add_i32_max_offset:
57; GCN1:       ; %bb.0: ; %entry
58; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
59; GCN1-NEXT:    s_load_dword s2, s[4:5], 0xb
60; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
61; GCN1-NEXT:    s_add_u32 s0, s0, 0xffc
62; GCN1-NEXT:    s_addc_u32 s1, s1, 0
63; GCN1-NEXT:    v_mov_b32_e32 v0, s0
64; GCN1-NEXT:    v_mov_b32_e32 v1, s1
65; GCN1-NEXT:    v_mov_b32_e32 v2, s2
66; GCN1-NEXT:    flat_atomic_add v[0:1], v2
67; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
68; GCN1-NEXT:    buffer_wbinvl1_vol
69; GCN1-NEXT:    s_endpgm
70;
71; GCN2-LABEL: atomic_add_i32_max_offset:
72; GCN2:       ; %bb.0: ; %entry
73; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
74; GCN2-NEXT:    s_load_dword s2, s[4:5], 0x2c
75; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
76; GCN2-NEXT:    s_add_u32 s0, s0, 0xffc
77; GCN2-NEXT:    s_addc_u32 s1, s1, 0
78; GCN2-NEXT:    v_mov_b32_e32 v0, s0
79; GCN2-NEXT:    v_mov_b32_e32 v1, s1
80; GCN2-NEXT:    v_mov_b32_e32 v2, s2
81; GCN2-NEXT:    flat_atomic_add v[0:1], v2
82; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
83; GCN2-NEXT:    buffer_wbinvl1_vol
84; GCN2-NEXT:    s_endpgm
85;
86; GCN3-LABEL: atomic_add_i32_max_offset:
87; GCN3:       ; %bb.0: ; %entry
88; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
89; GCN3-NEXT:    s_load_dword s2, s[4:5], 0x2c
90; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
91; GCN3-NEXT:    v_mov_b32_e32 v0, s0
92; GCN3-NEXT:    v_mov_b32_e32 v1, s1
93; GCN3-NEXT:    v_mov_b32_e32 v2, s2
94; GCN3-NEXT:    flat_atomic_add v[0:1], v2 offset:4092
95; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
96; GCN3-NEXT:    buffer_wbinvl1_vol
97; GCN3-NEXT:    s_endpgm
98entry:
99  %gep = getelementptr i32, ptr %out, i32 1023
100  %val = atomicrmw volatile add ptr %gep, i32 %in syncscope("agent") seq_cst
101  ret void
102}
103
104define amdgpu_kernel void @atomic_add_i32_max_offset_p1(ptr %out, i32 %in) {
105; GCN1-LABEL: atomic_add_i32_max_offset_p1:
106; GCN1:       ; %bb.0: ; %entry
107; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
108; GCN1-NEXT:    s_load_dword s2, s[4:5], 0xb
109; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
110; GCN1-NEXT:    s_add_u32 s0, s0, 0x1000
111; GCN1-NEXT:    s_addc_u32 s1, s1, 0
112; GCN1-NEXT:    v_mov_b32_e32 v0, s0
113; GCN1-NEXT:    v_mov_b32_e32 v1, s1
114; GCN1-NEXT:    v_mov_b32_e32 v2, s2
115; GCN1-NEXT:    flat_atomic_add v[0:1], v2
116; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
117; GCN1-NEXT:    buffer_wbinvl1_vol
118; GCN1-NEXT:    s_endpgm
119;
120; GCN2-LABEL: atomic_add_i32_max_offset_p1:
121; GCN2:       ; %bb.0: ; %entry
122; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
123; GCN2-NEXT:    s_load_dword s2, s[4:5], 0x2c
124; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
125; GCN2-NEXT:    s_add_u32 s0, s0, 0x1000
126; GCN2-NEXT:    s_addc_u32 s1, s1, 0
127; GCN2-NEXT:    v_mov_b32_e32 v0, s0
128; GCN2-NEXT:    v_mov_b32_e32 v1, s1
129; GCN2-NEXT:    v_mov_b32_e32 v2, s2
130; GCN2-NEXT:    flat_atomic_add v[0:1], v2
131; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
132; GCN2-NEXT:    buffer_wbinvl1_vol
133; GCN2-NEXT:    s_endpgm
134;
135; GCN3-LABEL: atomic_add_i32_max_offset_p1:
136; GCN3:       ; %bb.0: ; %entry
137; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
138; GCN3-NEXT:    s_load_dword s2, s[4:5], 0x2c
139; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
140; GCN3-NEXT:    v_mov_b32_e32 v0, s0
141; GCN3-NEXT:    v_mov_b32_e32 v1, s1
142; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
143; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
144; GCN3-NEXT:    v_mov_b32_e32 v2, s2
145; GCN3-NEXT:    flat_atomic_add v[0:1], v2
146; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
147; GCN3-NEXT:    buffer_wbinvl1_vol
148; GCN3-NEXT:    s_endpgm
149entry:
150  %gep = getelementptr i32, ptr %out, i32 1024
151  %val = atomicrmw volatile add ptr %gep, i32 %in syncscope("agent") seq_cst
152  ret void
153}
154
155define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
156; GCN1-LABEL: atomic_add_i32_ret_offset:
157; GCN1:       ; %bb.0: ; %entry
158; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
159; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xd
160; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
161; GCN1-NEXT:    s_add_u32 s0, s0, 16
162; GCN1-NEXT:    s_addc_u32 s1, s1, 0
163; GCN1-NEXT:    v_mov_b32_e32 v0, s0
164; GCN1-NEXT:    v_mov_b32_e32 v1, s1
165; GCN1-NEXT:    v_mov_b32_e32 v2, s4
166; GCN1-NEXT:    flat_atomic_add v2, v[0:1], v2 glc
167; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
168; GCN1-NEXT:    buffer_wbinvl1_vol
169; GCN1-NEXT:    v_mov_b32_e32 v0, s2
170; GCN1-NEXT:    v_mov_b32_e32 v1, s3
171; GCN1-NEXT:    flat_store_dword v[0:1], v2
172; GCN1-NEXT:    s_endpgm
173;
174; GCN2-LABEL: atomic_add_i32_ret_offset:
175; GCN2:       ; %bb.0: ; %entry
176; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
177; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x34
178; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
179; GCN2-NEXT:    s_add_u32 s0, s0, 16
180; GCN2-NEXT:    s_addc_u32 s1, s1, 0
181; GCN2-NEXT:    v_mov_b32_e32 v0, s0
182; GCN2-NEXT:    v_mov_b32_e32 v1, s1
183; GCN2-NEXT:    v_mov_b32_e32 v2, s4
184; GCN2-NEXT:    flat_atomic_add v2, v[0:1], v2 glc
185; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
186; GCN2-NEXT:    buffer_wbinvl1_vol
187; GCN2-NEXT:    v_mov_b32_e32 v0, s2
188; GCN2-NEXT:    v_mov_b32_e32 v1, s3
189; GCN2-NEXT:    flat_store_dword v[0:1], v2
190; GCN2-NEXT:    s_endpgm
191;
192; GCN3-LABEL: atomic_add_i32_ret_offset:
193; GCN3:       ; %bb.0: ; %entry
194; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
195; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x34
196; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
197; GCN3-NEXT:    v_mov_b32_e32 v0, s0
198; GCN3-NEXT:    v_mov_b32_e32 v1, s1
199; GCN3-NEXT:    v_mov_b32_e32 v2, s6
200; GCN3-NEXT:    flat_atomic_add v2, v[0:1], v2 offset:16 glc
201; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
202; GCN3-NEXT:    buffer_wbinvl1_vol
203; GCN3-NEXT:    v_mov_b32_e32 v0, s2
204; GCN3-NEXT:    v_mov_b32_e32 v1, s3
205; GCN3-NEXT:    flat_store_dword v[0:1], v2
206; GCN3-NEXT:    s_endpgm
207entry:
208  %gep = getelementptr i32, ptr %out, i32 4
209  %val = atomicrmw volatile add ptr %gep, i32 %in syncscope("agent") seq_cst
210  store i32 %val, ptr %out2
211  ret void
212}
213
214define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
215; GCN1-LABEL: atomic_add_i32_addr64_offset:
216; GCN1:       ; %bb.0: ; %entry
217; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
218; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
219; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
220; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
221; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
222; GCN1-NEXT:    s_add_u32 s0, s2, s0
223; GCN1-NEXT:    s_addc_u32 s1, s3, s1
224; GCN1-NEXT:    s_add_u32 s0, s0, 16
225; GCN1-NEXT:    s_addc_u32 s1, s1, 0
226; GCN1-NEXT:    v_mov_b32_e32 v0, s0
227; GCN1-NEXT:    v_mov_b32_e32 v1, s1
228; GCN1-NEXT:    v_mov_b32_e32 v2, s4
229; GCN1-NEXT:    flat_atomic_add v[0:1], v2
230; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
231; GCN1-NEXT:    buffer_wbinvl1_vol
232; GCN1-NEXT:    s_endpgm
233;
234; GCN2-LABEL: atomic_add_i32_addr64_offset:
235; GCN2:       ; %bb.0: ; %entry
236; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
237; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
238; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
239; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
240; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
241; GCN2-NEXT:    s_add_u32 s0, s2, s0
242; GCN2-NEXT:    s_addc_u32 s1, s3, s1
243; GCN2-NEXT:    s_add_u32 s0, s0, 16
244; GCN2-NEXT:    s_addc_u32 s1, s1, 0
245; GCN2-NEXT:    v_mov_b32_e32 v0, s0
246; GCN2-NEXT:    v_mov_b32_e32 v1, s1
247; GCN2-NEXT:    v_mov_b32_e32 v2, s4
248; GCN2-NEXT:    flat_atomic_add v[0:1], v2
249; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
250; GCN2-NEXT:    buffer_wbinvl1_vol
251; GCN2-NEXT:    s_endpgm
252;
253; GCN3-LABEL: atomic_add_i32_addr64_offset:
254; GCN3:       ; %bb.0: ; %entry
255; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
256; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
257; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
258; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
259; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
260; GCN3-NEXT:    s_add_u32 s0, s2, s0
261; GCN3-NEXT:    s_addc_u32 s1, s3, s1
262; GCN3-NEXT:    v_mov_b32_e32 v0, s0
263; GCN3-NEXT:    v_mov_b32_e32 v1, s1
264; GCN3-NEXT:    v_mov_b32_e32 v2, s6
265; GCN3-NEXT:    flat_atomic_add v[0:1], v2 offset:16
266; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
267; GCN3-NEXT:    buffer_wbinvl1_vol
268; GCN3-NEXT:    s_endpgm
269entry:
270  %ptr = getelementptr i32, ptr %out, i64 %index
271  %gep = getelementptr i32, ptr %ptr, i32 4
272  %val = atomicrmw volatile add ptr %gep, i32 %in syncscope("agent") seq_cst
273  ret void
274}
275
276define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
277; GCN1-LABEL: atomic_add_i32_ret_addr64_offset:
278; GCN1:       ; %bb.0: ; %entry
279; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
280; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
281; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
282; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
283; GCN1-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
284; GCN1-NEXT:    s_add_u32 s0, s0, s4
285; GCN1-NEXT:    s_addc_u32 s1, s1, s5
286; GCN1-NEXT:    s_add_u32 s0, s0, 16
287; GCN1-NEXT:    s_addc_u32 s1, s1, 0
288; GCN1-NEXT:    v_mov_b32_e32 v0, s0
289; GCN1-NEXT:    v_mov_b32_e32 v1, s1
290; GCN1-NEXT:    v_mov_b32_e32 v2, s8
291; GCN1-NEXT:    flat_atomic_add v2, v[0:1], v2 glc
292; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
293; GCN1-NEXT:    buffer_wbinvl1_vol
294; GCN1-NEXT:    v_mov_b32_e32 v0, s2
295; GCN1-NEXT:    v_mov_b32_e32 v1, s3
296; GCN1-NEXT:    flat_store_dword v[0:1], v2
297; GCN1-NEXT:    s_endpgm
298;
299; GCN2-LABEL: atomic_add_i32_ret_addr64_offset:
300; GCN2:       ; %bb.0: ; %entry
301; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
302; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
303; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
304; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
305; GCN2-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
306; GCN2-NEXT:    s_add_u32 s0, s0, s4
307; GCN2-NEXT:    s_addc_u32 s1, s1, s5
308; GCN2-NEXT:    s_add_u32 s0, s0, 16
309; GCN2-NEXT:    s_addc_u32 s1, s1, 0
310; GCN2-NEXT:    v_mov_b32_e32 v0, s0
311; GCN2-NEXT:    v_mov_b32_e32 v1, s1
312; GCN2-NEXT:    v_mov_b32_e32 v2, s8
313; GCN2-NEXT:    flat_atomic_add v2, v[0:1], v2 glc
314; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
315; GCN2-NEXT:    buffer_wbinvl1_vol
316; GCN2-NEXT:    v_mov_b32_e32 v0, s2
317; GCN2-NEXT:    v_mov_b32_e32 v1, s3
318; GCN2-NEXT:    flat_store_dword v[0:1], v2
319; GCN2-NEXT:    s_endpgm
320;
321; GCN3-LABEL: atomic_add_i32_ret_addr64_offset:
322; GCN3:       ; %bb.0: ; %entry
323; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
324; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
325; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
326; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
327; GCN3-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
328; GCN3-NEXT:    s_add_u32 s0, s0, s4
329; GCN3-NEXT:    s_addc_u32 s1, s1, s5
330; GCN3-NEXT:    v_mov_b32_e32 v0, s0
331; GCN3-NEXT:    v_mov_b32_e32 v1, s1
332; GCN3-NEXT:    v_mov_b32_e32 v2, s8
333; GCN3-NEXT:    flat_atomic_add v2, v[0:1], v2 offset:16 glc
334; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
335; GCN3-NEXT:    buffer_wbinvl1_vol
336; GCN3-NEXT:    v_mov_b32_e32 v0, s2
337; GCN3-NEXT:    v_mov_b32_e32 v1, s3
338; GCN3-NEXT:    flat_store_dword v[0:1], v2
339; GCN3-NEXT:    s_endpgm
340entry:
341  %ptr = getelementptr i32, ptr %out, i64 %index
342  %gep = getelementptr i32, ptr %ptr, i32 4
343  %val = atomicrmw volatile add ptr %gep, i32 %in syncscope("agent") seq_cst
344  store i32 %val, ptr %out2
345  ret void
346}
347
348define amdgpu_kernel void @atomic_add_i32(ptr %out, i32 %in) {
349; GCN1-LABEL: atomic_add_i32:
350; GCN1:       ; %bb.0: ; %entry
351; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
352; GCN1-NEXT:    s_load_dword s2, s[4:5], 0xb
353; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
354; GCN1-NEXT:    v_mov_b32_e32 v0, s0
355; GCN1-NEXT:    v_mov_b32_e32 v1, s1
356; GCN1-NEXT:    v_mov_b32_e32 v2, s2
357; GCN1-NEXT:    flat_atomic_add v[0:1], v2
358; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
359; GCN1-NEXT:    buffer_wbinvl1_vol
360; GCN1-NEXT:    s_endpgm
361;
362; GCN2-LABEL: atomic_add_i32:
363; GCN2:       ; %bb.0: ; %entry
364; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
365; GCN2-NEXT:    s_load_dword s2, s[4:5], 0x2c
366; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
367; GCN2-NEXT:    v_mov_b32_e32 v0, s0
368; GCN2-NEXT:    v_mov_b32_e32 v1, s1
369; GCN2-NEXT:    v_mov_b32_e32 v2, s2
370; GCN2-NEXT:    flat_atomic_add v[0:1], v2
371; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
372; GCN2-NEXT:    buffer_wbinvl1_vol
373; GCN2-NEXT:    s_endpgm
374;
375; GCN3-LABEL: atomic_add_i32:
376; GCN3:       ; %bb.0: ; %entry
377; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
378; GCN3-NEXT:    s_load_dword s2, s[4:5], 0x2c
379; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
380; GCN3-NEXT:    v_mov_b32_e32 v0, s0
381; GCN3-NEXT:    v_mov_b32_e32 v1, s1
382; GCN3-NEXT:    v_mov_b32_e32 v2, s2
383; GCN3-NEXT:    flat_atomic_add v[0:1], v2
384; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
385; GCN3-NEXT:    buffer_wbinvl1_vol
386; GCN3-NEXT:    s_endpgm
387entry:
388  %val = atomicrmw volatile add ptr %out, i32 %in syncscope("agent") seq_cst
389  ret void
390}
391
392define amdgpu_kernel void @atomic_add_i32_ret(ptr %out, ptr %out2, i32 %in) {
393; GCN1-LABEL: atomic_add_i32_ret:
394; GCN1:       ; %bb.0: ; %entry
395; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
396; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xd
397; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
398; GCN1-NEXT:    v_mov_b32_e32 v0, s0
399; GCN1-NEXT:    v_mov_b32_e32 v1, s1
400; GCN1-NEXT:    v_mov_b32_e32 v2, s4
401; GCN1-NEXT:    flat_atomic_add v2, v[0:1], v2 glc
402; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
403; GCN1-NEXT:    buffer_wbinvl1_vol
404; GCN1-NEXT:    v_mov_b32_e32 v0, s2
405; GCN1-NEXT:    v_mov_b32_e32 v1, s3
406; GCN1-NEXT:    flat_store_dword v[0:1], v2
407; GCN1-NEXT:    s_endpgm
408;
409; GCN2-LABEL: atomic_add_i32_ret:
410; GCN2:       ; %bb.0: ; %entry
411; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
412; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x34
413; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
414; GCN2-NEXT:    v_mov_b32_e32 v0, s0
415; GCN2-NEXT:    v_mov_b32_e32 v1, s1
416; GCN2-NEXT:    v_mov_b32_e32 v2, s4
417; GCN2-NEXT:    flat_atomic_add v2, v[0:1], v2 glc
418; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
419; GCN2-NEXT:    buffer_wbinvl1_vol
420; GCN2-NEXT:    v_mov_b32_e32 v0, s2
421; GCN2-NEXT:    v_mov_b32_e32 v1, s3
422; GCN2-NEXT:    flat_store_dword v[0:1], v2
423; GCN2-NEXT:    s_endpgm
424;
425; GCN3-LABEL: atomic_add_i32_ret:
426; GCN3:       ; %bb.0: ; %entry
427; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
428; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x34
429; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
430; GCN3-NEXT:    v_mov_b32_e32 v0, s0
431; GCN3-NEXT:    v_mov_b32_e32 v1, s1
432; GCN3-NEXT:    v_mov_b32_e32 v2, s6
433; GCN3-NEXT:    flat_atomic_add v2, v[0:1], v2 glc
434; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
435; GCN3-NEXT:    buffer_wbinvl1_vol
436; GCN3-NEXT:    v_mov_b32_e32 v0, s2
437; GCN3-NEXT:    v_mov_b32_e32 v1, s3
438; GCN3-NEXT:    flat_store_dword v[0:1], v2
439; GCN3-NEXT:    s_endpgm
440entry:
441  %val = atomicrmw volatile add ptr %out, i32 %in syncscope("agent") seq_cst
442  store i32 %val, ptr %out2
443  ret void
444}
445
446define amdgpu_kernel void @atomic_add_i32_addr64(ptr %out, i32 %in, i64 %index) {
447; GCN1-LABEL: atomic_add_i32_addr64:
448; GCN1:       ; %bb.0: ; %entry
449; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
450; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
451; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
452; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
453; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
454; GCN1-NEXT:    s_add_u32 s0, s2, s0
455; GCN1-NEXT:    s_addc_u32 s1, s3, s1
456; GCN1-NEXT:    v_mov_b32_e32 v0, s0
457; GCN1-NEXT:    v_mov_b32_e32 v1, s1
458; GCN1-NEXT:    v_mov_b32_e32 v2, s4
459; GCN1-NEXT:    flat_atomic_add v[0:1], v2
460; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
461; GCN1-NEXT:    buffer_wbinvl1_vol
462; GCN1-NEXT:    s_endpgm
463;
464; GCN2-LABEL: atomic_add_i32_addr64:
465; GCN2:       ; %bb.0: ; %entry
466; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
467; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
468; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
469; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
470; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
471; GCN2-NEXT:    s_add_u32 s0, s2, s0
472; GCN2-NEXT:    s_addc_u32 s1, s3, s1
473; GCN2-NEXT:    v_mov_b32_e32 v0, s0
474; GCN2-NEXT:    v_mov_b32_e32 v1, s1
475; GCN2-NEXT:    v_mov_b32_e32 v2, s4
476; GCN2-NEXT:    flat_atomic_add v[0:1], v2
477; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
478; GCN2-NEXT:    buffer_wbinvl1_vol
479; GCN2-NEXT:    s_endpgm
480;
481; GCN3-LABEL: atomic_add_i32_addr64:
482; GCN3:       ; %bb.0: ; %entry
483; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
484; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
485; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
486; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
487; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
488; GCN3-NEXT:    s_add_u32 s0, s2, s0
489; GCN3-NEXT:    s_addc_u32 s1, s3, s1
490; GCN3-NEXT:    v_mov_b32_e32 v0, s0
491; GCN3-NEXT:    v_mov_b32_e32 v1, s1
492; GCN3-NEXT:    v_mov_b32_e32 v2, s6
493; GCN3-NEXT:    flat_atomic_add v[0:1], v2
494; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
495; GCN3-NEXT:    buffer_wbinvl1_vol
496; GCN3-NEXT:    s_endpgm
497entry:
498  %ptr = getelementptr i32, ptr %out, i64 %index
499  %val = atomicrmw volatile add ptr %ptr, i32 %in syncscope("agent") seq_cst
500  ret void
501}
502
503define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
504; GCN1-LABEL: atomic_add_i32_ret_addr64:
505; GCN1:       ; %bb.0: ; %entry
506; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
507; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
508; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
509; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
510; GCN1-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
511; GCN1-NEXT:    s_add_u32 s0, s0, s4
512; GCN1-NEXT:    s_addc_u32 s1, s1, s5
513; GCN1-NEXT:    v_mov_b32_e32 v0, s0
514; GCN1-NEXT:    v_mov_b32_e32 v1, s1
515; GCN1-NEXT:    v_mov_b32_e32 v2, s8
516; GCN1-NEXT:    flat_atomic_add v2, v[0:1], v2 glc
517; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
518; GCN1-NEXT:    buffer_wbinvl1_vol
519; GCN1-NEXT:    v_mov_b32_e32 v0, s2
520; GCN1-NEXT:    v_mov_b32_e32 v1, s3
521; GCN1-NEXT:    flat_store_dword v[0:1], v2
522; GCN1-NEXT:    s_endpgm
523;
524; GCN2-LABEL: atomic_add_i32_ret_addr64:
525; GCN2:       ; %bb.0: ; %entry
526; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
527; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
528; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
529; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
530; GCN2-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
531; GCN2-NEXT:    s_add_u32 s0, s0, s4
532; GCN2-NEXT:    s_addc_u32 s1, s1, s5
533; GCN2-NEXT:    v_mov_b32_e32 v0, s0
534; GCN2-NEXT:    v_mov_b32_e32 v1, s1
535; GCN2-NEXT:    v_mov_b32_e32 v2, s8
536; GCN2-NEXT:    flat_atomic_add v2, v[0:1], v2 glc
537; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
538; GCN2-NEXT:    buffer_wbinvl1_vol
539; GCN2-NEXT:    v_mov_b32_e32 v0, s2
540; GCN2-NEXT:    v_mov_b32_e32 v1, s3
541; GCN2-NEXT:    flat_store_dword v[0:1], v2
542; GCN2-NEXT:    s_endpgm
543;
544; GCN3-LABEL: atomic_add_i32_ret_addr64:
545; GCN3:       ; %bb.0: ; %entry
546; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
547; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
548; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
549; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
550; GCN3-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
551; GCN3-NEXT:    s_add_u32 s0, s0, s4
552; GCN3-NEXT:    s_addc_u32 s1, s1, s5
553; GCN3-NEXT:    v_mov_b32_e32 v0, s0
554; GCN3-NEXT:    v_mov_b32_e32 v1, s1
555; GCN3-NEXT:    v_mov_b32_e32 v2, s8
556; GCN3-NEXT:    flat_atomic_add v2, v[0:1], v2 glc
557; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
558; GCN3-NEXT:    buffer_wbinvl1_vol
559; GCN3-NEXT:    v_mov_b32_e32 v0, s2
560; GCN3-NEXT:    v_mov_b32_e32 v1, s3
561; GCN3-NEXT:    flat_store_dword v[0:1], v2
562; GCN3-NEXT:    s_endpgm
563entry:
564  %ptr = getelementptr i32, ptr %out, i64 %index
565  %val = atomicrmw volatile add ptr %ptr, i32 %in syncscope("agent") seq_cst
566  store i32 %val, ptr %out2
567  ret void
568}
569
570define amdgpu_kernel void @atomic_and_i32_offset(ptr %out, i32 %in) {
571; GCN1-LABEL: atomic_and_i32_offset:
572; GCN1:       ; %bb.0: ; %entry
573; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
574; GCN1-NEXT:    s_load_dword s2, s[4:5], 0xb
575; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
576; GCN1-NEXT:    s_add_u32 s0, s0, 16
577; GCN1-NEXT:    s_addc_u32 s1, s1, 0
578; GCN1-NEXT:    v_mov_b32_e32 v0, s0
579; GCN1-NEXT:    v_mov_b32_e32 v1, s1
580; GCN1-NEXT:    v_mov_b32_e32 v2, s2
581; GCN1-NEXT:    flat_atomic_and v[0:1], v2
582; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
583; GCN1-NEXT:    buffer_wbinvl1_vol
584; GCN1-NEXT:    s_endpgm
585;
586; GCN2-LABEL: atomic_and_i32_offset:
587; GCN2:       ; %bb.0: ; %entry
588; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
589; GCN2-NEXT:    s_load_dword s2, s[4:5], 0x2c
590; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
591; GCN2-NEXT:    s_add_u32 s0, s0, 16
592; GCN2-NEXT:    s_addc_u32 s1, s1, 0
593; GCN2-NEXT:    v_mov_b32_e32 v0, s0
594; GCN2-NEXT:    v_mov_b32_e32 v1, s1
595; GCN2-NEXT:    v_mov_b32_e32 v2, s2
596; GCN2-NEXT:    flat_atomic_and v[0:1], v2
597; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
598; GCN2-NEXT:    buffer_wbinvl1_vol
599; GCN2-NEXT:    s_endpgm
600;
601; GCN3-LABEL: atomic_and_i32_offset:
602; GCN3:       ; %bb.0: ; %entry
603; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
604; GCN3-NEXT:    s_load_dword s2, s[4:5], 0x2c
605; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
606; GCN3-NEXT:    v_mov_b32_e32 v0, s0
607; GCN3-NEXT:    v_mov_b32_e32 v1, s1
608; GCN3-NEXT:    v_mov_b32_e32 v2, s2
609; GCN3-NEXT:    flat_atomic_and v[0:1], v2 offset:16
610; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
611; GCN3-NEXT:    buffer_wbinvl1_vol
612; GCN3-NEXT:    s_endpgm
613entry:
614  %gep = getelementptr i32, ptr %out, i32 4
615  %val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst
616  ret void
617}
618
619define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
620; GCN1-LABEL: atomic_and_i32_ret_offset:
621; GCN1:       ; %bb.0: ; %entry
622; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
623; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xd
624; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
625; GCN1-NEXT:    s_add_u32 s0, s0, 16
626; GCN1-NEXT:    s_addc_u32 s1, s1, 0
627; GCN1-NEXT:    v_mov_b32_e32 v0, s0
628; GCN1-NEXT:    v_mov_b32_e32 v1, s1
629; GCN1-NEXT:    v_mov_b32_e32 v2, s4
630; GCN1-NEXT:    flat_atomic_and v2, v[0:1], v2 glc
631; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
632; GCN1-NEXT:    buffer_wbinvl1_vol
633; GCN1-NEXT:    v_mov_b32_e32 v0, s2
634; GCN1-NEXT:    v_mov_b32_e32 v1, s3
635; GCN1-NEXT:    flat_store_dword v[0:1], v2
636; GCN1-NEXT:    s_endpgm
637;
638; GCN2-LABEL: atomic_and_i32_ret_offset:
639; GCN2:       ; %bb.0: ; %entry
640; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
641; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x34
642; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
643; GCN2-NEXT:    s_add_u32 s0, s0, 16
644; GCN2-NEXT:    s_addc_u32 s1, s1, 0
645; GCN2-NEXT:    v_mov_b32_e32 v0, s0
646; GCN2-NEXT:    v_mov_b32_e32 v1, s1
647; GCN2-NEXT:    v_mov_b32_e32 v2, s4
648; GCN2-NEXT:    flat_atomic_and v2, v[0:1], v2 glc
649; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
650; GCN2-NEXT:    buffer_wbinvl1_vol
651; GCN2-NEXT:    v_mov_b32_e32 v0, s2
652; GCN2-NEXT:    v_mov_b32_e32 v1, s3
653; GCN2-NEXT:    flat_store_dword v[0:1], v2
654; GCN2-NEXT:    s_endpgm
655;
656; GCN3-LABEL: atomic_and_i32_ret_offset:
657; GCN3:       ; %bb.0: ; %entry
658; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
659; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x34
660; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
661; GCN3-NEXT:    v_mov_b32_e32 v0, s0
662; GCN3-NEXT:    v_mov_b32_e32 v1, s1
663; GCN3-NEXT:    v_mov_b32_e32 v2, s6
664; GCN3-NEXT:    flat_atomic_and v2, v[0:1], v2 offset:16 glc
665; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
666; GCN3-NEXT:    buffer_wbinvl1_vol
667; GCN3-NEXT:    v_mov_b32_e32 v0, s2
668; GCN3-NEXT:    v_mov_b32_e32 v1, s3
669; GCN3-NEXT:    flat_store_dword v[0:1], v2
670; GCN3-NEXT:    s_endpgm
671entry:
672  %gep = getelementptr i32, ptr %out, i32 4
673  %val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst
674  store i32 %val, ptr %out2
675  ret void
676}
677
678define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
679; GCN1-LABEL: atomic_and_i32_addr64_offset:
680; GCN1:       ; %bb.0: ; %entry
681; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
682; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
683; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
684; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
685; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
686; GCN1-NEXT:    s_add_u32 s0, s2, s0
687; GCN1-NEXT:    s_addc_u32 s1, s3, s1
688; GCN1-NEXT:    s_add_u32 s0, s0, 16
689; GCN1-NEXT:    s_addc_u32 s1, s1, 0
690; GCN1-NEXT:    v_mov_b32_e32 v0, s0
691; GCN1-NEXT:    v_mov_b32_e32 v1, s1
692; GCN1-NEXT:    v_mov_b32_e32 v2, s4
693; GCN1-NEXT:    flat_atomic_and v[0:1], v2
694; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
695; GCN1-NEXT:    buffer_wbinvl1_vol
696; GCN1-NEXT:    s_endpgm
697;
698; GCN2-LABEL: atomic_and_i32_addr64_offset:
699; GCN2:       ; %bb.0: ; %entry
700; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
701; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
702; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
703; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
704; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
705; GCN2-NEXT:    s_add_u32 s0, s2, s0
706; GCN2-NEXT:    s_addc_u32 s1, s3, s1
707; GCN2-NEXT:    s_add_u32 s0, s0, 16
708; GCN2-NEXT:    s_addc_u32 s1, s1, 0
709; GCN2-NEXT:    v_mov_b32_e32 v0, s0
710; GCN2-NEXT:    v_mov_b32_e32 v1, s1
711; GCN2-NEXT:    v_mov_b32_e32 v2, s4
712; GCN2-NEXT:    flat_atomic_and v[0:1], v2
713; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
714; GCN2-NEXT:    buffer_wbinvl1_vol
715; GCN2-NEXT:    s_endpgm
716;
717; GCN3-LABEL: atomic_and_i32_addr64_offset:
718; GCN3:       ; %bb.0: ; %entry
719; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
720; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
721; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
722; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
723; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
724; GCN3-NEXT:    s_add_u32 s0, s2, s0
725; GCN3-NEXT:    s_addc_u32 s1, s3, s1
726; GCN3-NEXT:    v_mov_b32_e32 v0, s0
727; GCN3-NEXT:    v_mov_b32_e32 v1, s1
728; GCN3-NEXT:    v_mov_b32_e32 v2, s6
729; GCN3-NEXT:    flat_atomic_and v[0:1], v2 offset:16
730; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
731; GCN3-NEXT:    buffer_wbinvl1_vol
732; GCN3-NEXT:    s_endpgm
733entry:
734  %ptr = getelementptr i32, ptr %out, i64 %index
735  %gep = getelementptr i32, ptr %ptr, i32 4
736  %val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst
737  ret void
738}
739
740define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
741; GCN1-LABEL: atomic_and_i32_ret_addr64_offset:
742; GCN1:       ; %bb.0: ; %entry
743; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
744; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
745; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
746; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
747; GCN1-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
748; GCN1-NEXT:    s_add_u32 s0, s0, s4
749; GCN1-NEXT:    s_addc_u32 s1, s1, s5
750; GCN1-NEXT:    s_add_u32 s0, s0, 16
751; GCN1-NEXT:    s_addc_u32 s1, s1, 0
752; GCN1-NEXT:    v_mov_b32_e32 v0, s0
753; GCN1-NEXT:    v_mov_b32_e32 v1, s1
754; GCN1-NEXT:    v_mov_b32_e32 v2, s8
755; GCN1-NEXT:    flat_atomic_and v2, v[0:1], v2 glc
756; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
757; GCN1-NEXT:    buffer_wbinvl1_vol
758; GCN1-NEXT:    v_mov_b32_e32 v0, s2
759; GCN1-NEXT:    v_mov_b32_e32 v1, s3
760; GCN1-NEXT:    flat_store_dword v[0:1], v2
761; GCN1-NEXT:    s_endpgm
762;
763; GCN2-LABEL: atomic_and_i32_ret_addr64_offset:
764; GCN2:       ; %bb.0: ; %entry
765; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
766; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
767; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
768; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
769; GCN2-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
770; GCN2-NEXT:    s_add_u32 s0, s0, s4
771; GCN2-NEXT:    s_addc_u32 s1, s1, s5
772; GCN2-NEXT:    s_add_u32 s0, s0, 16
773; GCN2-NEXT:    s_addc_u32 s1, s1, 0
774; GCN2-NEXT:    v_mov_b32_e32 v0, s0
775; GCN2-NEXT:    v_mov_b32_e32 v1, s1
776; GCN2-NEXT:    v_mov_b32_e32 v2, s8
777; GCN2-NEXT:    flat_atomic_and v2, v[0:1], v2 glc
778; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
779; GCN2-NEXT:    buffer_wbinvl1_vol
780; GCN2-NEXT:    v_mov_b32_e32 v0, s2
781; GCN2-NEXT:    v_mov_b32_e32 v1, s3
782; GCN2-NEXT:    flat_store_dword v[0:1], v2
783; GCN2-NEXT:    s_endpgm
784;
785; GCN3-LABEL: atomic_and_i32_ret_addr64_offset:
786; GCN3:       ; %bb.0: ; %entry
787; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
788; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
789; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
790; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
791; GCN3-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
792; GCN3-NEXT:    s_add_u32 s0, s0, s4
793; GCN3-NEXT:    s_addc_u32 s1, s1, s5
794; GCN3-NEXT:    v_mov_b32_e32 v0, s0
795; GCN3-NEXT:    v_mov_b32_e32 v1, s1
796; GCN3-NEXT:    v_mov_b32_e32 v2, s8
797; GCN3-NEXT:    flat_atomic_and v2, v[0:1], v2 offset:16 glc
798; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
799; GCN3-NEXT:    buffer_wbinvl1_vol
800; GCN3-NEXT:    v_mov_b32_e32 v0, s2
801; GCN3-NEXT:    v_mov_b32_e32 v1, s3
802; GCN3-NEXT:    flat_store_dword v[0:1], v2
803; GCN3-NEXT:    s_endpgm
804entry:
805  %ptr = getelementptr i32, ptr %out, i64 %index
806  %gep = getelementptr i32, ptr %ptr, i32 4
807  %val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst
808  store i32 %val, ptr %out2
809  ret void
810}
811
812define amdgpu_kernel void @atomic_and_i32(ptr %out, i32 %in) {
813; GCN1-LABEL: atomic_and_i32:
814; GCN1:       ; %bb.0: ; %entry
815; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
816; GCN1-NEXT:    s_load_dword s2, s[4:5], 0xb
817; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
818; GCN1-NEXT:    v_mov_b32_e32 v0, s0
819; GCN1-NEXT:    v_mov_b32_e32 v1, s1
820; GCN1-NEXT:    v_mov_b32_e32 v2, s2
821; GCN1-NEXT:    flat_atomic_and v[0:1], v2
822; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
823; GCN1-NEXT:    buffer_wbinvl1_vol
824; GCN1-NEXT:    s_endpgm
825;
826; GCN2-LABEL: atomic_and_i32:
827; GCN2:       ; %bb.0: ; %entry
828; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
829; GCN2-NEXT:    s_load_dword s2, s[4:5], 0x2c
830; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
831; GCN2-NEXT:    v_mov_b32_e32 v0, s0
832; GCN2-NEXT:    v_mov_b32_e32 v1, s1
833; GCN2-NEXT:    v_mov_b32_e32 v2, s2
834; GCN2-NEXT:    flat_atomic_and v[0:1], v2
835; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
836; GCN2-NEXT:    buffer_wbinvl1_vol
837; GCN2-NEXT:    s_endpgm
838;
839; GCN3-LABEL: atomic_and_i32:
840; GCN3:       ; %bb.0: ; %entry
841; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
842; GCN3-NEXT:    s_load_dword s2, s[4:5], 0x2c
843; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
844; GCN3-NEXT:    v_mov_b32_e32 v0, s0
845; GCN3-NEXT:    v_mov_b32_e32 v1, s1
846; GCN3-NEXT:    v_mov_b32_e32 v2, s2
847; GCN3-NEXT:    flat_atomic_and v[0:1], v2
848; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
849; GCN3-NEXT:    buffer_wbinvl1_vol
850; GCN3-NEXT:    s_endpgm
851entry:
852  %val = atomicrmw volatile and ptr %out, i32 %in syncscope("agent") seq_cst
853  ret void
854}
855
856define amdgpu_kernel void @atomic_and_i32_ret(ptr %out, ptr %out2, i32 %in) {
857; GCN1-LABEL: atomic_and_i32_ret:
858; GCN1:       ; %bb.0: ; %entry
859; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
860; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xd
861; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
862; GCN1-NEXT:    v_mov_b32_e32 v0, s0
863; GCN1-NEXT:    v_mov_b32_e32 v1, s1
864; GCN1-NEXT:    v_mov_b32_e32 v2, s4
865; GCN1-NEXT:    flat_atomic_and v2, v[0:1], v2 glc
866; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
867; GCN1-NEXT:    buffer_wbinvl1_vol
868; GCN1-NEXT:    v_mov_b32_e32 v0, s2
869; GCN1-NEXT:    v_mov_b32_e32 v1, s3
870; GCN1-NEXT:    flat_store_dword v[0:1], v2
871; GCN1-NEXT:    s_endpgm
872;
873; GCN2-LABEL: atomic_and_i32_ret:
874; GCN2:       ; %bb.0: ; %entry
875; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
876; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x34
877; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
878; GCN2-NEXT:    v_mov_b32_e32 v0, s0
879; GCN2-NEXT:    v_mov_b32_e32 v1, s1
880; GCN2-NEXT:    v_mov_b32_e32 v2, s4
881; GCN2-NEXT:    flat_atomic_and v2, v[0:1], v2 glc
882; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
883; GCN2-NEXT:    buffer_wbinvl1_vol
884; GCN2-NEXT:    v_mov_b32_e32 v0, s2
885; GCN2-NEXT:    v_mov_b32_e32 v1, s3
886; GCN2-NEXT:    flat_store_dword v[0:1], v2
887; GCN2-NEXT:    s_endpgm
888;
889; GCN3-LABEL: atomic_and_i32_ret:
890; GCN3:       ; %bb.0: ; %entry
891; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
892; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x34
893; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
894; GCN3-NEXT:    v_mov_b32_e32 v0, s0
895; GCN3-NEXT:    v_mov_b32_e32 v1, s1
896; GCN3-NEXT:    v_mov_b32_e32 v2, s6
897; GCN3-NEXT:    flat_atomic_and v2, v[0:1], v2 glc
898; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
899; GCN3-NEXT:    buffer_wbinvl1_vol
900; GCN3-NEXT:    v_mov_b32_e32 v0, s2
901; GCN3-NEXT:    v_mov_b32_e32 v1, s3
902; GCN3-NEXT:    flat_store_dword v[0:1], v2
903; GCN3-NEXT:    s_endpgm
904entry:
905  %val = atomicrmw volatile and ptr %out, i32 %in syncscope("agent") seq_cst
906  store i32 %val, ptr %out2
907  ret void
908}
909
910define amdgpu_kernel void @atomic_and_i32_addr64(ptr %out, i32 %in, i64 %index) {
911; GCN1-LABEL: atomic_and_i32_addr64:
912; GCN1:       ; %bb.0: ; %entry
913; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
914; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
915; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
916; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
917; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
918; GCN1-NEXT:    s_add_u32 s0, s2, s0
919; GCN1-NEXT:    s_addc_u32 s1, s3, s1
920; GCN1-NEXT:    v_mov_b32_e32 v0, s0
921; GCN1-NEXT:    v_mov_b32_e32 v1, s1
922; GCN1-NEXT:    v_mov_b32_e32 v2, s4
923; GCN1-NEXT:    flat_atomic_and v[0:1], v2
924; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
925; GCN1-NEXT:    buffer_wbinvl1_vol
926; GCN1-NEXT:    s_endpgm
927;
928; GCN2-LABEL: atomic_and_i32_addr64:
929; GCN2:       ; %bb.0: ; %entry
930; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
931; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
932; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
933; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
934; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
935; GCN2-NEXT:    s_add_u32 s0, s2, s0
936; GCN2-NEXT:    s_addc_u32 s1, s3, s1
937; GCN2-NEXT:    v_mov_b32_e32 v0, s0
938; GCN2-NEXT:    v_mov_b32_e32 v1, s1
939; GCN2-NEXT:    v_mov_b32_e32 v2, s4
940; GCN2-NEXT:    flat_atomic_and v[0:1], v2
941; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
942; GCN2-NEXT:    buffer_wbinvl1_vol
943; GCN2-NEXT:    s_endpgm
944;
945; GCN3-LABEL: atomic_and_i32_addr64:
946; GCN3:       ; %bb.0: ; %entry
947; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
948; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
949; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
950; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
951; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
952; GCN3-NEXT:    s_add_u32 s0, s2, s0
953; GCN3-NEXT:    s_addc_u32 s1, s3, s1
954; GCN3-NEXT:    v_mov_b32_e32 v0, s0
955; GCN3-NEXT:    v_mov_b32_e32 v1, s1
956; GCN3-NEXT:    v_mov_b32_e32 v2, s6
957; GCN3-NEXT:    flat_atomic_and v[0:1], v2
958; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
959; GCN3-NEXT:    buffer_wbinvl1_vol
960; GCN3-NEXT:    s_endpgm
961entry:
962  %ptr = getelementptr i32, ptr %out, i64 %index
963  %val = atomicrmw volatile and ptr %ptr, i32 %in syncscope("agent") seq_cst
964  ret void
965}
966
967define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
968; GCN1-LABEL: atomic_and_i32_ret_addr64:
969; GCN1:       ; %bb.0: ; %entry
970; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
971; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
972; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
973; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
974; GCN1-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
975; GCN1-NEXT:    s_add_u32 s0, s0, s4
976; GCN1-NEXT:    s_addc_u32 s1, s1, s5
977; GCN1-NEXT:    v_mov_b32_e32 v0, s0
978; GCN1-NEXT:    v_mov_b32_e32 v1, s1
979; GCN1-NEXT:    v_mov_b32_e32 v2, s8
980; GCN1-NEXT:    flat_atomic_and v2, v[0:1], v2 glc
981; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
982; GCN1-NEXT:    buffer_wbinvl1_vol
983; GCN1-NEXT:    v_mov_b32_e32 v0, s2
984; GCN1-NEXT:    v_mov_b32_e32 v1, s3
985; GCN1-NEXT:    flat_store_dword v[0:1], v2
986; GCN1-NEXT:    s_endpgm
987;
988; GCN2-LABEL: atomic_and_i32_ret_addr64:
989; GCN2:       ; %bb.0: ; %entry
990; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
991; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
992; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
993; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
994; GCN2-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
995; GCN2-NEXT:    s_add_u32 s0, s0, s4
996; GCN2-NEXT:    s_addc_u32 s1, s1, s5
997; GCN2-NEXT:    v_mov_b32_e32 v0, s0
998; GCN2-NEXT:    v_mov_b32_e32 v1, s1
999; GCN2-NEXT:    v_mov_b32_e32 v2, s8
1000; GCN2-NEXT:    flat_atomic_and v2, v[0:1], v2 glc
1001; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1002; GCN2-NEXT:    buffer_wbinvl1_vol
1003; GCN2-NEXT:    v_mov_b32_e32 v0, s2
1004; GCN2-NEXT:    v_mov_b32_e32 v1, s3
1005; GCN2-NEXT:    flat_store_dword v[0:1], v2
1006; GCN2-NEXT:    s_endpgm
1007;
1008; GCN3-LABEL: atomic_and_i32_ret_addr64:
1009; GCN3:       ; %bb.0: ; %entry
1010; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
1011; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1012; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
1013; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
1014; GCN3-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
1015; GCN3-NEXT:    s_add_u32 s0, s0, s4
1016; GCN3-NEXT:    s_addc_u32 s1, s1, s5
1017; GCN3-NEXT:    v_mov_b32_e32 v0, s0
1018; GCN3-NEXT:    v_mov_b32_e32 v1, s1
1019; GCN3-NEXT:    v_mov_b32_e32 v2, s8
1020; GCN3-NEXT:    flat_atomic_and v2, v[0:1], v2 glc
1021; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1022; GCN3-NEXT:    buffer_wbinvl1_vol
1023; GCN3-NEXT:    v_mov_b32_e32 v0, s2
1024; GCN3-NEXT:    v_mov_b32_e32 v1, s3
1025; GCN3-NEXT:    flat_store_dword v[0:1], v2
1026; GCN3-NEXT:    s_endpgm
1027entry:
1028  %ptr = getelementptr i32, ptr %out, i64 %index
1029  %val = atomicrmw volatile and ptr %ptr, i32 %in syncscope("agent") seq_cst
1030  store i32 %val, ptr %out2
1031  ret void
1032}
1033
1034define amdgpu_kernel void @atomic_sub_i32_offset(ptr %out, i32 %in) {
1035; GCN1-LABEL: atomic_sub_i32_offset:
1036; GCN1:       ; %bb.0: ; %entry
1037; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1038; GCN1-NEXT:    s_load_dword s2, s[4:5], 0xb
1039; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
1040; GCN1-NEXT:    s_add_u32 s0, s0, 16
1041; GCN1-NEXT:    s_addc_u32 s1, s1, 0
1042; GCN1-NEXT:    v_mov_b32_e32 v0, s0
1043; GCN1-NEXT:    v_mov_b32_e32 v1, s1
1044; GCN1-NEXT:    v_mov_b32_e32 v2, s2
1045; GCN1-NEXT:    flat_atomic_sub v[0:1], v2
1046; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1047; GCN1-NEXT:    buffer_wbinvl1_vol
1048; GCN1-NEXT:    s_endpgm
1049;
1050; GCN2-LABEL: atomic_sub_i32_offset:
1051; GCN2:       ; %bb.0: ; %entry
1052; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1053; GCN2-NEXT:    s_load_dword s2, s[4:5], 0x2c
1054; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
1055; GCN2-NEXT:    s_add_u32 s0, s0, 16
1056; GCN2-NEXT:    s_addc_u32 s1, s1, 0
1057; GCN2-NEXT:    v_mov_b32_e32 v0, s0
1058; GCN2-NEXT:    v_mov_b32_e32 v1, s1
1059; GCN2-NEXT:    v_mov_b32_e32 v2, s2
1060; GCN2-NEXT:    flat_atomic_sub v[0:1], v2
1061; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1062; GCN2-NEXT:    buffer_wbinvl1_vol
1063; GCN2-NEXT:    s_endpgm
1064;
1065; GCN3-LABEL: atomic_sub_i32_offset:
1066; GCN3:       ; %bb.0: ; %entry
1067; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1068; GCN3-NEXT:    s_load_dword s2, s[4:5], 0x2c
1069; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
1070; GCN3-NEXT:    v_mov_b32_e32 v0, s0
1071; GCN3-NEXT:    v_mov_b32_e32 v1, s1
1072; GCN3-NEXT:    v_mov_b32_e32 v2, s2
1073; GCN3-NEXT:    flat_atomic_sub v[0:1], v2 offset:16
1074; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1075; GCN3-NEXT:    buffer_wbinvl1_vol
1076; GCN3-NEXT:    s_endpgm
1077entry:
1078  %gep = getelementptr i32, ptr %out, i32 4
1079  %val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst
1080  ret void
1081}
1082
1083define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
1084; GCN1-LABEL: atomic_sub_i32_ret_offset:
1085; GCN1:       ; %bb.0: ; %entry
1086; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1087; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xd
1088; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
1089; GCN1-NEXT:    s_add_u32 s0, s0, 16
1090; GCN1-NEXT:    s_addc_u32 s1, s1, 0
1091; GCN1-NEXT:    v_mov_b32_e32 v0, s0
1092; GCN1-NEXT:    v_mov_b32_e32 v1, s1
1093; GCN1-NEXT:    v_mov_b32_e32 v2, s4
1094; GCN1-NEXT:    flat_atomic_sub v2, v[0:1], v2 glc
1095; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1096; GCN1-NEXT:    buffer_wbinvl1_vol
1097; GCN1-NEXT:    v_mov_b32_e32 v0, s2
1098; GCN1-NEXT:    v_mov_b32_e32 v1, s3
1099; GCN1-NEXT:    flat_store_dword v[0:1], v2
1100; GCN1-NEXT:    s_endpgm
1101;
1102; GCN2-LABEL: atomic_sub_i32_ret_offset:
1103; GCN2:       ; %bb.0: ; %entry
1104; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1105; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x34
1106; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
1107; GCN2-NEXT:    s_add_u32 s0, s0, 16
1108; GCN2-NEXT:    s_addc_u32 s1, s1, 0
1109; GCN2-NEXT:    v_mov_b32_e32 v0, s0
1110; GCN2-NEXT:    v_mov_b32_e32 v1, s1
1111; GCN2-NEXT:    v_mov_b32_e32 v2, s4
1112; GCN2-NEXT:    flat_atomic_sub v2, v[0:1], v2 glc
1113; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1114; GCN2-NEXT:    buffer_wbinvl1_vol
1115; GCN2-NEXT:    v_mov_b32_e32 v0, s2
1116; GCN2-NEXT:    v_mov_b32_e32 v1, s3
1117; GCN2-NEXT:    flat_store_dword v[0:1], v2
1118; GCN2-NEXT:    s_endpgm
1119;
1120; GCN3-LABEL: atomic_sub_i32_ret_offset:
1121; GCN3:       ; %bb.0: ; %entry
1122; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1123; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x34
1124; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
1125; GCN3-NEXT:    v_mov_b32_e32 v0, s0
1126; GCN3-NEXT:    v_mov_b32_e32 v1, s1
1127; GCN3-NEXT:    v_mov_b32_e32 v2, s6
1128; GCN3-NEXT:    flat_atomic_sub v2, v[0:1], v2 offset:16 glc
1129; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1130; GCN3-NEXT:    buffer_wbinvl1_vol
1131; GCN3-NEXT:    v_mov_b32_e32 v0, s2
1132; GCN3-NEXT:    v_mov_b32_e32 v1, s3
1133; GCN3-NEXT:    flat_store_dword v[0:1], v2
1134; GCN3-NEXT:    s_endpgm
1135entry:
1136  %gep = getelementptr i32, ptr %out, i32 4
1137  %val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst
1138  store i32 %val, ptr %out2
1139  ret void
1140}
1141
1142define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
1143; GCN1-LABEL: atomic_sub_i32_addr64_offset:
1144; GCN1:       ; %bb.0: ; %entry
1145; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
1146; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
1147; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
1148; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
1149; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
1150; GCN1-NEXT:    s_add_u32 s0, s2, s0
1151; GCN1-NEXT:    s_addc_u32 s1, s3, s1
1152; GCN1-NEXT:    s_add_u32 s0, s0, 16
1153; GCN1-NEXT:    s_addc_u32 s1, s1, 0
1154; GCN1-NEXT:    v_mov_b32_e32 v0, s0
1155; GCN1-NEXT:    v_mov_b32_e32 v1, s1
1156; GCN1-NEXT:    v_mov_b32_e32 v2, s4
1157; GCN1-NEXT:    flat_atomic_sub v[0:1], v2
1158; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1159; GCN1-NEXT:    buffer_wbinvl1_vol
1160; GCN1-NEXT:    s_endpgm
1161;
1162; GCN2-LABEL: atomic_sub_i32_addr64_offset:
1163; GCN2:       ; %bb.0: ; %entry
1164; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
1165; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
1166; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
1167; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
1168; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
1169; GCN2-NEXT:    s_add_u32 s0, s2, s0
1170; GCN2-NEXT:    s_addc_u32 s1, s3, s1
1171; GCN2-NEXT:    s_add_u32 s0, s0, 16
1172; GCN2-NEXT:    s_addc_u32 s1, s1, 0
1173; GCN2-NEXT:    v_mov_b32_e32 v0, s0
1174; GCN2-NEXT:    v_mov_b32_e32 v1, s1
1175; GCN2-NEXT:    v_mov_b32_e32 v2, s4
1176; GCN2-NEXT:    flat_atomic_sub v[0:1], v2
1177; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1178; GCN2-NEXT:    buffer_wbinvl1_vol
1179; GCN2-NEXT:    s_endpgm
1180;
1181; GCN3-LABEL: atomic_sub_i32_addr64_offset:
1182; GCN3:       ; %bb.0: ; %entry
1183; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
1184; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
1185; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
1186; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
1187; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
1188; GCN3-NEXT:    s_add_u32 s0, s2, s0
1189; GCN3-NEXT:    s_addc_u32 s1, s3, s1
1190; GCN3-NEXT:    v_mov_b32_e32 v0, s0
1191; GCN3-NEXT:    v_mov_b32_e32 v1, s1
1192; GCN3-NEXT:    v_mov_b32_e32 v2, s6
1193; GCN3-NEXT:    flat_atomic_sub v[0:1], v2 offset:16
1194; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1195; GCN3-NEXT:    buffer_wbinvl1_vol
1196; GCN3-NEXT:    s_endpgm
1197entry:
1198  %ptr = getelementptr i32, ptr %out, i64 %index
1199  %gep = getelementptr i32, ptr %ptr, i32 4
1200  %val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst
1201  ret void
1202}
1203
1204define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
1205; GCN1-LABEL: atomic_sub_i32_ret_addr64_offset:
1206; GCN1:       ; %bb.0: ; %entry
1207; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
1208; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1209; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
1210; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
1211; GCN1-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
1212; GCN1-NEXT:    s_add_u32 s0, s0, s4
1213; GCN1-NEXT:    s_addc_u32 s1, s1, s5
1214; GCN1-NEXT:    s_add_u32 s0, s0, 16
1215; GCN1-NEXT:    s_addc_u32 s1, s1, 0
1216; GCN1-NEXT:    v_mov_b32_e32 v0, s0
1217; GCN1-NEXT:    v_mov_b32_e32 v1, s1
1218; GCN1-NEXT:    v_mov_b32_e32 v2, s8
1219; GCN1-NEXT:    flat_atomic_sub v2, v[0:1], v2 glc
1220; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1221; GCN1-NEXT:    buffer_wbinvl1_vol
1222; GCN1-NEXT:    v_mov_b32_e32 v0, s2
1223; GCN1-NEXT:    v_mov_b32_e32 v1, s3
1224; GCN1-NEXT:    flat_store_dword v[0:1], v2
1225; GCN1-NEXT:    s_endpgm
1226;
1227; GCN2-LABEL: atomic_sub_i32_ret_addr64_offset:
1228; GCN2:       ; %bb.0: ; %entry
1229; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
1230; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1231; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
1232; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
1233; GCN2-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
1234; GCN2-NEXT:    s_add_u32 s0, s0, s4
1235; GCN2-NEXT:    s_addc_u32 s1, s1, s5
1236; GCN2-NEXT:    s_add_u32 s0, s0, 16
1237; GCN2-NEXT:    s_addc_u32 s1, s1, 0
1238; GCN2-NEXT:    v_mov_b32_e32 v0, s0
1239; GCN2-NEXT:    v_mov_b32_e32 v1, s1
1240; GCN2-NEXT:    v_mov_b32_e32 v2, s8
1241; GCN2-NEXT:    flat_atomic_sub v2, v[0:1], v2 glc
1242; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1243; GCN2-NEXT:    buffer_wbinvl1_vol
1244; GCN2-NEXT:    v_mov_b32_e32 v0, s2
1245; GCN2-NEXT:    v_mov_b32_e32 v1, s3
1246; GCN2-NEXT:    flat_store_dword v[0:1], v2
1247; GCN2-NEXT:    s_endpgm
1248;
1249; GCN3-LABEL: atomic_sub_i32_ret_addr64_offset:
1250; GCN3:       ; %bb.0: ; %entry
1251; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
1252; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1253; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
1254; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
1255; GCN3-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
1256; GCN3-NEXT:    s_add_u32 s0, s0, s4
1257; GCN3-NEXT:    s_addc_u32 s1, s1, s5
1258; GCN3-NEXT:    v_mov_b32_e32 v0, s0
1259; GCN3-NEXT:    v_mov_b32_e32 v1, s1
1260; GCN3-NEXT:    v_mov_b32_e32 v2, s8
1261; GCN3-NEXT:    flat_atomic_sub v2, v[0:1], v2 offset:16 glc
1262; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1263; GCN3-NEXT:    buffer_wbinvl1_vol
1264; GCN3-NEXT:    v_mov_b32_e32 v0, s2
1265; GCN3-NEXT:    v_mov_b32_e32 v1, s3
1266; GCN3-NEXT:    flat_store_dword v[0:1], v2
1267; GCN3-NEXT:    s_endpgm
1268entry:
1269  %ptr = getelementptr i32, ptr %out, i64 %index
1270  %gep = getelementptr i32, ptr %ptr, i32 4
1271  %val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst
1272  store i32 %val, ptr %out2
1273  ret void
1274}
1275
1276define amdgpu_kernel void @atomic_sub_i32(ptr %out, i32 %in) {
1277; GCN1-LABEL: atomic_sub_i32:
1278; GCN1:       ; %bb.0: ; %entry
1279; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1280; GCN1-NEXT:    s_load_dword s2, s[4:5], 0xb
1281; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
1282; GCN1-NEXT:    v_mov_b32_e32 v0, s0
1283; GCN1-NEXT:    v_mov_b32_e32 v1, s1
1284; GCN1-NEXT:    v_mov_b32_e32 v2, s2
1285; GCN1-NEXT:    flat_atomic_sub v[0:1], v2
1286; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1287; GCN1-NEXT:    buffer_wbinvl1_vol
1288; GCN1-NEXT:    s_endpgm
1289;
1290; GCN2-LABEL: atomic_sub_i32:
1291; GCN2:       ; %bb.0: ; %entry
1292; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1293; GCN2-NEXT:    s_load_dword s2, s[4:5], 0x2c
1294; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
1295; GCN2-NEXT:    v_mov_b32_e32 v0, s0
1296; GCN2-NEXT:    v_mov_b32_e32 v1, s1
1297; GCN2-NEXT:    v_mov_b32_e32 v2, s2
1298; GCN2-NEXT:    flat_atomic_sub v[0:1], v2
1299; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1300; GCN2-NEXT:    buffer_wbinvl1_vol
1301; GCN2-NEXT:    s_endpgm
1302;
1303; GCN3-LABEL: atomic_sub_i32:
1304; GCN3:       ; %bb.0: ; %entry
1305; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1306; GCN3-NEXT:    s_load_dword s2, s[4:5], 0x2c
1307; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
1308; GCN3-NEXT:    v_mov_b32_e32 v0, s0
1309; GCN3-NEXT:    v_mov_b32_e32 v1, s1
1310; GCN3-NEXT:    v_mov_b32_e32 v2, s2
1311; GCN3-NEXT:    flat_atomic_sub v[0:1], v2
1312; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1313; GCN3-NEXT:    buffer_wbinvl1_vol
1314; GCN3-NEXT:    s_endpgm
1315entry:
1316  %val = atomicrmw volatile sub ptr %out, i32 %in syncscope("agent") seq_cst
1317  ret void
1318}
1319
1320define amdgpu_kernel void @atomic_sub_i32_ret(ptr %out, ptr %out2, i32 %in) {
1321; GCN1-LABEL: atomic_sub_i32_ret:
1322; GCN1:       ; %bb.0: ; %entry
1323; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1324; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xd
1325; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
1326; GCN1-NEXT:    v_mov_b32_e32 v0, s0
1327; GCN1-NEXT:    v_mov_b32_e32 v1, s1
1328; GCN1-NEXT:    v_mov_b32_e32 v2, s4
1329; GCN1-NEXT:    flat_atomic_sub v2, v[0:1], v2 glc
1330; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1331; GCN1-NEXT:    buffer_wbinvl1_vol
1332; GCN1-NEXT:    v_mov_b32_e32 v0, s2
1333; GCN1-NEXT:    v_mov_b32_e32 v1, s3
1334; GCN1-NEXT:    flat_store_dword v[0:1], v2
1335; GCN1-NEXT:    s_endpgm
1336;
1337; GCN2-LABEL: atomic_sub_i32_ret:
1338; GCN2:       ; %bb.0: ; %entry
1339; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1340; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x34
1341; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
1342; GCN2-NEXT:    v_mov_b32_e32 v0, s0
1343; GCN2-NEXT:    v_mov_b32_e32 v1, s1
1344; GCN2-NEXT:    v_mov_b32_e32 v2, s4
1345; GCN2-NEXT:    flat_atomic_sub v2, v[0:1], v2 glc
1346; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1347; GCN2-NEXT:    buffer_wbinvl1_vol
1348; GCN2-NEXT:    v_mov_b32_e32 v0, s2
1349; GCN2-NEXT:    v_mov_b32_e32 v1, s3
1350; GCN2-NEXT:    flat_store_dword v[0:1], v2
1351; GCN2-NEXT:    s_endpgm
1352;
1353; GCN3-LABEL: atomic_sub_i32_ret:
1354; GCN3:       ; %bb.0: ; %entry
1355; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1356; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x34
1357; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
1358; GCN3-NEXT:    v_mov_b32_e32 v0, s0
1359; GCN3-NEXT:    v_mov_b32_e32 v1, s1
1360; GCN3-NEXT:    v_mov_b32_e32 v2, s6
1361; GCN3-NEXT:    flat_atomic_sub v2, v[0:1], v2 glc
1362; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1363; GCN3-NEXT:    buffer_wbinvl1_vol
1364; GCN3-NEXT:    v_mov_b32_e32 v0, s2
1365; GCN3-NEXT:    v_mov_b32_e32 v1, s3
1366; GCN3-NEXT:    flat_store_dword v[0:1], v2
1367; GCN3-NEXT:    s_endpgm
1368entry:
1369  %val = atomicrmw volatile sub ptr %out, i32 %in syncscope("agent") seq_cst
1370  store i32 %val, ptr %out2
1371  ret void
1372}
1373
1374define amdgpu_kernel void @atomic_sub_i32_addr64(ptr %out, i32 %in, i64 %index) {
1375; GCN1-LABEL: atomic_sub_i32_addr64:
1376; GCN1:       ; %bb.0: ; %entry
1377; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
1378; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
1379; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
1380; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
1381; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
1382; GCN1-NEXT:    s_add_u32 s0, s2, s0
1383; GCN1-NEXT:    s_addc_u32 s1, s3, s1
1384; GCN1-NEXT:    v_mov_b32_e32 v0, s0
1385; GCN1-NEXT:    v_mov_b32_e32 v1, s1
1386; GCN1-NEXT:    v_mov_b32_e32 v2, s4
1387; GCN1-NEXT:    flat_atomic_sub v[0:1], v2
1388; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1389; GCN1-NEXT:    buffer_wbinvl1_vol
1390; GCN1-NEXT:    s_endpgm
1391;
1392; GCN2-LABEL: atomic_sub_i32_addr64:
1393; GCN2:       ; %bb.0: ; %entry
1394; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
1395; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
1396; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
1397; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
1398; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
1399; GCN2-NEXT:    s_add_u32 s0, s2, s0
1400; GCN2-NEXT:    s_addc_u32 s1, s3, s1
1401; GCN2-NEXT:    v_mov_b32_e32 v0, s0
1402; GCN2-NEXT:    v_mov_b32_e32 v1, s1
1403; GCN2-NEXT:    v_mov_b32_e32 v2, s4
1404; GCN2-NEXT:    flat_atomic_sub v[0:1], v2
1405; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1406; GCN2-NEXT:    buffer_wbinvl1_vol
1407; GCN2-NEXT:    s_endpgm
1408;
1409; GCN3-LABEL: atomic_sub_i32_addr64:
1410; GCN3:       ; %bb.0: ; %entry
1411; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
1412; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
1413; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
1414; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
1415; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
1416; GCN3-NEXT:    s_add_u32 s0, s2, s0
1417; GCN3-NEXT:    s_addc_u32 s1, s3, s1
1418; GCN3-NEXT:    v_mov_b32_e32 v0, s0
1419; GCN3-NEXT:    v_mov_b32_e32 v1, s1
1420; GCN3-NEXT:    v_mov_b32_e32 v2, s6
1421; GCN3-NEXT:    flat_atomic_sub v[0:1], v2
1422; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1423; GCN3-NEXT:    buffer_wbinvl1_vol
1424; GCN3-NEXT:    s_endpgm
1425entry:
1426  %ptr = getelementptr i32, ptr %out, i64 %index
1427  %val = atomicrmw volatile sub ptr %ptr, i32 %in syncscope("agent") seq_cst
1428  ret void
1429}
1430
1431define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
1432; GCN1-LABEL: atomic_sub_i32_ret_addr64:
1433; GCN1:       ; %bb.0: ; %entry
1434; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
1435; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1436; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
1437; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
1438; GCN1-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
1439; GCN1-NEXT:    s_add_u32 s0, s0, s4
1440; GCN1-NEXT:    s_addc_u32 s1, s1, s5
1441; GCN1-NEXT:    v_mov_b32_e32 v0, s0
1442; GCN1-NEXT:    v_mov_b32_e32 v1, s1
1443; GCN1-NEXT:    v_mov_b32_e32 v2, s8
1444; GCN1-NEXT:    flat_atomic_sub v2, v[0:1], v2 glc
1445; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1446; GCN1-NEXT:    buffer_wbinvl1_vol
1447; GCN1-NEXT:    v_mov_b32_e32 v0, s2
1448; GCN1-NEXT:    v_mov_b32_e32 v1, s3
1449; GCN1-NEXT:    flat_store_dword v[0:1], v2
1450; GCN1-NEXT:    s_endpgm
1451;
1452; GCN2-LABEL: atomic_sub_i32_ret_addr64:
1453; GCN2:       ; %bb.0: ; %entry
1454; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
1455; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1456; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
1457; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
1458; GCN2-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
1459; GCN2-NEXT:    s_add_u32 s0, s0, s4
1460; GCN2-NEXT:    s_addc_u32 s1, s1, s5
1461; GCN2-NEXT:    v_mov_b32_e32 v0, s0
1462; GCN2-NEXT:    v_mov_b32_e32 v1, s1
1463; GCN2-NEXT:    v_mov_b32_e32 v2, s8
1464; GCN2-NEXT:    flat_atomic_sub v2, v[0:1], v2 glc
1465; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1466; GCN2-NEXT:    buffer_wbinvl1_vol
1467; GCN2-NEXT:    v_mov_b32_e32 v0, s2
1468; GCN2-NEXT:    v_mov_b32_e32 v1, s3
1469; GCN2-NEXT:    flat_store_dword v[0:1], v2
1470; GCN2-NEXT:    s_endpgm
1471;
1472; GCN3-LABEL: atomic_sub_i32_ret_addr64:
1473; GCN3:       ; %bb.0: ; %entry
1474; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
1475; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1476; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
1477; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
1478; GCN3-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
1479; GCN3-NEXT:    s_add_u32 s0, s0, s4
1480; GCN3-NEXT:    s_addc_u32 s1, s1, s5
1481; GCN3-NEXT:    v_mov_b32_e32 v0, s0
1482; GCN3-NEXT:    v_mov_b32_e32 v1, s1
1483; GCN3-NEXT:    v_mov_b32_e32 v2, s8
1484; GCN3-NEXT:    flat_atomic_sub v2, v[0:1], v2 glc
1485; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1486; GCN3-NEXT:    buffer_wbinvl1_vol
1487; GCN3-NEXT:    v_mov_b32_e32 v0, s2
1488; GCN3-NEXT:    v_mov_b32_e32 v1, s3
1489; GCN3-NEXT:    flat_store_dword v[0:1], v2
1490; GCN3-NEXT:    s_endpgm
1491entry:
1492  %ptr = getelementptr i32, ptr %out, i64 %index
1493  %val = atomicrmw volatile sub ptr %ptr, i32 %in syncscope("agent") seq_cst
1494  store i32 %val, ptr %out2
1495  ret void
1496}
1497
1498define amdgpu_kernel void @atomic_max_i32_offset(ptr %out, i32 %in) {
1499; GCN1-LABEL: atomic_max_i32_offset:
1500; GCN1:       ; %bb.0: ; %entry
1501; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1502; GCN1-NEXT:    s_load_dword s2, s[4:5], 0xb
1503; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
1504; GCN1-NEXT:    s_add_u32 s0, s0, 16
1505; GCN1-NEXT:    s_addc_u32 s1, s1, 0
1506; GCN1-NEXT:    v_mov_b32_e32 v0, s0
1507; GCN1-NEXT:    v_mov_b32_e32 v1, s1
1508; GCN1-NEXT:    v_mov_b32_e32 v2, s2
1509; GCN1-NEXT:    flat_atomic_smax v[0:1], v2
1510; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
1511; GCN1-NEXT:    s_endpgm
1512;
1513; GCN2-LABEL: atomic_max_i32_offset:
1514; GCN2:       ; %bb.0: ; %entry
1515; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1516; GCN2-NEXT:    s_load_dword s2, s[4:5], 0x2c
1517; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
1518; GCN2-NEXT:    s_add_u32 s0, s0, 16
1519; GCN2-NEXT:    s_addc_u32 s1, s1, 0
1520; GCN2-NEXT:    v_mov_b32_e32 v0, s0
1521; GCN2-NEXT:    v_mov_b32_e32 v1, s1
1522; GCN2-NEXT:    v_mov_b32_e32 v2, s2
1523; GCN2-NEXT:    flat_atomic_smax v[0:1], v2
1524; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
1525; GCN2-NEXT:    s_endpgm
1526;
1527; GCN3-LABEL: atomic_max_i32_offset:
1528; GCN3:       ; %bb.0: ; %entry
1529; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1530; GCN3-NEXT:    s_load_dword s2, s[4:5], 0x2c
1531; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
1532; GCN3-NEXT:    v_mov_b32_e32 v0, s0
1533; GCN3-NEXT:    v_mov_b32_e32 v1, s1
1534; GCN3-NEXT:    v_mov_b32_e32 v2, s2
1535; GCN3-NEXT:    flat_atomic_smax v[0:1], v2 offset:16
1536; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
1537; GCN3-NEXT:    s_endpgm
1538entry:
1539  %gep = getelementptr i32, ptr %out, i32 4
1540  %val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst
1541  ret void
1542}
1543
1544define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
1545; GCN1-LABEL: atomic_max_i32_ret_offset:
1546; GCN1:       ; %bb.0: ; %entry
1547; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1548; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xd
1549; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
1550; GCN1-NEXT:    s_add_u32 s0, s0, 16
1551; GCN1-NEXT:    s_addc_u32 s1, s1, 0
1552; GCN1-NEXT:    v_mov_b32_e32 v0, s0
1553; GCN1-NEXT:    v_mov_b32_e32 v1, s1
1554; GCN1-NEXT:    v_mov_b32_e32 v2, s4
1555; GCN1-NEXT:    flat_atomic_smax v2, v[0:1], v2 glc
1556; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
1557; GCN1-NEXT:    v_mov_b32_e32 v0, s2
1558; GCN1-NEXT:    v_mov_b32_e32 v1, s3
1559; GCN1-NEXT:    s_waitcnt vmcnt(0)
1560; GCN1-NEXT:    flat_store_dword v[0:1], v2
1561; GCN1-NEXT:    s_endpgm
1562;
1563; GCN2-LABEL: atomic_max_i32_ret_offset:
1564; GCN2:       ; %bb.0: ; %entry
1565; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1566; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x34
1567; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
1568; GCN2-NEXT:    s_add_u32 s0, s0, 16
1569; GCN2-NEXT:    s_addc_u32 s1, s1, 0
1570; GCN2-NEXT:    v_mov_b32_e32 v0, s0
1571; GCN2-NEXT:    v_mov_b32_e32 v1, s1
1572; GCN2-NEXT:    v_mov_b32_e32 v2, s4
1573; GCN2-NEXT:    flat_atomic_smax v2, v[0:1], v2 glc
1574; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
1575; GCN2-NEXT:    v_mov_b32_e32 v0, s2
1576; GCN2-NEXT:    v_mov_b32_e32 v1, s3
1577; GCN2-NEXT:    s_waitcnt vmcnt(0)
1578; GCN2-NEXT:    flat_store_dword v[0:1], v2
1579; GCN2-NEXT:    s_endpgm
1580;
1581; GCN3-LABEL: atomic_max_i32_ret_offset:
1582; GCN3:       ; %bb.0: ; %entry
1583; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1584; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x34
1585; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
1586; GCN3-NEXT:    v_mov_b32_e32 v0, s0
1587; GCN3-NEXT:    v_mov_b32_e32 v1, s1
1588; GCN3-NEXT:    v_mov_b32_e32 v2, s6
1589; GCN3-NEXT:    flat_atomic_smax v2, v[0:1], v2 offset:16 glc
1590; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
1591; GCN3-NEXT:    v_mov_b32_e32 v0, s2
1592; GCN3-NEXT:    v_mov_b32_e32 v1, s3
1593; GCN3-NEXT:    s_waitcnt vmcnt(0)
1594; GCN3-NEXT:    flat_store_dword v[0:1], v2
1595; GCN3-NEXT:    s_endpgm
1596entry:
1597  %gep = getelementptr i32, ptr %out, i32 4
1598  %val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst
1599  store i32 %val, ptr %out2
1600  ret void
1601}
1602
1603define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
1604; GCN1-LABEL: atomic_max_i32_addr64_offset:
1605; GCN1:       ; %bb.0: ; %entry
1606; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
1607; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
1608; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
1609; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
1610; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
1611; GCN1-NEXT:    s_add_u32 s0, s2, s0
1612; GCN1-NEXT:    s_addc_u32 s1, s3, s1
1613; GCN1-NEXT:    s_add_u32 s0, s0, 16
1614; GCN1-NEXT:    s_addc_u32 s1, s1, 0
1615; GCN1-NEXT:    v_mov_b32_e32 v0, s0
1616; GCN1-NEXT:    v_mov_b32_e32 v1, s1
1617; GCN1-NEXT:    v_mov_b32_e32 v2, s4
1618; GCN1-NEXT:    flat_atomic_smax v[0:1], v2
1619; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
1620; GCN1-NEXT:    s_endpgm
1621;
1622; GCN2-LABEL: atomic_max_i32_addr64_offset:
1623; GCN2:       ; %bb.0: ; %entry
1624; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
1625; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
1626; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
1627; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
1628; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
1629; GCN2-NEXT:    s_add_u32 s0, s2, s0
1630; GCN2-NEXT:    s_addc_u32 s1, s3, s1
1631; GCN2-NEXT:    s_add_u32 s0, s0, 16
1632; GCN2-NEXT:    s_addc_u32 s1, s1, 0
1633; GCN2-NEXT:    v_mov_b32_e32 v0, s0
1634; GCN2-NEXT:    v_mov_b32_e32 v1, s1
1635; GCN2-NEXT:    v_mov_b32_e32 v2, s4
1636; GCN2-NEXT:    flat_atomic_smax v[0:1], v2
1637; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
1638; GCN2-NEXT:    s_endpgm
1639;
1640; GCN3-LABEL: atomic_max_i32_addr64_offset:
1641; GCN3:       ; %bb.0: ; %entry
1642; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
1643; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
1644; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
1645; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
1646; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
1647; GCN3-NEXT:    s_add_u32 s0, s2, s0
1648; GCN3-NEXT:    s_addc_u32 s1, s3, s1
1649; GCN3-NEXT:    v_mov_b32_e32 v0, s0
1650; GCN3-NEXT:    v_mov_b32_e32 v1, s1
1651; GCN3-NEXT:    v_mov_b32_e32 v2, s6
1652; GCN3-NEXT:    flat_atomic_smax v[0:1], v2 offset:16
1653; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
1654; GCN3-NEXT:    s_endpgm
1655entry:
1656  %ptr = getelementptr i32, ptr %out, i64 %index
1657  %gep = getelementptr i32, ptr %ptr, i32 4
1658  %val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst
1659  ret void
1660}
1661
1662define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
1663; GCN1-LABEL: atomic_max_i32_ret_addr64_offset:
1664; GCN1:       ; %bb.0: ; %entry
1665; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
1666; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1667; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
1668; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
1669; GCN1-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
1670; GCN1-NEXT:    s_add_u32 s0, s0, s4
1671; GCN1-NEXT:    s_addc_u32 s1, s1, s5
1672; GCN1-NEXT:    s_add_u32 s0, s0, 16
1673; GCN1-NEXT:    s_addc_u32 s1, s1, 0
1674; GCN1-NEXT:    v_mov_b32_e32 v0, s0
1675; GCN1-NEXT:    v_mov_b32_e32 v1, s1
1676; GCN1-NEXT:    v_mov_b32_e32 v2, s8
1677; GCN1-NEXT:    flat_atomic_smax v2, v[0:1], v2 glc
1678; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
1679; GCN1-NEXT:    v_mov_b32_e32 v0, s2
1680; GCN1-NEXT:    v_mov_b32_e32 v1, s3
1681; GCN1-NEXT:    s_waitcnt vmcnt(0)
1682; GCN1-NEXT:    flat_store_dword v[0:1], v2
1683; GCN1-NEXT:    s_endpgm
1684;
1685; GCN2-LABEL: atomic_max_i32_ret_addr64_offset:
1686; GCN2:       ; %bb.0: ; %entry
1687; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
1688; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1689; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
1690; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
1691; GCN2-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
1692; GCN2-NEXT:    s_add_u32 s0, s0, s4
1693; GCN2-NEXT:    s_addc_u32 s1, s1, s5
1694; GCN2-NEXT:    s_add_u32 s0, s0, 16
1695; GCN2-NEXT:    s_addc_u32 s1, s1, 0
1696; GCN2-NEXT:    v_mov_b32_e32 v0, s0
1697; GCN2-NEXT:    v_mov_b32_e32 v1, s1
1698; GCN2-NEXT:    v_mov_b32_e32 v2, s8
1699; GCN2-NEXT:    flat_atomic_smax v2, v[0:1], v2 glc
1700; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
1701; GCN2-NEXT:    v_mov_b32_e32 v0, s2
1702; GCN2-NEXT:    v_mov_b32_e32 v1, s3
1703; GCN2-NEXT:    s_waitcnt vmcnt(0)
1704; GCN2-NEXT:    flat_store_dword v[0:1], v2
1705; GCN2-NEXT:    s_endpgm
1706;
1707; GCN3-LABEL: atomic_max_i32_ret_addr64_offset:
1708; GCN3:       ; %bb.0: ; %entry
1709; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
1710; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1711; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
1712; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
1713; GCN3-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
1714; GCN3-NEXT:    s_add_u32 s0, s0, s4
1715; GCN3-NEXT:    s_addc_u32 s1, s1, s5
1716; GCN3-NEXT:    v_mov_b32_e32 v0, s0
1717; GCN3-NEXT:    v_mov_b32_e32 v1, s1
1718; GCN3-NEXT:    v_mov_b32_e32 v2, s8
1719; GCN3-NEXT:    flat_atomic_smax v2, v[0:1], v2 offset:16 glc
1720; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
1721; GCN3-NEXT:    v_mov_b32_e32 v0, s2
1722; GCN3-NEXT:    v_mov_b32_e32 v1, s3
1723; GCN3-NEXT:    s_waitcnt vmcnt(0)
1724; GCN3-NEXT:    flat_store_dword v[0:1], v2
1725; GCN3-NEXT:    s_endpgm
1726entry:
1727  %ptr = getelementptr i32, ptr %out, i64 %index
1728  %gep = getelementptr i32, ptr %ptr, i32 4
1729  %val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst
1730  store i32 %val, ptr %out2
1731  ret void
1732}
1733
1734define amdgpu_kernel void @atomic_max_i32(ptr %out, i32 %in) {
1735; GCN1-LABEL: atomic_max_i32:
1736; GCN1:       ; %bb.0: ; %entry
1737; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1738; GCN1-NEXT:    s_load_dword s2, s[4:5], 0xb
1739; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
1740; GCN1-NEXT:    v_mov_b32_e32 v0, s0
1741; GCN1-NEXT:    v_mov_b32_e32 v1, s1
1742; GCN1-NEXT:    v_mov_b32_e32 v2, s2
1743; GCN1-NEXT:    flat_atomic_smax v[0:1], v2
1744; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
1745; GCN1-NEXT:    s_endpgm
1746;
1747; GCN2-LABEL: atomic_max_i32:
1748; GCN2:       ; %bb.0: ; %entry
1749; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1750; GCN2-NEXT:    s_load_dword s2, s[4:5], 0x2c
1751; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
1752; GCN2-NEXT:    v_mov_b32_e32 v0, s0
1753; GCN2-NEXT:    v_mov_b32_e32 v1, s1
1754; GCN2-NEXT:    v_mov_b32_e32 v2, s2
1755; GCN2-NEXT:    flat_atomic_smax v[0:1], v2
1756; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
1757; GCN2-NEXT:    s_endpgm
1758;
1759; GCN3-LABEL: atomic_max_i32:
1760; GCN3:       ; %bb.0: ; %entry
1761; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1762; GCN3-NEXT:    s_load_dword s2, s[4:5], 0x2c
1763; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
1764; GCN3-NEXT:    v_mov_b32_e32 v0, s0
1765; GCN3-NEXT:    v_mov_b32_e32 v1, s1
1766; GCN3-NEXT:    v_mov_b32_e32 v2, s2
1767; GCN3-NEXT:    flat_atomic_smax v[0:1], v2
1768; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
1769; GCN3-NEXT:    s_endpgm
1770entry:
1771  %val = atomicrmw volatile max ptr %out, i32 %in syncscope("workgroup") seq_cst
1772  ret void
1773}
1774
1775define amdgpu_kernel void @atomic_max_i32_ret(ptr %out, ptr %out2, i32 %in) {
1776; GCN1-LABEL: atomic_max_i32_ret:
1777; GCN1:       ; %bb.0: ; %entry
1778; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1779; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xd
1780; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
1781; GCN1-NEXT:    v_mov_b32_e32 v0, s0
1782; GCN1-NEXT:    v_mov_b32_e32 v1, s1
1783; GCN1-NEXT:    v_mov_b32_e32 v2, s4
1784; GCN1-NEXT:    flat_atomic_smax v2, v[0:1], v2 glc
1785; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
1786; GCN1-NEXT:    v_mov_b32_e32 v0, s2
1787; GCN1-NEXT:    v_mov_b32_e32 v1, s3
1788; GCN1-NEXT:    s_waitcnt vmcnt(0)
1789; GCN1-NEXT:    flat_store_dword v[0:1], v2
1790; GCN1-NEXT:    s_endpgm
1791;
1792; GCN2-LABEL: atomic_max_i32_ret:
1793; GCN2:       ; %bb.0: ; %entry
1794; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1795; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x34
1796; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
1797; GCN2-NEXT:    v_mov_b32_e32 v0, s0
1798; GCN2-NEXT:    v_mov_b32_e32 v1, s1
1799; GCN2-NEXT:    v_mov_b32_e32 v2, s4
1800; GCN2-NEXT:    flat_atomic_smax v2, v[0:1], v2 glc
1801; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
1802; GCN2-NEXT:    v_mov_b32_e32 v0, s2
1803; GCN2-NEXT:    v_mov_b32_e32 v1, s3
1804; GCN2-NEXT:    s_waitcnt vmcnt(0)
1805; GCN2-NEXT:    flat_store_dword v[0:1], v2
1806; GCN2-NEXT:    s_endpgm
1807;
1808; GCN3-LABEL: atomic_max_i32_ret:
1809; GCN3:       ; %bb.0: ; %entry
1810; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1811; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x34
1812; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
1813; GCN3-NEXT:    v_mov_b32_e32 v0, s0
1814; GCN3-NEXT:    v_mov_b32_e32 v1, s1
1815; GCN3-NEXT:    v_mov_b32_e32 v2, s6
1816; GCN3-NEXT:    flat_atomic_smax v2, v[0:1], v2 glc
1817; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
1818; GCN3-NEXT:    v_mov_b32_e32 v0, s2
1819; GCN3-NEXT:    v_mov_b32_e32 v1, s3
1820; GCN3-NEXT:    s_waitcnt vmcnt(0)
1821; GCN3-NEXT:    flat_store_dword v[0:1], v2
1822; GCN3-NEXT:    s_endpgm
1823entry:
1824  %val = atomicrmw volatile max ptr %out, i32 %in syncscope("workgroup") seq_cst
1825  store i32 %val, ptr %out2
1826  ret void
1827}
1828
1829define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i64 %index) {
1830; GCN1-LABEL: atomic_max_i32_addr64:
1831; GCN1:       ; %bb.0: ; %entry
1832; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
1833; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
1834; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
1835; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
1836; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
1837; GCN1-NEXT:    s_add_u32 s0, s2, s0
1838; GCN1-NEXT:    s_addc_u32 s1, s3, s1
1839; GCN1-NEXT:    v_mov_b32_e32 v0, s0
1840; GCN1-NEXT:    v_mov_b32_e32 v1, s1
1841; GCN1-NEXT:    v_mov_b32_e32 v2, s4
1842; GCN1-NEXT:    flat_atomic_smax v[0:1], v2
1843; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
1844; GCN1-NEXT:    s_endpgm
1845;
1846; GCN2-LABEL: atomic_max_i32_addr64:
1847; GCN2:       ; %bb.0: ; %entry
1848; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
1849; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
1850; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
1851; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
1852; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
1853; GCN2-NEXT:    s_add_u32 s0, s2, s0
1854; GCN2-NEXT:    s_addc_u32 s1, s3, s1
1855; GCN2-NEXT:    v_mov_b32_e32 v0, s0
1856; GCN2-NEXT:    v_mov_b32_e32 v1, s1
1857; GCN2-NEXT:    v_mov_b32_e32 v2, s4
1858; GCN2-NEXT:    flat_atomic_smax v[0:1], v2
1859; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
1860; GCN2-NEXT:    s_endpgm
1861;
1862; GCN3-LABEL: atomic_max_i32_addr64:
1863; GCN3:       ; %bb.0: ; %entry
1864; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
1865; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
1866; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
1867; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
1868; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
1869; GCN3-NEXT:    s_add_u32 s0, s2, s0
1870; GCN3-NEXT:    s_addc_u32 s1, s3, s1
1871; GCN3-NEXT:    v_mov_b32_e32 v0, s0
1872; GCN3-NEXT:    v_mov_b32_e32 v1, s1
1873; GCN3-NEXT:    v_mov_b32_e32 v2, s6
1874; GCN3-NEXT:    flat_atomic_smax v[0:1], v2
1875; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
1876; GCN3-NEXT:    s_endpgm
1877entry:
1878  %ptr = getelementptr i32, ptr %out, i64 %index
1879  %val = atomicrmw volatile max ptr %ptr, i32 %in syncscope("workgroup") seq_cst
1880  ret void
1881}
1882
1883define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
1884; GCN1-LABEL: atomic_max_i32_ret_addr64:
1885; GCN1:       ; %bb.0: ; %entry
1886; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
1887; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1888; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
1889; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
1890; GCN1-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
1891; GCN1-NEXT:    s_add_u32 s0, s0, s4
1892; GCN1-NEXT:    s_addc_u32 s1, s1, s5
1893; GCN1-NEXT:    v_mov_b32_e32 v0, s0
1894; GCN1-NEXT:    v_mov_b32_e32 v1, s1
1895; GCN1-NEXT:    v_mov_b32_e32 v2, s8
1896; GCN1-NEXT:    flat_atomic_smax v2, v[0:1], v2 glc
1897; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
1898; GCN1-NEXT:    v_mov_b32_e32 v0, s2
1899; GCN1-NEXT:    v_mov_b32_e32 v1, s3
1900; GCN1-NEXT:    s_waitcnt vmcnt(0)
1901; GCN1-NEXT:    flat_store_dword v[0:1], v2
1902; GCN1-NEXT:    s_endpgm
1903;
1904; GCN2-LABEL: atomic_max_i32_ret_addr64:
1905; GCN2:       ; %bb.0: ; %entry
1906; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
1907; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1908; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
1909; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
1910; GCN2-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
1911; GCN2-NEXT:    s_add_u32 s0, s0, s4
1912; GCN2-NEXT:    s_addc_u32 s1, s1, s5
1913; GCN2-NEXT:    v_mov_b32_e32 v0, s0
1914; GCN2-NEXT:    v_mov_b32_e32 v1, s1
1915; GCN2-NEXT:    v_mov_b32_e32 v2, s8
1916; GCN2-NEXT:    flat_atomic_smax v2, v[0:1], v2 glc
1917; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
1918; GCN2-NEXT:    v_mov_b32_e32 v0, s2
1919; GCN2-NEXT:    v_mov_b32_e32 v1, s3
1920; GCN2-NEXT:    s_waitcnt vmcnt(0)
1921; GCN2-NEXT:    flat_store_dword v[0:1], v2
1922; GCN2-NEXT:    s_endpgm
1923;
1924; GCN3-LABEL: atomic_max_i32_ret_addr64:
1925; GCN3:       ; %bb.0: ; %entry
1926; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
1927; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1928; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
1929; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
1930; GCN3-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
1931; GCN3-NEXT:    s_add_u32 s0, s0, s4
1932; GCN3-NEXT:    s_addc_u32 s1, s1, s5
1933; GCN3-NEXT:    v_mov_b32_e32 v0, s0
1934; GCN3-NEXT:    v_mov_b32_e32 v1, s1
1935; GCN3-NEXT:    v_mov_b32_e32 v2, s8
1936; GCN3-NEXT:    flat_atomic_smax v2, v[0:1], v2 glc
1937; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
1938; GCN3-NEXT:    v_mov_b32_e32 v0, s2
1939; GCN3-NEXT:    v_mov_b32_e32 v1, s3
1940; GCN3-NEXT:    s_waitcnt vmcnt(0)
1941; GCN3-NEXT:    flat_store_dword v[0:1], v2
1942; GCN3-NEXT:    s_endpgm
1943entry:
1944  %ptr = getelementptr i32, ptr %out, i64 %index
1945  %val = atomicrmw volatile max ptr %ptr, i32 %in syncscope("workgroup") seq_cst
1946  store i32 %val, ptr %out2
1947  ret void
1948}
1949
1950define amdgpu_kernel void @atomic_umax_i32_offset(ptr %out, i32 %in) {
1951; GCN1-LABEL: atomic_umax_i32_offset:
1952; GCN1:       ; %bb.0: ; %entry
1953; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1954; GCN1-NEXT:    s_load_dword s2, s[4:5], 0xb
1955; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
1956; GCN1-NEXT:    s_add_u32 s0, s0, 16
1957; GCN1-NEXT:    s_addc_u32 s1, s1, 0
1958; GCN1-NEXT:    v_mov_b32_e32 v0, s0
1959; GCN1-NEXT:    v_mov_b32_e32 v1, s1
1960; GCN1-NEXT:    v_mov_b32_e32 v2, s2
1961; GCN1-NEXT:    flat_atomic_umax v[0:1], v2
1962; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
1963; GCN1-NEXT:    s_endpgm
1964;
1965; GCN2-LABEL: atomic_umax_i32_offset:
1966; GCN2:       ; %bb.0: ; %entry
1967; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1968; GCN2-NEXT:    s_load_dword s2, s[4:5], 0x2c
1969; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
1970; GCN2-NEXT:    s_add_u32 s0, s0, 16
1971; GCN2-NEXT:    s_addc_u32 s1, s1, 0
1972; GCN2-NEXT:    v_mov_b32_e32 v0, s0
1973; GCN2-NEXT:    v_mov_b32_e32 v1, s1
1974; GCN2-NEXT:    v_mov_b32_e32 v2, s2
1975; GCN2-NEXT:    flat_atomic_umax v[0:1], v2
1976; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
1977; GCN2-NEXT:    s_endpgm
1978;
1979; GCN3-LABEL: atomic_umax_i32_offset:
1980; GCN3:       ; %bb.0: ; %entry
1981; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1982; GCN3-NEXT:    s_load_dword s2, s[4:5], 0x2c
1983; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
1984; GCN3-NEXT:    v_mov_b32_e32 v0, s0
1985; GCN3-NEXT:    v_mov_b32_e32 v1, s1
1986; GCN3-NEXT:    v_mov_b32_e32 v2, s2
1987; GCN3-NEXT:    flat_atomic_umax v[0:1], v2 offset:16
1988; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
1989; GCN3-NEXT:    s_endpgm
1990entry:
1991  %gep = getelementptr i32, ptr %out, i32 4
1992  %val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst
1993  ret void
1994}
1995
1996define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
1997; GCN1-LABEL: atomic_umax_i32_ret_offset:
1998; GCN1:       ; %bb.0: ; %entry
1999; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2000; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xd
2001; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
2002; GCN1-NEXT:    s_add_u32 s0, s0, 16
2003; GCN1-NEXT:    s_addc_u32 s1, s1, 0
2004; GCN1-NEXT:    v_mov_b32_e32 v0, s0
2005; GCN1-NEXT:    v_mov_b32_e32 v1, s1
2006; GCN1-NEXT:    v_mov_b32_e32 v2, s4
2007; GCN1-NEXT:    flat_atomic_umax v2, v[0:1], v2 glc
2008; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
2009; GCN1-NEXT:    v_mov_b32_e32 v0, s2
2010; GCN1-NEXT:    v_mov_b32_e32 v1, s3
2011; GCN1-NEXT:    s_waitcnt vmcnt(0)
2012; GCN1-NEXT:    flat_store_dword v[0:1], v2
2013; GCN1-NEXT:    s_endpgm
2014;
2015; GCN2-LABEL: atomic_umax_i32_ret_offset:
2016; GCN2:       ; %bb.0: ; %entry
2017; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2018; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x34
2019; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
2020; GCN2-NEXT:    s_add_u32 s0, s0, 16
2021; GCN2-NEXT:    s_addc_u32 s1, s1, 0
2022; GCN2-NEXT:    v_mov_b32_e32 v0, s0
2023; GCN2-NEXT:    v_mov_b32_e32 v1, s1
2024; GCN2-NEXT:    v_mov_b32_e32 v2, s4
2025; GCN2-NEXT:    flat_atomic_umax v2, v[0:1], v2 glc
2026; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
2027; GCN2-NEXT:    v_mov_b32_e32 v0, s2
2028; GCN2-NEXT:    v_mov_b32_e32 v1, s3
2029; GCN2-NEXT:    s_waitcnt vmcnt(0)
2030; GCN2-NEXT:    flat_store_dword v[0:1], v2
2031; GCN2-NEXT:    s_endpgm
2032;
2033; GCN3-LABEL: atomic_umax_i32_ret_offset:
2034; GCN3:       ; %bb.0: ; %entry
2035; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2036; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x34
2037; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
2038; GCN3-NEXT:    v_mov_b32_e32 v0, s0
2039; GCN3-NEXT:    v_mov_b32_e32 v1, s1
2040; GCN3-NEXT:    v_mov_b32_e32 v2, s6
2041; GCN3-NEXT:    flat_atomic_umax v2, v[0:1], v2 offset:16 glc
2042; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
2043; GCN3-NEXT:    v_mov_b32_e32 v0, s2
2044; GCN3-NEXT:    v_mov_b32_e32 v1, s3
2045; GCN3-NEXT:    s_waitcnt vmcnt(0)
2046; GCN3-NEXT:    flat_store_dword v[0:1], v2
2047; GCN3-NEXT:    s_endpgm
2048entry:
2049  %gep = getelementptr i32, ptr %out, i32 4
2050  %val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst
2051  store i32 %val, ptr %out2
2052  ret void
2053}
2054
2055define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
2056; GCN1-LABEL: atomic_umax_i32_addr64_offset:
2057; GCN1:       ; %bb.0: ; %entry
2058; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
2059; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
2060; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
2061; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
2062; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
2063; GCN1-NEXT:    s_add_u32 s0, s2, s0
2064; GCN1-NEXT:    s_addc_u32 s1, s3, s1
2065; GCN1-NEXT:    s_add_u32 s0, s0, 16
2066; GCN1-NEXT:    s_addc_u32 s1, s1, 0
2067; GCN1-NEXT:    v_mov_b32_e32 v0, s0
2068; GCN1-NEXT:    v_mov_b32_e32 v1, s1
2069; GCN1-NEXT:    v_mov_b32_e32 v2, s4
2070; GCN1-NEXT:    flat_atomic_umax v[0:1], v2
2071; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
2072; GCN1-NEXT:    s_endpgm
2073;
2074; GCN2-LABEL: atomic_umax_i32_addr64_offset:
2075; GCN2:       ; %bb.0: ; %entry
2076; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
2077; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
2078; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
2079; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
2080; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
2081; GCN2-NEXT:    s_add_u32 s0, s2, s0
2082; GCN2-NEXT:    s_addc_u32 s1, s3, s1
2083; GCN2-NEXT:    s_add_u32 s0, s0, 16
2084; GCN2-NEXT:    s_addc_u32 s1, s1, 0
2085; GCN2-NEXT:    v_mov_b32_e32 v0, s0
2086; GCN2-NEXT:    v_mov_b32_e32 v1, s1
2087; GCN2-NEXT:    v_mov_b32_e32 v2, s4
2088; GCN2-NEXT:    flat_atomic_umax v[0:1], v2
2089; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
2090; GCN2-NEXT:    s_endpgm
2091;
2092; GCN3-LABEL: atomic_umax_i32_addr64_offset:
2093; GCN3:       ; %bb.0: ; %entry
2094; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
2095; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
2096; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
2097; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
2098; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
2099; GCN3-NEXT:    s_add_u32 s0, s2, s0
2100; GCN3-NEXT:    s_addc_u32 s1, s3, s1
2101; GCN3-NEXT:    v_mov_b32_e32 v0, s0
2102; GCN3-NEXT:    v_mov_b32_e32 v1, s1
2103; GCN3-NEXT:    v_mov_b32_e32 v2, s6
2104; GCN3-NEXT:    flat_atomic_umax v[0:1], v2 offset:16
2105; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
2106; GCN3-NEXT:    s_endpgm
2107entry:
2108  %ptr = getelementptr i32, ptr %out, i64 %index
2109  %gep = getelementptr i32, ptr %ptr, i32 4
2110  %val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst
2111  ret void
2112}
2113
2114define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
2115; GCN1-LABEL: atomic_umax_i32_ret_addr64_offset:
2116; GCN1:       ; %bb.0: ; %entry
2117; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
2118; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2119; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
2120; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
2121; GCN1-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
2122; GCN1-NEXT:    s_add_u32 s0, s0, s4
2123; GCN1-NEXT:    s_addc_u32 s1, s1, s5
2124; GCN1-NEXT:    s_add_u32 s0, s0, 16
2125; GCN1-NEXT:    s_addc_u32 s1, s1, 0
2126; GCN1-NEXT:    v_mov_b32_e32 v0, s0
2127; GCN1-NEXT:    v_mov_b32_e32 v1, s1
2128; GCN1-NEXT:    v_mov_b32_e32 v2, s8
2129; GCN1-NEXT:    flat_atomic_umax v2, v[0:1], v2 glc
2130; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
2131; GCN1-NEXT:    v_mov_b32_e32 v0, s2
2132; GCN1-NEXT:    v_mov_b32_e32 v1, s3
2133; GCN1-NEXT:    s_waitcnt vmcnt(0)
2134; GCN1-NEXT:    flat_store_dword v[0:1], v2
2135; GCN1-NEXT:    s_endpgm
2136;
2137; GCN2-LABEL: atomic_umax_i32_ret_addr64_offset:
2138; GCN2:       ; %bb.0: ; %entry
2139; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
2140; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2141; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
2142; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
2143; GCN2-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
2144; GCN2-NEXT:    s_add_u32 s0, s0, s4
2145; GCN2-NEXT:    s_addc_u32 s1, s1, s5
2146; GCN2-NEXT:    s_add_u32 s0, s0, 16
2147; GCN2-NEXT:    s_addc_u32 s1, s1, 0
2148; GCN2-NEXT:    v_mov_b32_e32 v0, s0
2149; GCN2-NEXT:    v_mov_b32_e32 v1, s1
2150; GCN2-NEXT:    v_mov_b32_e32 v2, s8
2151; GCN2-NEXT:    flat_atomic_umax v2, v[0:1], v2 glc
2152; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
2153; GCN2-NEXT:    v_mov_b32_e32 v0, s2
2154; GCN2-NEXT:    v_mov_b32_e32 v1, s3
2155; GCN2-NEXT:    s_waitcnt vmcnt(0)
2156; GCN2-NEXT:    flat_store_dword v[0:1], v2
2157; GCN2-NEXT:    s_endpgm
2158;
2159; GCN3-LABEL: atomic_umax_i32_ret_addr64_offset:
2160; GCN3:       ; %bb.0: ; %entry
2161; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
2162; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2163; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
2164; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
2165; GCN3-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
2166; GCN3-NEXT:    s_add_u32 s0, s0, s4
2167; GCN3-NEXT:    s_addc_u32 s1, s1, s5
2168; GCN3-NEXT:    v_mov_b32_e32 v0, s0
2169; GCN3-NEXT:    v_mov_b32_e32 v1, s1
2170; GCN3-NEXT:    v_mov_b32_e32 v2, s8
2171; GCN3-NEXT:    flat_atomic_umax v2, v[0:1], v2 offset:16 glc
2172; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
2173; GCN3-NEXT:    v_mov_b32_e32 v0, s2
2174; GCN3-NEXT:    v_mov_b32_e32 v1, s3
2175; GCN3-NEXT:    s_waitcnt vmcnt(0)
2176; GCN3-NEXT:    flat_store_dword v[0:1], v2
2177; GCN3-NEXT:    s_endpgm
2178entry:
2179  %ptr = getelementptr i32, ptr %out, i64 %index
2180  %gep = getelementptr i32, ptr %ptr, i32 4
2181  %val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst
2182  store i32 %val, ptr %out2
2183  ret void
2184}
2185
2186define amdgpu_kernel void @atomic_umax_i32(ptr %out, i32 %in) {
2187; GCN1-LABEL: atomic_umax_i32:
2188; GCN1:       ; %bb.0: ; %entry
2189; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
2190; GCN1-NEXT:    s_load_dword s2, s[4:5], 0xb
2191; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
2192; GCN1-NEXT:    v_mov_b32_e32 v0, s0
2193; GCN1-NEXT:    v_mov_b32_e32 v1, s1
2194; GCN1-NEXT:    v_mov_b32_e32 v2, s2
2195; GCN1-NEXT:    flat_atomic_umax v[0:1], v2
2196; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
2197; GCN1-NEXT:    s_endpgm
2198;
2199; GCN2-LABEL: atomic_umax_i32:
2200; GCN2:       ; %bb.0: ; %entry
2201; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2202; GCN2-NEXT:    s_load_dword s2, s[4:5], 0x2c
2203; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
2204; GCN2-NEXT:    v_mov_b32_e32 v0, s0
2205; GCN2-NEXT:    v_mov_b32_e32 v1, s1
2206; GCN2-NEXT:    v_mov_b32_e32 v2, s2
2207; GCN2-NEXT:    flat_atomic_umax v[0:1], v2
2208; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
2209; GCN2-NEXT:    s_endpgm
2210;
2211; GCN3-LABEL: atomic_umax_i32:
2212; GCN3:       ; %bb.0: ; %entry
2213; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2214; GCN3-NEXT:    s_load_dword s2, s[4:5], 0x2c
2215; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
2216; GCN3-NEXT:    v_mov_b32_e32 v0, s0
2217; GCN3-NEXT:    v_mov_b32_e32 v1, s1
2218; GCN3-NEXT:    v_mov_b32_e32 v2, s2
2219; GCN3-NEXT:    flat_atomic_umax v[0:1], v2
2220; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
2221; GCN3-NEXT:    s_endpgm
2222entry:
2223  %val = atomicrmw volatile umax ptr %out, i32 %in syncscope("workgroup") seq_cst
2224  ret void
2225}
2226
2227define amdgpu_kernel void @atomic_umax_i32_ret(ptr %out, ptr %out2, i32 %in) {
2228; GCN1-LABEL: atomic_umax_i32_ret:
2229; GCN1:       ; %bb.0: ; %entry
2230; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2231; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xd
2232; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
2233; GCN1-NEXT:    v_mov_b32_e32 v0, s0
2234; GCN1-NEXT:    v_mov_b32_e32 v1, s1
2235; GCN1-NEXT:    v_mov_b32_e32 v2, s4
2236; GCN1-NEXT:    flat_atomic_umax v2, v[0:1], v2 glc
2237; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
2238; GCN1-NEXT:    v_mov_b32_e32 v0, s2
2239; GCN1-NEXT:    v_mov_b32_e32 v1, s3
2240; GCN1-NEXT:    s_waitcnt vmcnt(0)
2241; GCN1-NEXT:    flat_store_dword v[0:1], v2
2242; GCN1-NEXT:    s_endpgm
2243;
2244; GCN2-LABEL: atomic_umax_i32_ret:
2245; GCN2:       ; %bb.0: ; %entry
2246; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2247; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x34
2248; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
2249; GCN2-NEXT:    v_mov_b32_e32 v0, s0
2250; GCN2-NEXT:    v_mov_b32_e32 v1, s1
2251; GCN2-NEXT:    v_mov_b32_e32 v2, s4
2252; GCN2-NEXT:    flat_atomic_umax v2, v[0:1], v2 glc
2253; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
2254; GCN2-NEXT:    v_mov_b32_e32 v0, s2
2255; GCN2-NEXT:    v_mov_b32_e32 v1, s3
2256; GCN2-NEXT:    s_waitcnt vmcnt(0)
2257; GCN2-NEXT:    flat_store_dword v[0:1], v2
2258; GCN2-NEXT:    s_endpgm
2259;
2260; GCN3-LABEL: atomic_umax_i32_ret:
2261; GCN3:       ; %bb.0: ; %entry
2262; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2263; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x34
2264; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
2265; GCN3-NEXT:    v_mov_b32_e32 v0, s0
2266; GCN3-NEXT:    v_mov_b32_e32 v1, s1
2267; GCN3-NEXT:    v_mov_b32_e32 v2, s6
2268; GCN3-NEXT:    flat_atomic_umax v2, v[0:1], v2 glc
2269; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
2270; GCN3-NEXT:    v_mov_b32_e32 v0, s2
2271; GCN3-NEXT:    v_mov_b32_e32 v1, s3
2272; GCN3-NEXT:    s_waitcnt vmcnt(0)
2273; GCN3-NEXT:    flat_store_dword v[0:1], v2
2274; GCN3-NEXT:    s_endpgm
2275entry:
2276  %val = atomicrmw volatile umax ptr %out, i32 %in syncscope("workgroup") seq_cst
2277  store i32 %val, ptr %out2
2278  ret void
2279}
2280
2281define amdgpu_kernel void @atomic_umax_i32_addr64(ptr %out, i32 %in, i64 %index) {
2282; GCN1-LABEL: atomic_umax_i32_addr64:
2283; GCN1:       ; %bb.0: ; %entry
2284; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
2285; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
2286; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
2287; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
2288; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
2289; GCN1-NEXT:    s_add_u32 s0, s2, s0
2290; GCN1-NEXT:    s_addc_u32 s1, s3, s1
2291; GCN1-NEXT:    v_mov_b32_e32 v0, s0
2292; GCN1-NEXT:    v_mov_b32_e32 v1, s1
2293; GCN1-NEXT:    v_mov_b32_e32 v2, s4
2294; GCN1-NEXT:    flat_atomic_umax v[0:1], v2
2295; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
2296; GCN1-NEXT:    s_endpgm
2297;
2298; GCN2-LABEL: atomic_umax_i32_addr64:
2299; GCN2:       ; %bb.0: ; %entry
2300; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
2301; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
2302; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
2303; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
2304; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
2305; GCN2-NEXT:    s_add_u32 s0, s2, s0
2306; GCN2-NEXT:    s_addc_u32 s1, s3, s1
2307; GCN2-NEXT:    v_mov_b32_e32 v0, s0
2308; GCN2-NEXT:    v_mov_b32_e32 v1, s1
2309; GCN2-NEXT:    v_mov_b32_e32 v2, s4
2310; GCN2-NEXT:    flat_atomic_umax v[0:1], v2
2311; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
2312; GCN2-NEXT:    s_endpgm
2313;
2314; GCN3-LABEL: atomic_umax_i32_addr64:
2315; GCN3:       ; %bb.0: ; %entry
2316; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
2317; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
2318; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
2319; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
2320; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
2321; GCN3-NEXT:    s_add_u32 s0, s2, s0
2322; GCN3-NEXT:    s_addc_u32 s1, s3, s1
2323; GCN3-NEXT:    v_mov_b32_e32 v0, s0
2324; GCN3-NEXT:    v_mov_b32_e32 v1, s1
2325; GCN3-NEXT:    v_mov_b32_e32 v2, s6
2326; GCN3-NEXT:    flat_atomic_umax v[0:1], v2
2327; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
2328; GCN3-NEXT:    s_endpgm
2329entry:
2330  %ptr = getelementptr i32, ptr %out, i64 %index
2331  %val = atomicrmw volatile umax ptr %ptr, i32 %in syncscope("workgroup") seq_cst
2332  ret void
2333}
2334
2335define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
2336; GCN1-LABEL: atomic_umax_i32_ret_addr64:
2337; GCN1:       ; %bb.0: ; %entry
2338; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
2339; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2340; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
2341; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
2342; GCN1-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
2343; GCN1-NEXT:    s_add_u32 s0, s0, s4
2344; GCN1-NEXT:    s_addc_u32 s1, s1, s5
2345; GCN1-NEXT:    v_mov_b32_e32 v0, s0
2346; GCN1-NEXT:    v_mov_b32_e32 v1, s1
2347; GCN1-NEXT:    v_mov_b32_e32 v2, s8
2348; GCN1-NEXT:    flat_atomic_umax v2, v[0:1], v2 glc
2349; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
2350; GCN1-NEXT:    v_mov_b32_e32 v0, s2
2351; GCN1-NEXT:    v_mov_b32_e32 v1, s3
2352; GCN1-NEXT:    s_waitcnt vmcnt(0)
2353; GCN1-NEXT:    flat_store_dword v[0:1], v2
2354; GCN1-NEXT:    s_endpgm
2355;
2356; GCN2-LABEL: atomic_umax_i32_ret_addr64:
2357; GCN2:       ; %bb.0: ; %entry
2358; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
2359; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2360; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
2361; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
2362; GCN2-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
2363; GCN2-NEXT:    s_add_u32 s0, s0, s4
2364; GCN2-NEXT:    s_addc_u32 s1, s1, s5
2365; GCN2-NEXT:    v_mov_b32_e32 v0, s0
2366; GCN2-NEXT:    v_mov_b32_e32 v1, s1
2367; GCN2-NEXT:    v_mov_b32_e32 v2, s8
2368; GCN2-NEXT:    flat_atomic_umax v2, v[0:1], v2 glc
2369; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
2370; GCN2-NEXT:    v_mov_b32_e32 v0, s2
2371; GCN2-NEXT:    v_mov_b32_e32 v1, s3
2372; GCN2-NEXT:    s_waitcnt vmcnt(0)
2373; GCN2-NEXT:    flat_store_dword v[0:1], v2
2374; GCN2-NEXT:    s_endpgm
2375;
2376; GCN3-LABEL: atomic_umax_i32_ret_addr64:
2377; GCN3:       ; %bb.0: ; %entry
2378; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
2379; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2380; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
2381; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
2382; GCN3-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
2383; GCN3-NEXT:    s_add_u32 s0, s0, s4
2384; GCN3-NEXT:    s_addc_u32 s1, s1, s5
2385; GCN3-NEXT:    v_mov_b32_e32 v0, s0
2386; GCN3-NEXT:    v_mov_b32_e32 v1, s1
2387; GCN3-NEXT:    v_mov_b32_e32 v2, s8
2388; GCN3-NEXT:    flat_atomic_umax v2, v[0:1], v2 glc
2389; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
2390; GCN3-NEXT:    v_mov_b32_e32 v0, s2
2391; GCN3-NEXT:    v_mov_b32_e32 v1, s3
2392; GCN3-NEXT:    s_waitcnt vmcnt(0)
2393; GCN3-NEXT:    flat_store_dword v[0:1], v2
2394; GCN3-NEXT:    s_endpgm
2395entry:
2396  %ptr = getelementptr i32, ptr %out, i64 %index
2397  %val = atomicrmw volatile umax ptr %ptr, i32 %in syncscope("workgroup") seq_cst
2398  store i32 %val, ptr %out2
2399  ret void
2400}
2401
2402define amdgpu_kernel void @atomic_min_i32_offset(ptr %out, i32 %in) {
2403; GCN1-LABEL: atomic_min_i32_offset:
2404; GCN1:       ; %bb.0: ; %entry
2405; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
2406; GCN1-NEXT:    s_load_dword s2, s[4:5], 0xb
2407; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
2408; GCN1-NEXT:    s_add_u32 s0, s0, 16
2409; GCN1-NEXT:    s_addc_u32 s1, s1, 0
2410; GCN1-NEXT:    v_mov_b32_e32 v0, s0
2411; GCN1-NEXT:    v_mov_b32_e32 v1, s1
2412; GCN1-NEXT:    v_mov_b32_e32 v2, s2
2413; GCN1-NEXT:    flat_atomic_smin v[0:1], v2
2414; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
2415; GCN1-NEXT:    s_endpgm
2416;
2417; GCN2-LABEL: atomic_min_i32_offset:
2418; GCN2:       ; %bb.0: ; %entry
2419; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2420; GCN2-NEXT:    s_load_dword s2, s[4:5], 0x2c
2421; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
2422; GCN2-NEXT:    s_add_u32 s0, s0, 16
2423; GCN2-NEXT:    s_addc_u32 s1, s1, 0
2424; GCN2-NEXT:    v_mov_b32_e32 v0, s0
2425; GCN2-NEXT:    v_mov_b32_e32 v1, s1
2426; GCN2-NEXT:    v_mov_b32_e32 v2, s2
2427; GCN2-NEXT:    flat_atomic_smin v[0:1], v2
2428; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
2429; GCN2-NEXT:    s_endpgm
2430;
2431; GCN3-LABEL: atomic_min_i32_offset:
2432; GCN3:       ; %bb.0: ; %entry
2433; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2434; GCN3-NEXT:    s_load_dword s2, s[4:5], 0x2c
2435; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
2436; GCN3-NEXT:    v_mov_b32_e32 v0, s0
2437; GCN3-NEXT:    v_mov_b32_e32 v1, s1
2438; GCN3-NEXT:    v_mov_b32_e32 v2, s2
2439; GCN3-NEXT:    flat_atomic_smin v[0:1], v2 offset:16
2440; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
2441; GCN3-NEXT:    s_endpgm
2442entry:
2443  %gep = getelementptr i32, ptr %out, i32 4
2444  %val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst
2445  ret void
2446}
2447
2448define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
2449; GCN1-LABEL: atomic_min_i32_ret_offset:
2450; GCN1:       ; %bb.0: ; %entry
2451; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2452; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xd
2453; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
2454; GCN1-NEXT:    s_add_u32 s0, s0, 16
2455; GCN1-NEXT:    s_addc_u32 s1, s1, 0
2456; GCN1-NEXT:    v_mov_b32_e32 v0, s0
2457; GCN1-NEXT:    v_mov_b32_e32 v1, s1
2458; GCN1-NEXT:    v_mov_b32_e32 v2, s4
2459; GCN1-NEXT:    flat_atomic_smin v2, v[0:1], v2 glc
2460; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
2461; GCN1-NEXT:    v_mov_b32_e32 v0, s2
2462; GCN1-NEXT:    v_mov_b32_e32 v1, s3
2463; GCN1-NEXT:    s_waitcnt vmcnt(0)
2464; GCN1-NEXT:    flat_store_dword v[0:1], v2
2465; GCN1-NEXT:    s_endpgm
2466;
2467; GCN2-LABEL: atomic_min_i32_ret_offset:
2468; GCN2:       ; %bb.0: ; %entry
2469; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2470; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x34
2471; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
2472; GCN2-NEXT:    s_add_u32 s0, s0, 16
2473; GCN2-NEXT:    s_addc_u32 s1, s1, 0
2474; GCN2-NEXT:    v_mov_b32_e32 v0, s0
2475; GCN2-NEXT:    v_mov_b32_e32 v1, s1
2476; GCN2-NEXT:    v_mov_b32_e32 v2, s4
2477; GCN2-NEXT:    flat_atomic_smin v2, v[0:1], v2 glc
2478; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
2479; GCN2-NEXT:    v_mov_b32_e32 v0, s2
2480; GCN2-NEXT:    v_mov_b32_e32 v1, s3
2481; GCN2-NEXT:    s_waitcnt vmcnt(0)
2482; GCN2-NEXT:    flat_store_dword v[0:1], v2
2483; GCN2-NEXT:    s_endpgm
2484;
2485; GCN3-LABEL: atomic_min_i32_ret_offset:
2486; GCN3:       ; %bb.0: ; %entry
2487; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2488; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x34
2489; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
2490; GCN3-NEXT:    v_mov_b32_e32 v0, s0
2491; GCN3-NEXT:    v_mov_b32_e32 v1, s1
2492; GCN3-NEXT:    v_mov_b32_e32 v2, s6
2493; GCN3-NEXT:    flat_atomic_smin v2, v[0:1], v2 offset:16 glc
2494; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
2495; GCN3-NEXT:    v_mov_b32_e32 v0, s2
2496; GCN3-NEXT:    v_mov_b32_e32 v1, s3
2497; GCN3-NEXT:    s_waitcnt vmcnt(0)
2498; GCN3-NEXT:    flat_store_dword v[0:1], v2
2499; GCN3-NEXT:    s_endpgm
2500entry:
2501  %gep = getelementptr i32, ptr %out, i32 4
2502  %val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst
2503  store i32 %val, ptr %out2
2504  ret void
2505}
2506
2507define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
2508; GCN1-LABEL: atomic_min_i32_addr64_offset:
2509; GCN1:       ; %bb.0: ; %entry
2510; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
2511; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
2512; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
2513; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
2514; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
2515; GCN1-NEXT:    s_add_u32 s0, s2, s0
2516; GCN1-NEXT:    s_addc_u32 s1, s3, s1
2517; GCN1-NEXT:    s_add_u32 s0, s0, 16
2518; GCN1-NEXT:    s_addc_u32 s1, s1, 0
2519; GCN1-NEXT:    v_mov_b32_e32 v0, s0
2520; GCN1-NEXT:    v_mov_b32_e32 v1, s1
2521; GCN1-NEXT:    v_mov_b32_e32 v2, s4
2522; GCN1-NEXT:    flat_atomic_smin v[0:1], v2
2523; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
2524; GCN1-NEXT:    s_endpgm
2525;
2526; GCN2-LABEL: atomic_min_i32_addr64_offset:
2527; GCN2:       ; %bb.0: ; %entry
2528; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
2529; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
2530; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
2531; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
2532; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
2533; GCN2-NEXT:    s_add_u32 s0, s2, s0
2534; GCN2-NEXT:    s_addc_u32 s1, s3, s1
2535; GCN2-NEXT:    s_add_u32 s0, s0, 16
2536; GCN2-NEXT:    s_addc_u32 s1, s1, 0
2537; GCN2-NEXT:    v_mov_b32_e32 v0, s0
2538; GCN2-NEXT:    v_mov_b32_e32 v1, s1
2539; GCN2-NEXT:    v_mov_b32_e32 v2, s4
2540; GCN2-NEXT:    flat_atomic_smin v[0:1], v2
2541; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
2542; GCN2-NEXT:    s_endpgm
2543;
2544; GCN3-LABEL: atomic_min_i32_addr64_offset:
2545; GCN3:       ; %bb.0: ; %entry
2546; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
2547; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
2548; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
2549; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
2550; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
2551; GCN3-NEXT:    s_add_u32 s0, s2, s0
2552; GCN3-NEXT:    s_addc_u32 s1, s3, s1
2553; GCN3-NEXT:    v_mov_b32_e32 v0, s0
2554; GCN3-NEXT:    v_mov_b32_e32 v1, s1
2555; GCN3-NEXT:    v_mov_b32_e32 v2, s6
2556; GCN3-NEXT:    flat_atomic_smin v[0:1], v2 offset:16
2557; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
2558; GCN3-NEXT:    s_endpgm
2559entry:
2560  %ptr = getelementptr i32, ptr %out, i64 %index
2561  %gep = getelementptr i32, ptr %ptr, i32 4
2562  %val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst
2563  ret void
2564}
2565
2566define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
2567; GCN1-LABEL: atomic_min_i32_ret_addr64_offset:
2568; GCN1:       ; %bb.0: ; %entry
2569; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
2570; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2571; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
2572; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
2573; GCN1-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
2574; GCN1-NEXT:    s_add_u32 s0, s0, s4
2575; GCN1-NEXT:    s_addc_u32 s1, s1, s5
2576; GCN1-NEXT:    s_add_u32 s0, s0, 16
2577; GCN1-NEXT:    s_addc_u32 s1, s1, 0
2578; GCN1-NEXT:    v_mov_b32_e32 v0, s0
2579; GCN1-NEXT:    v_mov_b32_e32 v1, s1
2580; GCN1-NEXT:    v_mov_b32_e32 v2, s8
2581; GCN1-NEXT:    flat_atomic_smin v2, v[0:1], v2 glc
2582; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
2583; GCN1-NEXT:    v_mov_b32_e32 v0, s2
2584; GCN1-NEXT:    v_mov_b32_e32 v1, s3
2585; GCN1-NEXT:    s_waitcnt vmcnt(0)
2586; GCN1-NEXT:    flat_store_dword v[0:1], v2
2587; GCN1-NEXT:    s_endpgm
2588;
2589; GCN2-LABEL: atomic_min_i32_ret_addr64_offset:
2590; GCN2:       ; %bb.0: ; %entry
2591; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
2592; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2593; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
2594; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
2595; GCN2-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
2596; GCN2-NEXT:    s_add_u32 s0, s0, s4
2597; GCN2-NEXT:    s_addc_u32 s1, s1, s5
2598; GCN2-NEXT:    s_add_u32 s0, s0, 16
2599; GCN2-NEXT:    s_addc_u32 s1, s1, 0
2600; GCN2-NEXT:    v_mov_b32_e32 v0, s0
2601; GCN2-NEXT:    v_mov_b32_e32 v1, s1
2602; GCN2-NEXT:    v_mov_b32_e32 v2, s8
2603; GCN2-NEXT:    flat_atomic_smin v2, v[0:1], v2 glc
2604; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
2605; GCN2-NEXT:    v_mov_b32_e32 v0, s2
2606; GCN2-NEXT:    v_mov_b32_e32 v1, s3
2607; GCN2-NEXT:    s_waitcnt vmcnt(0)
2608; GCN2-NEXT:    flat_store_dword v[0:1], v2
2609; GCN2-NEXT:    s_endpgm
2610;
2611; GCN3-LABEL: atomic_min_i32_ret_addr64_offset:
2612; GCN3:       ; %bb.0: ; %entry
2613; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
2614; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2615; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
2616; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
2617; GCN3-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
2618; GCN3-NEXT:    s_add_u32 s0, s0, s4
2619; GCN3-NEXT:    s_addc_u32 s1, s1, s5
2620; GCN3-NEXT:    v_mov_b32_e32 v0, s0
2621; GCN3-NEXT:    v_mov_b32_e32 v1, s1
2622; GCN3-NEXT:    v_mov_b32_e32 v2, s8
2623; GCN3-NEXT:    flat_atomic_smin v2, v[0:1], v2 offset:16 glc
2624; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
2625; GCN3-NEXT:    v_mov_b32_e32 v0, s2
2626; GCN3-NEXT:    v_mov_b32_e32 v1, s3
2627; GCN3-NEXT:    s_waitcnt vmcnt(0)
2628; GCN3-NEXT:    flat_store_dword v[0:1], v2
2629; GCN3-NEXT:    s_endpgm
2630entry:
2631  %ptr = getelementptr i32, ptr %out, i64 %index
2632  %gep = getelementptr i32, ptr %ptr, i32 4
2633  %val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst
2634  store i32 %val, ptr %out2
2635  ret void
2636}
2637
2638define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) {
2639; GCN1-LABEL: atomic_min_i32:
2640; GCN1:       ; %bb.0: ; %entry
2641; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
2642; GCN1-NEXT:    s_load_dword s2, s[4:5], 0xb
2643; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
2644; GCN1-NEXT:    v_mov_b32_e32 v0, s0
2645; GCN1-NEXT:    v_mov_b32_e32 v1, s1
2646; GCN1-NEXT:    v_mov_b32_e32 v2, s2
2647; GCN1-NEXT:    flat_atomic_smin v[0:1], v2
2648; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
2649; GCN1-NEXT:    s_endpgm
2650;
2651; GCN2-LABEL: atomic_min_i32:
2652; GCN2:       ; %bb.0: ; %entry
2653; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2654; GCN2-NEXT:    s_load_dword s2, s[4:5], 0x2c
2655; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
2656; GCN2-NEXT:    v_mov_b32_e32 v0, s0
2657; GCN2-NEXT:    v_mov_b32_e32 v1, s1
2658; GCN2-NEXT:    v_mov_b32_e32 v2, s2
2659; GCN2-NEXT:    flat_atomic_smin v[0:1], v2
2660; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
2661; GCN2-NEXT:    s_endpgm
2662;
2663; GCN3-LABEL: atomic_min_i32:
2664; GCN3:       ; %bb.0: ; %entry
2665; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2666; GCN3-NEXT:    s_load_dword s2, s[4:5], 0x2c
2667; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
2668; GCN3-NEXT:    v_mov_b32_e32 v0, s0
2669; GCN3-NEXT:    v_mov_b32_e32 v1, s1
2670; GCN3-NEXT:    v_mov_b32_e32 v2, s2
2671; GCN3-NEXT:    flat_atomic_smin v[0:1], v2
2672; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
2673; GCN3-NEXT:    s_endpgm
2674entry:
2675  %val = atomicrmw volatile min ptr %out, i32 %in syncscope("workgroup") seq_cst
2676  ret void
2677}
2678
2679define amdgpu_kernel void @atomic_min_i32_ret(ptr %out, ptr %out2, i32 %in) {
2680; GCN1-LABEL: atomic_min_i32_ret:
2681; GCN1:       ; %bb.0: ; %entry
2682; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2683; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xd
2684; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
2685; GCN1-NEXT:    v_mov_b32_e32 v0, s0
2686; GCN1-NEXT:    v_mov_b32_e32 v1, s1
2687; GCN1-NEXT:    v_mov_b32_e32 v2, s4
2688; GCN1-NEXT:    flat_atomic_smin v2, v[0:1], v2 glc
2689; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
2690; GCN1-NEXT:    v_mov_b32_e32 v0, s2
2691; GCN1-NEXT:    v_mov_b32_e32 v1, s3
2692; GCN1-NEXT:    s_waitcnt vmcnt(0)
2693; GCN1-NEXT:    flat_store_dword v[0:1], v2
2694; GCN1-NEXT:    s_endpgm
2695;
2696; GCN2-LABEL: atomic_min_i32_ret:
2697; GCN2:       ; %bb.0: ; %entry
2698; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2699; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x34
2700; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
2701; GCN2-NEXT:    v_mov_b32_e32 v0, s0
2702; GCN2-NEXT:    v_mov_b32_e32 v1, s1
2703; GCN2-NEXT:    v_mov_b32_e32 v2, s4
2704; GCN2-NEXT:    flat_atomic_smin v2, v[0:1], v2 glc
2705; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
2706; GCN2-NEXT:    v_mov_b32_e32 v0, s2
2707; GCN2-NEXT:    v_mov_b32_e32 v1, s3
2708; GCN2-NEXT:    s_waitcnt vmcnt(0)
2709; GCN2-NEXT:    flat_store_dword v[0:1], v2
2710; GCN2-NEXT:    s_endpgm
2711;
2712; GCN3-LABEL: atomic_min_i32_ret:
2713; GCN3:       ; %bb.0: ; %entry
2714; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2715; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x34
2716; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
2717; GCN3-NEXT:    v_mov_b32_e32 v0, s0
2718; GCN3-NEXT:    v_mov_b32_e32 v1, s1
2719; GCN3-NEXT:    v_mov_b32_e32 v2, s6
2720; GCN3-NEXT:    flat_atomic_smin v2, v[0:1], v2 glc
2721; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
2722; GCN3-NEXT:    v_mov_b32_e32 v0, s2
2723; GCN3-NEXT:    v_mov_b32_e32 v1, s3
2724; GCN3-NEXT:    s_waitcnt vmcnt(0)
2725; GCN3-NEXT:    flat_store_dword v[0:1], v2
2726; GCN3-NEXT:    s_endpgm
2727entry:
2728  %val = atomicrmw volatile min ptr %out, i32 %in syncscope("workgroup") seq_cst
2729  store i32 %val, ptr %out2
2730  ret void
2731}
2732
2733define amdgpu_kernel void @atomic_min_i32_addr64(ptr %out, i32 %in, i64 %index) {
2734; GCN1-LABEL: atomic_min_i32_addr64:
2735; GCN1:       ; %bb.0: ; %entry
2736; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
2737; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
2738; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
2739; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
2740; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
2741; GCN1-NEXT:    s_add_u32 s0, s2, s0
2742; GCN1-NEXT:    s_addc_u32 s1, s3, s1
2743; GCN1-NEXT:    v_mov_b32_e32 v0, s0
2744; GCN1-NEXT:    v_mov_b32_e32 v1, s1
2745; GCN1-NEXT:    v_mov_b32_e32 v2, s4
2746; GCN1-NEXT:    flat_atomic_smin v[0:1], v2
2747; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
2748; GCN1-NEXT:    s_endpgm
2749;
2750; GCN2-LABEL: atomic_min_i32_addr64:
2751; GCN2:       ; %bb.0: ; %entry
2752; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
2753; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
2754; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
2755; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
2756; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
2757; GCN2-NEXT:    s_add_u32 s0, s2, s0
2758; GCN2-NEXT:    s_addc_u32 s1, s3, s1
2759; GCN2-NEXT:    v_mov_b32_e32 v0, s0
2760; GCN2-NEXT:    v_mov_b32_e32 v1, s1
2761; GCN2-NEXT:    v_mov_b32_e32 v2, s4
2762; GCN2-NEXT:    flat_atomic_smin v[0:1], v2
2763; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
2764; GCN2-NEXT:    s_endpgm
2765;
2766; GCN3-LABEL: atomic_min_i32_addr64:
2767; GCN3:       ; %bb.0: ; %entry
2768; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
2769; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
2770; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
2771; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
2772; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
2773; GCN3-NEXT:    s_add_u32 s0, s2, s0
2774; GCN3-NEXT:    s_addc_u32 s1, s3, s1
2775; GCN3-NEXT:    v_mov_b32_e32 v0, s0
2776; GCN3-NEXT:    v_mov_b32_e32 v1, s1
2777; GCN3-NEXT:    v_mov_b32_e32 v2, s6
2778; GCN3-NEXT:    flat_atomic_smin v[0:1], v2
2779; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
2780; GCN3-NEXT:    s_endpgm
2781entry:
2782  %ptr = getelementptr i32, ptr %out, i64 %index
2783  %val = atomicrmw volatile min ptr %ptr, i32 %in syncscope("workgroup") seq_cst
2784  ret void
2785}
2786
2787define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
2788; GCN1-LABEL: atomic_min_i32_ret_addr64:
2789; GCN1:       ; %bb.0: ; %entry
2790; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
2791; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2792; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
2793; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
2794; GCN1-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
2795; GCN1-NEXT:    s_add_u32 s0, s0, s4
2796; GCN1-NEXT:    s_addc_u32 s1, s1, s5
2797; GCN1-NEXT:    v_mov_b32_e32 v0, s0
2798; GCN1-NEXT:    v_mov_b32_e32 v1, s1
2799; GCN1-NEXT:    v_mov_b32_e32 v2, s8
2800; GCN1-NEXT:    flat_atomic_smin v2, v[0:1], v2 glc
2801; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
2802; GCN1-NEXT:    v_mov_b32_e32 v0, s2
2803; GCN1-NEXT:    v_mov_b32_e32 v1, s3
2804; GCN1-NEXT:    s_waitcnt vmcnt(0)
2805; GCN1-NEXT:    flat_store_dword v[0:1], v2
2806; GCN1-NEXT:    s_endpgm
2807;
2808; GCN2-LABEL: atomic_min_i32_ret_addr64:
2809; GCN2:       ; %bb.0: ; %entry
2810; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
2811; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2812; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
2813; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
2814; GCN2-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
2815; GCN2-NEXT:    s_add_u32 s0, s0, s4
2816; GCN2-NEXT:    s_addc_u32 s1, s1, s5
2817; GCN2-NEXT:    v_mov_b32_e32 v0, s0
2818; GCN2-NEXT:    v_mov_b32_e32 v1, s1
2819; GCN2-NEXT:    v_mov_b32_e32 v2, s8
2820; GCN2-NEXT:    flat_atomic_smin v2, v[0:1], v2 glc
2821; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
2822; GCN2-NEXT:    v_mov_b32_e32 v0, s2
2823; GCN2-NEXT:    v_mov_b32_e32 v1, s3
2824; GCN2-NEXT:    s_waitcnt vmcnt(0)
2825; GCN2-NEXT:    flat_store_dword v[0:1], v2
2826; GCN2-NEXT:    s_endpgm
2827;
2828; GCN3-LABEL: atomic_min_i32_ret_addr64:
2829; GCN3:       ; %bb.0: ; %entry
2830; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
2831; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2832; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
2833; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
2834; GCN3-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
2835; GCN3-NEXT:    s_add_u32 s0, s0, s4
2836; GCN3-NEXT:    s_addc_u32 s1, s1, s5
2837; GCN3-NEXT:    v_mov_b32_e32 v0, s0
2838; GCN3-NEXT:    v_mov_b32_e32 v1, s1
2839; GCN3-NEXT:    v_mov_b32_e32 v2, s8
2840; GCN3-NEXT:    flat_atomic_smin v2, v[0:1], v2 glc
2841; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
2842; GCN3-NEXT:    v_mov_b32_e32 v0, s2
2843; GCN3-NEXT:    v_mov_b32_e32 v1, s3
2844; GCN3-NEXT:    s_waitcnt vmcnt(0)
2845; GCN3-NEXT:    flat_store_dword v[0:1], v2
2846; GCN3-NEXT:    s_endpgm
2847entry:
2848  %ptr = getelementptr i32, ptr %out, i64 %index
2849  %val = atomicrmw volatile min ptr %ptr, i32 %in syncscope("workgroup") seq_cst
2850  store i32 %val, ptr %out2
2851  ret void
2852}
2853
2854define amdgpu_kernel void @atomic_umin_i32_offset(ptr %out, i32 %in) {
2855; GCN1-LABEL: atomic_umin_i32_offset:
2856; GCN1:       ; %bb.0: ; %entry
2857; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
2858; GCN1-NEXT:    s_load_dword s2, s[4:5], 0xb
2859; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
2860; GCN1-NEXT:    s_add_u32 s0, s0, 16
2861; GCN1-NEXT:    s_addc_u32 s1, s1, 0
2862; GCN1-NEXT:    v_mov_b32_e32 v0, s0
2863; GCN1-NEXT:    v_mov_b32_e32 v1, s1
2864; GCN1-NEXT:    v_mov_b32_e32 v2, s2
2865; GCN1-NEXT:    flat_atomic_umin v[0:1], v2
2866; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
2867; GCN1-NEXT:    s_endpgm
2868;
2869; GCN2-LABEL: atomic_umin_i32_offset:
2870; GCN2:       ; %bb.0: ; %entry
2871; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2872; GCN2-NEXT:    s_load_dword s2, s[4:5], 0x2c
2873; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
2874; GCN2-NEXT:    s_add_u32 s0, s0, 16
2875; GCN2-NEXT:    s_addc_u32 s1, s1, 0
2876; GCN2-NEXT:    v_mov_b32_e32 v0, s0
2877; GCN2-NEXT:    v_mov_b32_e32 v1, s1
2878; GCN2-NEXT:    v_mov_b32_e32 v2, s2
2879; GCN2-NEXT:    flat_atomic_umin v[0:1], v2
2880; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
2881; GCN2-NEXT:    s_endpgm
2882;
2883; GCN3-LABEL: atomic_umin_i32_offset:
2884; GCN3:       ; %bb.0: ; %entry
2885; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2886; GCN3-NEXT:    s_load_dword s2, s[4:5], 0x2c
2887; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
2888; GCN3-NEXT:    v_mov_b32_e32 v0, s0
2889; GCN3-NEXT:    v_mov_b32_e32 v1, s1
2890; GCN3-NEXT:    v_mov_b32_e32 v2, s2
2891; GCN3-NEXT:    flat_atomic_umin v[0:1], v2 offset:16
2892; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
2893; GCN3-NEXT:    s_endpgm
2894entry:
2895  %gep = getelementptr i32, ptr %out, i32 4
2896  %val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst
2897  ret void
2898}
2899
2900define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
2901; GCN1-LABEL: atomic_umin_i32_ret_offset:
2902; GCN1:       ; %bb.0: ; %entry
2903; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2904; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xd
2905; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
2906; GCN1-NEXT:    s_add_u32 s0, s0, 16
2907; GCN1-NEXT:    s_addc_u32 s1, s1, 0
2908; GCN1-NEXT:    v_mov_b32_e32 v0, s0
2909; GCN1-NEXT:    v_mov_b32_e32 v1, s1
2910; GCN1-NEXT:    v_mov_b32_e32 v2, s4
2911; GCN1-NEXT:    flat_atomic_umin v2, v[0:1], v2 glc
2912; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
2913; GCN1-NEXT:    v_mov_b32_e32 v0, s2
2914; GCN1-NEXT:    v_mov_b32_e32 v1, s3
2915; GCN1-NEXT:    s_waitcnt vmcnt(0)
2916; GCN1-NEXT:    flat_store_dword v[0:1], v2
2917; GCN1-NEXT:    s_endpgm
2918;
2919; GCN2-LABEL: atomic_umin_i32_ret_offset:
2920; GCN2:       ; %bb.0: ; %entry
2921; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2922; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x34
2923; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
2924; GCN2-NEXT:    s_add_u32 s0, s0, 16
2925; GCN2-NEXT:    s_addc_u32 s1, s1, 0
2926; GCN2-NEXT:    v_mov_b32_e32 v0, s0
2927; GCN2-NEXT:    v_mov_b32_e32 v1, s1
2928; GCN2-NEXT:    v_mov_b32_e32 v2, s4
2929; GCN2-NEXT:    flat_atomic_umin v2, v[0:1], v2 glc
2930; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
2931; GCN2-NEXT:    v_mov_b32_e32 v0, s2
2932; GCN2-NEXT:    v_mov_b32_e32 v1, s3
2933; GCN2-NEXT:    s_waitcnt vmcnt(0)
2934; GCN2-NEXT:    flat_store_dword v[0:1], v2
2935; GCN2-NEXT:    s_endpgm
2936;
2937; GCN3-LABEL: atomic_umin_i32_ret_offset:
2938; GCN3:       ; %bb.0: ; %entry
2939; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2940; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x34
2941; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
2942; GCN3-NEXT:    v_mov_b32_e32 v0, s0
2943; GCN3-NEXT:    v_mov_b32_e32 v1, s1
2944; GCN3-NEXT:    v_mov_b32_e32 v2, s6
2945; GCN3-NEXT:    flat_atomic_umin v2, v[0:1], v2 offset:16 glc
2946; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
2947; GCN3-NEXT:    v_mov_b32_e32 v0, s2
2948; GCN3-NEXT:    v_mov_b32_e32 v1, s3
2949; GCN3-NEXT:    s_waitcnt vmcnt(0)
2950; GCN3-NEXT:    flat_store_dword v[0:1], v2
2951; GCN3-NEXT:    s_endpgm
2952entry:
2953  %gep = getelementptr i32, ptr %out, i32 4
2954  %val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst
2955  store i32 %val, ptr %out2
2956  ret void
2957}
2958
2959define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
2960; GCN1-LABEL: atomic_umin_i32_addr64_offset:
2961; GCN1:       ; %bb.0: ; %entry
2962; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
2963; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
2964; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
2965; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
2966; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
2967; GCN1-NEXT:    s_add_u32 s0, s2, s0
2968; GCN1-NEXT:    s_addc_u32 s1, s3, s1
2969; GCN1-NEXT:    s_add_u32 s0, s0, 16
2970; GCN1-NEXT:    s_addc_u32 s1, s1, 0
2971; GCN1-NEXT:    v_mov_b32_e32 v0, s0
2972; GCN1-NEXT:    v_mov_b32_e32 v1, s1
2973; GCN1-NEXT:    v_mov_b32_e32 v2, s4
2974; GCN1-NEXT:    flat_atomic_umin v[0:1], v2
2975; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
2976; GCN1-NEXT:    s_endpgm
2977;
2978; GCN2-LABEL: atomic_umin_i32_addr64_offset:
2979; GCN2:       ; %bb.0: ; %entry
2980; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
2981; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
2982; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
2983; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
2984; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
2985; GCN2-NEXT:    s_add_u32 s0, s2, s0
2986; GCN2-NEXT:    s_addc_u32 s1, s3, s1
2987; GCN2-NEXT:    s_add_u32 s0, s0, 16
2988; GCN2-NEXT:    s_addc_u32 s1, s1, 0
2989; GCN2-NEXT:    v_mov_b32_e32 v0, s0
2990; GCN2-NEXT:    v_mov_b32_e32 v1, s1
2991; GCN2-NEXT:    v_mov_b32_e32 v2, s4
2992; GCN2-NEXT:    flat_atomic_umin v[0:1], v2
2993; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
2994; GCN2-NEXT:    s_endpgm
2995;
2996; GCN3-LABEL: atomic_umin_i32_addr64_offset:
2997; GCN3:       ; %bb.0: ; %entry
2998; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
2999; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
3000; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
3001; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
3002; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
3003; GCN3-NEXT:    s_add_u32 s0, s2, s0
3004; GCN3-NEXT:    s_addc_u32 s1, s3, s1
3005; GCN3-NEXT:    v_mov_b32_e32 v0, s0
3006; GCN3-NEXT:    v_mov_b32_e32 v1, s1
3007; GCN3-NEXT:    v_mov_b32_e32 v2, s6
3008; GCN3-NEXT:    flat_atomic_umin v[0:1], v2 offset:16
3009; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
3010; GCN3-NEXT:    s_endpgm
3011entry:
3012  %ptr = getelementptr i32, ptr %out, i64 %index
3013  %gep = getelementptr i32, ptr %ptr, i32 4
3014  %val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst
3015  ret void
3016}
3017
3018define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
3019; GCN1-LABEL: atomic_umin_i32_ret_addr64_offset:
3020; GCN1:       ; %bb.0: ; %entry
3021; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
3022; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
3023; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
3024; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
3025; GCN1-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
3026; GCN1-NEXT:    s_add_u32 s0, s0, s4
3027; GCN1-NEXT:    s_addc_u32 s1, s1, s5
3028; GCN1-NEXT:    s_add_u32 s0, s0, 16
3029; GCN1-NEXT:    s_addc_u32 s1, s1, 0
3030; GCN1-NEXT:    v_mov_b32_e32 v0, s0
3031; GCN1-NEXT:    v_mov_b32_e32 v1, s1
3032; GCN1-NEXT:    v_mov_b32_e32 v2, s8
3033; GCN1-NEXT:    flat_atomic_umin v2, v[0:1], v2 glc
3034; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
3035; GCN1-NEXT:    v_mov_b32_e32 v0, s2
3036; GCN1-NEXT:    v_mov_b32_e32 v1, s3
3037; GCN1-NEXT:    s_waitcnt vmcnt(0)
3038; GCN1-NEXT:    flat_store_dword v[0:1], v2
3039; GCN1-NEXT:    s_endpgm
3040;
3041; GCN2-LABEL: atomic_umin_i32_ret_addr64_offset:
3042; GCN2:       ; %bb.0: ; %entry
3043; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
3044; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3045; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
3046; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
3047; GCN2-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
3048; GCN2-NEXT:    s_add_u32 s0, s0, s4
3049; GCN2-NEXT:    s_addc_u32 s1, s1, s5
3050; GCN2-NEXT:    s_add_u32 s0, s0, 16
3051; GCN2-NEXT:    s_addc_u32 s1, s1, 0
3052; GCN2-NEXT:    v_mov_b32_e32 v0, s0
3053; GCN2-NEXT:    v_mov_b32_e32 v1, s1
3054; GCN2-NEXT:    v_mov_b32_e32 v2, s8
3055; GCN2-NEXT:    flat_atomic_umin v2, v[0:1], v2 glc
3056; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
3057; GCN2-NEXT:    v_mov_b32_e32 v0, s2
3058; GCN2-NEXT:    v_mov_b32_e32 v1, s3
3059; GCN2-NEXT:    s_waitcnt vmcnt(0)
3060; GCN2-NEXT:    flat_store_dword v[0:1], v2
3061; GCN2-NEXT:    s_endpgm
3062;
3063; GCN3-LABEL: atomic_umin_i32_ret_addr64_offset:
3064; GCN3:       ; %bb.0: ; %entry
3065; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
3066; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3067; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
3068; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
3069; GCN3-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
3070; GCN3-NEXT:    s_add_u32 s0, s0, s4
3071; GCN3-NEXT:    s_addc_u32 s1, s1, s5
3072; GCN3-NEXT:    v_mov_b32_e32 v0, s0
3073; GCN3-NEXT:    v_mov_b32_e32 v1, s1
3074; GCN3-NEXT:    v_mov_b32_e32 v2, s8
3075; GCN3-NEXT:    flat_atomic_umin v2, v[0:1], v2 offset:16 glc
3076; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
3077; GCN3-NEXT:    v_mov_b32_e32 v0, s2
3078; GCN3-NEXT:    v_mov_b32_e32 v1, s3
3079; GCN3-NEXT:    s_waitcnt vmcnt(0)
3080; GCN3-NEXT:    flat_store_dword v[0:1], v2
3081; GCN3-NEXT:    s_endpgm
3082entry:
3083  %ptr = getelementptr i32, ptr %out, i64 %index
3084  %gep = getelementptr i32, ptr %ptr, i32 4
3085  %val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst
3086  store i32 %val, ptr %out2
3087  ret void
3088}
3089
3090define amdgpu_kernel void @atomic_umin_i32(ptr %out, i32 %in) {
3091; GCN1-LABEL: atomic_umin_i32:
3092; GCN1:       ; %bb.0: ; %entry
3093; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
3094; GCN1-NEXT:    s_load_dword s2, s[4:5], 0xb
3095; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
3096; GCN1-NEXT:    v_mov_b32_e32 v0, s0
3097; GCN1-NEXT:    v_mov_b32_e32 v1, s1
3098; GCN1-NEXT:    v_mov_b32_e32 v2, s2
3099; GCN1-NEXT:    flat_atomic_umin v[0:1], v2
3100; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
3101; GCN1-NEXT:    s_endpgm
3102;
3103; GCN2-LABEL: atomic_umin_i32:
3104; GCN2:       ; %bb.0: ; %entry
3105; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
3106; GCN2-NEXT:    s_load_dword s2, s[4:5], 0x2c
3107; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
3108; GCN2-NEXT:    v_mov_b32_e32 v0, s0
3109; GCN2-NEXT:    v_mov_b32_e32 v1, s1
3110; GCN2-NEXT:    v_mov_b32_e32 v2, s2
3111; GCN2-NEXT:    flat_atomic_umin v[0:1], v2
3112; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
3113; GCN2-NEXT:    s_endpgm
3114;
3115; GCN3-LABEL: atomic_umin_i32:
3116; GCN3:       ; %bb.0: ; %entry
3117; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
3118; GCN3-NEXT:    s_load_dword s2, s[4:5], 0x2c
3119; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
3120; GCN3-NEXT:    v_mov_b32_e32 v0, s0
3121; GCN3-NEXT:    v_mov_b32_e32 v1, s1
3122; GCN3-NEXT:    v_mov_b32_e32 v2, s2
3123; GCN3-NEXT:    flat_atomic_umin v[0:1], v2
3124; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
3125; GCN3-NEXT:    s_endpgm
3126entry:
3127  %val = atomicrmw volatile umin ptr %out, i32 %in syncscope("workgroup") seq_cst
3128  ret void
3129}
3130
3131define amdgpu_kernel void @atomic_umin_i32_ret(ptr %out, ptr %out2, i32 %in) {
3132; GCN1-LABEL: atomic_umin_i32_ret:
3133; GCN1:       ; %bb.0: ; %entry
3134; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
3135; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xd
3136; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
3137; GCN1-NEXT:    v_mov_b32_e32 v0, s0
3138; GCN1-NEXT:    v_mov_b32_e32 v1, s1
3139; GCN1-NEXT:    v_mov_b32_e32 v2, s4
3140; GCN1-NEXT:    flat_atomic_umin v2, v[0:1], v2 glc
3141; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
3142; GCN1-NEXT:    v_mov_b32_e32 v0, s2
3143; GCN1-NEXT:    v_mov_b32_e32 v1, s3
3144; GCN1-NEXT:    s_waitcnt vmcnt(0)
3145; GCN1-NEXT:    flat_store_dword v[0:1], v2
3146; GCN1-NEXT:    s_endpgm
3147;
3148; GCN2-LABEL: atomic_umin_i32_ret:
3149; GCN2:       ; %bb.0: ; %entry
3150; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3151; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x34
3152; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
3153; GCN2-NEXT:    v_mov_b32_e32 v0, s0
3154; GCN2-NEXT:    v_mov_b32_e32 v1, s1
3155; GCN2-NEXT:    v_mov_b32_e32 v2, s4
3156; GCN2-NEXT:    flat_atomic_umin v2, v[0:1], v2 glc
3157; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
3158; GCN2-NEXT:    v_mov_b32_e32 v0, s2
3159; GCN2-NEXT:    v_mov_b32_e32 v1, s3
3160; GCN2-NEXT:    s_waitcnt vmcnt(0)
3161; GCN2-NEXT:    flat_store_dword v[0:1], v2
3162; GCN2-NEXT:    s_endpgm
3163;
3164; GCN3-LABEL: atomic_umin_i32_ret:
3165; GCN3:       ; %bb.0: ; %entry
3166; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3167; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x34
3168; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
3169; GCN3-NEXT:    v_mov_b32_e32 v0, s0
3170; GCN3-NEXT:    v_mov_b32_e32 v1, s1
3171; GCN3-NEXT:    v_mov_b32_e32 v2, s6
3172; GCN3-NEXT:    flat_atomic_umin v2, v[0:1], v2 glc
3173; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
3174; GCN3-NEXT:    v_mov_b32_e32 v0, s2
3175; GCN3-NEXT:    v_mov_b32_e32 v1, s3
3176; GCN3-NEXT:    s_waitcnt vmcnt(0)
3177; GCN3-NEXT:    flat_store_dword v[0:1], v2
3178; GCN3-NEXT:    s_endpgm
3179entry:
3180  %val = atomicrmw volatile umin ptr %out, i32 %in syncscope("workgroup") seq_cst
3181  store i32 %val, ptr %out2
3182  ret void
3183}
3184
3185define amdgpu_kernel void @atomic_umin_i32_addr64(ptr %out, i32 %in, i64 %index) {
3186; GCN1-LABEL: atomic_umin_i32_addr64:
3187; GCN1:       ; %bb.0: ; %entry
3188; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
3189; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
3190; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
3191; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
3192; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
3193; GCN1-NEXT:    s_add_u32 s0, s2, s0
3194; GCN1-NEXT:    s_addc_u32 s1, s3, s1
3195; GCN1-NEXT:    v_mov_b32_e32 v0, s0
3196; GCN1-NEXT:    v_mov_b32_e32 v1, s1
3197; GCN1-NEXT:    v_mov_b32_e32 v2, s4
3198; GCN1-NEXT:    flat_atomic_umin v[0:1], v2
3199; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
3200; GCN1-NEXT:    s_endpgm
3201;
3202; GCN2-LABEL: atomic_umin_i32_addr64:
3203; GCN2:       ; %bb.0: ; %entry
3204; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
3205; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
3206; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
3207; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
3208; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
3209; GCN2-NEXT:    s_add_u32 s0, s2, s0
3210; GCN2-NEXT:    s_addc_u32 s1, s3, s1
3211; GCN2-NEXT:    v_mov_b32_e32 v0, s0
3212; GCN2-NEXT:    v_mov_b32_e32 v1, s1
3213; GCN2-NEXT:    v_mov_b32_e32 v2, s4
3214; GCN2-NEXT:    flat_atomic_umin v[0:1], v2
3215; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
3216; GCN2-NEXT:    s_endpgm
3217;
3218; GCN3-LABEL: atomic_umin_i32_addr64:
3219; GCN3:       ; %bb.0: ; %entry
3220; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
3221; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
3222; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
3223; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
3224; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
3225; GCN3-NEXT:    s_add_u32 s0, s2, s0
3226; GCN3-NEXT:    s_addc_u32 s1, s3, s1
3227; GCN3-NEXT:    v_mov_b32_e32 v0, s0
3228; GCN3-NEXT:    v_mov_b32_e32 v1, s1
3229; GCN3-NEXT:    v_mov_b32_e32 v2, s6
3230; GCN3-NEXT:    flat_atomic_umin v[0:1], v2
3231; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
3232; GCN3-NEXT:    s_endpgm
3233entry:
3234  %ptr = getelementptr i32, ptr %out, i64 %index
3235  %val = atomicrmw volatile umin ptr %ptr, i32 %in syncscope("workgroup") seq_cst
3236  ret void
3237}
3238
3239define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
3240; GCN1-LABEL: atomic_umin_i32_ret_addr64:
3241; GCN1:       ; %bb.0: ; %entry
3242; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
3243; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
3244; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
3245; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
3246; GCN1-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
3247; GCN1-NEXT:    s_add_u32 s0, s0, s4
3248; GCN1-NEXT:    s_addc_u32 s1, s1, s5
3249; GCN1-NEXT:    v_mov_b32_e32 v0, s0
3250; GCN1-NEXT:    v_mov_b32_e32 v1, s1
3251; GCN1-NEXT:    v_mov_b32_e32 v2, s8
3252; GCN1-NEXT:    flat_atomic_umin v2, v[0:1], v2 glc
3253; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
3254; GCN1-NEXT:    v_mov_b32_e32 v0, s2
3255; GCN1-NEXT:    v_mov_b32_e32 v1, s3
3256; GCN1-NEXT:    s_waitcnt vmcnt(0)
3257; GCN1-NEXT:    flat_store_dword v[0:1], v2
3258; GCN1-NEXT:    s_endpgm
3259;
3260; GCN2-LABEL: atomic_umin_i32_ret_addr64:
3261; GCN2:       ; %bb.0: ; %entry
3262; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
3263; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3264; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
3265; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
3266; GCN2-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
3267; GCN2-NEXT:    s_add_u32 s0, s0, s4
3268; GCN2-NEXT:    s_addc_u32 s1, s1, s5
3269; GCN2-NEXT:    v_mov_b32_e32 v0, s0
3270; GCN2-NEXT:    v_mov_b32_e32 v1, s1
3271; GCN2-NEXT:    v_mov_b32_e32 v2, s8
3272; GCN2-NEXT:    flat_atomic_umin v2, v[0:1], v2 glc
3273; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
3274; GCN2-NEXT:    v_mov_b32_e32 v0, s2
3275; GCN2-NEXT:    v_mov_b32_e32 v1, s3
3276; GCN2-NEXT:    s_waitcnt vmcnt(0)
3277; GCN2-NEXT:    flat_store_dword v[0:1], v2
3278; GCN2-NEXT:    s_endpgm
3279;
3280; GCN3-LABEL: atomic_umin_i32_ret_addr64:
3281; GCN3:       ; %bb.0: ; %entry
3282; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
3283; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3284; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
3285; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
3286; GCN3-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
3287; GCN3-NEXT:    s_add_u32 s0, s0, s4
3288; GCN3-NEXT:    s_addc_u32 s1, s1, s5
3289; GCN3-NEXT:    v_mov_b32_e32 v0, s0
3290; GCN3-NEXT:    v_mov_b32_e32 v1, s1
3291; GCN3-NEXT:    v_mov_b32_e32 v2, s8
3292; GCN3-NEXT:    flat_atomic_umin v2, v[0:1], v2 glc
3293; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
3294; GCN3-NEXT:    v_mov_b32_e32 v0, s2
3295; GCN3-NEXT:    v_mov_b32_e32 v1, s3
3296; GCN3-NEXT:    s_waitcnt vmcnt(0)
3297; GCN3-NEXT:    flat_store_dword v[0:1], v2
3298; GCN3-NEXT:    s_endpgm
3299entry:
3300  %ptr = getelementptr i32, ptr %out, i64 %index
3301  %val = atomicrmw volatile umin ptr %ptr, i32 %in syncscope("workgroup") seq_cst
3302  store i32 %val, ptr %out2
3303  ret void
3304}
3305
3306define amdgpu_kernel void @atomic_or_i32_offset(ptr %out, i32 %in) {
3307; GCN1-LABEL: atomic_or_i32_offset:
3308; GCN1:       ; %bb.0: ; %entry
3309; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
3310; GCN1-NEXT:    s_load_dword s2, s[4:5], 0xb
3311; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
3312; GCN1-NEXT:    s_add_u32 s0, s0, 16
3313; GCN1-NEXT:    s_addc_u32 s1, s1, 0
3314; GCN1-NEXT:    v_mov_b32_e32 v0, s0
3315; GCN1-NEXT:    v_mov_b32_e32 v1, s1
3316; GCN1-NEXT:    v_mov_b32_e32 v2, s2
3317; GCN1-NEXT:    flat_atomic_or v[0:1], v2
3318; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3319; GCN1-NEXT:    buffer_wbinvl1_vol
3320; GCN1-NEXT:    s_endpgm
3321;
3322; GCN2-LABEL: atomic_or_i32_offset:
3323; GCN2:       ; %bb.0: ; %entry
3324; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
3325; GCN2-NEXT:    s_load_dword s2, s[4:5], 0x2c
3326; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
3327; GCN2-NEXT:    s_add_u32 s0, s0, 16
3328; GCN2-NEXT:    s_addc_u32 s1, s1, 0
3329; GCN2-NEXT:    v_mov_b32_e32 v0, s0
3330; GCN2-NEXT:    v_mov_b32_e32 v1, s1
3331; GCN2-NEXT:    v_mov_b32_e32 v2, s2
3332; GCN2-NEXT:    flat_atomic_or v[0:1], v2
3333; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3334; GCN2-NEXT:    buffer_wbinvl1_vol
3335; GCN2-NEXT:    s_endpgm
3336;
3337; GCN3-LABEL: atomic_or_i32_offset:
3338; GCN3:       ; %bb.0: ; %entry
3339; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
3340; GCN3-NEXT:    s_load_dword s2, s[4:5], 0x2c
3341; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
3342; GCN3-NEXT:    v_mov_b32_e32 v0, s0
3343; GCN3-NEXT:    v_mov_b32_e32 v1, s1
3344; GCN3-NEXT:    v_mov_b32_e32 v2, s2
3345; GCN3-NEXT:    flat_atomic_or v[0:1], v2 offset:16
3346; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3347; GCN3-NEXT:    buffer_wbinvl1_vol
3348; GCN3-NEXT:    s_endpgm
3349entry:
3350  %gep = getelementptr i32, ptr %out, i32 4
3351  %val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst
3352  ret void
3353}
3354
3355define amdgpu_kernel void @atomic_or_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
3356; GCN1-LABEL: atomic_or_i32_ret_offset:
3357; GCN1:       ; %bb.0: ; %entry
3358; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
3359; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xd
3360; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
3361; GCN1-NEXT:    s_add_u32 s0, s0, 16
3362; GCN1-NEXT:    s_addc_u32 s1, s1, 0
3363; GCN1-NEXT:    v_mov_b32_e32 v0, s0
3364; GCN1-NEXT:    v_mov_b32_e32 v1, s1
3365; GCN1-NEXT:    v_mov_b32_e32 v2, s4
3366; GCN1-NEXT:    flat_atomic_or v2, v[0:1], v2 glc
3367; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3368; GCN1-NEXT:    buffer_wbinvl1_vol
3369; GCN1-NEXT:    v_mov_b32_e32 v0, s2
3370; GCN1-NEXT:    v_mov_b32_e32 v1, s3
3371; GCN1-NEXT:    flat_store_dword v[0:1], v2
3372; GCN1-NEXT:    s_endpgm
3373;
3374; GCN2-LABEL: atomic_or_i32_ret_offset:
3375; GCN2:       ; %bb.0: ; %entry
3376; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3377; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x34
3378; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
3379; GCN2-NEXT:    s_add_u32 s0, s0, 16
3380; GCN2-NEXT:    s_addc_u32 s1, s1, 0
3381; GCN2-NEXT:    v_mov_b32_e32 v0, s0
3382; GCN2-NEXT:    v_mov_b32_e32 v1, s1
3383; GCN2-NEXT:    v_mov_b32_e32 v2, s4
3384; GCN2-NEXT:    flat_atomic_or v2, v[0:1], v2 glc
3385; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3386; GCN2-NEXT:    buffer_wbinvl1_vol
3387; GCN2-NEXT:    v_mov_b32_e32 v0, s2
3388; GCN2-NEXT:    v_mov_b32_e32 v1, s3
3389; GCN2-NEXT:    flat_store_dword v[0:1], v2
3390; GCN2-NEXT:    s_endpgm
3391;
3392; GCN3-LABEL: atomic_or_i32_ret_offset:
3393; GCN3:       ; %bb.0: ; %entry
3394; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3395; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x34
3396; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
3397; GCN3-NEXT:    v_mov_b32_e32 v0, s0
3398; GCN3-NEXT:    v_mov_b32_e32 v1, s1
3399; GCN3-NEXT:    v_mov_b32_e32 v2, s6
3400; GCN3-NEXT:    flat_atomic_or v2, v[0:1], v2 offset:16 glc
3401; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3402; GCN3-NEXT:    buffer_wbinvl1_vol
3403; GCN3-NEXT:    v_mov_b32_e32 v0, s2
3404; GCN3-NEXT:    v_mov_b32_e32 v1, s3
3405; GCN3-NEXT:    flat_store_dword v[0:1], v2
3406; GCN3-NEXT:    s_endpgm
3407entry:
3408  %gep = getelementptr i32, ptr %out, i32 4
3409  %val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst
3410  store i32 %val, ptr %out2
3411  ret void
3412}
3413
3414define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
3415; GCN1-LABEL: atomic_or_i32_addr64_offset:
3416; GCN1:       ; %bb.0: ; %entry
3417; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
3418; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
3419; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
3420; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
3421; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
3422; GCN1-NEXT:    s_add_u32 s0, s2, s0
3423; GCN1-NEXT:    s_addc_u32 s1, s3, s1
3424; GCN1-NEXT:    s_add_u32 s0, s0, 16
3425; GCN1-NEXT:    s_addc_u32 s1, s1, 0
3426; GCN1-NEXT:    v_mov_b32_e32 v0, s0
3427; GCN1-NEXT:    v_mov_b32_e32 v1, s1
3428; GCN1-NEXT:    v_mov_b32_e32 v2, s4
3429; GCN1-NEXT:    flat_atomic_or v[0:1], v2
3430; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3431; GCN1-NEXT:    buffer_wbinvl1_vol
3432; GCN1-NEXT:    s_endpgm
3433;
3434; GCN2-LABEL: atomic_or_i32_addr64_offset:
3435; GCN2:       ; %bb.0: ; %entry
3436; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
3437; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
3438; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
3439; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
3440; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
3441; GCN2-NEXT:    s_add_u32 s0, s2, s0
3442; GCN2-NEXT:    s_addc_u32 s1, s3, s1
3443; GCN2-NEXT:    s_add_u32 s0, s0, 16
3444; GCN2-NEXT:    s_addc_u32 s1, s1, 0
3445; GCN2-NEXT:    v_mov_b32_e32 v0, s0
3446; GCN2-NEXT:    v_mov_b32_e32 v1, s1
3447; GCN2-NEXT:    v_mov_b32_e32 v2, s4
3448; GCN2-NEXT:    flat_atomic_or v[0:1], v2
3449; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3450; GCN2-NEXT:    buffer_wbinvl1_vol
3451; GCN2-NEXT:    s_endpgm
3452;
3453; GCN3-LABEL: atomic_or_i32_addr64_offset:
3454; GCN3:       ; %bb.0: ; %entry
3455; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
3456; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
3457; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
3458; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
3459; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
3460; GCN3-NEXT:    s_add_u32 s0, s2, s0
3461; GCN3-NEXT:    s_addc_u32 s1, s3, s1
3462; GCN3-NEXT:    v_mov_b32_e32 v0, s0
3463; GCN3-NEXT:    v_mov_b32_e32 v1, s1
3464; GCN3-NEXT:    v_mov_b32_e32 v2, s6
3465; GCN3-NEXT:    flat_atomic_or v[0:1], v2 offset:16
3466; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3467; GCN3-NEXT:    buffer_wbinvl1_vol
3468; GCN3-NEXT:    s_endpgm
3469entry:
3470  %ptr = getelementptr i32, ptr %out, i64 %index
3471  %gep = getelementptr i32, ptr %ptr, i32 4
3472  %val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst
3473  ret void
3474}
3475
3476define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
3477; GCN1-LABEL: atomic_or_i32_ret_addr64_offset:
3478; GCN1:       ; %bb.0: ; %entry
3479; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
3480; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
3481; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
3482; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
3483; GCN1-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
3484; GCN1-NEXT:    s_add_u32 s0, s0, s4
3485; GCN1-NEXT:    s_addc_u32 s1, s1, s5
3486; GCN1-NEXT:    s_add_u32 s0, s0, 16
3487; GCN1-NEXT:    s_addc_u32 s1, s1, 0
3488; GCN1-NEXT:    v_mov_b32_e32 v0, s0
3489; GCN1-NEXT:    v_mov_b32_e32 v1, s1
3490; GCN1-NEXT:    v_mov_b32_e32 v2, s8
3491; GCN1-NEXT:    flat_atomic_or v2, v[0:1], v2 glc
3492; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3493; GCN1-NEXT:    buffer_wbinvl1_vol
3494; GCN1-NEXT:    v_mov_b32_e32 v0, s2
3495; GCN1-NEXT:    v_mov_b32_e32 v1, s3
3496; GCN1-NEXT:    flat_store_dword v[0:1], v2
3497; GCN1-NEXT:    s_endpgm
3498;
3499; GCN2-LABEL: atomic_or_i32_ret_addr64_offset:
3500; GCN2:       ; %bb.0: ; %entry
3501; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
3502; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3503; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
3504; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
3505; GCN2-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
3506; GCN2-NEXT:    s_add_u32 s0, s0, s4
3507; GCN2-NEXT:    s_addc_u32 s1, s1, s5
3508; GCN2-NEXT:    s_add_u32 s0, s0, 16
3509; GCN2-NEXT:    s_addc_u32 s1, s1, 0
3510; GCN2-NEXT:    v_mov_b32_e32 v0, s0
3511; GCN2-NEXT:    v_mov_b32_e32 v1, s1
3512; GCN2-NEXT:    v_mov_b32_e32 v2, s8
3513; GCN2-NEXT:    flat_atomic_or v2, v[0:1], v2 glc
3514; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3515; GCN2-NEXT:    buffer_wbinvl1_vol
3516; GCN2-NEXT:    v_mov_b32_e32 v0, s2
3517; GCN2-NEXT:    v_mov_b32_e32 v1, s3
3518; GCN2-NEXT:    flat_store_dword v[0:1], v2
3519; GCN2-NEXT:    s_endpgm
3520;
3521; GCN3-LABEL: atomic_or_i32_ret_addr64_offset:
3522; GCN3:       ; %bb.0: ; %entry
3523; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
3524; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3525; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
3526; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
3527; GCN3-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
3528; GCN3-NEXT:    s_add_u32 s0, s0, s4
3529; GCN3-NEXT:    s_addc_u32 s1, s1, s5
3530; GCN3-NEXT:    v_mov_b32_e32 v0, s0
3531; GCN3-NEXT:    v_mov_b32_e32 v1, s1
3532; GCN3-NEXT:    v_mov_b32_e32 v2, s8
3533; GCN3-NEXT:    flat_atomic_or v2, v[0:1], v2 offset:16 glc
3534; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3535; GCN3-NEXT:    buffer_wbinvl1_vol
3536; GCN3-NEXT:    v_mov_b32_e32 v0, s2
3537; GCN3-NEXT:    v_mov_b32_e32 v1, s3
3538; GCN3-NEXT:    flat_store_dword v[0:1], v2
3539; GCN3-NEXT:    s_endpgm
3540entry:
3541  %ptr = getelementptr i32, ptr %out, i64 %index
3542  %gep = getelementptr i32, ptr %ptr, i32 4
3543  %val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst
3544  store i32 %val, ptr %out2
3545  ret void
3546}
3547
3548define amdgpu_kernel void @atomic_or_i32(ptr %out, i32 %in) {
3549; GCN1-LABEL: atomic_or_i32:
3550; GCN1:       ; %bb.0: ; %entry
3551; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
3552; GCN1-NEXT:    s_load_dword s2, s[4:5], 0xb
3553; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
3554; GCN1-NEXT:    v_mov_b32_e32 v0, s0
3555; GCN1-NEXT:    v_mov_b32_e32 v1, s1
3556; GCN1-NEXT:    v_mov_b32_e32 v2, s2
3557; GCN1-NEXT:    flat_atomic_or v[0:1], v2
3558; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3559; GCN1-NEXT:    buffer_wbinvl1_vol
3560; GCN1-NEXT:    s_endpgm
3561;
3562; GCN2-LABEL: atomic_or_i32:
3563; GCN2:       ; %bb.0: ; %entry
3564; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
3565; GCN2-NEXT:    s_load_dword s2, s[4:5], 0x2c
3566; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
3567; GCN2-NEXT:    v_mov_b32_e32 v0, s0
3568; GCN2-NEXT:    v_mov_b32_e32 v1, s1
3569; GCN2-NEXT:    v_mov_b32_e32 v2, s2
3570; GCN2-NEXT:    flat_atomic_or v[0:1], v2
3571; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3572; GCN2-NEXT:    buffer_wbinvl1_vol
3573; GCN2-NEXT:    s_endpgm
3574;
3575; GCN3-LABEL: atomic_or_i32:
3576; GCN3:       ; %bb.0: ; %entry
3577; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
3578; GCN3-NEXT:    s_load_dword s2, s[4:5], 0x2c
3579; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
3580; GCN3-NEXT:    v_mov_b32_e32 v0, s0
3581; GCN3-NEXT:    v_mov_b32_e32 v1, s1
3582; GCN3-NEXT:    v_mov_b32_e32 v2, s2
3583; GCN3-NEXT:    flat_atomic_or v[0:1], v2
3584; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3585; GCN3-NEXT:    buffer_wbinvl1_vol
3586; GCN3-NEXT:    s_endpgm
3587entry:
3588  %val = atomicrmw volatile or ptr %out, i32 %in syncscope("agent") seq_cst
3589  ret void
3590}
3591
3592define amdgpu_kernel void @atomic_or_i32_ret(ptr %out, ptr %out2, i32 %in) {
3593; GCN1-LABEL: atomic_or_i32_ret:
3594; GCN1:       ; %bb.0: ; %entry
3595; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
3596; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xd
3597; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
3598; GCN1-NEXT:    v_mov_b32_e32 v0, s0
3599; GCN1-NEXT:    v_mov_b32_e32 v1, s1
3600; GCN1-NEXT:    v_mov_b32_e32 v2, s4
3601; GCN1-NEXT:    flat_atomic_or v2, v[0:1], v2 glc
3602; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3603; GCN1-NEXT:    buffer_wbinvl1_vol
3604; GCN1-NEXT:    v_mov_b32_e32 v0, s2
3605; GCN1-NEXT:    v_mov_b32_e32 v1, s3
3606; GCN1-NEXT:    flat_store_dword v[0:1], v2
3607; GCN1-NEXT:    s_endpgm
3608;
3609; GCN2-LABEL: atomic_or_i32_ret:
3610; GCN2:       ; %bb.0: ; %entry
3611; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3612; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x34
3613; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
3614; GCN2-NEXT:    v_mov_b32_e32 v0, s0
3615; GCN2-NEXT:    v_mov_b32_e32 v1, s1
3616; GCN2-NEXT:    v_mov_b32_e32 v2, s4
3617; GCN2-NEXT:    flat_atomic_or v2, v[0:1], v2 glc
3618; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3619; GCN2-NEXT:    buffer_wbinvl1_vol
3620; GCN2-NEXT:    v_mov_b32_e32 v0, s2
3621; GCN2-NEXT:    v_mov_b32_e32 v1, s3
3622; GCN2-NEXT:    flat_store_dword v[0:1], v2
3623; GCN2-NEXT:    s_endpgm
3624;
3625; GCN3-LABEL: atomic_or_i32_ret:
3626; GCN3:       ; %bb.0: ; %entry
3627; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3628; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x34
3629; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
3630; GCN3-NEXT:    v_mov_b32_e32 v0, s0
3631; GCN3-NEXT:    v_mov_b32_e32 v1, s1
3632; GCN3-NEXT:    v_mov_b32_e32 v2, s6
3633; GCN3-NEXT:    flat_atomic_or v2, v[0:1], v2 glc
3634; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3635; GCN3-NEXT:    buffer_wbinvl1_vol
3636; GCN3-NEXT:    v_mov_b32_e32 v0, s2
3637; GCN3-NEXT:    v_mov_b32_e32 v1, s3
3638; GCN3-NEXT:    flat_store_dword v[0:1], v2
3639; GCN3-NEXT:    s_endpgm
3640entry:
3641  %val = atomicrmw volatile or ptr %out, i32 %in syncscope("agent") seq_cst
3642  store i32 %val, ptr %out2
3643  ret void
3644}
3645
3646define amdgpu_kernel void @atomic_or_i32_addr64(ptr %out, i32 %in, i64 %index) {
3647; GCN1-LABEL: atomic_or_i32_addr64:
3648; GCN1:       ; %bb.0: ; %entry
3649; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
3650; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
3651; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
3652; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
3653; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
3654; GCN1-NEXT:    s_add_u32 s0, s2, s0
3655; GCN1-NEXT:    s_addc_u32 s1, s3, s1
3656; GCN1-NEXT:    v_mov_b32_e32 v0, s0
3657; GCN1-NEXT:    v_mov_b32_e32 v1, s1
3658; GCN1-NEXT:    v_mov_b32_e32 v2, s4
3659; GCN1-NEXT:    flat_atomic_or v[0:1], v2
3660; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3661; GCN1-NEXT:    buffer_wbinvl1_vol
3662; GCN1-NEXT:    s_endpgm
3663;
3664; GCN2-LABEL: atomic_or_i32_addr64:
3665; GCN2:       ; %bb.0: ; %entry
3666; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
3667; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
3668; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
3669; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
3670; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
3671; GCN2-NEXT:    s_add_u32 s0, s2, s0
3672; GCN2-NEXT:    s_addc_u32 s1, s3, s1
3673; GCN2-NEXT:    v_mov_b32_e32 v0, s0
3674; GCN2-NEXT:    v_mov_b32_e32 v1, s1
3675; GCN2-NEXT:    v_mov_b32_e32 v2, s4
3676; GCN2-NEXT:    flat_atomic_or v[0:1], v2
3677; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3678; GCN2-NEXT:    buffer_wbinvl1_vol
3679; GCN2-NEXT:    s_endpgm
3680;
3681; GCN3-LABEL: atomic_or_i32_addr64:
3682; GCN3:       ; %bb.0: ; %entry
3683; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
3684; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
3685; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
3686; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
3687; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
3688; GCN3-NEXT:    s_add_u32 s0, s2, s0
3689; GCN3-NEXT:    s_addc_u32 s1, s3, s1
3690; GCN3-NEXT:    v_mov_b32_e32 v0, s0
3691; GCN3-NEXT:    v_mov_b32_e32 v1, s1
3692; GCN3-NEXT:    v_mov_b32_e32 v2, s6
3693; GCN3-NEXT:    flat_atomic_or v[0:1], v2
3694; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3695; GCN3-NEXT:    buffer_wbinvl1_vol
3696; GCN3-NEXT:    s_endpgm
3697entry:
3698  %ptr = getelementptr i32, ptr %out, i64 %index
3699  %val = atomicrmw volatile or ptr %ptr, i32 %in syncscope("agent") seq_cst
3700  ret void
3701}
3702
3703define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
3704; GCN1-LABEL: atomic_or_i32_ret_addr64:
3705; GCN1:       ; %bb.0: ; %entry
3706; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
3707; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
3708; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
3709; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
3710; GCN1-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
3711; GCN1-NEXT:    s_add_u32 s0, s0, s4
3712; GCN1-NEXT:    s_addc_u32 s1, s1, s5
3713; GCN1-NEXT:    v_mov_b32_e32 v0, s0
3714; GCN1-NEXT:    v_mov_b32_e32 v1, s1
3715; GCN1-NEXT:    v_mov_b32_e32 v2, s8
3716; GCN1-NEXT:    flat_atomic_or v2, v[0:1], v2 glc
3717; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3718; GCN1-NEXT:    buffer_wbinvl1_vol
3719; GCN1-NEXT:    v_mov_b32_e32 v0, s2
3720; GCN1-NEXT:    v_mov_b32_e32 v1, s3
3721; GCN1-NEXT:    flat_store_dword v[0:1], v2
3722; GCN1-NEXT:    s_endpgm
3723;
3724; GCN2-LABEL: atomic_or_i32_ret_addr64:
3725; GCN2:       ; %bb.0: ; %entry
3726; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
3727; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3728; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
3729; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
3730; GCN2-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
3731; GCN2-NEXT:    s_add_u32 s0, s0, s4
3732; GCN2-NEXT:    s_addc_u32 s1, s1, s5
3733; GCN2-NEXT:    v_mov_b32_e32 v0, s0
3734; GCN2-NEXT:    v_mov_b32_e32 v1, s1
3735; GCN2-NEXT:    v_mov_b32_e32 v2, s8
3736; GCN2-NEXT:    flat_atomic_or v2, v[0:1], v2 glc
3737; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3738; GCN2-NEXT:    buffer_wbinvl1_vol
3739; GCN2-NEXT:    v_mov_b32_e32 v0, s2
3740; GCN2-NEXT:    v_mov_b32_e32 v1, s3
3741; GCN2-NEXT:    flat_store_dword v[0:1], v2
3742; GCN2-NEXT:    s_endpgm
3743;
3744; GCN3-LABEL: atomic_or_i32_ret_addr64:
3745; GCN3:       ; %bb.0: ; %entry
3746; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
3747; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3748; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
3749; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
3750; GCN3-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
3751; GCN3-NEXT:    s_add_u32 s0, s0, s4
3752; GCN3-NEXT:    s_addc_u32 s1, s1, s5
3753; GCN3-NEXT:    v_mov_b32_e32 v0, s0
3754; GCN3-NEXT:    v_mov_b32_e32 v1, s1
3755; GCN3-NEXT:    v_mov_b32_e32 v2, s8
3756; GCN3-NEXT:    flat_atomic_or v2, v[0:1], v2 glc
3757; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3758; GCN3-NEXT:    buffer_wbinvl1_vol
3759; GCN3-NEXT:    v_mov_b32_e32 v0, s2
3760; GCN3-NEXT:    v_mov_b32_e32 v1, s3
3761; GCN3-NEXT:    flat_store_dword v[0:1], v2
3762; GCN3-NEXT:    s_endpgm
3763entry:
3764  %ptr = getelementptr i32, ptr %out, i64 %index
3765  %val = atomicrmw volatile or ptr %ptr, i32 %in syncscope("agent") seq_cst
3766  store i32 %val, ptr %out2
3767  ret void
3768}
3769
3770define amdgpu_kernel void @atomic_xchg_i32_offset(ptr %out, i32 %in) {
3771; GCN1-LABEL: atomic_xchg_i32_offset:
3772; GCN1:       ; %bb.0: ; %entry
3773; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
3774; GCN1-NEXT:    s_load_dword s2, s[4:5], 0xb
3775; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
3776; GCN1-NEXT:    s_add_u32 s0, s0, 16
3777; GCN1-NEXT:    s_addc_u32 s1, s1, 0
3778; GCN1-NEXT:    v_mov_b32_e32 v0, s0
3779; GCN1-NEXT:    v_mov_b32_e32 v1, s1
3780; GCN1-NEXT:    v_mov_b32_e32 v2, s2
3781; GCN1-NEXT:    flat_atomic_swap v[0:1], v2
3782; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3783; GCN1-NEXT:    buffer_wbinvl1_vol
3784; GCN1-NEXT:    s_endpgm
3785;
3786; GCN2-LABEL: atomic_xchg_i32_offset:
3787; GCN2:       ; %bb.0: ; %entry
3788; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
3789; GCN2-NEXT:    s_load_dword s2, s[4:5], 0x2c
3790; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
3791; GCN2-NEXT:    s_add_u32 s0, s0, 16
3792; GCN2-NEXT:    s_addc_u32 s1, s1, 0
3793; GCN2-NEXT:    v_mov_b32_e32 v0, s0
3794; GCN2-NEXT:    v_mov_b32_e32 v1, s1
3795; GCN2-NEXT:    v_mov_b32_e32 v2, s2
3796; GCN2-NEXT:    flat_atomic_swap v[0:1], v2
3797; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3798; GCN2-NEXT:    buffer_wbinvl1_vol
3799; GCN2-NEXT:    s_endpgm
3800;
3801; GCN3-LABEL: atomic_xchg_i32_offset:
3802; GCN3:       ; %bb.0: ; %entry
3803; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
3804; GCN3-NEXT:    s_load_dword s2, s[4:5], 0x2c
3805; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
3806; GCN3-NEXT:    v_mov_b32_e32 v0, s0
3807; GCN3-NEXT:    v_mov_b32_e32 v1, s1
3808; GCN3-NEXT:    v_mov_b32_e32 v2, s2
3809; GCN3-NEXT:    flat_atomic_swap v[0:1], v2 offset:16
3810; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3811; GCN3-NEXT:    buffer_wbinvl1_vol
3812; GCN3-NEXT:    s_endpgm
3813entry:
3814  %gep = getelementptr i32, ptr %out, i32 4
3815  %val = atomicrmw volatile xchg ptr %gep, i32 %in syncscope("agent") seq_cst
3816  ret void
3817}
3818
3819define amdgpu_kernel void @atomic_xchg_f32_offset(ptr %out, float %in) {
3820; GCN1-LABEL: atomic_xchg_f32_offset:
3821; GCN1:       ; %bb.0: ; %entry
3822; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
3823; GCN1-NEXT:    s_load_dword s2, s[4:5], 0xb
3824; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
3825; GCN1-NEXT:    s_add_u32 s0, s0, 16
3826; GCN1-NEXT:    s_addc_u32 s1, s1, 0
3827; GCN1-NEXT:    v_mov_b32_e32 v0, s0
3828; GCN1-NEXT:    v_mov_b32_e32 v1, s1
3829; GCN1-NEXT:    v_mov_b32_e32 v2, s2
3830; GCN1-NEXT:    flat_atomic_swap v[0:1], v2
3831; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3832; GCN1-NEXT:    buffer_wbinvl1_vol
3833; GCN1-NEXT:    s_endpgm
3834;
3835; GCN2-LABEL: atomic_xchg_f32_offset:
3836; GCN2:       ; %bb.0: ; %entry
3837; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
3838; GCN2-NEXT:    s_load_dword s2, s[4:5], 0x2c
3839; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
3840; GCN2-NEXT:    s_add_u32 s0, s0, 16
3841; GCN2-NEXT:    s_addc_u32 s1, s1, 0
3842; GCN2-NEXT:    v_mov_b32_e32 v0, s0
3843; GCN2-NEXT:    v_mov_b32_e32 v1, s1
3844; GCN2-NEXT:    v_mov_b32_e32 v2, s2
3845; GCN2-NEXT:    flat_atomic_swap v[0:1], v2
3846; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3847; GCN2-NEXT:    buffer_wbinvl1_vol
3848; GCN2-NEXT:    s_endpgm
3849;
3850; GCN3-LABEL: atomic_xchg_f32_offset:
3851; GCN3:       ; %bb.0: ; %entry
3852; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
3853; GCN3-NEXT:    s_load_dword s2, s[4:5], 0x2c
3854; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
3855; GCN3-NEXT:    v_mov_b32_e32 v0, s0
3856; GCN3-NEXT:    v_mov_b32_e32 v1, s1
3857; GCN3-NEXT:    v_mov_b32_e32 v2, s2
3858; GCN3-NEXT:    flat_atomic_swap v[0:1], v2 offset:16
3859; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3860; GCN3-NEXT:    buffer_wbinvl1_vol
3861; GCN3-NEXT:    s_endpgm
3862entry:
3863  %gep = getelementptr float, ptr %out, i32 4
3864  %val = atomicrmw volatile xchg ptr %gep, float %in syncscope("agent") seq_cst
3865  ret void
3866}
3867
3868define amdgpu_kernel void @atomic_xchg_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
3869; GCN1-LABEL: atomic_xchg_i32_ret_offset:
3870; GCN1:       ; %bb.0: ; %entry
3871; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
3872; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xd
3873; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
3874; GCN1-NEXT:    s_add_u32 s0, s0, 16
3875; GCN1-NEXT:    s_addc_u32 s1, s1, 0
3876; GCN1-NEXT:    v_mov_b32_e32 v0, s0
3877; GCN1-NEXT:    v_mov_b32_e32 v1, s1
3878; GCN1-NEXT:    v_mov_b32_e32 v2, s4
3879; GCN1-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
3880; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3881; GCN1-NEXT:    buffer_wbinvl1_vol
3882; GCN1-NEXT:    v_mov_b32_e32 v0, s2
3883; GCN1-NEXT:    v_mov_b32_e32 v1, s3
3884; GCN1-NEXT:    flat_store_dword v[0:1], v2
3885; GCN1-NEXT:    s_endpgm
3886;
3887; GCN2-LABEL: atomic_xchg_i32_ret_offset:
3888; GCN2:       ; %bb.0: ; %entry
3889; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3890; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x34
3891; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
3892; GCN2-NEXT:    s_add_u32 s0, s0, 16
3893; GCN2-NEXT:    s_addc_u32 s1, s1, 0
3894; GCN2-NEXT:    v_mov_b32_e32 v0, s0
3895; GCN2-NEXT:    v_mov_b32_e32 v1, s1
3896; GCN2-NEXT:    v_mov_b32_e32 v2, s4
3897; GCN2-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
3898; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3899; GCN2-NEXT:    buffer_wbinvl1_vol
3900; GCN2-NEXT:    v_mov_b32_e32 v0, s2
3901; GCN2-NEXT:    v_mov_b32_e32 v1, s3
3902; GCN2-NEXT:    flat_store_dword v[0:1], v2
3903; GCN2-NEXT:    s_endpgm
3904;
3905; GCN3-LABEL: atomic_xchg_i32_ret_offset:
3906; GCN3:       ; %bb.0: ; %entry
3907; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3908; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x34
3909; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
3910; GCN3-NEXT:    v_mov_b32_e32 v0, s0
3911; GCN3-NEXT:    v_mov_b32_e32 v1, s1
3912; GCN3-NEXT:    v_mov_b32_e32 v2, s6
3913; GCN3-NEXT:    flat_atomic_swap v2, v[0:1], v2 offset:16 glc
3914; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3915; GCN3-NEXT:    buffer_wbinvl1_vol
3916; GCN3-NEXT:    v_mov_b32_e32 v0, s2
3917; GCN3-NEXT:    v_mov_b32_e32 v1, s3
3918; GCN3-NEXT:    flat_store_dword v[0:1], v2
3919; GCN3-NEXT:    s_endpgm
3920entry:
3921  %gep = getelementptr i32, ptr %out, i32 4
3922  %val = atomicrmw volatile xchg ptr %gep, i32 %in syncscope("agent") seq_cst
3923  store i32 %val, ptr %out2
3924  ret void
3925}
3926
3927define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
3928; GCN1-LABEL: atomic_xchg_i32_addr64_offset:
3929; GCN1:       ; %bb.0: ; %entry
3930; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
3931; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
3932; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
3933; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
3934; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
3935; GCN1-NEXT:    s_add_u32 s0, s2, s0
3936; GCN1-NEXT:    s_addc_u32 s1, s3, s1
3937; GCN1-NEXT:    s_add_u32 s0, s0, 16
3938; GCN1-NEXT:    s_addc_u32 s1, s1, 0
3939; GCN1-NEXT:    v_mov_b32_e32 v0, s0
3940; GCN1-NEXT:    v_mov_b32_e32 v1, s1
3941; GCN1-NEXT:    v_mov_b32_e32 v2, s4
3942; GCN1-NEXT:    flat_atomic_swap v[0:1], v2
3943; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3944; GCN1-NEXT:    buffer_wbinvl1_vol
3945; GCN1-NEXT:    s_endpgm
3946;
3947; GCN2-LABEL: atomic_xchg_i32_addr64_offset:
3948; GCN2:       ; %bb.0: ; %entry
3949; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
3950; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
3951; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
3952; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
3953; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
3954; GCN2-NEXT:    s_add_u32 s0, s2, s0
3955; GCN2-NEXT:    s_addc_u32 s1, s3, s1
3956; GCN2-NEXT:    s_add_u32 s0, s0, 16
3957; GCN2-NEXT:    s_addc_u32 s1, s1, 0
3958; GCN2-NEXT:    v_mov_b32_e32 v0, s0
3959; GCN2-NEXT:    v_mov_b32_e32 v1, s1
3960; GCN2-NEXT:    v_mov_b32_e32 v2, s4
3961; GCN2-NEXT:    flat_atomic_swap v[0:1], v2
3962; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3963; GCN2-NEXT:    buffer_wbinvl1_vol
3964; GCN2-NEXT:    s_endpgm
3965;
3966; GCN3-LABEL: atomic_xchg_i32_addr64_offset:
3967; GCN3:       ; %bb.0: ; %entry
3968; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
3969; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
3970; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
3971; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
3972; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
3973; GCN3-NEXT:    s_add_u32 s0, s2, s0
3974; GCN3-NEXT:    s_addc_u32 s1, s3, s1
3975; GCN3-NEXT:    v_mov_b32_e32 v0, s0
3976; GCN3-NEXT:    v_mov_b32_e32 v1, s1
3977; GCN3-NEXT:    v_mov_b32_e32 v2, s6
3978; GCN3-NEXT:    flat_atomic_swap v[0:1], v2 offset:16
3979; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3980; GCN3-NEXT:    buffer_wbinvl1_vol
3981; GCN3-NEXT:    s_endpgm
3982entry:
3983  %ptr = getelementptr i32, ptr %out, i64 %index
3984  %gep = getelementptr i32, ptr %ptr, i32 4
3985  %val = atomicrmw volatile xchg ptr %gep, i32 %in syncscope("agent") seq_cst
3986  ret void
3987}
3988
3989define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
3990; GCN1-LABEL: atomic_xchg_i32_ret_addr64_offset:
3991; GCN1:       ; %bb.0: ; %entry
3992; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
3993; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
3994; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
3995; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
3996; GCN1-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
3997; GCN1-NEXT:    s_add_u32 s0, s0, s4
3998; GCN1-NEXT:    s_addc_u32 s1, s1, s5
3999; GCN1-NEXT:    s_add_u32 s0, s0, 16
4000; GCN1-NEXT:    s_addc_u32 s1, s1, 0
4001; GCN1-NEXT:    v_mov_b32_e32 v0, s0
4002; GCN1-NEXT:    v_mov_b32_e32 v1, s1
4003; GCN1-NEXT:    v_mov_b32_e32 v2, s8
4004; GCN1-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
4005; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4006; GCN1-NEXT:    buffer_wbinvl1_vol
4007; GCN1-NEXT:    v_mov_b32_e32 v0, s2
4008; GCN1-NEXT:    v_mov_b32_e32 v1, s3
4009; GCN1-NEXT:    flat_store_dword v[0:1], v2
4010; GCN1-NEXT:    s_endpgm
4011;
4012; GCN2-LABEL: atomic_xchg_i32_ret_addr64_offset:
4013; GCN2:       ; %bb.0: ; %entry
4014; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
4015; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4016; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
4017; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
4018; GCN2-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
4019; GCN2-NEXT:    s_add_u32 s0, s0, s4
4020; GCN2-NEXT:    s_addc_u32 s1, s1, s5
4021; GCN2-NEXT:    s_add_u32 s0, s0, 16
4022; GCN2-NEXT:    s_addc_u32 s1, s1, 0
4023; GCN2-NEXT:    v_mov_b32_e32 v0, s0
4024; GCN2-NEXT:    v_mov_b32_e32 v1, s1
4025; GCN2-NEXT:    v_mov_b32_e32 v2, s8
4026; GCN2-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
4027; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4028; GCN2-NEXT:    buffer_wbinvl1_vol
4029; GCN2-NEXT:    v_mov_b32_e32 v0, s2
4030; GCN2-NEXT:    v_mov_b32_e32 v1, s3
4031; GCN2-NEXT:    flat_store_dword v[0:1], v2
4032; GCN2-NEXT:    s_endpgm
4033;
4034; GCN3-LABEL: atomic_xchg_i32_ret_addr64_offset:
4035; GCN3:       ; %bb.0: ; %entry
4036; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
4037; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4038; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
4039; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
4040; GCN3-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
4041; GCN3-NEXT:    s_add_u32 s0, s0, s4
4042; GCN3-NEXT:    s_addc_u32 s1, s1, s5
4043; GCN3-NEXT:    v_mov_b32_e32 v0, s0
4044; GCN3-NEXT:    v_mov_b32_e32 v1, s1
4045; GCN3-NEXT:    v_mov_b32_e32 v2, s8
4046; GCN3-NEXT:    flat_atomic_swap v2, v[0:1], v2 offset:16 glc
4047; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4048; GCN3-NEXT:    buffer_wbinvl1_vol
4049; GCN3-NEXT:    v_mov_b32_e32 v0, s2
4050; GCN3-NEXT:    v_mov_b32_e32 v1, s3
4051; GCN3-NEXT:    flat_store_dword v[0:1], v2
4052; GCN3-NEXT:    s_endpgm
4053entry:
4054  %ptr = getelementptr i32, ptr %out, i64 %index
4055  %gep = getelementptr i32, ptr %ptr, i32 4
4056  %val = atomicrmw volatile xchg ptr %gep, i32 %in syncscope("agent") seq_cst
4057  store i32 %val, ptr %out2
4058  ret void
4059}
4060
4061define amdgpu_kernel void @atomic_xchg_i32(ptr %out, i32 %in) {
4062; GCN1-LABEL: atomic_xchg_i32:
4063; GCN1:       ; %bb.0: ; %entry
4064; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
4065; GCN1-NEXT:    s_load_dword s2, s[4:5], 0xb
4066; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
4067; GCN1-NEXT:    v_mov_b32_e32 v0, s0
4068; GCN1-NEXT:    v_mov_b32_e32 v1, s1
4069; GCN1-NEXT:    v_mov_b32_e32 v2, s2
4070; GCN1-NEXT:    flat_atomic_swap v[0:1], v2
4071; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4072; GCN1-NEXT:    buffer_wbinvl1_vol
4073; GCN1-NEXT:    s_endpgm
4074;
4075; GCN2-LABEL: atomic_xchg_i32:
4076; GCN2:       ; %bb.0: ; %entry
4077; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
4078; GCN2-NEXT:    s_load_dword s2, s[4:5], 0x2c
4079; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
4080; GCN2-NEXT:    v_mov_b32_e32 v0, s0
4081; GCN2-NEXT:    v_mov_b32_e32 v1, s1
4082; GCN2-NEXT:    v_mov_b32_e32 v2, s2
4083; GCN2-NEXT:    flat_atomic_swap v[0:1], v2
4084; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4085; GCN2-NEXT:    buffer_wbinvl1_vol
4086; GCN2-NEXT:    s_endpgm
4087;
4088; GCN3-LABEL: atomic_xchg_i32:
4089; GCN3:       ; %bb.0: ; %entry
4090; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
4091; GCN3-NEXT:    s_load_dword s2, s[4:5], 0x2c
4092; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
4093; GCN3-NEXT:    v_mov_b32_e32 v0, s0
4094; GCN3-NEXT:    v_mov_b32_e32 v1, s1
4095; GCN3-NEXT:    v_mov_b32_e32 v2, s2
4096; GCN3-NEXT:    flat_atomic_swap v[0:1], v2
4097; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4098; GCN3-NEXT:    buffer_wbinvl1_vol
4099; GCN3-NEXT:    s_endpgm
4100entry:
4101  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") seq_cst
4102  ret void
4103}
4104
4105define amdgpu_kernel void @atomic_xchg_i32_ret(ptr %out, ptr %out2, i32 %in) {
4106; GCN1-LABEL: atomic_xchg_i32_ret:
4107; GCN1:       ; %bb.0: ; %entry
4108; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
4109; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xd
4110; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
4111; GCN1-NEXT:    v_mov_b32_e32 v0, s0
4112; GCN1-NEXT:    v_mov_b32_e32 v1, s1
4113; GCN1-NEXT:    v_mov_b32_e32 v2, s4
4114; GCN1-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
4115; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4116; GCN1-NEXT:    buffer_wbinvl1_vol
4117; GCN1-NEXT:    v_mov_b32_e32 v0, s2
4118; GCN1-NEXT:    v_mov_b32_e32 v1, s3
4119; GCN1-NEXT:    flat_store_dword v[0:1], v2
4120; GCN1-NEXT:    s_endpgm
4121;
4122; GCN2-LABEL: atomic_xchg_i32_ret:
4123; GCN2:       ; %bb.0: ; %entry
4124; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4125; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x34
4126; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
4127; GCN2-NEXT:    v_mov_b32_e32 v0, s0
4128; GCN2-NEXT:    v_mov_b32_e32 v1, s1
4129; GCN2-NEXT:    v_mov_b32_e32 v2, s4
4130; GCN2-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
4131; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4132; GCN2-NEXT:    buffer_wbinvl1_vol
4133; GCN2-NEXT:    v_mov_b32_e32 v0, s2
4134; GCN2-NEXT:    v_mov_b32_e32 v1, s3
4135; GCN2-NEXT:    flat_store_dword v[0:1], v2
4136; GCN2-NEXT:    s_endpgm
4137;
4138; GCN3-LABEL: atomic_xchg_i32_ret:
4139; GCN3:       ; %bb.0: ; %entry
4140; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4141; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x34
4142; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
4143; GCN3-NEXT:    v_mov_b32_e32 v0, s0
4144; GCN3-NEXT:    v_mov_b32_e32 v1, s1
4145; GCN3-NEXT:    v_mov_b32_e32 v2, s6
4146; GCN3-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
4147; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4148; GCN3-NEXT:    buffer_wbinvl1_vol
4149; GCN3-NEXT:    v_mov_b32_e32 v0, s2
4150; GCN3-NEXT:    v_mov_b32_e32 v1, s3
4151; GCN3-NEXT:    flat_store_dword v[0:1], v2
4152; GCN3-NEXT:    s_endpgm
4153entry:
4154  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") seq_cst
4155  store i32 %val, ptr %out2
4156  ret void
4157}
4158
4159define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr %out, i32 %in, i64 %index) {
4160; GCN1-LABEL: atomic_xchg_i32_addr64:
4161; GCN1:       ; %bb.0: ; %entry
4162; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
4163; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
4164; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
4165; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
4166; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
4167; GCN1-NEXT:    s_add_u32 s0, s2, s0
4168; GCN1-NEXT:    s_addc_u32 s1, s3, s1
4169; GCN1-NEXT:    v_mov_b32_e32 v0, s0
4170; GCN1-NEXT:    v_mov_b32_e32 v1, s1
4171; GCN1-NEXT:    v_mov_b32_e32 v2, s4
4172; GCN1-NEXT:    flat_atomic_swap v[0:1], v2
4173; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4174; GCN1-NEXT:    buffer_wbinvl1_vol
4175; GCN1-NEXT:    s_endpgm
4176;
4177; GCN2-LABEL: atomic_xchg_i32_addr64:
4178; GCN2:       ; %bb.0: ; %entry
4179; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
4180; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
4181; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
4182; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
4183; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
4184; GCN2-NEXT:    s_add_u32 s0, s2, s0
4185; GCN2-NEXT:    s_addc_u32 s1, s3, s1
4186; GCN2-NEXT:    v_mov_b32_e32 v0, s0
4187; GCN2-NEXT:    v_mov_b32_e32 v1, s1
4188; GCN2-NEXT:    v_mov_b32_e32 v2, s4
4189; GCN2-NEXT:    flat_atomic_swap v[0:1], v2
4190; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4191; GCN2-NEXT:    buffer_wbinvl1_vol
4192; GCN2-NEXT:    s_endpgm
4193;
4194; GCN3-LABEL: atomic_xchg_i32_addr64:
4195; GCN3:       ; %bb.0: ; %entry
4196; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
4197; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
4198; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
4199; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
4200; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
4201; GCN3-NEXT:    s_add_u32 s0, s2, s0
4202; GCN3-NEXT:    s_addc_u32 s1, s3, s1
4203; GCN3-NEXT:    v_mov_b32_e32 v0, s0
4204; GCN3-NEXT:    v_mov_b32_e32 v1, s1
4205; GCN3-NEXT:    v_mov_b32_e32 v2, s6
4206; GCN3-NEXT:    flat_atomic_swap v[0:1], v2
4207; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4208; GCN3-NEXT:    buffer_wbinvl1_vol
4209; GCN3-NEXT:    s_endpgm
4210entry:
4211  %ptr = getelementptr i32, ptr %out, i64 %index
4212  %val = atomicrmw volatile xchg ptr %ptr, i32 %in syncscope("agent") seq_cst
4213  ret void
4214}
4215
4216define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
4217; GCN1-LABEL: atomic_xchg_i32_ret_addr64:
4218; GCN1:       ; %bb.0: ; %entry
4219; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
4220; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
4221; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
4222; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
4223; GCN1-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
4224; GCN1-NEXT:    s_add_u32 s0, s0, s4
4225; GCN1-NEXT:    s_addc_u32 s1, s1, s5
4226; GCN1-NEXT:    v_mov_b32_e32 v0, s0
4227; GCN1-NEXT:    v_mov_b32_e32 v1, s1
4228; GCN1-NEXT:    v_mov_b32_e32 v2, s8
4229; GCN1-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
4230; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4231; GCN1-NEXT:    buffer_wbinvl1_vol
4232; GCN1-NEXT:    v_mov_b32_e32 v0, s2
4233; GCN1-NEXT:    v_mov_b32_e32 v1, s3
4234; GCN1-NEXT:    flat_store_dword v[0:1], v2
4235; GCN1-NEXT:    s_endpgm
4236;
4237; GCN2-LABEL: atomic_xchg_i32_ret_addr64:
4238; GCN2:       ; %bb.0: ; %entry
4239; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
4240; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4241; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
4242; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
4243; GCN2-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
4244; GCN2-NEXT:    s_add_u32 s0, s0, s4
4245; GCN2-NEXT:    s_addc_u32 s1, s1, s5
4246; GCN2-NEXT:    v_mov_b32_e32 v0, s0
4247; GCN2-NEXT:    v_mov_b32_e32 v1, s1
4248; GCN2-NEXT:    v_mov_b32_e32 v2, s8
4249; GCN2-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
4250; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4251; GCN2-NEXT:    buffer_wbinvl1_vol
4252; GCN2-NEXT:    v_mov_b32_e32 v0, s2
4253; GCN2-NEXT:    v_mov_b32_e32 v1, s3
4254; GCN2-NEXT:    flat_store_dword v[0:1], v2
4255; GCN2-NEXT:    s_endpgm
4256;
4257; GCN3-LABEL: atomic_xchg_i32_ret_addr64:
4258; GCN3:       ; %bb.0: ; %entry
4259; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
4260; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4261; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
4262; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
4263; GCN3-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
4264; GCN3-NEXT:    s_add_u32 s0, s0, s4
4265; GCN3-NEXT:    s_addc_u32 s1, s1, s5
4266; GCN3-NEXT:    v_mov_b32_e32 v0, s0
4267; GCN3-NEXT:    v_mov_b32_e32 v1, s1
4268; GCN3-NEXT:    v_mov_b32_e32 v2, s8
4269; GCN3-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
4270; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4271; GCN3-NEXT:    buffer_wbinvl1_vol
4272; GCN3-NEXT:    v_mov_b32_e32 v0, s2
4273; GCN3-NEXT:    v_mov_b32_e32 v1, s3
4274; GCN3-NEXT:    flat_store_dword v[0:1], v2
4275; GCN3-NEXT:    s_endpgm
4276entry:
4277  %ptr = getelementptr i32, ptr %out, i64 %index
4278  %val = atomicrmw volatile xchg ptr %ptr, i32 %in syncscope("agent") seq_cst
4279  store i32 %val, ptr %out2
4280  ret void
4281}
4282
4283; CMP_SWAP
4284
4285define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old) {
4286; GCN1-LABEL: atomic_cmpxchg_i32_offset:
4287; GCN1:       ; %bb.0: ; %entry
4288; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
4289; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
4290; GCN1-NEXT:    s_add_u32 s0, s0, 16
4291; GCN1-NEXT:    s_addc_u32 s1, s1, 0
4292; GCN1-NEXT:    v_mov_b32_e32 v0, s0
4293; GCN1-NEXT:    v_mov_b32_e32 v2, s2
4294; GCN1-NEXT:    v_mov_b32_e32 v1, s1
4295; GCN1-NEXT:    v_mov_b32_e32 v3, s3
4296; GCN1-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4297; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4298; GCN1-NEXT:    buffer_wbinvl1_vol
4299; GCN1-NEXT:    s_endpgm
4300;
4301; GCN2-LABEL: atomic_cmpxchg_i32_offset:
4302; GCN2:       ; %bb.0: ; %entry
4303; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4304; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
4305; GCN2-NEXT:    s_add_u32 s0, s0, 16
4306; GCN2-NEXT:    s_addc_u32 s1, s1, 0
4307; GCN2-NEXT:    v_mov_b32_e32 v0, s0
4308; GCN2-NEXT:    v_mov_b32_e32 v2, s2
4309; GCN2-NEXT:    v_mov_b32_e32 v1, s1
4310; GCN2-NEXT:    v_mov_b32_e32 v3, s3
4311; GCN2-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4312; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4313; GCN2-NEXT:    buffer_wbinvl1_vol
4314; GCN2-NEXT:    s_endpgm
4315;
4316; GCN3-LABEL: atomic_cmpxchg_i32_offset:
4317; GCN3:       ; %bb.0: ; %entry
4318; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4319; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
4320; GCN3-NEXT:    v_mov_b32_e32 v0, s0
4321; GCN3-NEXT:    v_mov_b32_e32 v2, s2
4322; GCN3-NEXT:    v_mov_b32_e32 v1, s1
4323; GCN3-NEXT:    v_mov_b32_e32 v3, s3
4324; GCN3-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4325; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4326; GCN3-NEXT:    buffer_wbinvl1_vol
4327; GCN3-NEXT:    s_endpgm
4328entry:
4329  %gep = getelementptr i32, ptr %out, i32 4
4330  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
4331  ret void
4332}
4333
4334define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr %out, ptr %out2, i32 %in, i32 %old) {
4335; GCN1-LABEL: atomic_cmpxchg_i32_ret_offset:
4336; GCN1:       ; %bb.0: ; %entry
4337; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
4338; GCN1-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
4339; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
4340; GCN1-NEXT:    s_add_u32 s0, s0, 16
4341; GCN1-NEXT:    s_addc_u32 s1, s1, 0
4342; GCN1-NEXT:    v_mov_b32_e32 v0, s0
4343; GCN1-NEXT:    v_mov_b32_e32 v2, s4
4344; GCN1-NEXT:    v_mov_b32_e32 v1, s1
4345; GCN1-NEXT:    v_mov_b32_e32 v3, s5
4346; GCN1-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4347; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4348; GCN1-NEXT:    buffer_wbinvl1_vol
4349; GCN1-NEXT:    v_mov_b32_e32 v0, s2
4350; GCN1-NEXT:    v_mov_b32_e32 v1, s3
4351; GCN1-NEXT:    flat_store_dword v[0:1], v2
4352; GCN1-NEXT:    s_endpgm
4353;
4354; GCN2-LABEL: atomic_cmpxchg_i32_ret_offset:
4355; GCN2:       ; %bb.0: ; %entry
4356; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4357; GCN2-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
4358; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
4359; GCN2-NEXT:    s_add_u32 s0, s0, 16
4360; GCN2-NEXT:    s_addc_u32 s1, s1, 0
4361; GCN2-NEXT:    v_mov_b32_e32 v0, s0
4362; GCN2-NEXT:    v_mov_b32_e32 v2, s4
4363; GCN2-NEXT:    v_mov_b32_e32 v1, s1
4364; GCN2-NEXT:    v_mov_b32_e32 v3, s5
4365; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4366; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4367; GCN2-NEXT:    buffer_wbinvl1_vol
4368; GCN2-NEXT:    v_mov_b32_e32 v0, s2
4369; GCN2-NEXT:    v_mov_b32_e32 v1, s3
4370; GCN2-NEXT:    flat_store_dword v[0:1], v2
4371; GCN2-NEXT:    s_endpgm
4372;
4373; GCN3-LABEL: atomic_cmpxchg_i32_ret_offset:
4374; GCN3:       ; %bb.0: ; %entry
4375; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4376; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
4377; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
4378; GCN3-NEXT:    v_mov_b32_e32 v0, s0
4379; GCN3-NEXT:    v_mov_b32_e32 v2, s6
4380; GCN3-NEXT:    v_mov_b32_e32 v1, s1
4381; GCN3-NEXT:    v_mov_b32_e32 v3, s7
4382; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
4383; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4384; GCN3-NEXT:    buffer_wbinvl1_vol
4385; GCN3-NEXT:    v_mov_b32_e32 v0, s2
4386; GCN3-NEXT:    v_mov_b32_e32 v1, s3
4387; GCN3-NEXT:    flat_store_dword v[0:1], v2
4388; GCN3-NEXT:    s_endpgm
4389entry:
4390  %gep = getelementptr i32, ptr %out, i32 4
4391  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
4392  %flag = extractvalue { i32, i1 } %val, 0
4393  store i32 %flag, ptr %out2
4394  ret void
4395}
4396
4397define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i64 %index, i32 %old) {
4398; GCN1-LABEL: atomic_cmpxchg_i32_addr64_offset:
4399; GCN1:       ; %bb.0: ; %entry
4400; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
4401; GCN1-NEXT:    s_load_dword s6, s[4:5], 0xb
4402; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
4403; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xf
4404; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
4405; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
4406; GCN1-NEXT:    v_mov_b32_e32 v0, s6
4407; GCN1-NEXT:    s_add_u32 s0, s2, s0
4408; GCN1-NEXT:    s_addc_u32 s1, s3, s1
4409; GCN1-NEXT:    s_add_u32 s0, s0, 16
4410; GCN1-NEXT:    s_addc_u32 s1, s1, 0
4411; GCN1-NEXT:    v_mov_b32_e32 v3, s1
4412; GCN1-NEXT:    v_mov_b32_e32 v1, s4
4413; GCN1-NEXT:    v_mov_b32_e32 v2, s0
4414; GCN1-NEXT:    flat_atomic_cmpswap v[2:3], v[0:1]
4415; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4416; GCN1-NEXT:    buffer_wbinvl1_vol
4417; GCN1-NEXT:    s_endpgm
4418;
4419; GCN2-LABEL: atomic_cmpxchg_i32_addr64_offset:
4420; GCN2:       ; %bb.0: ; %entry
4421; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
4422; GCN2-NEXT:    s_load_dword s6, s[4:5], 0x2c
4423; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
4424; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x3c
4425; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
4426; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
4427; GCN2-NEXT:    v_mov_b32_e32 v0, s6
4428; GCN2-NEXT:    s_add_u32 s0, s2, s0
4429; GCN2-NEXT:    s_addc_u32 s1, s3, s1
4430; GCN2-NEXT:    s_add_u32 s0, s0, 16
4431; GCN2-NEXT:    s_addc_u32 s1, s1, 0
4432; GCN2-NEXT:    v_mov_b32_e32 v3, s1
4433; GCN2-NEXT:    v_mov_b32_e32 v1, s4
4434; GCN2-NEXT:    v_mov_b32_e32 v2, s0
4435; GCN2-NEXT:    flat_atomic_cmpswap v[2:3], v[0:1]
4436; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4437; GCN2-NEXT:    buffer_wbinvl1_vol
4438; GCN2-NEXT:    s_endpgm
4439;
4440; GCN3-LABEL: atomic_cmpxchg_i32_addr64_offset:
4441; GCN3:       ; %bb.0: ; %entry
4442; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
4443; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
4444; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
4445; GCN3-NEXT:    s_load_dword s7, s[4:5], 0x3c
4446; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
4447; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
4448; GCN3-NEXT:    v_mov_b32_e32 v0, s6
4449; GCN3-NEXT:    s_add_u32 s0, s2, s0
4450; GCN3-NEXT:    s_addc_u32 s1, s3, s1
4451; GCN3-NEXT:    v_mov_b32_e32 v3, s1
4452; GCN3-NEXT:    v_mov_b32_e32 v1, s7
4453; GCN3-NEXT:    v_mov_b32_e32 v2, s0
4454; GCN3-NEXT:    flat_atomic_cmpswap v[2:3], v[0:1] offset:16
4455; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4456; GCN3-NEXT:    buffer_wbinvl1_vol
4457; GCN3-NEXT:    s_endpgm
4458entry:
4459  %ptr = getelementptr i32, ptr %out, i64 %index
4460  %gep = getelementptr i32, ptr %ptr, i32 4
4461  %val  = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
4462  ret void
4463}
4464
4465define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index, i32 %old) {
4466; GCN1-LABEL: atomic_cmpxchg_i32_ret_addr64_offset:
4467; GCN1:       ; %bb.0: ; %entry
4468; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
4469; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
4470; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
4471; GCN1-NEXT:    s_load_dword s9, s[4:5], 0x11
4472; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
4473; GCN1-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
4474; GCN1-NEXT:    v_mov_b32_e32 v0, s8
4475; GCN1-NEXT:    s_add_u32 s0, s0, s4
4476; GCN1-NEXT:    s_addc_u32 s1, s1, s5
4477; GCN1-NEXT:    s_add_u32 s0, s0, 16
4478; GCN1-NEXT:    s_addc_u32 s1, s1, 0
4479; GCN1-NEXT:    v_mov_b32_e32 v3, s1
4480; GCN1-NEXT:    v_mov_b32_e32 v1, s9
4481; GCN1-NEXT:    v_mov_b32_e32 v2, s0
4482; GCN1-NEXT:    flat_atomic_cmpswap v2, v[2:3], v[0:1] glc
4483; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4484; GCN1-NEXT:    buffer_wbinvl1_vol
4485; GCN1-NEXT:    v_mov_b32_e32 v0, s2
4486; GCN1-NEXT:    v_mov_b32_e32 v1, s3
4487; GCN1-NEXT:    flat_store_dword v[0:1], v2
4488; GCN1-NEXT:    s_endpgm
4489;
4490; GCN2-LABEL: atomic_cmpxchg_i32_ret_addr64_offset:
4491; GCN2:       ; %bb.0: ; %entry
4492; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
4493; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
4494; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4495; GCN2-NEXT:    s_load_dword s9, s[4:5], 0x44
4496; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
4497; GCN2-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
4498; GCN2-NEXT:    v_mov_b32_e32 v0, s8
4499; GCN2-NEXT:    s_add_u32 s0, s0, s4
4500; GCN2-NEXT:    s_addc_u32 s1, s1, s5
4501; GCN2-NEXT:    s_add_u32 s0, s0, 16
4502; GCN2-NEXT:    s_addc_u32 s1, s1, 0
4503; GCN2-NEXT:    v_mov_b32_e32 v3, s1
4504; GCN2-NEXT:    v_mov_b32_e32 v1, s9
4505; GCN2-NEXT:    v_mov_b32_e32 v2, s0
4506; GCN2-NEXT:    flat_atomic_cmpswap v2, v[2:3], v[0:1] glc
4507; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4508; GCN2-NEXT:    buffer_wbinvl1_vol
4509; GCN2-NEXT:    v_mov_b32_e32 v0, s2
4510; GCN2-NEXT:    v_mov_b32_e32 v1, s3
4511; GCN2-NEXT:    flat_store_dword v[0:1], v2
4512; GCN2-NEXT:    s_endpgm
4513;
4514; GCN3-LABEL: atomic_cmpxchg_i32_ret_addr64_offset:
4515; GCN3:       ; %bb.0: ; %entry
4516; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
4517; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
4518; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4519; GCN3-NEXT:    s_load_dword s9, s[4:5], 0x44
4520; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
4521; GCN3-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
4522; GCN3-NEXT:    v_mov_b32_e32 v0, s8
4523; GCN3-NEXT:    s_add_u32 s0, s0, s4
4524; GCN3-NEXT:    s_addc_u32 s1, s1, s5
4525; GCN3-NEXT:    v_mov_b32_e32 v3, s1
4526; GCN3-NEXT:    v_mov_b32_e32 v1, s9
4527; GCN3-NEXT:    v_mov_b32_e32 v2, s0
4528; GCN3-NEXT:    flat_atomic_cmpswap v2, v[2:3], v[0:1] offset:16 glc
4529; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4530; GCN3-NEXT:    buffer_wbinvl1_vol
4531; GCN3-NEXT:    v_mov_b32_e32 v0, s2
4532; GCN3-NEXT:    v_mov_b32_e32 v1, s3
4533; GCN3-NEXT:    flat_store_dword v[0:1], v2
4534; GCN3-NEXT:    s_endpgm
4535entry:
4536  %ptr = getelementptr i32, ptr %out, i64 %index
4537  %gep = getelementptr i32, ptr %ptr, i32 4
4538  %val  = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
4539  %flag = extractvalue { i32, i1 } %val, 0
4540  store i32 %flag, ptr %out2
4541  ret void
4542}
4543
4544define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) {
4545; GCN1-LABEL: atomic_cmpxchg_i32:
4546; GCN1:       ; %bb.0: ; %entry
4547; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
4548; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
4549; GCN1-NEXT:    v_mov_b32_e32 v0, s0
4550; GCN1-NEXT:    v_mov_b32_e32 v2, s2
4551; GCN1-NEXT:    v_mov_b32_e32 v1, s1
4552; GCN1-NEXT:    v_mov_b32_e32 v3, s3
4553; GCN1-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4554; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4555; GCN1-NEXT:    buffer_wbinvl1_vol
4556; GCN1-NEXT:    s_endpgm
4557;
4558; GCN2-LABEL: atomic_cmpxchg_i32:
4559; GCN2:       ; %bb.0: ; %entry
4560; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4561; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
4562; GCN2-NEXT:    v_mov_b32_e32 v0, s0
4563; GCN2-NEXT:    v_mov_b32_e32 v2, s2
4564; GCN2-NEXT:    v_mov_b32_e32 v1, s1
4565; GCN2-NEXT:    v_mov_b32_e32 v3, s3
4566; GCN2-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4567; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4568; GCN2-NEXT:    buffer_wbinvl1_vol
4569; GCN2-NEXT:    s_endpgm
4570;
4571; GCN3-LABEL: atomic_cmpxchg_i32:
4572; GCN3:       ; %bb.0: ; %entry
4573; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4574; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
4575; GCN3-NEXT:    v_mov_b32_e32 v0, s0
4576; GCN3-NEXT:    v_mov_b32_e32 v2, s2
4577; GCN3-NEXT:    v_mov_b32_e32 v1, s1
4578; GCN3-NEXT:    v_mov_b32_e32 v3, s3
4579; GCN3-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4580; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4581; GCN3-NEXT:    buffer_wbinvl1_vol
4582; GCN3-NEXT:    s_endpgm
4583entry:
4584  %val = cmpxchg volatile ptr %out, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
4585  ret void
4586}
4587
4588define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr %out, ptr %out2, i32 %in, i32 %old) {
4589; GCN1-LABEL: atomic_cmpxchg_i32_ret:
4590; GCN1:       ; %bb.0: ; %entry
4591; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
4592; GCN1-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
4593; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
4594; GCN1-NEXT:    v_mov_b32_e32 v0, s0
4595; GCN1-NEXT:    v_mov_b32_e32 v2, s4
4596; GCN1-NEXT:    v_mov_b32_e32 v1, s1
4597; GCN1-NEXT:    v_mov_b32_e32 v3, s5
4598; GCN1-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4599; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4600; GCN1-NEXT:    buffer_wbinvl1_vol
4601; GCN1-NEXT:    v_mov_b32_e32 v0, s2
4602; GCN1-NEXT:    v_mov_b32_e32 v1, s3
4603; GCN1-NEXT:    flat_store_dword v[0:1], v2
4604; GCN1-NEXT:    s_endpgm
4605;
4606; GCN2-LABEL: atomic_cmpxchg_i32_ret:
4607; GCN2:       ; %bb.0: ; %entry
4608; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4609; GCN2-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
4610; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
4611; GCN2-NEXT:    v_mov_b32_e32 v0, s0
4612; GCN2-NEXT:    v_mov_b32_e32 v2, s4
4613; GCN2-NEXT:    v_mov_b32_e32 v1, s1
4614; GCN2-NEXT:    v_mov_b32_e32 v3, s5
4615; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4616; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4617; GCN2-NEXT:    buffer_wbinvl1_vol
4618; GCN2-NEXT:    v_mov_b32_e32 v0, s2
4619; GCN2-NEXT:    v_mov_b32_e32 v1, s3
4620; GCN2-NEXT:    flat_store_dword v[0:1], v2
4621; GCN2-NEXT:    s_endpgm
4622;
4623; GCN3-LABEL: atomic_cmpxchg_i32_ret:
4624; GCN3:       ; %bb.0: ; %entry
4625; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4626; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
4627; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
4628; GCN3-NEXT:    v_mov_b32_e32 v0, s0
4629; GCN3-NEXT:    v_mov_b32_e32 v2, s6
4630; GCN3-NEXT:    v_mov_b32_e32 v1, s1
4631; GCN3-NEXT:    v_mov_b32_e32 v3, s7
4632; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4633; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4634; GCN3-NEXT:    buffer_wbinvl1_vol
4635; GCN3-NEXT:    v_mov_b32_e32 v0, s2
4636; GCN3-NEXT:    v_mov_b32_e32 v1, s3
4637; GCN3-NEXT:    flat_store_dword v[0:1], v2
4638; GCN3-NEXT:    s_endpgm
4639entry:
4640  %val = cmpxchg volatile ptr %out, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
4641  %flag = extractvalue { i32, i1 } %val, 0
4642  store i32 %flag, ptr %out2
4643  ret void
4644}
4645
4646define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %index, i32 %old) {
4647; GCN1-LABEL: atomic_cmpxchg_i32_addr64:
4648; GCN1:       ; %bb.0: ; %entry
4649; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
4650; GCN1-NEXT:    s_load_dword s6, s[4:5], 0xb
4651; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
4652; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xf
4653; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
4654; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
4655; GCN1-NEXT:    v_mov_b32_e32 v0, s6
4656; GCN1-NEXT:    s_add_u32 s0, s2, s0
4657; GCN1-NEXT:    s_addc_u32 s1, s3, s1
4658; GCN1-NEXT:    v_mov_b32_e32 v3, s1
4659; GCN1-NEXT:    v_mov_b32_e32 v1, s4
4660; GCN1-NEXT:    v_mov_b32_e32 v2, s0
4661; GCN1-NEXT:    flat_atomic_cmpswap v[2:3], v[0:1]
4662; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4663; GCN1-NEXT:    buffer_wbinvl1_vol
4664; GCN1-NEXT:    s_endpgm
4665;
4666; GCN2-LABEL: atomic_cmpxchg_i32_addr64:
4667; GCN2:       ; %bb.0: ; %entry
4668; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
4669; GCN2-NEXT:    s_load_dword s6, s[4:5], 0x2c
4670; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
4671; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x3c
4672; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
4673; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
4674; GCN2-NEXT:    v_mov_b32_e32 v0, s6
4675; GCN2-NEXT:    s_add_u32 s0, s2, s0
4676; GCN2-NEXT:    s_addc_u32 s1, s3, s1
4677; GCN2-NEXT:    v_mov_b32_e32 v3, s1
4678; GCN2-NEXT:    v_mov_b32_e32 v1, s4
4679; GCN2-NEXT:    v_mov_b32_e32 v2, s0
4680; GCN2-NEXT:    flat_atomic_cmpswap v[2:3], v[0:1]
4681; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4682; GCN2-NEXT:    buffer_wbinvl1_vol
4683; GCN2-NEXT:    s_endpgm
4684;
4685; GCN3-LABEL: atomic_cmpxchg_i32_addr64:
4686; GCN3:       ; %bb.0: ; %entry
4687; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
4688; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
4689; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
4690; GCN3-NEXT:    s_load_dword s7, s[4:5], 0x3c
4691; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
4692; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
4693; GCN3-NEXT:    v_mov_b32_e32 v0, s6
4694; GCN3-NEXT:    s_add_u32 s0, s2, s0
4695; GCN3-NEXT:    s_addc_u32 s1, s3, s1
4696; GCN3-NEXT:    v_mov_b32_e32 v3, s1
4697; GCN3-NEXT:    v_mov_b32_e32 v1, s7
4698; GCN3-NEXT:    v_mov_b32_e32 v2, s0
4699; GCN3-NEXT:    flat_atomic_cmpswap v[2:3], v[0:1]
4700; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4701; GCN3-NEXT:    buffer_wbinvl1_vol
4702; GCN3-NEXT:    s_endpgm
4703entry:
4704  %ptr = getelementptr i32, ptr %out, i64 %index
4705  %val = cmpxchg volatile ptr %ptr, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
4706  ret void
4707}
4708
4709define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index, i32 %old) {
4710; GCN1-LABEL: atomic_cmpxchg_i32_ret_addr64:
4711; GCN1:       ; %bb.0: ; %entry
4712; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
4713; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
4714; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
4715; GCN1-NEXT:    s_load_dword s9, s[4:5], 0x11
4716; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
4717; GCN1-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
4718; GCN1-NEXT:    v_mov_b32_e32 v0, s8
4719; GCN1-NEXT:    s_add_u32 s0, s0, s4
4720; GCN1-NEXT:    s_addc_u32 s1, s1, s5
4721; GCN1-NEXT:    v_mov_b32_e32 v3, s1
4722; GCN1-NEXT:    v_mov_b32_e32 v1, s9
4723; GCN1-NEXT:    v_mov_b32_e32 v2, s0
4724; GCN1-NEXT:    flat_atomic_cmpswap v2, v[2:3], v[0:1] glc
4725; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4726; GCN1-NEXT:    buffer_wbinvl1_vol
4727; GCN1-NEXT:    v_mov_b32_e32 v0, s2
4728; GCN1-NEXT:    v_mov_b32_e32 v1, s3
4729; GCN1-NEXT:    flat_store_dword v[0:1], v2
4730; GCN1-NEXT:    s_endpgm
4731;
4732; GCN2-LABEL: atomic_cmpxchg_i32_ret_addr64:
4733; GCN2:       ; %bb.0: ; %entry
4734; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
4735; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
4736; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4737; GCN2-NEXT:    s_load_dword s9, s[4:5], 0x44
4738; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
4739; GCN2-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
4740; GCN2-NEXT:    v_mov_b32_e32 v0, s8
4741; GCN2-NEXT:    s_add_u32 s0, s0, s4
4742; GCN2-NEXT:    s_addc_u32 s1, s1, s5
4743; GCN2-NEXT:    v_mov_b32_e32 v3, s1
4744; GCN2-NEXT:    v_mov_b32_e32 v1, s9
4745; GCN2-NEXT:    v_mov_b32_e32 v2, s0
4746; GCN2-NEXT:    flat_atomic_cmpswap v2, v[2:3], v[0:1] glc
4747; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4748; GCN2-NEXT:    buffer_wbinvl1_vol
4749; GCN2-NEXT:    v_mov_b32_e32 v0, s2
4750; GCN2-NEXT:    v_mov_b32_e32 v1, s3
4751; GCN2-NEXT:    flat_store_dword v[0:1], v2
4752; GCN2-NEXT:    s_endpgm
4753;
4754; GCN3-LABEL: atomic_cmpxchg_i32_ret_addr64:
4755; GCN3:       ; %bb.0: ; %entry
4756; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
4757; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
4758; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4759; GCN3-NEXT:    s_load_dword s9, s[4:5], 0x44
4760; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
4761; GCN3-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
4762; GCN3-NEXT:    v_mov_b32_e32 v0, s8
4763; GCN3-NEXT:    s_add_u32 s0, s0, s4
4764; GCN3-NEXT:    s_addc_u32 s1, s1, s5
4765; GCN3-NEXT:    v_mov_b32_e32 v3, s1
4766; GCN3-NEXT:    v_mov_b32_e32 v1, s9
4767; GCN3-NEXT:    v_mov_b32_e32 v2, s0
4768; GCN3-NEXT:    flat_atomic_cmpswap v2, v[2:3], v[0:1] glc
4769; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4770; GCN3-NEXT:    buffer_wbinvl1_vol
4771; GCN3-NEXT:    v_mov_b32_e32 v0, s2
4772; GCN3-NEXT:    v_mov_b32_e32 v1, s3
4773; GCN3-NEXT:    flat_store_dword v[0:1], v2
4774; GCN3-NEXT:    s_endpgm
4775entry:
4776  %ptr = getelementptr i32, ptr %out, i64 %index
4777  %val = cmpxchg volatile ptr %ptr, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
4778  %flag = extractvalue { i32, i1 } %val, 0
4779  store i32 %flag, ptr %out2
4780  ret void
4781}
4782
4783define amdgpu_kernel void @atomic_xor_i32_offset(ptr %out, i32 %in) {
4784; GCN1-LABEL: atomic_xor_i32_offset:
4785; GCN1:       ; %bb.0: ; %entry
4786; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
4787; GCN1-NEXT:    s_load_dword s2, s[4:5], 0xb
4788; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
4789; GCN1-NEXT:    s_add_u32 s0, s0, 16
4790; GCN1-NEXT:    s_addc_u32 s1, s1, 0
4791; GCN1-NEXT:    v_mov_b32_e32 v0, s0
4792; GCN1-NEXT:    v_mov_b32_e32 v1, s1
4793; GCN1-NEXT:    v_mov_b32_e32 v2, s2
4794; GCN1-NEXT:    flat_atomic_xor v[0:1], v2
4795; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4796; GCN1-NEXT:    buffer_wbinvl1_vol
4797; GCN1-NEXT:    s_endpgm
4798;
4799; GCN2-LABEL: atomic_xor_i32_offset:
4800; GCN2:       ; %bb.0: ; %entry
4801; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
4802; GCN2-NEXT:    s_load_dword s2, s[4:5], 0x2c
4803; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
4804; GCN2-NEXT:    s_add_u32 s0, s0, 16
4805; GCN2-NEXT:    s_addc_u32 s1, s1, 0
4806; GCN2-NEXT:    v_mov_b32_e32 v0, s0
4807; GCN2-NEXT:    v_mov_b32_e32 v1, s1
4808; GCN2-NEXT:    v_mov_b32_e32 v2, s2
4809; GCN2-NEXT:    flat_atomic_xor v[0:1], v2
4810; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4811; GCN2-NEXT:    buffer_wbinvl1_vol
4812; GCN2-NEXT:    s_endpgm
4813;
4814; GCN3-LABEL: atomic_xor_i32_offset:
4815; GCN3:       ; %bb.0: ; %entry
4816; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
4817; GCN3-NEXT:    s_load_dword s2, s[4:5], 0x2c
4818; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
4819; GCN3-NEXT:    v_mov_b32_e32 v0, s0
4820; GCN3-NEXT:    v_mov_b32_e32 v1, s1
4821; GCN3-NEXT:    v_mov_b32_e32 v2, s2
4822; GCN3-NEXT:    flat_atomic_xor v[0:1], v2 offset:16
4823; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4824; GCN3-NEXT:    buffer_wbinvl1_vol
4825; GCN3-NEXT:    s_endpgm
4826entry:
4827  %gep = getelementptr i32, ptr %out, i32 4
4828  %val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst
4829  ret void
4830}
4831
4832define amdgpu_kernel void @atomic_xor_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
4833; GCN1-LABEL: atomic_xor_i32_ret_offset:
4834; GCN1:       ; %bb.0: ; %entry
4835; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
4836; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xd
4837; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
4838; GCN1-NEXT:    s_add_u32 s0, s0, 16
4839; GCN1-NEXT:    s_addc_u32 s1, s1, 0
4840; GCN1-NEXT:    v_mov_b32_e32 v0, s0
4841; GCN1-NEXT:    v_mov_b32_e32 v1, s1
4842; GCN1-NEXT:    v_mov_b32_e32 v2, s4
4843; GCN1-NEXT:    flat_atomic_xor v2, v[0:1], v2 glc
4844; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4845; GCN1-NEXT:    buffer_wbinvl1_vol
4846; GCN1-NEXT:    v_mov_b32_e32 v0, s2
4847; GCN1-NEXT:    v_mov_b32_e32 v1, s3
4848; GCN1-NEXT:    flat_store_dword v[0:1], v2
4849; GCN1-NEXT:    s_endpgm
4850;
4851; GCN2-LABEL: atomic_xor_i32_ret_offset:
4852; GCN2:       ; %bb.0: ; %entry
4853; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4854; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x34
4855; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
4856; GCN2-NEXT:    s_add_u32 s0, s0, 16
4857; GCN2-NEXT:    s_addc_u32 s1, s1, 0
4858; GCN2-NEXT:    v_mov_b32_e32 v0, s0
4859; GCN2-NEXT:    v_mov_b32_e32 v1, s1
4860; GCN2-NEXT:    v_mov_b32_e32 v2, s4
4861; GCN2-NEXT:    flat_atomic_xor v2, v[0:1], v2 glc
4862; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4863; GCN2-NEXT:    buffer_wbinvl1_vol
4864; GCN2-NEXT:    v_mov_b32_e32 v0, s2
4865; GCN2-NEXT:    v_mov_b32_e32 v1, s3
4866; GCN2-NEXT:    flat_store_dword v[0:1], v2
4867; GCN2-NEXT:    s_endpgm
4868;
4869; GCN3-LABEL: atomic_xor_i32_ret_offset:
4870; GCN3:       ; %bb.0: ; %entry
4871; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4872; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x34
4873; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
4874; GCN3-NEXT:    v_mov_b32_e32 v0, s0
4875; GCN3-NEXT:    v_mov_b32_e32 v1, s1
4876; GCN3-NEXT:    v_mov_b32_e32 v2, s6
4877; GCN3-NEXT:    flat_atomic_xor v2, v[0:1], v2 offset:16 glc
4878; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4879; GCN3-NEXT:    buffer_wbinvl1_vol
4880; GCN3-NEXT:    v_mov_b32_e32 v0, s2
4881; GCN3-NEXT:    v_mov_b32_e32 v1, s3
4882; GCN3-NEXT:    flat_store_dword v[0:1], v2
4883; GCN3-NEXT:    s_endpgm
4884entry:
4885  %gep = getelementptr i32, ptr %out, i32 4
4886  %val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst
4887  store i32 %val, ptr %out2
4888  ret void
4889}
4890
4891define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
4892; GCN1-LABEL: atomic_xor_i32_addr64_offset:
4893; GCN1:       ; %bb.0: ; %entry
4894; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
4895; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
4896; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
4897; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
4898; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
4899; GCN1-NEXT:    s_add_u32 s0, s2, s0
4900; GCN1-NEXT:    s_addc_u32 s1, s3, s1
4901; GCN1-NEXT:    s_add_u32 s0, s0, 16
4902; GCN1-NEXT:    s_addc_u32 s1, s1, 0
4903; GCN1-NEXT:    v_mov_b32_e32 v0, s0
4904; GCN1-NEXT:    v_mov_b32_e32 v1, s1
4905; GCN1-NEXT:    v_mov_b32_e32 v2, s4
4906; GCN1-NEXT:    flat_atomic_xor v[0:1], v2
4907; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4908; GCN1-NEXT:    buffer_wbinvl1_vol
4909; GCN1-NEXT:    s_endpgm
4910;
4911; GCN2-LABEL: atomic_xor_i32_addr64_offset:
4912; GCN2:       ; %bb.0: ; %entry
4913; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
4914; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
4915; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
4916; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
4917; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
4918; GCN2-NEXT:    s_add_u32 s0, s2, s0
4919; GCN2-NEXT:    s_addc_u32 s1, s3, s1
4920; GCN2-NEXT:    s_add_u32 s0, s0, 16
4921; GCN2-NEXT:    s_addc_u32 s1, s1, 0
4922; GCN2-NEXT:    v_mov_b32_e32 v0, s0
4923; GCN2-NEXT:    v_mov_b32_e32 v1, s1
4924; GCN2-NEXT:    v_mov_b32_e32 v2, s4
4925; GCN2-NEXT:    flat_atomic_xor v[0:1], v2
4926; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4927; GCN2-NEXT:    buffer_wbinvl1_vol
4928; GCN2-NEXT:    s_endpgm
4929;
4930; GCN3-LABEL: atomic_xor_i32_addr64_offset:
4931; GCN3:       ; %bb.0: ; %entry
4932; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
4933; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
4934; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
4935; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
4936; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
4937; GCN3-NEXT:    s_add_u32 s0, s2, s0
4938; GCN3-NEXT:    s_addc_u32 s1, s3, s1
4939; GCN3-NEXT:    v_mov_b32_e32 v0, s0
4940; GCN3-NEXT:    v_mov_b32_e32 v1, s1
4941; GCN3-NEXT:    v_mov_b32_e32 v2, s6
4942; GCN3-NEXT:    flat_atomic_xor v[0:1], v2 offset:16
4943; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4944; GCN3-NEXT:    buffer_wbinvl1_vol
4945; GCN3-NEXT:    s_endpgm
4946entry:
4947  %ptr = getelementptr i32, ptr %out, i64 %index
4948  %gep = getelementptr i32, ptr %ptr, i32 4
4949  %val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst
4950  ret void
4951}
4952
4953define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
4954; GCN1-LABEL: atomic_xor_i32_ret_addr64_offset:
4955; GCN1:       ; %bb.0: ; %entry
4956; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
4957; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
4958; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
4959; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
4960; GCN1-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
4961; GCN1-NEXT:    s_add_u32 s0, s0, s4
4962; GCN1-NEXT:    s_addc_u32 s1, s1, s5
4963; GCN1-NEXT:    s_add_u32 s0, s0, 16
4964; GCN1-NEXT:    s_addc_u32 s1, s1, 0
4965; GCN1-NEXT:    v_mov_b32_e32 v0, s0
4966; GCN1-NEXT:    v_mov_b32_e32 v1, s1
4967; GCN1-NEXT:    v_mov_b32_e32 v2, s8
4968; GCN1-NEXT:    flat_atomic_xor v2, v[0:1], v2 glc
4969; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4970; GCN1-NEXT:    buffer_wbinvl1_vol
4971; GCN1-NEXT:    v_mov_b32_e32 v0, s2
4972; GCN1-NEXT:    v_mov_b32_e32 v1, s3
4973; GCN1-NEXT:    flat_store_dword v[0:1], v2
4974; GCN1-NEXT:    s_endpgm
4975;
4976; GCN2-LABEL: atomic_xor_i32_ret_addr64_offset:
4977; GCN2:       ; %bb.0: ; %entry
4978; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
4979; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4980; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
4981; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
4982; GCN2-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
4983; GCN2-NEXT:    s_add_u32 s0, s0, s4
4984; GCN2-NEXT:    s_addc_u32 s1, s1, s5
4985; GCN2-NEXT:    s_add_u32 s0, s0, 16
4986; GCN2-NEXT:    s_addc_u32 s1, s1, 0
4987; GCN2-NEXT:    v_mov_b32_e32 v0, s0
4988; GCN2-NEXT:    v_mov_b32_e32 v1, s1
4989; GCN2-NEXT:    v_mov_b32_e32 v2, s8
4990; GCN2-NEXT:    flat_atomic_xor v2, v[0:1], v2 glc
4991; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4992; GCN2-NEXT:    buffer_wbinvl1_vol
4993; GCN2-NEXT:    v_mov_b32_e32 v0, s2
4994; GCN2-NEXT:    v_mov_b32_e32 v1, s3
4995; GCN2-NEXT:    flat_store_dword v[0:1], v2
4996; GCN2-NEXT:    s_endpgm
4997;
4998; GCN3-LABEL: atomic_xor_i32_ret_addr64_offset:
4999; GCN3:       ; %bb.0: ; %entry
5000; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
5001; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5002; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
5003; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
5004; GCN3-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
5005; GCN3-NEXT:    s_add_u32 s0, s0, s4
5006; GCN3-NEXT:    s_addc_u32 s1, s1, s5
5007; GCN3-NEXT:    v_mov_b32_e32 v0, s0
5008; GCN3-NEXT:    v_mov_b32_e32 v1, s1
5009; GCN3-NEXT:    v_mov_b32_e32 v2, s8
5010; GCN3-NEXT:    flat_atomic_xor v2, v[0:1], v2 offset:16 glc
5011; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5012; GCN3-NEXT:    buffer_wbinvl1_vol
5013; GCN3-NEXT:    v_mov_b32_e32 v0, s2
5014; GCN3-NEXT:    v_mov_b32_e32 v1, s3
5015; GCN3-NEXT:    flat_store_dword v[0:1], v2
5016; GCN3-NEXT:    s_endpgm
5017entry:
5018  %ptr = getelementptr i32, ptr %out, i64 %index
5019  %gep = getelementptr i32, ptr %ptr, i32 4
5020  %val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst
5021  store i32 %val, ptr %out2
5022  ret void
5023}
5024
5025define amdgpu_kernel void @atomic_xor_i32(ptr %out, i32 %in) {
5026; GCN1-LABEL: atomic_xor_i32:
5027; GCN1:       ; %bb.0: ; %entry
5028; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
5029; GCN1-NEXT:    s_load_dword s2, s[4:5], 0xb
5030; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
5031; GCN1-NEXT:    v_mov_b32_e32 v0, s0
5032; GCN1-NEXT:    v_mov_b32_e32 v1, s1
5033; GCN1-NEXT:    v_mov_b32_e32 v2, s2
5034; GCN1-NEXT:    flat_atomic_xor v[0:1], v2
5035; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5036; GCN1-NEXT:    buffer_wbinvl1_vol
5037; GCN1-NEXT:    s_endpgm
5038;
5039; GCN2-LABEL: atomic_xor_i32:
5040; GCN2:       ; %bb.0: ; %entry
5041; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
5042; GCN2-NEXT:    s_load_dword s2, s[4:5], 0x2c
5043; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
5044; GCN2-NEXT:    v_mov_b32_e32 v0, s0
5045; GCN2-NEXT:    v_mov_b32_e32 v1, s1
5046; GCN2-NEXT:    v_mov_b32_e32 v2, s2
5047; GCN2-NEXT:    flat_atomic_xor v[0:1], v2
5048; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5049; GCN2-NEXT:    buffer_wbinvl1_vol
5050; GCN2-NEXT:    s_endpgm
5051;
5052; GCN3-LABEL: atomic_xor_i32:
5053; GCN3:       ; %bb.0: ; %entry
5054; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
5055; GCN3-NEXT:    s_load_dword s2, s[4:5], 0x2c
5056; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
5057; GCN3-NEXT:    v_mov_b32_e32 v0, s0
5058; GCN3-NEXT:    v_mov_b32_e32 v1, s1
5059; GCN3-NEXT:    v_mov_b32_e32 v2, s2
5060; GCN3-NEXT:    flat_atomic_xor v[0:1], v2
5061; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5062; GCN3-NEXT:    buffer_wbinvl1_vol
5063; GCN3-NEXT:    s_endpgm
5064entry:
5065  %val = atomicrmw volatile xor ptr %out, i32 %in syncscope("agent") seq_cst
5066  ret void
5067}
5068
5069define amdgpu_kernel void @atomic_xor_i32_ret(ptr %out, ptr %out2, i32 %in) {
5070; GCN1-LABEL: atomic_xor_i32_ret:
5071; GCN1:       ; %bb.0: ; %entry
5072; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
5073; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xd
5074; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
5075; GCN1-NEXT:    v_mov_b32_e32 v0, s0
5076; GCN1-NEXT:    v_mov_b32_e32 v1, s1
5077; GCN1-NEXT:    v_mov_b32_e32 v2, s4
5078; GCN1-NEXT:    flat_atomic_xor v2, v[0:1], v2 glc
5079; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5080; GCN1-NEXT:    buffer_wbinvl1_vol
5081; GCN1-NEXT:    v_mov_b32_e32 v0, s2
5082; GCN1-NEXT:    v_mov_b32_e32 v1, s3
5083; GCN1-NEXT:    flat_store_dword v[0:1], v2
5084; GCN1-NEXT:    s_endpgm
5085;
5086; GCN2-LABEL: atomic_xor_i32_ret:
5087; GCN2:       ; %bb.0: ; %entry
5088; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5089; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x34
5090; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
5091; GCN2-NEXT:    v_mov_b32_e32 v0, s0
5092; GCN2-NEXT:    v_mov_b32_e32 v1, s1
5093; GCN2-NEXT:    v_mov_b32_e32 v2, s4
5094; GCN2-NEXT:    flat_atomic_xor v2, v[0:1], v2 glc
5095; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5096; GCN2-NEXT:    buffer_wbinvl1_vol
5097; GCN2-NEXT:    v_mov_b32_e32 v0, s2
5098; GCN2-NEXT:    v_mov_b32_e32 v1, s3
5099; GCN2-NEXT:    flat_store_dword v[0:1], v2
5100; GCN2-NEXT:    s_endpgm
5101;
5102; GCN3-LABEL: atomic_xor_i32_ret:
5103; GCN3:       ; %bb.0: ; %entry
5104; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5105; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x34
5106; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
5107; GCN3-NEXT:    v_mov_b32_e32 v0, s0
5108; GCN3-NEXT:    v_mov_b32_e32 v1, s1
5109; GCN3-NEXT:    v_mov_b32_e32 v2, s6
5110; GCN3-NEXT:    flat_atomic_xor v2, v[0:1], v2 glc
5111; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5112; GCN3-NEXT:    buffer_wbinvl1_vol
5113; GCN3-NEXT:    v_mov_b32_e32 v0, s2
5114; GCN3-NEXT:    v_mov_b32_e32 v1, s3
5115; GCN3-NEXT:    flat_store_dword v[0:1], v2
5116; GCN3-NEXT:    s_endpgm
5117entry:
5118  %val = atomicrmw volatile xor ptr %out, i32 %in syncscope("agent") seq_cst
5119  store i32 %val, ptr %out2
5120  ret void
5121}
5122
5123define amdgpu_kernel void @atomic_xor_i32_addr64(ptr %out, i32 %in, i64 %index) {
5124; GCN1-LABEL: atomic_xor_i32_addr64:
5125; GCN1:       ; %bb.0: ; %entry
5126; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
5127; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
5128; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
5129; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
5130; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
5131; GCN1-NEXT:    s_add_u32 s0, s2, s0
5132; GCN1-NEXT:    s_addc_u32 s1, s3, s1
5133; GCN1-NEXT:    v_mov_b32_e32 v0, s0
5134; GCN1-NEXT:    v_mov_b32_e32 v1, s1
5135; GCN1-NEXT:    v_mov_b32_e32 v2, s4
5136; GCN1-NEXT:    flat_atomic_xor v[0:1], v2
5137; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5138; GCN1-NEXT:    buffer_wbinvl1_vol
5139; GCN1-NEXT:    s_endpgm
5140;
5141; GCN2-LABEL: atomic_xor_i32_addr64:
5142; GCN2:       ; %bb.0: ; %entry
5143; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
5144; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
5145; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
5146; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
5147; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
5148; GCN2-NEXT:    s_add_u32 s0, s2, s0
5149; GCN2-NEXT:    s_addc_u32 s1, s3, s1
5150; GCN2-NEXT:    v_mov_b32_e32 v0, s0
5151; GCN2-NEXT:    v_mov_b32_e32 v1, s1
5152; GCN2-NEXT:    v_mov_b32_e32 v2, s4
5153; GCN2-NEXT:    flat_atomic_xor v[0:1], v2
5154; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5155; GCN2-NEXT:    buffer_wbinvl1_vol
5156; GCN2-NEXT:    s_endpgm
5157;
5158; GCN3-LABEL: atomic_xor_i32_addr64:
5159; GCN3:       ; %bb.0: ; %entry
5160; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
5161; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
5162; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
5163; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
5164; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
5165; GCN3-NEXT:    s_add_u32 s0, s2, s0
5166; GCN3-NEXT:    s_addc_u32 s1, s3, s1
5167; GCN3-NEXT:    v_mov_b32_e32 v0, s0
5168; GCN3-NEXT:    v_mov_b32_e32 v1, s1
5169; GCN3-NEXT:    v_mov_b32_e32 v2, s6
5170; GCN3-NEXT:    flat_atomic_xor v[0:1], v2
5171; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5172; GCN3-NEXT:    buffer_wbinvl1_vol
5173; GCN3-NEXT:    s_endpgm
5174entry:
5175  %ptr = getelementptr i32, ptr %out, i64 %index
5176  %val = atomicrmw volatile xor ptr %ptr, i32 %in syncscope("agent") seq_cst
5177  ret void
5178}
5179
5180define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
5181; GCN1-LABEL: atomic_xor_i32_ret_addr64:
5182; GCN1:       ; %bb.0: ; %entry
5183; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
5184; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
5185; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
5186; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
5187; GCN1-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
5188; GCN1-NEXT:    s_add_u32 s0, s0, s4
5189; GCN1-NEXT:    s_addc_u32 s1, s1, s5
5190; GCN1-NEXT:    v_mov_b32_e32 v0, s0
5191; GCN1-NEXT:    v_mov_b32_e32 v1, s1
5192; GCN1-NEXT:    v_mov_b32_e32 v2, s8
5193; GCN1-NEXT:    flat_atomic_xor v2, v[0:1], v2 glc
5194; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5195; GCN1-NEXT:    buffer_wbinvl1_vol
5196; GCN1-NEXT:    v_mov_b32_e32 v0, s2
5197; GCN1-NEXT:    v_mov_b32_e32 v1, s3
5198; GCN1-NEXT:    flat_store_dword v[0:1], v2
5199; GCN1-NEXT:    s_endpgm
5200;
5201; GCN2-LABEL: atomic_xor_i32_ret_addr64:
5202; GCN2:       ; %bb.0: ; %entry
5203; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
5204; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5205; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
5206; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
5207; GCN2-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
5208; GCN2-NEXT:    s_add_u32 s0, s0, s4
5209; GCN2-NEXT:    s_addc_u32 s1, s1, s5
5210; GCN2-NEXT:    v_mov_b32_e32 v0, s0
5211; GCN2-NEXT:    v_mov_b32_e32 v1, s1
5212; GCN2-NEXT:    v_mov_b32_e32 v2, s8
5213; GCN2-NEXT:    flat_atomic_xor v2, v[0:1], v2 glc
5214; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5215; GCN2-NEXT:    buffer_wbinvl1_vol
5216; GCN2-NEXT:    v_mov_b32_e32 v0, s2
5217; GCN2-NEXT:    v_mov_b32_e32 v1, s3
5218; GCN2-NEXT:    flat_store_dword v[0:1], v2
5219; GCN2-NEXT:    s_endpgm
5220;
5221; GCN3-LABEL: atomic_xor_i32_ret_addr64:
5222; GCN3:       ; %bb.0: ; %entry
5223; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
5224; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5225; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
5226; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
5227; GCN3-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
5228; GCN3-NEXT:    s_add_u32 s0, s0, s4
5229; GCN3-NEXT:    s_addc_u32 s1, s1, s5
5230; GCN3-NEXT:    v_mov_b32_e32 v0, s0
5231; GCN3-NEXT:    v_mov_b32_e32 v1, s1
5232; GCN3-NEXT:    v_mov_b32_e32 v2, s8
5233; GCN3-NEXT:    flat_atomic_xor v2, v[0:1], v2 glc
5234; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5235; GCN3-NEXT:    buffer_wbinvl1_vol
5236; GCN3-NEXT:    v_mov_b32_e32 v0, s2
5237; GCN3-NEXT:    v_mov_b32_e32 v1, s3
5238; GCN3-NEXT:    flat_store_dword v[0:1], v2
5239; GCN3-NEXT:    s_endpgm
5240entry:
5241  %ptr = getelementptr i32, ptr %out, i64 %index
5242  %val = atomicrmw volatile xor ptr %ptr, i32 %in syncscope("agent") seq_cst
5243  store i32 %val, ptr %out2
5244  ret void
5245}
5246
5247define amdgpu_kernel void @atomic_load_i32_offset(ptr %in, ptr %out) {
5248; GCN1-LABEL: atomic_load_i32_offset:
5249; GCN1:       ; %bb.0: ; %entry
5250; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
5251; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
5252; GCN1-NEXT:    s_add_u32 s0, s0, 16
5253; GCN1-NEXT:    s_addc_u32 s1, s1, 0
5254; GCN1-NEXT:    v_mov_b32_e32 v0, s0
5255; GCN1-NEXT:    v_mov_b32_e32 v1, s1
5256; GCN1-NEXT:    flat_load_dword v2, v[0:1] glc
5257; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5258; GCN1-NEXT:    buffer_wbinvl1_vol
5259; GCN1-NEXT:    v_mov_b32_e32 v0, s2
5260; GCN1-NEXT:    v_mov_b32_e32 v1, s3
5261; GCN1-NEXT:    flat_store_dword v[0:1], v2
5262; GCN1-NEXT:    s_endpgm
5263;
5264; GCN2-LABEL: atomic_load_i32_offset:
5265; GCN2:       ; %bb.0: ; %entry
5266; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5267; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
5268; GCN2-NEXT:    s_add_u32 s0, s0, 16
5269; GCN2-NEXT:    s_addc_u32 s1, s1, 0
5270; GCN2-NEXT:    v_mov_b32_e32 v0, s0
5271; GCN2-NEXT:    v_mov_b32_e32 v1, s1
5272; GCN2-NEXT:    flat_load_dword v2, v[0:1] glc
5273; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5274; GCN2-NEXT:    buffer_wbinvl1_vol
5275; GCN2-NEXT:    v_mov_b32_e32 v0, s2
5276; GCN2-NEXT:    v_mov_b32_e32 v1, s3
5277; GCN2-NEXT:    flat_store_dword v[0:1], v2
5278; GCN2-NEXT:    s_endpgm
5279;
5280; GCN3-LABEL: atomic_load_i32_offset:
5281; GCN3:       ; %bb.0: ; %entry
5282; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5283; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
5284; GCN3-NEXT:    v_mov_b32_e32 v0, s0
5285; GCN3-NEXT:    v_mov_b32_e32 v1, s1
5286; GCN3-NEXT:    flat_load_dword v2, v[0:1] offset:16 glc
5287; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5288; GCN3-NEXT:    buffer_wbinvl1_vol
5289; GCN3-NEXT:    v_mov_b32_e32 v0, s2
5290; GCN3-NEXT:    v_mov_b32_e32 v1, s3
5291; GCN3-NEXT:    flat_store_dword v[0:1], v2
5292; GCN3-NEXT:    s_endpgm
5293entry:
5294  %gep = getelementptr i32, ptr %in, i32 4
5295  %val = load atomic i32, ptr %gep  seq_cst, align 4
5296  store i32 %val, ptr %out
5297  ret void
5298}
5299
5300define amdgpu_kernel void @atomic_load_i32(ptr %in, ptr %out) {
5301; GCN1-LABEL: atomic_load_i32:
5302; GCN1:       ; %bb.0: ; %entry
5303; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
5304; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
5305; GCN1-NEXT:    v_mov_b32_e32 v0, s0
5306; GCN1-NEXT:    v_mov_b32_e32 v1, s1
5307; GCN1-NEXT:    flat_load_dword v2, v[0:1] glc
5308; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5309; GCN1-NEXT:    buffer_wbinvl1_vol
5310; GCN1-NEXT:    v_mov_b32_e32 v0, s2
5311; GCN1-NEXT:    v_mov_b32_e32 v1, s3
5312; GCN1-NEXT:    flat_store_dword v[0:1], v2
5313; GCN1-NEXT:    s_endpgm
5314;
5315; GCN2-LABEL: atomic_load_i32:
5316; GCN2:       ; %bb.0: ; %entry
5317; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5318; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
5319; GCN2-NEXT:    v_mov_b32_e32 v0, s0
5320; GCN2-NEXT:    v_mov_b32_e32 v1, s1
5321; GCN2-NEXT:    flat_load_dword v2, v[0:1] glc
5322; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5323; GCN2-NEXT:    buffer_wbinvl1_vol
5324; GCN2-NEXT:    v_mov_b32_e32 v0, s2
5325; GCN2-NEXT:    v_mov_b32_e32 v1, s3
5326; GCN2-NEXT:    flat_store_dword v[0:1], v2
5327; GCN2-NEXT:    s_endpgm
5328;
5329; GCN3-LABEL: atomic_load_i32:
5330; GCN3:       ; %bb.0: ; %entry
5331; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5332; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
5333; GCN3-NEXT:    v_mov_b32_e32 v0, s0
5334; GCN3-NEXT:    v_mov_b32_e32 v1, s1
5335; GCN3-NEXT:    flat_load_dword v2, v[0:1] glc
5336; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5337; GCN3-NEXT:    buffer_wbinvl1_vol
5338; GCN3-NEXT:    v_mov_b32_e32 v0, s2
5339; GCN3-NEXT:    v_mov_b32_e32 v1, s3
5340; GCN3-NEXT:    flat_store_dword v[0:1], v2
5341; GCN3-NEXT:    s_endpgm
5342entry:
5343  %val = load atomic i32, ptr %in seq_cst, align 4
5344  store i32 %val, ptr %out
5345  ret void
5346}
5347
5348define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr %in, ptr %out, i64 %index) {
5349; GCN1-LABEL: atomic_load_i32_addr64_offset:
5350; GCN1:       ; %bb.0: ; %entry
5351; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
5352; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
5353; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
5354; GCN1-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
5355; GCN1-NEXT:    s_add_u32 s0, s0, s4
5356; GCN1-NEXT:    s_addc_u32 s1, s1, s5
5357; GCN1-NEXT:    s_add_u32 s0, s0, 16
5358; GCN1-NEXT:    s_addc_u32 s1, s1, 0
5359; GCN1-NEXT:    v_mov_b32_e32 v0, s0
5360; GCN1-NEXT:    v_mov_b32_e32 v1, s1
5361; GCN1-NEXT:    flat_load_dword v2, v[0:1] glc
5362; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5363; GCN1-NEXT:    buffer_wbinvl1_vol
5364; GCN1-NEXT:    v_mov_b32_e32 v0, s2
5365; GCN1-NEXT:    v_mov_b32_e32 v1, s3
5366; GCN1-NEXT:    flat_store_dword v[0:1], v2
5367; GCN1-NEXT:    s_endpgm
5368;
5369; GCN2-LABEL: atomic_load_i32_addr64_offset:
5370; GCN2:       ; %bb.0: ; %entry
5371; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
5372; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5373; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
5374; GCN2-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
5375; GCN2-NEXT:    s_add_u32 s0, s0, s4
5376; GCN2-NEXT:    s_addc_u32 s1, s1, s5
5377; GCN2-NEXT:    s_add_u32 s0, s0, 16
5378; GCN2-NEXT:    s_addc_u32 s1, s1, 0
5379; GCN2-NEXT:    v_mov_b32_e32 v0, s0
5380; GCN2-NEXT:    v_mov_b32_e32 v1, s1
5381; GCN2-NEXT:    flat_load_dword v2, v[0:1] glc
5382; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5383; GCN2-NEXT:    buffer_wbinvl1_vol
5384; GCN2-NEXT:    v_mov_b32_e32 v0, s2
5385; GCN2-NEXT:    v_mov_b32_e32 v1, s3
5386; GCN2-NEXT:    flat_store_dword v[0:1], v2
5387; GCN2-NEXT:    s_endpgm
5388;
5389; GCN3-LABEL: atomic_load_i32_addr64_offset:
5390; GCN3:       ; %bb.0: ; %entry
5391; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
5392; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5393; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
5394; GCN3-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
5395; GCN3-NEXT:    s_add_u32 s0, s0, s4
5396; GCN3-NEXT:    s_addc_u32 s1, s1, s5
5397; GCN3-NEXT:    v_mov_b32_e32 v0, s0
5398; GCN3-NEXT:    v_mov_b32_e32 v1, s1
5399; GCN3-NEXT:    flat_load_dword v2, v[0:1] offset:16 glc
5400; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5401; GCN3-NEXT:    buffer_wbinvl1_vol
5402; GCN3-NEXT:    v_mov_b32_e32 v0, s2
5403; GCN3-NEXT:    v_mov_b32_e32 v1, s3
5404; GCN3-NEXT:    flat_store_dword v[0:1], v2
5405; GCN3-NEXT:    s_endpgm
5406entry:
5407  %ptr = getelementptr i32, ptr %in, i64 %index
5408  %gep = getelementptr i32, ptr %ptr, i32 4
5409  %val = load atomic i32, ptr %gep seq_cst, align 4
5410  store i32 %val, ptr %out
5411  ret void
5412}
5413
5414define amdgpu_kernel void @atomic_load_i32_addr64(ptr %in, ptr %out, i64 %index) {
5415; GCN1-LABEL: atomic_load_i32_addr64:
5416; GCN1:       ; %bb.0: ; %entry
5417; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
5418; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
5419; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
5420; GCN1-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
5421; GCN1-NEXT:    s_add_u32 s0, s0, s4
5422; GCN1-NEXT:    s_addc_u32 s1, s1, s5
5423; GCN1-NEXT:    v_mov_b32_e32 v0, s0
5424; GCN1-NEXT:    v_mov_b32_e32 v1, s1
5425; GCN1-NEXT:    flat_load_dword v2, v[0:1] glc
5426; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5427; GCN1-NEXT:    buffer_wbinvl1_vol
5428; GCN1-NEXT:    v_mov_b32_e32 v0, s2
5429; GCN1-NEXT:    v_mov_b32_e32 v1, s3
5430; GCN1-NEXT:    flat_store_dword v[0:1], v2
5431; GCN1-NEXT:    s_endpgm
5432;
5433; GCN2-LABEL: atomic_load_i32_addr64:
5434; GCN2:       ; %bb.0: ; %entry
5435; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
5436; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5437; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
5438; GCN2-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
5439; GCN2-NEXT:    s_add_u32 s0, s0, s4
5440; GCN2-NEXT:    s_addc_u32 s1, s1, s5
5441; GCN2-NEXT:    v_mov_b32_e32 v0, s0
5442; GCN2-NEXT:    v_mov_b32_e32 v1, s1
5443; GCN2-NEXT:    flat_load_dword v2, v[0:1] glc
5444; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5445; GCN2-NEXT:    buffer_wbinvl1_vol
5446; GCN2-NEXT:    v_mov_b32_e32 v0, s2
5447; GCN2-NEXT:    v_mov_b32_e32 v1, s3
5448; GCN2-NEXT:    flat_store_dword v[0:1], v2
5449; GCN2-NEXT:    s_endpgm
5450;
5451; GCN3-LABEL: atomic_load_i32_addr64:
5452; GCN3:       ; %bb.0: ; %entry
5453; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
5454; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5455; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
5456; GCN3-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
5457; GCN3-NEXT:    s_add_u32 s0, s0, s4
5458; GCN3-NEXT:    s_addc_u32 s1, s1, s5
5459; GCN3-NEXT:    v_mov_b32_e32 v0, s0
5460; GCN3-NEXT:    v_mov_b32_e32 v1, s1
5461; GCN3-NEXT:    flat_load_dword v2, v[0:1] glc
5462; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5463; GCN3-NEXT:    buffer_wbinvl1_vol
5464; GCN3-NEXT:    v_mov_b32_e32 v0, s2
5465; GCN3-NEXT:    v_mov_b32_e32 v1, s3
5466; GCN3-NEXT:    flat_store_dword v[0:1], v2
5467; GCN3-NEXT:    s_endpgm
5468entry:
5469  %ptr = getelementptr i32, ptr %in, i64 %index
5470  %val = load atomic i32, ptr %ptr seq_cst, align 4
5471  store i32 %val, ptr %out
5472  ret void
5473}
5474
5475define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, ptr %out) {
5476; GCN1-LABEL: atomic_store_i32_offset:
5477; GCN1:       ; %bb.0: ; %entry
5478; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xb
5479; GCN1-NEXT:    s_load_dword s2, s[4:5], 0x9
5480; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
5481; GCN1-NEXT:    s_add_u32 s0, s0, 16
5482; GCN1-NEXT:    s_addc_u32 s1, s1, 0
5483; GCN1-NEXT:    v_mov_b32_e32 v0, s0
5484; GCN1-NEXT:    v_mov_b32_e32 v1, s1
5485; GCN1-NEXT:    v_mov_b32_e32 v2, s2
5486; GCN1-NEXT:    flat_store_dword v[0:1], v2
5487; GCN1-NEXT:    s_endpgm
5488;
5489; GCN2-LABEL: atomic_store_i32_offset:
5490; GCN2:       ; %bb.0: ; %entry
5491; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
5492; GCN2-NEXT:    s_load_dword s2, s[4:5], 0x24
5493; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
5494; GCN2-NEXT:    s_add_u32 s0, s0, 16
5495; GCN2-NEXT:    s_addc_u32 s1, s1, 0
5496; GCN2-NEXT:    v_mov_b32_e32 v0, s0
5497; GCN2-NEXT:    v_mov_b32_e32 v1, s1
5498; GCN2-NEXT:    v_mov_b32_e32 v2, s2
5499; GCN2-NEXT:    flat_store_dword v[0:1], v2
5500; GCN2-NEXT:    s_endpgm
5501;
5502; GCN3-LABEL: atomic_store_i32_offset:
5503; GCN3:       ; %bb.0: ; %entry
5504; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
5505; GCN3-NEXT:    s_load_dword s2, s[4:5], 0x24
5506; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
5507; GCN3-NEXT:    v_mov_b32_e32 v0, s0
5508; GCN3-NEXT:    v_mov_b32_e32 v1, s1
5509; GCN3-NEXT:    v_mov_b32_e32 v2, s2
5510; GCN3-NEXT:    flat_store_dword v[0:1], v2 offset:16
5511; GCN3-NEXT:    s_endpgm
5512entry:
5513  %gep = getelementptr i32, ptr %out, i32 4
5514  store atomic i32 %in, ptr %gep  seq_cst, align 4
5515  ret void
5516}
5517
5518define amdgpu_kernel void @atomic_store_i32(i32 %in, ptr %out) {
5519; GCN1-LABEL: atomic_store_i32:
5520; GCN1:       ; %bb.0: ; %entry
5521; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xb
5522; GCN1-NEXT:    s_load_dword s2, s[4:5], 0x9
5523; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
5524; GCN1-NEXT:    v_mov_b32_e32 v0, s0
5525; GCN1-NEXT:    v_mov_b32_e32 v1, s1
5526; GCN1-NEXT:    v_mov_b32_e32 v2, s2
5527; GCN1-NEXT:    flat_store_dword v[0:1], v2
5528; GCN1-NEXT:    s_endpgm
5529;
5530; GCN2-LABEL: atomic_store_i32:
5531; GCN2:       ; %bb.0: ; %entry
5532; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
5533; GCN2-NEXT:    s_load_dword s2, s[4:5], 0x24
5534; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
5535; GCN2-NEXT:    v_mov_b32_e32 v0, s0
5536; GCN2-NEXT:    v_mov_b32_e32 v1, s1
5537; GCN2-NEXT:    v_mov_b32_e32 v2, s2
5538; GCN2-NEXT:    flat_store_dword v[0:1], v2
5539; GCN2-NEXT:    s_endpgm
5540;
5541; GCN3-LABEL: atomic_store_i32:
5542; GCN3:       ; %bb.0: ; %entry
5543; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
5544; GCN3-NEXT:    s_load_dword s2, s[4:5], 0x24
5545; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
5546; GCN3-NEXT:    v_mov_b32_e32 v0, s0
5547; GCN3-NEXT:    v_mov_b32_e32 v1, s1
5548; GCN3-NEXT:    v_mov_b32_e32 v2, s2
5549; GCN3-NEXT:    flat_store_dword v[0:1], v2
5550; GCN3-NEXT:    s_endpgm
5551entry:
5552  store atomic i32 %in, ptr %out seq_cst, align 4
5553  ret void
5554}
5555
5556define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, ptr %out, i64 %index) {
5557; GCN1-LABEL: atomic_store_i32_addr64_offset:
5558; GCN1:       ; %bb.0: ; %entry
5559; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
5560; GCN1-NEXT:    s_load_dword s4, s[4:5], 0x9
5561; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
5562; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
5563; GCN1-NEXT:    s_add_u32 s0, s0, s2
5564; GCN1-NEXT:    s_addc_u32 s1, s1, s3
5565; GCN1-NEXT:    s_add_u32 s0, s0, 16
5566; GCN1-NEXT:    s_addc_u32 s1, s1, 0
5567; GCN1-NEXT:    v_mov_b32_e32 v0, s0
5568; GCN1-NEXT:    v_mov_b32_e32 v1, s1
5569; GCN1-NEXT:    v_mov_b32_e32 v2, s4
5570; GCN1-NEXT:    flat_store_dword v[0:1], v2
5571; GCN1-NEXT:    s_endpgm
5572;
5573; GCN2-LABEL: atomic_store_i32_addr64_offset:
5574; GCN2:       ; %bb.0: ; %entry
5575; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
5576; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x24
5577; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
5578; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
5579; GCN2-NEXT:    s_add_u32 s0, s0, s2
5580; GCN2-NEXT:    s_addc_u32 s1, s1, s3
5581; GCN2-NEXT:    s_add_u32 s0, s0, 16
5582; GCN2-NEXT:    s_addc_u32 s1, s1, 0
5583; GCN2-NEXT:    v_mov_b32_e32 v0, s0
5584; GCN2-NEXT:    v_mov_b32_e32 v1, s1
5585; GCN2-NEXT:    v_mov_b32_e32 v2, s4
5586; GCN2-NEXT:    flat_store_dword v[0:1], v2
5587; GCN2-NEXT:    s_endpgm
5588;
5589; GCN3-LABEL: atomic_store_i32_addr64_offset:
5590; GCN3:       ; %bb.0: ; %entry
5591; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
5592; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x24
5593; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
5594; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
5595; GCN3-NEXT:    s_add_u32 s0, s0, s2
5596; GCN3-NEXT:    s_addc_u32 s1, s1, s3
5597; GCN3-NEXT:    v_mov_b32_e32 v0, s0
5598; GCN3-NEXT:    v_mov_b32_e32 v1, s1
5599; GCN3-NEXT:    v_mov_b32_e32 v2, s6
5600; GCN3-NEXT:    flat_store_dword v[0:1], v2 offset:16
5601; GCN3-NEXT:    s_endpgm
5602entry:
5603  %ptr = getelementptr i32, ptr %out, i64 %index
5604  %gep = getelementptr i32, ptr %ptr, i32 4
5605  store atomic i32 %in, ptr %gep seq_cst, align 4
5606  ret void
5607}
5608
5609define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, ptr %out, i64 %index) {
5610; GCN1-LABEL: atomic_store_i32_addr64:
5611; GCN1:       ; %bb.0: ; %entry
5612; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
5613; GCN1-NEXT:    s_load_dword s4, s[4:5], 0x9
5614; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
5615; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
5616; GCN1-NEXT:    s_add_u32 s0, s0, s2
5617; GCN1-NEXT:    s_addc_u32 s1, s1, s3
5618; GCN1-NEXT:    v_mov_b32_e32 v0, s0
5619; GCN1-NEXT:    v_mov_b32_e32 v1, s1
5620; GCN1-NEXT:    v_mov_b32_e32 v2, s4
5621; GCN1-NEXT:    flat_store_dword v[0:1], v2
5622; GCN1-NEXT:    s_endpgm
5623;
5624; GCN2-LABEL: atomic_store_i32_addr64:
5625; GCN2:       ; %bb.0: ; %entry
5626; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
5627; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x24
5628; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
5629; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
5630; GCN2-NEXT:    s_add_u32 s0, s0, s2
5631; GCN2-NEXT:    s_addc_u32 s1, s1, s3
5632; GCN2-NEXT:    v_mov_b32_e32 v0, s0
5633; GCN2-NEXT:    v_mov_b32_e32 v1, s1
5634; GCN2-NEXT:    v_mov_b32_e32 v2, s4
5635; GCN2-NEXT:    flat_store_dword v[0:1], v2
5636; GCN2-NEXT:    s_endpgm
5637;
5638; GCN3-LABEL: atomic_store_i32_addr64:
5639; GCN3:       ; %bb.0: ; %entry
5640; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
5641; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x24
5642; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
5643; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
5644; GCN3-NEXT:    s_add_u32 s0, s0, s2
5645; GCN3-NEXT:    s_addc_u32 s1, s1, s3
5646; GCN3-NEXT:    v_mov_b32_e32 v0, s0
5647; GCN3-NEXT:    v_mov_b32_e32 v1, s1
5648; GCN3-NEXT:    v_mov_b32_e32 v2, s6
5649; GCN3-NEXT:    flat_store_dword v[0:1], v2
5650; GCN3-NEXT:    s_endpgm
5651entry:
5652  %ptr = getelementptr i32, ptr %out, i64 %index
5653  store atomic i32 %in, ptr %ptr seq_cst, align 4
5654  ret void
5655}
5656
5657define amdgpu_kernel void @atomic_load_f32_offset(ptr %in, ptr %out) {
5658; GCN1-LABEL: atomic_load_f32_offset:
5659; GCN1:       ; %bb.0: ; %entry
5660; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
5661; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
5662; GCN1-NEXT:    s_add_u32 s0, s0, 16
5663; GCN1-NEXT:    s_addc_u32 s1, s1, 0
5664; GCN1-NEXT:    v_mov_b32_e32 v0, s0
5665; GCN1-NEXT:    v_mov_b32_e32 v1, s1
5666; GCN1-NEXT:    flat_load_dword v2, v[0:1] glc
5667; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5668; GCN1-NEXT:    buffer_wbinvl1_vol
5669; GCN1-NEXT:    v_mov_b32_e32 v0, s2
5670; GCN1-NEXT:    v_mov_b32_e32 v1, s3
5671; GCN1-NEXT:    flat_store_dword v[0:1], v2
5672; GCN1-NEXT:    s_endpgm
5673;
5674; GCN2-LABEL: atomic_load_f32_offset:
5675; GCN2:       ; %bb.0: ; %entry
5676; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5677; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
5678; GCN2-NEXT:    s_add_u32 s0, s0, 16
5679; GCN2-NEXT:    s_addc_u32 s1, s1, 0
5680; GCN2-NEXT:    v_mov_b32_e32 v0, s0
5681; GCN2-NEXT:    v_mov_b32_e32 v1, s1
5682; GCN2-NEXT:    flat_load_dword v2, v[0:1] glc
5683; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5684; GCN2-NEXT:    buffer_wbinvl1_vol
5685; GCN2-NEXT:    v_mov_b32_e32 v0, s2
5686; GCN2-NEXT:    v_mov_b32_e32 v1, s3
5687; GCN2-NEXT:    flat_store_dword v[0:1], v2
5688; GCN2-NEXT:    s_endpgm
5689;
5690; GCN3-LABEL: atomic_load_f32_offset:
5691; GCN3:       ; %bb.0: ; %entry
5692; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5693; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
5694; GCN3-NEXT:    v_mov_b32_e32 v0, s0
5695; GCN3-NEXT:    v_mov_b32_e32 v1, s1
5696; GCN3-NEXT:    flat_load_dword v2, v[0:1] offset:16 glc
5697; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5698; GCN3-NEXT:    buffer_wbinvl1_vol
5699; GCN3-NEXT:    v_mov_b32_e32 v0, s2
5700; GCN3-NEXT:    v_mov_b32_e32 v1, s3
5701; GCN3-NEXT:    flat_store_dword v[0:1], v2
5702; GCN3-NEXT:    s_endpgm
5703entry:
5704  %gep = getelementptr float, ptr %in, i32 4
5705  %val = load atomic float, ptr %gep  seq_cst, align 4
5706  store float %val, ptr %out
5707  ret void
5708}
5709
5710define amdgpu_kernel void @atomic_load_f32(ptr %in, ptr %out) {
5711; GCN1-LABEL: atomic_load_f32:
5712; GCN1:       ; %bb.0: ; %entry
5713; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
5714; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
5715; GCN1-NEXT:    v_mov_b32_e32 v0, s0
5716; GCN1-NEXT:    v_mov_b32_e32 v1, s1
5717; GCN1-NEXT:    flat_load_dword v2, v[0:1] glc
5718; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5719; GCN1-NEXT:    buffer_wbinvl1_vol
5720; GCN1-NEXT:    v_mov_b32_e32 v0, s2
5721; GCN1-NEXT:    v_mov_b32_e32 v1, s3
5722; GCN1-NEXT:    flat_store_dword v[0:1], v2
5723; GCN1-NEXT:    s_endpgm
5724;
5725; GCN2-LABEL: atomic_load_f32:
5726; GCN2:       ; %bb.0: ; %entry
5727; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5728; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
5729; GCN2-NEXT:    v_mov_b32_e32 v0, s0
5730; GCN2-NEXT:    v_mov_b32_e32 v1, s1
5731; GCN2-NEXT:    flat_load_dword v2, v[0:1] glc
5732; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5733; GCN2-NEXT:    buffer_wbinvl1_vol
5734; GCN2-NEXT:    v_mov_b32_e32 v0, s2
5735; GCN2-NEXT:    v_mov_b32_e32 v1, s3
5736; GCN2-NEXT:    flat_store_dword v[0:1], v2
5737; GCN2-NEXT:    s_endpgm
5738;
5739; GCN3-LABEL: atomic_load_f32:
5740; GCN3:       ; %bb.0: ; %entry
5741; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5742; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
5743; GCN3-NEXT:    v_mov_b32_e32 v0, s0
5744; GCN3-NEXT:    v_mov_b32_e32 v1, s1
5745; GCN3-NEXT:    flat_load_dword v2, v[0:1] glc
5746; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5747; GCN3-NEXT:    buffer_wbinvl1_vol
5748; GCN3-NEXT:    v_mov_b32_e32 v0, s2
5749; GCN3-NEXT:    v_mov_b32_e32 v1, s3
5750; GCN3-NEXT:    flat_store_dword v[0:1], v2
5751; GCN3-NEXT:    s_endpgm
5752entry:
5753  %val = load atomic float, ptr %in seq_cst, align 4
5754  store float %val, ptr %out
5755  ret void
5756}
5757
5758define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr %in, ptr %out, i64 %index) {
5759; GCN1-LABEL: atomic_load_f32_addr64_offset:
5760; GCN1:       ; %bb.0: ; %entry
5761; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
5762; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
5763; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
5764; GCN1-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
5765; GCN1-NEXT:    s_add_u32 s0, s0, s4
5766; GCN1-NEXT:    s_addc_u32 s1, s1, s5
5767; GCN1-NEXT:    s_add_u32 s0, s0, 16
5768; GCN1-NEXT:    s_addc_u32 s1, s1, 0
5769; GCN1-NEXT:    v_mov_b32_e32 v0, s0
5770; GCN1-NEXT:    v_mov_b32_e32 v1, s1
5771; GCN1-NEXT:    flat_load_dword v2, v[0:1] glc
5772; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5773; GCN1-NEXT:    buffer_wbinvl1_vol
5774; GCN1-NEXT:    v_mov_b32_e32 v0, s2
5775; GCN1-NEXT:    v_mov_b32_e32 v1, s3
5776; GCN1-NEXT:    flat_store_dword v[0:1], v2
5777; GCN1-NEXT:    s_endpgm
5778;
5779; GCN2-LABEL: atomic_load_f32_addr64_offset:
5780; GCN2:       ; %bb.0: ; %entry
5781; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
5782; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5783; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
5784; GCN2-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
5785; GCN2-NEXT:    s_add_u32 s0, s0, s4
5786; GCN2-NEXT:    s_addc_u32 s1, s1, s5
5787; GCN2-NEXT:    s_add_u32 s0, s0, 16
5788; GCN2-NEXT:    s_addc_u32 s1, s1, 0
5789; GCN2-NEXT:    v_mov_b32_e32 v0, s0
5790; GCN2-NEXT:    v_mov_b32_e32 v1, s1
5791; GCN2-NEXT:    flat_load_dword v2, v[0:1] glc
5792; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5793; GCN2-NEXT:    buffer_wbinvl1_vol
5794; GCN2-NEXT:    v_mov_b32_e32 v0, s2
5795; GCN2-NEXT:    v_mov_b32_e32 v1, s3
5796; GCN2-NEXT:    flat_store_dword v[0:1], v2
5797; GCN2-NEXT:    s_endpgm
5798;
5799; GCN3-LABEL: atomic_load_f32_addr64_offset:
5800; GCN3:       ; %bb.0: ; %entry
5801; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
5802; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5803; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
5804; GCN3-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
5805; GCN3-NEXT:    s_add_u32 s0, s0, s4
5806; GCN3-NEXT:    s_addc_u32 s1, s1, s5
5807; GCN3-NEXT:    v_mov_b32_e32 v0, s0
5808; GCN3-NEXT:    v_mov_b32_e32 v1, s1
5809; GCN3-NEXT:    flat_load_dword v2, v[0:1] offset:16 glc
5810; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5811; GCN3-NEXT:    buffer_wbinvl1_vol
5812; GCN3-NEXT:    v_mov_b32_e32 v0, s2
5813; GCN3-NEXT:    v_mov_b32_e32 v1, s3
5814; GCN3-NEXT:    flat_store_dword v[0:1], v2
5815; GCN3-NEXT:    s_endpgm
5816entry:
5817  %ptr = getelementptr float, ptr %in, i64 %index
5818  %gep = getelementptr float, ptr %ptr, i32 4
5819  %val = load atomic float, ptr %gep seq_cst, align 4
5820  store float %val, ptr %out
5821  ret void
5822}
5823
5824define amdgpu_kernel void @atomic_load_f32_addr64(ptr %in, ptr %out, i64 %index) {
5825; GCN1-LABEL: atomic_load_f32_addr64:
5826; GCN1:       ; %bb.0: ; %entry
5827; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
5828; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
5829; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
5830; GCN1-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
5831; GCN1-NEXT:    s_add_u32 s0, s0, s4
5832; GCN1-NEXT:    s_addc_u32 s1, s1, s5
5833; GCN1-NEXT:    v_mov_b32_e32 v0, s0
5834; GCN1-NEXT:    v_mov_b32_e32 v1, s1
5835; GCN1-NEXT:    flat_load_dword v2, v[0:1] glc
5836; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5837; GCN1-NEXT:    buffer_wbinvl1_vol
5838; GCN1-NEXT:    v_mov_b32_e32 v0, s2
5839; GCN1-NEXT:    v_mov_b32_e32 v1, s3
5840; GCN1-NEXT:    flat_store_dword v[0:1], v2
5841; GCN1-NEXT:    s_endpgm
5842;
5843; GCN2-LABEL: atomic_load_f32_addr64:
5844; GCN2:       ; %bb.0: ; %entry
5845; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
5846; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5847; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
5848; GCN2-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
5849; GCN2-NEXT:    s_add_u32 s0, s0, s4
5850; GCN2-NEXT:    s_addc_u32 s1, s1, s5
5851; GCN2-NEXT:    v_mov_b32_e32 v0, s0
5852; GCN2-NEXT:    v_mov_b32_e32 v1, s1
5853; GCN2-NEXT:    flat_load_dword v2, v[0:1] glc
5854; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5855; GCN2-NEXT:    buffer_wbinvl1_vol
5856; GCN2-NEXT:    v_mov_b32_e32 v0, s2
5857; GCN2-NEXT:    v_mov_b32_e32 v1, s3
5858; GCN2-NEXT:    flat_store_dword v[0:1], v2
5859; GCN2-NEXT:    s_endpgm
5860;
5861; GCN3-LABEL: atomic_load_f32_addr64:
5862; GCN3:       ; %bb.0: ; %entry
5863; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
5864; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5865; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
5866; GCN3-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
5867; GCN3-NEXT:    s_add_u32 s0, s0, s4
5868; GCN3-NEXT:    s_addc_u32 s1, s1, s5
5869; GCN3-NEXT:    v_mov_b32_e32 v0, s0
5870; GCN3-NEXT:    v_mov_b32_e32 v1, s1
5871; GCN3-NEXT:    flat_load_dword v2, v[0:1] glc
5872; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5873; GCN3-NEXT:    buffer_wbinvl1_vol
5874; GCN3-NEXT:    v_mov_b32_e32 v0, s2
5875; GCN3-NEXT:    v_mov_b32_e32 v1, s3
5876; GCN3-NEXT:    flat_store_dword v[0:1], v2
5877; GCN3-NEXT:    s_endpgm
5878entry:
5879  %ptr = getelementptr float, ptr %in, i64 %index
5880  %val = load atomic float, ptr %ptr seq_cst, align 4
5881  store float %val, ptr %out
5882  ret void
5883}
5884
5885define amdgpu_kernel void @atomic_store_f32_offset(float %in, ptr %out) {
5886; GCN1-LABEL: atomic_store_f32_offset:
5887; GCN1:       ; %bb.0: ; %entry
5888; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xb
5889; GCN1-NEXT:    s_load_dword s2, s[4:5], 0x9
5890; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
5891; GCN1-NEXT:    s_add_u32 s0, s0, 16
5892; GCN1-NEXT:    s_addc_u32 s1, s1, 0
5893; GCN1-NEXT:    v_mov_b32_e32 v0, s0
5894; GCN1-NEXT:    v_mov_b32_e32 v1, s1
5895; GCN1-NEXT:    v_mov_b32_e32 v2, s2
5896; GCN1-NEXT:    flat_store_dword v[0:1], v2
5897; GCN1-NEXT:    s_endpgm
5898;
5899; GCN2-LABEL: atomic_store_f32_offset:
5900; GCN2:       ; %bb.0: ; %entry
5901; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
5902; GCN2-NEXT:    s_load_dword s2, s[4:5], 0x24
5903; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
5904; GCN2-NEXT:    s_add_u32 s0, s0, 16
5905; GCN2-NEXT:    s_addc_u32 s1, s1, 0
5906; GCN2-NEXT:    v_mov_b32_e32 v0, s0
5907; GCN2-NEXT:    v_mov_b32_e32 v1, s1
5908; GCN2-NEXT:    v_mov_b32_e32 v2, s2
5909; GCN2-NEXT:    flat_store_dword v[0:1], v2
5910; GCN2-NEXT:    s_endpgm
5911;
5912; GCN3-LABEL: atomic_store_f32_offset:
5913; GCN3:       ; %bb.0: ; %entry
5914; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
5915; GCN3-NEXT:    s_load_dword s2, s[4:5], 0x24
5916; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
5917; GCN3-NEXT:    v_mov_b32_e32 v0, s0
5918; GCN3-NEXT:    v_mov_b32_e32 v1, s1
5919; GCN3-NEXT:    v_mov_b32_e32 v2, s2
5920; GCN3-NEXT:    flat_store_dword v[0:1], v2 offset:16
5921; GCN3-NEXT:    s_endpgm
5922entry:
5923  %gep = getelementptr float, ptr %out, i32 4
5924  store atomic float %in, ptr %gep  seq_cst, align 4
5925  ret void
5926}
5927
5928define amdgpu_kernel void @atomic_store_f32(float %in, ptr %out) {
5929; GCN1-LABEL: atomic_store_f32:
5930; GCN1:       ; %bb.0: ; %entry
5931; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xb
5932; GCN1-NEXT:    s_load_dword s2, s[4:5], 0x9
5933; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
5934; GCN1-NEXT:    v_mov_b32_e32 v0, s0
5935; GCN1-NEXT:    v_mov_b32_e32 v1, s1
5936; GCN1-NEXT:    v_mov_b32_e32 v2, s2
5937; GCN1-NEXT:    flat_store_dword v[0:1], v2
5938; GCN1-NEXT:    s_endpgm
5939;
5940; GCN2-LABEL: atomic_store_f32:
5941; GCN2:       ; %bb.0: ; %entry
5942; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
5943; GCN2-NEXT:    s_load_dword s2, s[4:5], 0x24
5944; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
5945; GCN2-NEXT:    v_mov_b32_e32 v0, s0
5946; GCN2-NEXT:    v_mov_b32_e32 v1, s1
5947; GCN2-NEXT:    v_mov_b32_e32 v2, s2
5948; GCN2-NEXT:    flat_store_dword v[0:1], v2
5949; GCN2-NEXT:    s_endpgm
5950;
5951; GCN3-LABEL: atomic_store_f32:
5952; GCN3:       ; %bb.0: ; %entry
5953; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
5954; GCN3-NEXT:    s_load_dword s2, s[4:5], 0x24
5955; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
5956; GCN3-NEXT:    v_mov_b32_e32 v0, s0
5957; GCN3-NEXT:    v_mov_b32_e32 v1, s1
5958; GCN3-NEXT:    v_mov_b32_e32 v2, s2
5959; GCN3-NEXT:    flat_store_dword v[0:1], v2
5960; GCN3-NEXT:    s_endpgm
5961entry:
5962  store atomic float %in, ptr %out seq_cst, align 4
5963  ret void
5964}
5965
5966define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, ptr %out, i64 %index) {
5967; GCN1-LABEL: atomic_store_f32_addr64_offset:
5968; GCN1:       ; %bb.0: ; %entry
5969; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
5970; GCN1-NEXT:    s_load_dword s4, s[4:5], 0x9
5971; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
5972; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
5973; GCN1-NEXT:    s_add_u32 s0, s0, s2
5974; GCN1-NEXT:    s_addc_u32 s1, s1, s3
5975; GCN1-NEXT:    s_add_u32 s0, s0, 16
5976; GCN1-NEXT:    s_addc_u32 s1, s1, 0
5977; GCN1-NEXT:    v_mov_b32_e32 v0, s0
5978; GCN1-NEXT:    v_mov_b32_e32 v1, s1
5979; GCN1-NEXT:    v_mov_b32_e32 v2, s4
5980; GCN1-NEXT:    flat_store_dword v[0:1], v2
5981; GCN1-NEXT:    s_endpgm
5982;
5983; GCN2-LABEL: atomic_store_f32_addr64_offset:
5984; GCN2:       ; %bb.0: ; %entry
5985; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
5986; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x24
5987; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
5988; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
5989; GCN2-NEXT:    s_add_u32 s0, s0, s2
5990; GCN2-NEXT:    s_addc_u32 s1, s1, s3
5991; GCN2-NEXT:    s_add_u32 s0, s0, 16
5992; GCN2-NEXT:    s_addc_u32 s1, s1, 0
5993; GCN2-NEXT:    v_mov_b32_e32 v0, s0
5994; GCN2-NEXT:    v_mov_b32_e32 v1, s1
5995; GCN2-NEXT:    v_mov_b32_e32 v2, s4
5996; GCN2-NEXT:    flat_store_dword v[0:1], v2
5997; GCN2-NEXT:    s_endpgm
5998;
5999; GCN3-LABEL: atomic_store_f32_addr64_offset:
6000; GCN3:       ; %bb.0: ; %entry
6001; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
6002; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x24
6003; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
6004; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
6005; GCN3-NEXT:    s_add_u32 s0, s0, s2
6006; GCN3-NEXT:    s_addc_u32 s1, s1, s3
6007; GCN3-NEXT:    v_mov_b32_e32 v0, s0
6008; GCN3-NEXT:    v_mov_b32_e32 v1, s1
6009; GCN3-NEXT:    v_mov_b32_e32 v2, s6
6010; GCN3-NEXT:    flat_store_dword v[0:1], v2 offset:16
6011; GCN3-NEXT:    s_endpgm
6012entry:
6013  %ptr = getelementptr float, ptr %out, i64 %index
6014  %gep = getelementptr float, ptr %ptr, i32 4
6015  store atomic float %in, ptr %gep seq_cst, align 4
6016  ret void
6017}
6018
6019define amdgpu_kernel void @atomic_store_f32_addr64(float %in, ptr %out, i64 %index) {
6020; GCN1-LABEL: atomic_store_f32_addr64:
6021; GCN1:       ; %bb.0: ; %entry
6022; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
6023; GCN1-NEXT:    s_load_dword s4, s[4:5], 0x9
6024; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
6025; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
6026; GCN1-NEXT:    s_add_u32 s0, s0, s2
6027; GCN1-NEXT:    s_addc_u32 s1, s1, s3
6028; GCN1-NEXT:    v_mov_b32_e32 v0, s0
6029; GCN1-NEXT:    v_mov_b32_e32 v1, s1
6030; GCN1-NEXT:    v_mov_b32_e32 v2, s4
6031; GCN1-NEXT:    flat_store_dword v[0:1], v2
6032; GCN1-NEXT:    s_endpgm
6033;
6034; GCN2-LABEL: atomic_store_f32_addr64:
6035; GCN2:       ; %bb.0: ; %entry
6036; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
6037; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x24
6038; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
6039; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
6040; GCN2-NEXT:    s_add_u32 s0, s0, s2
6041; GCN2-NEXT:    s_addc_u32 s1, s1, s3
6042; GCN2-NEXT:    v_mov_b32_e32 v0, s0
6043; GCN2-NEXT:    v_mov_b32_e32 v1, s1
6044; GCN2-NEXT:    v_mov_b32_e32 v2, s4
6045; GCN2-NEXT:    flat_store_dword v[0:1], v2
6046; GCN2-NEXT:    s_endpgm
6047;
6048; GCN3-LABEL: atomic_store_f32_addr64:
6049; GCN3:       ; %bb.0: ; %entry
6050; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
6051; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x24
6052; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
6053; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
6054; GCN3-NEXT:    s_add_u32 s0, s0, s2
6055; GCN3-NEXT:    s_addc_u32 s1, s1, s3
6056; GCN3-NEXT:    v_mov_b32_e32 v0, s0
6057; GCN3-NEXT:    v_mov_b32_e32 v1, s1
6058; GCN3-NEXT:    v_mov_b32_e32 v2, s6
6059; GCN3-NEXT:    flat_store_dword v[0:1], v2
6060; GCN3-NEXT:    s_endpgm
6061entry:
6062  %ptr = getelementptr float, ptr %out, i64 %index
6063  store atomic float %in, ptr %ptr seq_cst, align 4
6064  ret void
6065}
6066
6067define amdgpu_kernel void @atomic_load_i8_offset(ptr %in, ptr %out) {
6068; GCN1-LABEL: atomic_load_i8_offset:
6069; GCN1:       ; %bb.0: ; %entry
6070; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
6071; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
6072; GCN1-NEXT:    s_add_u32 s0, s0, 16
6073; GCN1-NEXT:    s_addc_u32 s1, s1, 0
6074; GCN1-NEXT:    v_mov_b32_e32 v0, s0
6075; GCN1-NEXT:    v_mov_b32_e32 v1, s1
6076; GCN1-NEXT:    flat_load_ubyte v2, v[0:1] glc
6077; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6078; GCN1-NEXT:    buffer_wbinvl1_vol
6079; GCN1-NEXT:    v_mov_b32_e32 v0, s2
6080; GCN1-NEXT:    v_mov_b32_e32 v1, s3
6081; GCN1-NEXT:    flat_store_byte v[0:1], v2
6082; GCN1-NEXT:    s_endpgm
6083;
6084; GCN2-LABEL: atomic_load_i8_offset:
6085; GCN2:       ; %bb.0: ; %entry
6086; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
6087; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
6088; GCN2-NEXT:    s_add_u32 s0, s0, 16
6089; GCN2-NEXT:    s_addc_u32 s1, s1, 0
6090; GCN2-NEXT:    v_mov_b32_e32 v0, s0
6091; GCN2-NEXT:    v_mov_b32_e32 v1, s1
6092; GCN2-NEXT:    flat_load_ubyte v2, v[0:1] glc
6093; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6094; GCN2-NEXT:    buffer_wbinvl1_vol
6095; GCN2-NEXT:    v_mov_b32_e32 v0, s2
6096; GCN2-NEXT:    v_mov_b32_e32 v1, s3
6097; GCN2-NEXT:    flat_store_byte v[0:1], v2
6098; GCN2-NEXT:    s_endpgm
6099;
6100; GCN3-LABEL: atomic_load_i8_offset:
6101; GCN3:       ; %bb.0: ; %entry
6102; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
6103; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
6104; GCN3-NEXT:    v_mov_b32_e32 v0, s0
6105; GCN3-NEXT:    v_mov_b32_e32 v1, s1
6106; GCN3-NEXT:    flat_load_ubyte v2, v[0:1] offset:16 glc
6107; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6108; GCN3-NEXT:    buffer_wbinvl1_vol
6109; GCN3-NEXT:    v_mov_b32_e32 v0, s2
6110; GCN3-NEXT:    v_mov_b32_e32 v1, s3
6111; GCN3-NEXT:    flat_store_byte v[0:1], v2
6112; GCN3-NEXT:    s_endpgm
6113entry:
6114  %gep = getelementptr i8, ptr %in, i64 16
6115  %val = load atomic i8, ptr %gep  seq_cst, align 1
6116  store i8 %val, ptr %out
6117  ret void
6118}
6119
6120define amdgpu_kernel void @atomic_load_i8(ptr %in, ptr %out) {
6121; GCN1-LABEL: atomic_load_i8:
6122; GCN1:       ; %bb.0: ; %entry
6123; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
6124; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
6125; GCN1-NEXT:    v_mov_b32_e32 v0, s0
6126; GCN1-NEXT:    v_mov_b32_e32 v1, s1
6127; GCN1-NEXT:    flat_load_ubyte v2, v[0:1] glc
6128; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6129; GCN1-NEXT:    buffer_wbinvl1_vol
6130; GCN1-NEXT:    v_mov_b32_e32 v0, s2
6131; GCN1-NEXT:    v_mov_b32_e32 v1, s3
6132; GCN1-NEXT:    flat_store_byte v[0:1], v2
6133; GCN1-NEXT:    s_endpgm
6134;
6135; GCN2-LABEL: atomic_load_i8:
6136; GCN2:       ; %bb.0: ; %entry
6137; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
6138; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
6139; GCN2-NEXT:    v_mov_b32_e32 v0, s0
6140; GCN2-NEXT:    v_mov_b32_e32 v1, s1
6141; GCN2-NEXT:    flat_load_ubyte v2, v[0:1] glc
6142; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6143; GCN2-NEXT:    buffer_wbinvl1_vol
6144; GCN2-NEXT:    v_mov_b32_e32 v0, s2
6145; GCN2-NEXT:    v_mov_b32_e32 v1, s3
6146; GCN2-NEXT:    flat_store_byte v[0:1], v2
6147; GCN2-NEXT:    s_endpgm
6148;
6149; GCN3-LABEL: atomic_load_i8:
6150; GCN3:       ; %bb.0: ; %entry
6151; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
6152; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
6153; GCN3-NEXT:    v_mov_b32_e32 v0, s0
6154; GCN3-NEXT:    v_mov_b32_e32 v1, s1
6155; GCN3-NEXT:    flat_load_ubyte v2, v[0:1] glc
6156; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6157; GCN3-NEXT:    buffer_wbinvl1_vol
6158; GCN3-NEXT:    v_mov_b32_e32 v0, s2
6159; GCN3-NEXT:    v_mov_b32_e32 v1, s3
6160; GCN3-NEXT:    flat_store_byte v[0:1], v2
6161; GCN3-NEXT:    s_endpgm
6162entry:
6163  %val = load atomic i8, ptr %in seq_cst, align 1
6164  store i8 %val, ptr %out
6165  ret void
6166}
6167
6168define amdgpu_kernel void @atomic_load_i8_addr64_offset(ptr %in, ptr %out, i64 %index) {
6169; GCN1-LABEL: atomic_load_i8_addr64_offset:
6170; GCN1:       ; %bb.0: ; %entry
6171; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
6172; GCN1-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
6173; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
6174; GCN1-NEXT:    s_add_u32 s0, s0, s4
6175; GCN1-NEXT:    s_addc_u32 s1, s1, s5
6176; GCN1-NEXT:    s_add_u32 s0, s0, 16
6177; GCN1-NEXT:    s_addc_u32 s1, s1, 0
6178; GCN1-NEXT:    v_mov_b32_e32 v0, s0
6179; GCN1-NEXT:    v_mov_b32_e32 v1, s1
6180; GCN1-NEXT:    flat_load_ubyte v2, v[0:1] glc
6181; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6182; GCN1-NEXT:    buffer_wbinvl1_vol
6183; GCN1-NEXT:    v_mov_b32_e32 v0, s2
6184; GCN1-NEXT:    v_mov_b32_e32 v1, s3
6185; GCN1-NEXT:    flat_store_byte v[0:1], v2
6186; GCN1-NEXT:    s_endpgm
6187;
6188; GCN2-LABEL: atomic_load_i8_addr64_offset:
6189; GCN2:       ; %bb.0: ; %entry
6190; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
6191; GCN2-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
6192; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
6193; GCN2-NEXT:    s_add_u32 s0, s0, s4
6194; GCN2-NEXT:    s_addc_u32 s1, s1, s5
6195; GCN2-NEXT:    s_add_u32 s0, s0, 16
6196; GCN2-NEXT:    s_addc_u32 s1, s1, 0
6197; GCN2-NEXT:    v_mov_b32_e32 v0, s0
6198; GCN2-NEXT:    v_mov_b32_e32 v1, s1
6199; GCN2-NEXT:    flat_load_ubyte v2, v[0:1] glc
6200; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6201; GCN2-NEXT:    buffer_wbinvl1_vol
6202; GCN2-NEXT:    v_mov_b32_e32 v0, s2
6203; GCN2-NEXT:    v_mov_b32_e32 v1, s3
6204; GCN2-NEXT:    flat_store_byte v[0:1], v2
6205; GCN2-NEXT:    s_endpgm
6206;
6207; GCN3-LABEL: atomic_load_i8_addr64_offset:
6208; GCN3:       ; %bb.0: ; %entry
6209; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
6210; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
6211; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
6212; GCN3-NEXT:    s_add_u32 s0, s0, s6
6213; GCN3-NEXT:    s_addc_u32 s1, s1, s7
6214; GCN3-NEXT:    v_mov_b32_e32 v0, s0
6215; GCN3-NEXT:    v_mov_b32_e32 v1, s1
6216; GCN3-NEXT:    flat_load_ubyte v2, v[0:1] offset:16 glc
6217; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6218; GCN3-NEXT:    buffer_wbinvl1_vol
6219; GCN3-NEXT:    v_mov_b32_e32 v0, s2
6220; GCN3-NEXT:    v_mov_b32_e32 v1, s3
6221; GCN3-NEXT:    flat_store_byte v[0:1], v2
6222; GCN3-NEXT:    s_endpgm
6223entry:
6224  %ptr = getelementptr i8, ptr %in, i64 %index
6225  %gep = getelementptr i8, ptr %ptr, i64 16
6226  %val = load atomic i8, ptr %gep seq_cst, align 1
6227  store i8 %val, ptr %out
6228  ret void
6229}
6230
6231define amdgpu_kernel void @atomic_store_i8_offset(i8 %in, ptr %out) {
6232; GCN1-LABEL: atomic_store_i8_offset:
6233; GCN1:       ; %bb.0: ; %entry
6234; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xb
6235; GCN1-NEXT:    s_load_dword s2, s[4:5], 0x9
6236; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
6237; GCN1-NEXT:    s_add_u32 s0, s0, 16
6238; GCN1-NEXT:    s_addc_u32 s1, s1, 0
6239; GCN1-NEXT:    v_mov_b32_e32 v0, s0
6240; GCN1-NEXT:    v_mov_b32_e32 v1, s1
6241; GCN1-NEXT:    v_mov_b32_e32 v2, s2
6242; GCN1-NEXT:    flat_store_byte v[0:1], v2
6243; GCN1-NEXT:    s_endpgm
6244;
6245; GCN2-LABEL: atomic_store_i8_offset:
6246; GCN2:       ; %bb.0: ; %entry
6247; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
6248; GCN2-NEXT:    s_load_dword s2, s[4:5], 0x24
6249; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
6250; GCN2-NEXT:    s_add_u32 s0, s0, 16
6251; GCN2-NEXT:    s_addc_u32 s1, s1, 0
6252; GCN2-NEXT:    v_mov_b32_e32 v0, s0
6253; GCN2-NEXT:    v_mov_b32_e32 v1, s1
6254; GCN2-NEXT:    v_mov_b32_e32 v2, s2
6255; GCN2-NEXT:    flat_store_byte v[0:1], v2
6256; GCN2-NEXT:    s_endpgm
6257;
6258; GCN3-LABEL: atomic_store_i8_offset:
6259; GCN3:       ; %bb.0: ; %entry
6260; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
6261; GCN3-NEXT:    s_load_dword s2, s[4:5], 0x24
6262; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
6263; GCN3-NEXT:    v_mov_b32_e32 v0, s0
6264; GCN3-NEXT:    v_mov_b32_e32 v1, s1
6265; GCN3-NEXT:    v_mov_b32_e32 v2, s2
6266; GCN3-NEXT:    flat_store_byte v[0:1], v2 offset:16
6267; GCN3-NEXT:    s_endpgm
6268entry:
6269  %gep = getelementptr i8, ptr %out, i64 16
6270  store atomic i8 %in, ptr %gep  seq_cst, align 1
6271  ret void
6272}
6273
6274define amdgpu_kernel void @atomic_store_i8(i8 %in, ptr %out) {
6275; GCN1-LABEL: atomic_store_i8:
6276; GCN1:       ; %bb.0: ; %entry
6277; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xb
6278; GCN1-NEXT:    s_load_dword s2, s[4:5], 0x9
6279; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
6280; GCN1-NEXT:    v_mov_b32_e32 v0, s0
6281; GCN1-NEXT:    v_mov_b32_e32 v1, s1
6282; GCN1-NEXT:    v_mov_b32_e32 v2, s2
6283; GCN1-NEXT:    flat_store_byte v[0:1], v2
6284; GCN1-NEXT:    s_endpgm
6285;
6286; GCN2-LABEL: atomic_store_i8:
6287; GCN2:       ; %bb.0: ; %entry
6288; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
6289; GCN2-NEXT:    s_load_dword s2, s[4:5], 0x24
6290; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
6291; GCN2-NEXT:    v_mov_b32_e32 v0, s0
6292; GCN2-NEXT:    v_mov_b32_e32 v1, s1
6293; GCN2-NEXT:    v_mov_b32_e32 v2, s2
6294; GCN2-NEXT:    flat_store_byte v[0:1], v2
6295; GCN2-NEXT:    s_endpgm
6296;
6297; GCN3-LABEL: atomic_store_i8:
6298; GCN3:       ; %bb.0: ; %entry
6299; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
6300; GCN3-NEXT:    s_load_dword s2, s[4:5], 0x24
6301; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
6302; GCN3-NEXT:    v_mov_b32_e32 v0, s0
6303; GCN3-NEXT:    v_mov_b32_e32 v1, s1
6304; GCN3-NEXT:    v_mov_b32_e32 v2, s2
6305; GCN3-NEXT:    flat_store_byte v[0:1], v2
6306; GCN3-NEXT:    s_endpgm
6307entry:
6308  store atomic i8 %in, ptr %out seq_cst, align 1
6309  ret void
6310}
6311
6312define amdgpu_kernel void @atomic_store_i8_addr64_offset(i8 %in, ptr %out, i64 %index) {
6313; GCN1-LABEL: atomic_store_i8_addr64_offset:
6314; GCN1:       ; %bb.0: ; %entry
6315; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
6316; GCN1-NEXT:    s_load_dword s4, s[4:5], 0x9
6317; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
6318; GCN1-NEXT:    s_add_u32 s0, s0, s2
6319; GCN1-NEXT:    s_addc_u32 s1, s1, s3
6320; GCN1-NEXT:    s_add_u32 s0, s0, 16
6321; GCN1-NEXT:    s_addc_u32 s1, s1, 0
6322; GCN1-NEXT:    v_mov_b32_e32 v0, s0
6323; GCN1-NEXT:    v_mov_b32_e32 v1, s1
6324; GCN1-NEXT:    v_mov_b32_e32 v2, s4
6325; GCN1-NEXT:    flat_store_byte v[0:1], v2
6326; GCN1-NEXT:    s_endpgm
6327;
6328; GCN2-LABEL: atomic_store_i8_addr64_offset:
6329; GCN2:       ; %bb.0: ; %entry
6330; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
6331; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x24
6332; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
6333; GCN2-NEXT:    s_add_u32 s0, s0, s2
6334; GCN2-NEXT:    s_addc_u32 s1, s1, s3
6335; GCN2-NEXT:    s_add_u32 s0, s0, 16
6336; GCN2-NEXT:    s_addc_u32 s1, s1, 0
6337; GCN2-NEXT:    v_mov_b32_e32 v0, s0
6338; GCN2-NEXT:    v_mov_b32_e32 v1, s1
6339; GCN2-NEXT:    v_mov_b32_e32 v2, s4
6340; GCN2-NEXT:    flat_store_byte v[0:1], v2
6341; GCN2-NEXT:    s_endpgm
6342;
6343; GCN3-LABEL: atomic_store_i8_addr64_offset:
6344; GCN3:       ; %bb.0: ; %entry
6345; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
6346; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x24
6347; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
6348; GCN3-NEXT:    s_add_u32 s0, s0, s2
6349; GCN3-NEXT:    s_addc_u32 s1, s1, s3
6350; GCN3-NEXT:    v_mov_b32_e32 v0, s0
6351; GCN3-NEXT:    v_mov_b32_e32 v1, s1
6352; GCN3-NEXT:    v_mov_b32_e32 v2, s6
6353; GCN3-NEXT:    flat_store_byte v[0:1], v2 offset:16
6354; GCN3-NEXT:    s_endpgm
6355entry:
6356  %ptr = getelementptr i8, ptr %out, i64 %index
6357  %gep = getelementptr i8, ptr %ptr, i64 16
6358  store atomic i8 %in, ptr %gep seq_cst, align 1
6359  ret void
6360}
6361
6362define amdgpu_kernel void @atomic_load_i16_offset(ptr %in, ptr %out) {
6363; GCN1-LABEL: atomic_load_i16_offset:
6364; GCN1:       ; %bb.0: ; %entry
6365; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
6366; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
6367; GCN1-NEXT:    s_add_u32 s0, s0, 16
6368; GCN1-NEXT:    s_addc_u32 s1, s1, 0
6369; GCN1-NEXT:    v_mov_b32_e32 v0, s0
6370; GCN1-NEXT:    v_mov_b32_e32 v1, s1
6371; GCN1-NEXT:    flat_load_ushort v2, v[0:1] glc
6372; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6373; GCN1-NEXT:    buffer_wbinvl1_vol
6374; GCN1-NEXT:    v_mov_b32_e32 v0, s2
6375; GCN1-NEXT:    v_mov_b32_e32 v1, s3
6376; GCN1-NEXT:    flat_store_short v[0:1], v2
6377; GCN1-NEXT:    s_endpgm
6378;
6379; GCN2-LABEL: atomic_load_i16_offset:
6380; GCN2:       ; %bb.0: ; %entry
6381; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
6382; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
6383; GCN2-NEXT:    s_add_u32 s0, s0, 16
6384; GCN2-NEXT:    s_addc_u32 s1, s1, 0
6385; GCN2-NEXT:    v_mov_b32_e32 v0, s0
6386; GCN2-NEXT:    v_mov_b32_e32 v1, s1
6387; GCN2-NEXT:    flat_load_ushort v2, v[0:1] glc
6388; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6389; GCN2-NEXT:    buffer_wbinvl1_vol
6390; GCN2-NEXT:    v_mov_b32_e32 v0, s2
6391; GCN2-NEXT:    v_mov_b32_e32 v1, s3
6392; GCN2-NEXT:    flat_store_short v[0:1], v2
6393; GCN2-NEXT:    s_endpgm
6394;
6395; GCN3-LABEL: atomic_load_i16_offset:
6396; GCN3:       ; %bb.0: ; %entry
6397; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
6398; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
6399; GCN3-NEXT:    v_mov_b32_e32 v0, s0
6400; GCN3-NEXT:    v_mov_b32_e32 v1, s1
6401; GCN3-NEXT:    flat_load_ushort v2, v[0:1] offset:16 glc
6402; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6403; GCN3-NEXT:    buffer_wbinvl1_vol
6404; GCN3-NEXT:    v_mov_b32_e32 v0, s2
6405; GCN3-NEXT:    v_mov_b32_e32 v1, s3
6406; GCN3-NEXT:    flat_store_short v[0:1], v2
6407; GCN3-NEXT:    s_endpgm
6408entry:
6409  %gep = getelementptr i16, ptr %in, i64 8
6410  %val = load atomic i16, ptr %gep  seq_cst, align 2
6411  store i16 %val, ptr %out
6412  ret void
6413}
6414
6415define amdgpu_kernel void @atomic_load_i16(ptr %in, ptr %out) {
6416; GCN1-LABEL: atomic_load_i16:
6417; GCN1:       ; %bb.0: ; %entry
6418; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
6419; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
6420; GCN1-NEXT:    v_mov_b32_e32 v0, s0
6421; GCN1-NEXT:    v_mov_b32_e32 v1, s1
6422; GCN1-NEXT:    flat_load_ushort v2, v[0:1] glc
6423; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6424; GCN1-NEXT:    buffer_wbinvl1_vol
6425; GCN1-NEXT:    v_mov_b32_e32 v0, s2
6426; GCN1-NEXT:    v_mov_b32_e32 v1, s3
6427; GCN1-NEXT:    flat_store_short v[0:1], v2
6428; GCN1-NEXT:    s_endpgm
6429;
6430; GCN2-LABEL: atomic_load_i16:
6431; GCN2:       ; %bb.0: ; %entry
6432; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
6433; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
6434; GCN2-NEXT:    v_mov_b32_e32 v0, s0
6435; GCN2-NEXT:    v_mov_b32_e32 v1, s1
6436; GCN2-NEXT:    flat_load_ushort v2, v[0:1] glc
6437; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6438; GCN2-NEXT:    buffer_wbinvl1_vol
6439; GCN2-NEXT:    v_mov_b32_e32 v0, s2
6440; GCN2-NEXT:    v_mov_b32_e32 v1, s3
6441; GCN2-NEXT:    flat_store_short v[0:1], v2
6442; GCN2-NEXT:    s_endpgm
6443;
6444; GCN3-LABEL: atomic_load_i16:
6445; GCN3:       ; %bb.0: ; %entry
6446; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
6447; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
6448; GCN3-NEXT:    v_mov_b32_e32 v0, s0
6449; GCN3-NEXT:    v_mov_b32_e32 v1, s1
6450; GCN3-NEXT:    flat_load_ushort v2, v[0:1] glc
6451; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6452; GCN3-NEXT:    buffer_wbinvl1_vol
6453; GCN3-NEXT:    v_mov_b32_e32 v0, s2
6454; GCN3-NEXT:    v_mov_b32_e32 v1, s3
6455; GCN3-NEXT:    flat_store_short v[0:1], v2
6456; GCN3-NEXT:    s_endpgm
6457entry:
6458  %val = load atomic i16, ptr %in seq_cst, align 2
6459  store i16 %val, ptr %out
6460  ret void
6461}
6462
6463define amdgpu_kernel void @atomic_load_i16_addr64_offset(ptr %in, ptr %out, i64 %index) {
6464; GCN1-LABEL: atomic_load_i16_addr64_offset:
6465; GCN1:       ; %bb.0: ; %entry
6466; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
6467; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
6468; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
6469; GCN1-NEXT:    s_lshl_b64 s[4:5], s[6:7], 1
6470; GCN1-NEXT:    s_add_u32 s0, s0, s4
6471; GCN1-NEXT:    s_addc_u32 s1, s1, s5
6472; GCN1-NEXT:    s_add_u32 s0, s0, 16
6473; GCN1-NEXT:    s_addc_u32 s1, s1, 0
6474; GCN1-NEXT:    v_mov_b32_e32 v0, s0
6475; GCN1-NEXT:    v_mov_b32_e32 v1, s1
6476; GCN1-NEXT:    flat_load_ushort v2, v[0:1] glc
6477; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6478; GCN1-NEXT:    buffer_wbinvl1_vol
6479; GCN1-NEXT:    v_mov_b32_e32 v0, s2
6480; GCN1-NEXT:    v_mov_b32_e32 v1, s3
6481; GCN1-NEXT:    flat_store_short v[0:1], v2
6482; GCN1-NEXT:    s_endpgm
6483;
6484; GCN2-LABEL: atomic_load_i16_addr64_offset:
6485; GCN2:       ; %bb.0: ; %entry
6486; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
6487; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
6488; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
6489; GCN2-NEXT:    s_lshl_b64 s[4:5], s[6:7], 1
6490; GCN2-NEXT:    s_add_u32 s0, s0, s4
6491; GCN2-NEXT:    s_addc_u32 s1, s1, s5
6492; GCN2-NEXT:    s_add_u32 s0, s0, 16
6493; GCN2-NEXT:    s_addc_u32 s1, s1, 0
6494; GCN2-NEXT:    v_mov_b32_e32 v0, s0
6495; GCN2-NEXT:    v_mov_b32_e32 v1, s1
6496; GCN2-NEXT:    flat_load_ushort v2, v[0:1] glc
6497; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6498; GCN2-NEXT:    buffer_wbinvl1_vol
6499; GCN2-NEXT:    v_mov_b32_e32 v0, s2
6500; GCN2-NEXT:    v_mov_b32_e32 v1, s3
6501; GCN2-NEXT:    flat_store_short v[0:1], v2
6502; GCN2-NEXT:    s_endpgm
6503;
6504; GCN3-LABEL: atomic_load_i16_addr64_offset:
6505; GCN3:       ; %bb.0: ; %entry
6506; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
6507; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
6508; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
6509; GCN3-NEXT:    s_lshl_b64 s[4:5], s[6:7], 1
6510; GCN3-NEXT:    s_add_u32 s0, s0, s4
6511; GCN3-NEXT:    s_addc_u32 s1, s1, s5
6512; GCN3-NEXT:    v_mov_b32_e32 v0, s0
6513; GCN3-NEXT:    v_mov_b32_e32 v1, s1
6514; GCN3-NEXT:    flat_load_ushort v2, v[0:1] offset:16 glc
6515; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6516; GCN3-NEXT:    buffer_wbinvl1_vol
6517; GCN3-NEXT:    v_mov_b32_e32 v0, s2
6518; GCN3-NEXT:    v_mov_b32_e32 v1, s3
6519; GCN3-NEXT:    flat_store_short v[0:1], v2
6520; GCN3-NEXT:    s_endpgm
6521entry:
6522  %ptr = getelementptr i16, ptr %in, i64 %index
6523  %gep = getelementptr i16, ptr %ptr, i64 8
6524  %val = load atomic i16, ptr %gep seq_cst, align 2
6525  store i16 %val, ptr %out
6526  ret void
6527}
6528
6529define amdgpu_kernel void @atomic_store_i16_offset(i16 %in, ptr %out) {
6530; GCN1-LABEL: atomic_store_i16_offset:
6531; GCN1:       ; %bb.0: ; %entry
6532; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xb
6533; GCN1-NEXT:    s_load_dword s2, s[4:5], 0x9
6534; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
6535; GCN1-NEXT:    s_add_u32 s0, s0, 16
6536; GCN1-NEXT:    s_addc_u32 s1, s1, 0
6537; GCN1-NEXT:    v_mov_b32_e32 v0, s0
6538; GCN1-NEXT:    v_mov_b32_e32 v1, s1
6539; GCN1-NEXT:    v_mov_b32_e32 v2, s2
6540; GCN1-NEXT:    flat_store_short v[0:1], v2
6541; GCN1-NEXT:    s_endpgm
6542;
6543; GCN2-LABEL: atomic_store_i16_offset:
6544; GCN2:       ; %bb.0: ; %entry
6545; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
6546; GCN2-NEXT:    s_load_dword s2, s[4:5], 0x24
6547; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
6548; GCN2-NEXT:    s_add_u32 s0, s0, 16
6549; GCN2-NEXT:    s_addc_u32 s1, s1, 0
6550; GCN2-NEXT:    v_mov_b32_e32 v0, s0
6551; GCN2-NEXT:    v_mov_b32_e32 v1, s1
6552; GCN2-NEXT:    v_mov_b32_e32 v2, s2
6553; GCN2-NEXT:    flat_store_short v[0:1], v2
6554; GCN2-NEXT:    s_endpgm
6555;
6556; GCN3-LABEL: atomic_store_i16_offset:
6557; GCN3:       ; %bb.0: ; %entry
6558; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
6559; GCN3-NEXT:    s_load_dword s2, s[4:5], 0x24
6560; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
6561; GCN3-NEXT:    v_mov_b32_e32 v0, s0
6562; GCN3-NEXT:    v_mov_b32_e32 v1, s1
6563; GCN3-NEXT:    v_mov_b32_e32 v2, s2
6564; GCN3-NEXT:    flat_store_short v[0:1], v2 offset:16
6565; GCN3-NEXT:    s_endpgm
6566entry:
6567  %gep = getelementptr i16, ptr %out, i64 8
6568  store atomic i16 %in, ptr %gep  seq_cst, align 2
6569  ret void
6570}
6571
6572define amdgpu_kernel void @atomic_store_i16(i16 %in, ptr %out) {
6573; GCN1-LABEL: atomic_store_i16:
6574; GCN1:       ; %bb.0: ; %entry
6575; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xb
6576; GCN1-NEXT:    s_load_dword s2, s[4:5], 0x9
6577; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
6578; GCN1-NEXT:    v_mov_b32_e32 v0, s0
6579; GCN1-NEXT:    v_mov_b32_e32 v1, s1
6580; GCN1-NEXT:    v_mov_b32_e32 v2, s2
6581; GCN1-NEXT:    flat_store_short v[0:1], v2
6582; GCN1-NEXT:    s_endpgm
6583;
6584; GCN2-LABEL: atomic_store_i16:
6585; GCN2:       ; %bb.0: ; %entry
6586; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
6587; GCN2-NEXT:    s_load_dword s2, s[4:5], 0x24
6588; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
6589; GCN2-NEXT:    v_mov_b32_e32 v0, s0
6590; GCN2-NEXT:    v_mov_b32_e32 v1, s1
6591; GCN2-NEXT:    v_mov_b32_e32 v2, s2
6592; GCN2-NEXT:    flat_store_short v[0:1], v2
6593; GCN2-NEXT:    s_endpgm
6594;
6595; GCN3-LABEL: atomic_store_i16:
6596; GCN3:       ; %bb.0: ; %entry
6597; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
6598; GCN3-NEXT:    s_load_dword s2, s[4:5], 0x24
6599; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
6600; GCN3-NEXT:    v_mov_b32_e32 v0, s0
6601; GCN3-NEXT:    v_mov_b32_e32 v1, s1
6602; GCN3-NEXT:    v_mov_b32_e32 v2, s2
6603; GCN3-NEXT:    flat_store_short v[0:1], v2
6604; GCN3-NEXT:    s_endpgm
6605entry:
6606  store atomic i16 %in, ptr %out seq_cst, align 2
6607  ret void
6608}
6609
6610define amdgpu_kernel void @atomic_store_i16_addr64_offset(i16 %in, ptr %out, i64 %index) {
6611; GCN1-LABEL: atomic_store_i16_addr64_offset:
6612; GCN1:       ; %bb.0: ; %entry
6613; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
6614; GCN1-NEXT:    s_load_dword s4, s[4:5], 0x9
6615; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
6616; GCN1-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
6617; GCN1-NEXT:    s_add_u32 s0, s0, s2
6618; GCN1-NEXT:    s_addc_u32 s1, s1, s3
6619; GCN1-NEXT:    s_add_u32 s0, s0, 16
6620; GCN1-NEXT:    s_addc_u32 s1, s1, 0
6621; GCN1-NEXT:    v_mov_b32_e32 v0, s0
6622; GCN1-NEXT:    v_mov_b32_e32 v1, s1
6623; GCN1-NEXT:    v_mov_b32_e32 v2, s4
6624; GCN1-NEXT:    flat_store_short v[0:1], v2
6625; GCN1-NEXT:    s_endpgm
6626;
6627; GCN2-LABEL: atomic_store_i16_addr64_offset:
6628; GCN2:       ; %bb.0: ; %entry
6629; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
6630; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x24
6631; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
6632; GCN2-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
6633; GCN2-NEXT:    s_add_u32 s0, s0, s2
6634; GCN2-NEXT:    s_addc_u32 s1, s1, s3
6635; GCN2-NEXT:    s_add_u32 s0, s0, 16
6636; GCN2-NEXT:    s_addc_u32 s1, s1, 0
6637; GCN2-NEXT:    v_mov_b32_e32 v0, s0
6638; GCN2-NEXT:    v_mov_b32_e32 v1, s1
6639; GCN2-NEXT:    v_mov_b32_e32 v2, s4
6640; GCN2-NEXT:    flat_store_short v[0:1], v2
6641; GCN2-NEXT:    s_endpgm
6642;
6643; GCN3-LABEL: atomic_store_i16_addr64_offset:
6644; GCN3:       ; %bb.0: ; %entry
6645; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
6646; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x24
6647; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
6648; GCN3-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
6649; GCN3-NEXT:    s_add_u32 s0, s0, s2
6650; GCN3-NEXT:    s_addc_u32 s1, s1, s3
6651; GCN3-NEXT:    v_mov_b32_e32 v0, s0
6652; GCN3-NEXT:    v_mov_b32_e32 v1, s1
6653; GCN3-NEXT:    v_mov_b32_e32 v2, s6
6654; GCN3-NEXT:    flat_store_short v[0:1], v2 offset:16
6655; GCN3-NEXT:    s_endpgm
6656entry:
6657  %ptr = getelementptr i16, ptr %out, i64 %index
6658  %gep = getelementptr i16, ptr %ptr, i64 8
6659  store atomic i16 %in, ptr %gep seq_cst, align 2
6660  ret void
6661}
6662
6663define amdgpu_kernel void @atomic_store_f16_offset(half %in, ptr %out) {
6664; GCN1-LABEL: atomic_store_f16_offset:
6665; GCN1:       ; %bb.0: ; %entry
6666; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xb
6667; GCN1-NEXT:    s_load_dword s2, s[4:5], 0x9
6668; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
6669; GCN1-NEXT:    s_add_u32 s0, s0, 16
6670; GCN1-NEXT:    s_addc_u32 s1, s1, 0
6671; GCN1-NEXT:    v_mov_b32_e32 v0, s0
6672; GCN1-NEXT:    v_mov_b32_e32 v1, s1
6673; GCN1-NEXT:    v_mov_b32_e32 v2, s2
6674; GCN1-NEXT:    flat_store_short v[0:1], v2
6675; GCN1-NEXT:    s_endpgm
6676;
6677; GCN2-LABEL: atomic_store_f16_offset:
6678; GCN2:       ; %bb.0: ; %entry
6679; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
6680; GCN2-NEXT:    s_load_dword s2, s[4:5], 0x24
6681; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
6682; GCN2-NEXT:    s_add_u32 s0, s0, 16
6683; GCN2-NEXT:    s_addc_u32 s1, s1, 0
6684; GCN2-NEXT:    v_mov_b32_e32 v0, s0
6685; GCN2-NEXT:    v_mov_b32_e32 v1, s1
6686; GCN2-NEXT:    v_mov_b32_e32 v2, s2
6687; GCN2-NEXT:    flat_store_short v[0:1], v2
6688; GCN2-NEXT:    s_endpgm
6689;
6690; GCN3-LABEL: atomic_store_f16_offset:
6691; GCN3:       ; %bb.0: ; %entry
6692; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
6693; GCN3-NEXT:    s_load_dword s2, s[4:5], 0x24
6694; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
6695; GCN3-NEXT:    v_mov_b32_e32 v0, s0
6696; GCN3-NEXT:    v_mov_b32_e32 v1, s1
6697; GCN3-NEXT:    v_mov_b32_e32 v2, s2
6698; GCN3-NEXT:    flat_store_short v[0:1], v2 offset:16
6699; GCN3-NEXT:    s_endpgm
6700entry:
6701  %gep = getelementptr half, ptr %out, i64 8
6702  store atomic half %in, ptr %gep  seq_cst, align 2
6703  ret void
6704}
6705
6706define amdgpu_kernel void @atomic_store_f16(half %in, ptr %out) {
6707; GCN1-LABEL: atomic_store_f16:
6708; GCN1:       ; %bb.0: ; %entry
6709; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xb
6710; GCN1-NEXT:    s_load_dword s2, s[4:5], 0x9
6711; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
6712; GCN1-NEXT:    v_mov_b32_e32 v0, s0
6713; GCN1-NEXT:    v_mov_b32_e32 v1, s1
6714; GCN1-NEXT:    v_mov_b32_e32 v2, s2
6715; GCN1-NEXT:    flat_store_short v[0:1], v2
6716; GCN1-NEXT:    s_endpgm
6717;
6718; GCN2-LABEL: atomic_store_f16:
6719; GCN2:       ; %bb.0: ; %entry
6720; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
6721; GCN2-NEXT:    s_load_dword s2, s[4:5], 0x24
6722; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
6723; GCN2-NEXT:    v_mov_b32_e32 v0, s0
6724; GCN2-NEXT:    v_mov_b32_e32 v1, s1
6725; GCN2-NEXT:    v_mov_b32_e32 v2, s2
6726; GCN2-NEXT:    flat_store_short v[0:1], v2
6727; GCN2-NEXT:    s_endpgm
6728;
6729; GCN3-LABEL: atomic_store_f16:
6730; GCN3:       ; %bb.0: ; %entry
6731; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
6732; GCN3-NEXT:    s_load_dword s2, s[4:5], 0x24
6733; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
6734; GCN3-NEXT:    v_mov_b32_e32 v0, s0
6735; GCN3-NEXT:    v_mov_b32_e32 v1, s1
6736; GCN3-NEXT:    v_mov_b32_e32 v2, s2
6737; GCN3-NEXT:    flat_store_short v[0:1], v2
6738; GCN3-NEXT:    s_endpgm
6739entry:
6740  store atomic half %in, ptr %out seq_cst, align 2
6741  ret void
6742}
6743
6744define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr %out) {
6745; GCN1-LABEL: atomic_store_bf16_offset:
6746; GCN1:       ; %bb.0:
6747; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xb
6748; GCN1-NEXT:    s_load_dword s2, s[4:5], 0x9
6749; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
6750; GCN1-NEXT:    v_mov_b32_e32 v0, s0
6751; GCN1-NEXT:    v_mov_b32_e32 v1, s1
6752; GCN1-NEXT:    v_mov_b32_e32 v2, s2
6753; GCN1-NEXT:    flat_store_short v[0:1], v2
6754; GCN1-NEXT:    s_endpgm
6755;
6756; GCN2-LABEL: atomic_store_bf16_offset:
6757; GCN2:       ; %bb.0:
6758; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
6759; GCN2-NEXT:    s_load_dword s2, s[4:5], 0x24
6760; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
6761; GCN2-NEXT:    v_mov_b32_e32 v0, s0
6762; GCN2-NEXT:    v_mov_b32_e32 v1, s1
6763; GCN2-NEXT:    v_mov_b32_e32 v2, s2
6764; GCN2-NEXT:    flat_store_short v[0:1], v2
6765; GCN2-NEXT:    s_endpgm
6766;
6767; GCN3-LABEL: atomic_store_bf16_offset:
6768; GCN3:       ; %bb.0:
6769; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
6770; GCN3-NEXT:    s_load_dword s2, s[4:5], 0x24
6771; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
6772; GCN3-NEXT:    v_mov_b32_e32 v0, s0
6773; GCN3-NEXT:    v_mov_b32_e32 v1, s1
6774; GCN3-NEXT:    v_mov_b32_e32 v2, s2
6775; GCN3-NEXT:    flat_store_short v[0:1], v2
6776; GCN3-NEXT:    s_endpgm
6777  %gep = getelementptr bfloat, ptr %out, i64 8
6778  store atomic bfloat %in, ptr %out seq_cst, align 2
6779  ret void
6780}
6781
6782define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr %out) {
6783; GCN1-LABEL: atomic_store_bf16:
6784; GCN1:       ; %bb.0:
6785; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xb
6786; GCN1-NEXT:    s_load_dword s2, s[4:5], 0x9
6787; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
6788; GCN1-NEXT:    v_mov_b32_e32 v0, s0
6789; GCN1-NEXT:    v_mov_b32_e32 v1, s1
6790; GCN1-NEXT:    v_mov_b32_e32 v2, s2
6791; GCN1-NEXT:    flat_store_short v[0:1], v2
6792; GCN1-NEXT:    s_endpgm
6793;
6794; GCN2-LABEL: atomic_store_bf16:
6795; GCN2:       ; %bb.0:
6796; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
6797; GCN2-NEXT:    s_load_dword s2, s[4:5], 0x24
6798; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
6799; GCN2-NEXT:    v_mov_b32_e32 v0, s0
6800; GCN2-NEXT:    v_mov_b32_e32 v1, s1
6801; GCN2-NEXT:    v_mov_b32_e32 v2, s2
6802; GCN2-NEXT:    flat_store_short v[0:1], v2
6803; GCN2-NEXT:    s_endpgm
6804;
6805; GCN3-LABEL: atomic_store_bf16:
6806; GCN3:       ; %bb.0:
6807; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
6808; GCN3-NEXT:    s_load_dword s2, s[4:5], 0x24
6809; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
6810; GCN3-NEXT:    v_mov_b32_e32 v0, s0
6811; GCN3-NEXT:    v_mov_b32_e32 v1, s1
6812; GCN3-NEXT:    v_mov_b32_e32 v2, s2
6813; GCN3-NEXT:    flat_store_short v[0:1], v2
6814; GCN3-NEXT:    s_endpgm
6815  store atomic bfloat %in, ptr %out seq_cst, align 2
6816  ret void
6817}
6818
6819define amdgpu_kernel void @atomic_inc_i32_offset(ptr %out, i32 %in) {
6820; GCN1-LABEL: atomic_inc_i32_offset:
6821; GCN1:       ; %bb.0: ; %entry
6822; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
6823; GCN1-NEXT:    s_load_dword s2, s[4:5], 0xb
6824; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
6825; GCN1-NEXT:    s_add_u32 s0, s0, 16
6826; GCN1-NEXT:    s_addc_u32 s1, s1, 0
6827; GCN1-NEXT:    v_mov_b32_e32 v0, s0
6828; GCN1-NEXT:    v_mov_b32_e32 v1, s1
6829; GCN1-NEXT:    v_mov_b32_e32 v2, s2
6830; GCN1-NEXT:    flat_atomic_inc v[0:1], v2
6831; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6832; GCN1-NEXT:    buffer_wbinvl1_vol
6833; GCN1-NEXT:    s_endpgm
6834;
6835; GCN2-LABEL: atomic_inc_i32_offset:
6836; GCN2:       ; %bb.0: ; %entry
6837; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
6838; GCN2-NEXT:    s_load_dword s2, s[4:5], 0x2c
6839; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
6840; GCN2-NEXT:    s_add_u32 s0, s0, 16
6841; GCN2-NEXT:    s_addc_u32 s1, s1, 0
6842; GCN2-NEXT:    v_mov_b32_e32 v0, s0
6843; GCN2-NEXT:    v_mov_b32_e32 v1, s1
6844; GCN2-NEXT:    v_mov_b32_e32 v2, s2
6845; GCN2-NEXT:    flat_atomic_inc v[0:1], v2
6846; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6847; GCN2-NEXT:    buffer_wbinvl1_vol
6848; GCN2-NEXT:    s_endpgm
6849;
6850; GCN3-LABEL: atomic_inc_i32_offset:
6851; GCN3:       ; %bb.0: ; %entry
6852; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
6853; GCN3-NEXT:    s_load_dword s2, s[4:5], 0x2c
6854; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
6855; GCN3-NEXT:    v_mov_b32_e32 v0, s0
6856; GCN3-NEXT:    v_mov_b32_e32 v1, s1
6857; GCN3-NEXT:    v_mov_b32_e32 v2, s2
6858; GCN3-NEXT:    flat_atomic_inc v[0:1], v2 offset:16
6859; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6860; GCN3-NEXT:    buffer_wbinvl1_vol
6861; GCN3-NEXT:    s_endpgm
6862entry:
6863  %gep = getelementptr i32, ptr %out, i32 4
6864  %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
6865  ret void
6866}
6867
6868define amdgpu_kernel void @atomic_inc_i32_max_offset(ptr %out, i32 %in) {
6869; GCN1-LABEL: atomic_inc_i32_max_offset:
6870; GCN1:       ; %bb.0: ; %entry
6871; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
6872; GCN1-NEXT:    s_load_dword s2, s[4:5], 0xb
6873; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
6874; GCN1-NEXT:    s_add_u32 s0, s0, 0xffc
6875; GCN1-NEXT:    s_addc_u32 s1, s1, 0
6876; GCN1-NEXT:    v_mov_b32_e32 v0, s0
6877; GCN1-NEXT:    v_mov_b32_e32 v1, s1
6878; GCN1-NEXT:    v_mov_b32_e32 v2, s2
6879; GCN1-NEXT:    flat_atomic_inc v[0:1], v2
6880; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6881; GCN1-NEXT:    buffer_wbinvl1_vol
6882; GCN1-NEXT:    s_endpgm
6883;
6884; GCN2-LABEL: atomic_inc_i32_max_offset:
6885; GCN2:       ; %bb.0: ; %entry
6886; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
6887; GCN2-NEXT:    s_load_dword s2, s[4:5], 0x2c
6888; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
6889; GCN2-NEXT:    s_add_u32 s0, s0, 0xffc
6890; GCN2-NEXT:    s_addc_u32 s1, s1, 0
6891; GCN2-NEXT:    v_mov_b32_e32 v0, s0
6892; GCN2-NEXT:    v_mov_b32_e32 v1, s1
6893; GCN2-NEXT:    v_mov_b32_e32 v2, s2
6894; GCN2-NEXT:    flat_atomic_inc v[0:1], v2
6895; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6896; GCN2-NEXT:    buffer_wbinvl1_vol
6897; GCN2-NEXT:    s_endpgm
6898;
6899; GCN3-LABEL: atomic_inc_i32_max_offset:
6900; GCN3:       ; %bb.0: ; %entry
6901; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
6902; GCN3-NEXT:    s_load_dword s2, s[4:5], 0x2c
6903; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
6904; GCN3-NEXT:    v_mov_b32_e32 v0, s0
6905; GCN3-NEXT:    v_mov_b32_e32 v1, s1
6906; GCN3-NEXT:    v_mov_b32_e32 v2, s2
6907; GCN3-NEXT:    flat_atomic_inc v[0:1], v2 offset:4092
6908; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6909; GCN3-NEXT:    buffer_wbinvl1_vol
6910; GCN3-NEXT:    s_endpgm
6911entry:
6912  %gep = getelementptr i32, ptr %out, i32 1023
6913  %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
6914  ret void
6915}
6916
6917define amdgpu_kernel void @atomic_inc_i32_max_offset_p1(ptr %out, i32 %in) {
6918; GCN1-LABEL: atomic_inc_i32_max_offset_p1:
6919; GCN1:       ; %bb.0: ; %entry
6920; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
6921; GCN1-NEXT:    s_load_dword s2, s[4:5], 0xb
6922; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
6923; GCN1-NEXT:    s_add_u32 s0, s0, 0x1000
6924; GCN1-NEXT:    s_addc_u32 s1, s1, 0
6925; GCN1-NEXT:    v_mov_b32_e32 v0, s0
6926; GCN1-NEXT:    v_mov_b32_e32 v1, s1
6927; GCN1-NEXT:    v_mov_b32_e32 v2, s2
6928; GCN1-NEXT:    flat_atomic_inc v[0:1], v2
6929; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6930; GCN1-NEXT:    buffer_wbinvl1_vol
6931; GCN1-NEXT:    s_endpgm
6932;
6933; GCN2-LABEL: atomic_inc_i32_max_offset_p1:
6934; GCN2:       ; %bb.0: ; %entry
6935; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
6936; GCN2-NEXT:    s_load_dword s2, s[4:5], 0x2c
6937; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
6938; GCN2-NEXT:    s_add_u32 s0, s0, 0x1000
6939; GCN2-NEXT:    s_addc_u32 s1, s1, 0
6940; GCN2-NEXT:    v_mov_b32_e32 v0, s0
6941; GCN2-NEXT:    v_mov_b32_e32 v1, s1
6942; GCN2-NEXT:    v_mov_b32_e32 v2, s2
6943; GCN2-NEXT:    flat_atomic_inc v[0:1], v2
6944; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6945; GCN2-NEXT:    buffer_wbinvl1_vol
6946; GCN2-NEXT:    s_endpgm
6947;
6948; GCN3-LABEL: atomic_inc_i32_max_offset_p1:
6949; GCN3:       ; %bb.0: ; %entry
6950; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
6951; GCN3-NEXT:    s_load_dword s2, s[4:5], 0x2c
6952; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
6953; GCN3-NEXT:    v_mov_b32_e32 v0, s0
6954; GCN3-NEXT:    v_mov_b32_e32 v1, s1
6955; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
6956; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
6957; GCN3-NEXT:    v_mov_b32_e32 v2, s2
6958; GCN3-NEXT:    flat_atomic_inc v[0:1], v2
6959; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6960; GCN3-NEXT:    buffer_wbinvl1_vol
6961; GCN3-NEXT:    s_endpgm
6962entry:
6963  %gep = getelementptr i32, ptr %out, i32 1024
6964  %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
6965  ret void
6966}
6967
6968define amdgpu_kernel void @atomic_inc_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
6969; GCN1-LABEL: atomic_inc_i32_ret_offset:
6970; GCN1:       ; %bb.0: ; %entry
6971; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
6972; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xd
6973; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
6974; GCN1-NEXT:    s_add_u32 s0, s0, 16
6975; GCN1-NEXT:    s_addc_u32 s1, s1, 0
6976; GCN1-NEXT:    v_mov_b32_e32 v0, s0
6977; GCN1-NEXT:    v_mov_b32_e32 v1, s1
6978; GCN1-NEXT:    v_mov_b32_e32 v2, s4
6979; GCN1-NEXT:    flat_atomic_inc v2, v[0:1], v2 glc
6980; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6981; GCN1-NEXT:    buffer_wbinvl1_vol
6982; GCN1-NEXT:    v_mov_b32_e32 v0, s2
6983; GCN1-NEXT:    v_mov_b32_e32 v1, s3
6984; GCN1-NEXT:    flat_store_dword v[0:1], v2
6985; GCN1-NEXT:    s_endpgm
6986;
6987; GCN2-LABEL: atomic_inc_i32_ret_offset:
6988; GCN2:       ; %bb.0: ; %entry
6989; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
6990; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x34
6991; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
6992; GCN2-NEXT:    s_add_u32 s0, s0, 16
6993; GCN2-NEXT:    s_addc_u32 s1, s1, 0
6994; GCN2-NEXT:    v_mov_b32_e32 v0, s0
6995; GCN2-NEXT:    v_mov_b32_e32 v1, s1
6996; GCN2-NEXT:    v_mov_b32_e32 v2, s4
6997; GCN2-NEXT:    flat_atomic_inc v2, v[0:1], v2 glc
6998; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6999; GCN2-NEXT:    buffer_wbinvl1_vol
7000; GCN2-NEXT:    v_mov_b32_e32 v0, s2
7001; GCN2-NEXT:    v_mov_b32_e32 v1, s3
7002; GCN2-NEXT:    flat_store_dword v[0:1], v2
7003; GCN2-NEXT:    s_endpgm
7004;
7005; GCN3-LABEL: atomic_inc_i32_ret_offset:
7006; GCN3:       ; %bb.0: ; %entry
7007; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
7008; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x34
7009; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
7010; GCN3-NEXT:    v_mov_b32_e32 v0, s0
7011; GCN3-NEXT:    v_mov_b32_e32 v1, s1
7012; GCN3-NEXT:    v_mov_b32_e32 v2, s6
7013; GCN3-NEXT:    flat_atomic_inc v2, v[0:1], v2 offset:16 glc
7014; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7015; GCN3-NEXT:    buffer_wbinvl1_vol
7016; GCN3-NEXT:    v_mov_b32_e32 v0, s2
7017; GCN3-NEXT:    v_mov_b32_e32 v1, s3
7018; GCN3-NEXT:    flat_store_dword v[0:1], v2
7019; GCN3-NEXT:    s_endpgm
7020entry:
7021  %gep = getelementptr i32, ptr %out, i32 4
7022  %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
7023  store i32 %val, ptr %out2
7024  ret void
7025}
7026
7027define amdgpu_kernel void @atomic_inc_i32_incr64_offset(ptr %out, i32 %in, i64 %index) {
7028; GCN1-LABEL: atomic_inc_i32_incr64_offset:
7029; GCN1:       ; %bb.0: ; %entry
7030; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
7031; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
7032; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
7033; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
7034; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
7035; GCN1-NEXT:    s_add_u32 s0, s2, s0
7036; GCN1-NEXT:    s_addc_u32 s1, s3, s1
7037; GCN1-NEXT:    s_add_u32 s0, s0, 16
7038; GCN1-NEXT:    s_addc_u32 s1, s1, 0
7039; GCN1-NEXT:    v_mov_b32_e32 v0, s0
7040; GCN1-NEXT:    v_mov_b32_e32 v1, s1
7041; GCN1-NEXT:    v_mov_b32_e32 v2, s4
7042; GCN1-NEXT:    flat_atomic_inc v[0:1], v2
7043; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7044; GCN1-NEXT:    buffer_wbinvl1_vol
7045; GCN1-NEXT:    s_endpgm
7046;
7047; GCN2-LABEL: atomic_inc_i32_incr64_offset:
7048; GCN2:       ; %bb.0: ; %entry
7049; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
7050; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
7051; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
7052; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
7053; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
7054; GCN2-NEXT:    s_add_u32 s0, s2, s0
7055; GCN2-NEXT:    s_addc_u32 s1, s3, s1
7056; GCN2-NEXT:    s_add_u32 s0, s0, 16
7057; GCN2-NEXT:    s_addc_u32 s1, s1, 0
7058; GCN2-NEXT:    v_mov_b32_e32 v0, s0
7059; GCN2-NEXT:    v_mov_b32_e32 v1, s1
7060; GCN2-NEXT:    v_mov_b32_e32 v2, s4
7061; GCN2-NEXT:    flat_atomic_inc v[0:1], v2
7062; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7063; GCN2-NEXT:    buffer_wbinvl1_vol
7064; GCN2-NEXT:    s_endpgm
7065;
7066; GCN3-LABEL: atomic_inc_i32_incr64_offset:
7067; GCN3:       ; %bb.0: ; %entry
7068; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
7069; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
7070; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
7071; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
7072; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
7073; GCN3-NEXT:    s_add_u32 s0, s2, s0
7074; GCN3-NEXT:    s_addc_u32 s1, s3, s1
7075; GCN3-NEXT:    v_mov_b32_e32 v0, s0
7076; GCN3-NEXT:    v_mov_b32_e32 v1, s1
7077; GCN3-NEXT:    v_mov_b32_e32 v2, s6
7078; GCN3-NEXT:    flat_atomic_inc v[0:1], v2 offset:16
7079; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7080; GCN3-NEXT:    buffer_wbinvl1_vol
7081; GCN3-NEXT:    s_endpgm
7082entry:
7083  %ptr = getelementptr i32, ptr %out, i64 %index
7084  %gep = getelementptr i32, ptr %ptr, i32 4
7085  %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
7086  ret void
7087}
7088
7089define amdgpu_kernel void @atomic_inc_i32_ret_incr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
7090; GCN1-LABEL: atomic_inc_i32_ret_incr64_offset:
7091; GCN1:       ; %bb.0: ; %entry
7092; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
7093; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
7094; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
7095; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
7096; GCN1-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
7097; GCN1-NEXT:    s_add_u32 s0, s0, s4
7098; GCN1-NEXT:    s_addc_u32 s1, s1, s5
7099; GCN1-NEXT:    s_add_u32 s0, s0, 16
7100; GCN1-NEXT:    s_addc_u32 s1, s1, 0
7101; GCN1-NEXT:    v_mov_b32_e32 v0, s0
7102; GCN1-NEXT:    v_mov_b32_e32 v1, s1
7103; GCN1-NEXT:    v_mov_b32_e32 v2, s8
7104; GCN1-NEXT:    flat_atomic_inc v2, v[0:1], v2 glc
7105; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7106; GCN1-NEXT:    buffer_wbinvl1_vol
7107; GCN1-NEXT:    v_mov_b32_e32 v0, s2
7108; GCN1-NEXT:    v_mov_b32_e32 v1, s3
7109; GCN1-NEXT:    flat_store_dword v[0:1], v2
7110; GCN1-NEXT:    s_endpgm
7111;
7112; GCN2-LABEL: atomic_inc_i32_ret_incr64_offset:
7113; GCN2:       ; %bb.0: ; %entry
7114; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
7115; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
7116; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
7117; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
7118; GCN2-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
7119; GCN2-NEXT:    s_add_u32 s0, s0, s4
7120; GCN2-NEXT:    s_addc_u32 s1, s1, s5
7121; GCN2-NEXT:    s_add_u32 s0, s0, 16
7122; GCN2-NEXT:    s_addc_u32 s1, s1, 0
7123; GCN2-NEXT:    v_mov_b32_e32 v0, s0
7124; GCN2-NEXT:    v_mov_b32_e32 v1, s1
7125; GCN2-NEXT:    v_mov_b32_e32 v2, s8
7126; GCN2-NEXT:    flat_atomic_inc v2, v[0:1], v2 glc
7127; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7128; GCN2-NEXT:    buffer_wbinvl1_vol
7129; GCN2-NEXT:    v_mov_b32_e32 v0, s2
7130; GCN2-NEXT:    v_mov_b32_e32 v1, s3
7131; GCN2-NEXT:    flat_store_dword v[0:1], v2
7132; GCN2-NEXT:    s_endpgm
7133;
7134; GCN3-LABEL: atomic_inc_i32_ret_incr64_offset:
7135; GCN3:       ; %bb.0: ; %entry
7136; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
7137; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
7138; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
7139; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
7140; GCN3-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
7141; GCN3-NEXT:    s_add_u32 s0, s0, s4
7142; GCN3-NEXT:    s_addc_u32 s1, s1, s5
7143; GCN3-NEXT:    v_mov_b32_e32 v0, s0
7144; GCN3-NEXT:    v_mov_b32_e32 v1, s1
7145; GCN3-NEXT:    v_mov_b32_e32 v2, s8
7146; GCN3-NEXT:    flat_atomic_inc v2, v[0:1], v2 offset:16 glc
7147; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7148; GCN3-NEXT:    buffer_wbinvl1_vol
7149; GCN3-NEXT:    v_mov_b32_e32 v0, s2
7150; GCN3-NEXT:    v_mov_b32_e32 v1, s3
7151; GCN3-NEXT:    flat_store_dword v[0:1], v2
7152; GCN3-NEXT:    s_endpgm
7153entry:
7154  %ptr = getelementptr i32, ptr %out, i64 %index
7155  %gep = getelementptr i32, ptr %ptr, i32 4
7156  %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
7157  store i32 %val, ptr %out2
7158  ret void
7159}
7160
7161define amdgpu_kernel void @atomic_inc_i32(ptr %out, i32 %in) {
7162; GCN1-LABEL: atomic_inc_i32:
7163; GCN1:       ; %bb.0: ; %entry
7164; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
7165; GCN1-NEXT:    s_load_dword s2, s[4:5], 0xb
7166; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
7167; GCN1-NEXT:    v_mov_b32_e32 v0, s0
7168; GCN1-NEXT:    v_mov_b32_e32 v1, s1
7169; GCN1-NEXT:    v_mov_b32_e32 v2, s2
7170; GCN1-NEXT:    flat_atomic_inc v[0:1], v2
7171; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7172; GCN1-NEXT:    buffer_wbinvl1_vol
7173; GCN1-NEXT:    s_endpgm
7174;
7175; GCN2-LABEL: atomic_inc_i32:
7176; GCN2:       ; %bb.0: ; %entry
7177; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
7178; GCN2-NEXT:    s_load_dword s2, s[4:5], 0x2c
7179; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
7180; GCN2-NEXT:    v_mov_b32_e32 v0, s0
7181; GCN2-NEXT:    v_mov_b32_e32 v1, s1
7182; GCN2-NEXT:    v_mov_b32_e32 v2, s2
7183; GCN2-NEXT:    flat_atomic_inc v[0:1], v2
7184; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7185; GCN2-NEXT:    buffer_wbinvl1_vol
7186; GCN2-NEXT:    s_endpgm
7187;
7188; GCN3-LABEL: atomic_inc_i32:
7189; GCN3:       ; %bb.0: ; %entry
7190; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
7191; GCN3-NEXT:    s_load_dword s2, s[4:5], 0x2c
7192; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
7193; GCN3-NEXT:    v_mov_b32_e32 v0, s0
7194; GCN3-NEXT:    v_mov_b32_e32 v1, s1
7195; GCN3-NEXT:    v_mov_b32_e32 v2, s2
7196; GCN3-NEXT:    flat_atomic_inc v[0:1], v2
7197; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7198; GCN3-NEXT:    buffer_wbinvl1_vol
7199; GCN3-NEXT:    s_endpgm
7200entry:
7201  %val = atomicrmw volatile uinc_wrap ptr %out, i32 %in syncscope("agent") seq_cst
7202  ret void
7203}
7204
7205define amdgpu_kernel void @atomic_inc_i32_ret(ptr %out, ptr %out2, i32 %in) {
7206; GCN1-LABEL: atomic_inc_i32_ret:
7207; GCN1:       ; %bb.0: ; %entry
7208; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
7209; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xd
7210; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
7211; GCN1-NEXT:    v_mov_b32_e32 v0, s0
7212; GCN1-NEXT:    v_mov_b32_e32 v1, s1
7213; GCN1-NEXT:    v_mov_b32_e32 v2, s4
7214; GCN1-NEXT:    flat_atomic_inc v2, v[0:1], v2 glc
7215; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7216; GCN1-NEXT:    buffer_wbinvl1_vol
7217; GCN1-NEXT:    v_mov_b32_e32 v0, s2
7218; GCN1-NEXT:    v_mov_b32_e32 v1, s3
7219; GCN1-NEXT:    flat_store_dword v[0:1], v2
7220; GCN1-NEXT:    s_endpgm
7221;
7222; GCN2-LABEL: atomic_inc_i32_ret:
7223; GCN2:       ; %bb.0: ; %entry
7224; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
7225; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x34
7226; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
7227; GCN2-NEXT:    v_mov_b32_e32 v0, s0
7228; GCN2-NEXT:    v_mov_b32_e32 v1, s1
7229; GCN2-NEXT:    v_mov_b32_e32 v2, s4
7230; GCN2-NEXT:    flat_atomic_inc v2, v[0:1], v2 glc
7231; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7232; GCN2-NEXT:    buffer_wbinvl1_vol
7233; GCN2-NEXT:    v_mov_b32_e32 v0, s2
7234; GCN2-NEXT:    v_mov_b32_e32 v1, s3
7235; GCN2-NEXT:    flat_store_dword v[0:1], v2
7236; GCN2-NEXT:    s_endpgm
7237;
7238; GCN3-LABEL: atomic_inc_i32_ret:
7239; GCN3:       ; %bb.0: ; %entry
7240; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
7241; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x34
7242; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
7243; GCN3-NEXT:    v_mov_b32_e32 v0, s0
7244; GCN3-NEXT:    v_mov_b32_e32 v1, s1
7245; GCN3-NEXT:    v_mov_b32_e32 v2, s6
7246; GCN3-NEXT:    flat_atomic_inc v2, v[0:1], v2 glc
7247; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7248; GCN3-NEXT:    buffer_wbinvl1_vol
7249; GCN3-NEXT:    v_mov_b32_e32 v0, s2
7250; GCN3-NEXT:    v_mov_b32_e32 v1, s3
7251; GCN3-NEXT:    flat_store_dword v[0:1], v2
7252; GCN3-NEXT:    s_endpgm
7253entry:
7254  %val = atomicrmw volatile uinc_wrap ptr %out, i32 %in syncscope("agent") seq_cst
7255  store i32 %val, ptr %out2
7256  ret void
7257}
7258
7259define amdgpu_kernel void @atomic_inc_i32_incr64(ptr %out, i32 %in, i64 %index) {
7260; GCN1-LABEL: atomic_inc_i32_incr64:
7261; GCN1:       ; %bb.0: ; %entry
7262; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
7263; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
7264; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
7265; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
7266; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
7267; GCN1-NEXT:    s_add_u32 s0, s2, s0
7268; GCN1-NEXT:    s_addc_u32 s1, s3, s1
7269; GCN1-NEXT:    v_mov_b32_e32 v0, s0
7270; GCN1-NEXT:    v_mov_b32_e32 v1, s1
7271; GCN1-NEXT:    v_mov_b32_e32 v2, s4
7272; GCN1-NEXT:    flat_atomic_inc v[0:1], v2
7273; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7274; GCN1-NEXT:    buffer_wbinvl1_vol
7275; GCN1-NEXT:    s_endpgm
7276;
7277; GCN2-LABEL: atomic_inc_i32_incr64:
7278; GCN2:       ; %bb.0: ; %entry
7279; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
7280; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
7281; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
7282; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
7283; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
7284; GCN2-NEXT:    s_add_u32 s0, s2, s0
7285; GCN2-NEXT:    s_addc_u32 s1, s3, s1
7286; GCN2-NEXT:    v_mov_b32_e32 v0, s0
7287; GCN2-NEXT:    v_mov_b32_e32 v1, s1
7288; GCN2-NEXT:    v_mov_b32_e32 v2, s4
7289; GCN2-NEXT:    flat_atomic_inc v[0:1], v2
7290; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7291; GCN2-NEXT:    buffer_wbinvl1_vol
7292; GCN2-NEXT:    s_endpgm
7293;
7294; GCN3-LABEL: atomic_inc_i32_incr64:
7295; GCN3:       ; %bb.0: ; %entry
7296; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
7297; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
7298; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
7299; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
7300; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
7301; GCN3-NEXT:    s_add_u32 s0, s2, s0
7302; GCN3-NEXT:    s_addc_u32 s1, s3, s1
7303; GCN3-NEXT:    v_mov_b32_e32 v0, s0
7304; GCN3-NEXT:    v_mov_b32_e32 v1, s1
7305; GCN3-NEXT:    v_mov_b32_e32 v2, s6
7306; GCN3-NEXT:    flat_atomic_inc v[0:1], v2
7307; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7308; GCN3-NEXT:    buffer_wbinvl1_vol
7309; GCN3-NEXT:    s_endpgm
7310entry:
7311  %ptr = getelementptr i32, ptr %out, i64 %index
7312  %val = atomicrmw volatile uinc_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst
7313  ret void
7314}
7315
7316define amdgpu_kernel void @atomic_inc_i32_ret_incr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
7317; GCN1-LABEL: atomic_inc_i32_ret_incr64:
7318; GCN1:       ; %bb.0: ; %entry
7319; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
7320; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
7321; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
7322; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
7323; GCN1-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
7324; GCN1-NEXT:    s_add_u32 s0, s0, s4
7325; GCN1-NEXT:    s_addc_u32 s1, s1, s5
7326; GCN1-NEXT:    v_mov_b32_e32 v0, s0
7327; GCN1-NEXT:    v_mov_b32_e32 v1, s1
7328; GCN1-NEXT:    v_mov_b32_e32 v2, s8
7329; GCN1-NEXT:    flat_atomic_inc v2, v[0:1], v2 glc
7330; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7331; GCN1-NEXT:    buffer_wbinvl1_vol
7332; GCN1-NEXT:    v_mov_b32_e32 v0, s2
7333; GCN1-NEXT:    v_mov_b32_e32 v1, s3
7334; GCN1-NEXT:    flat_store_dword v[0:1], v2
7335; GCN1-NEXT:    s_endpgm
7336;
7337; GCN2-LABEL: atomic_inc_i32_ret_incr64:
7338; GCN2:       ; %bb.0: ; %entry
7339; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
7340; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
7341; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
7342; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
7343; GCN2-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
7344; GCN2-NEXT:    s_add_u32 s0, s0, s4
7345; GCN2-NEXT:    s_addc_u32 s1, s1, s5
7346; GCN2-NEXT:    v_mov_b32_e32 v0, s0
7347; GCN2-NEXT:    v_mov_b32_e32 v1, s1
7348; GCN2-NEXT:    v_mov_b32_e32 v2, s8
7349; GCN2-NEXT:    flat_atomic_inc v2, v[0:1], v2 glc
7350; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7351; GCN2-NEXT:    buffer_wbinvl1_vol
7352; GCN2-NEXT:    v_mov_b32_e32 v0, s2
7353; GCN2-NEXT:    v_mov_b32_e32 v1, s3
7354; GCN2-NEXT:    flat_store_dword v[0:1], v2
7355; GCN2-NEXT:    s_endpgm
7356;
7357; GCN3-LABEL: atomic_inc_i32_ret_incr64:
7358; GCN3:       ; %bb.0: ; %entry
7359; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
7360; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
7361; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
7362; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
7363; GCN3-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
7364; GCN3-NEXT:    s_add_u32 s0, s0, s4
7365; GCN3-NEXT:    s_addc_u32 s1, s1, s5
7366; GCN3-NEXT:    v_mov_b32_e32 v0, s0
7367; GCN3-NEXT:    v_mov_b32_e32 v1, s1
7368; GCN3-NEXT:    v_mov_b32_e32 v2, s8
7369; GCN3-NEXT:    flat_atomic_inc v2, v[0:1], v2 glc
7370; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7371; GCN3-NEXT:    buffer_wbinvl1_vol
7372; GCN3-NEXT:    v_mov_b32_e32 v0, s2
7373; GCN3-NEXT:    v_mov_b32_e32 v1, s3
7374; GCN3-NEXT:    flat_store_dword v[0:1], v2
7375; GCN3-NEXT:    s_endpgm
7376entry:
7377  %ptr = getelementptr i32, ptr %out, i64 %index
7378  %val = atomicrmw volatile uinc_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst
7379  store i32 %val, ptr %out2
7380  ret void
7381}
7382
7383define amdgpu_kernel void @atomic_dec_i32_offset(ptr %out, i32 %in) {
7384; GCN1-LABEL: atomic_dec_i32_offset:
7385; GCN1:       ; %bb.0: ; %entry
7386; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
7387; GCN1-NEXT:    s_load_dword s2, s[4:5], 0xb
7388; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
7389; GCN1-NEXT:    s_add_u32 s0, s0, 16
7390; GCN1-NEXT:    s_addc_u32 s1, s1, 0
7391; GCN1-NEXT:    v_mov_b32_e32 v0, s0
7392; GCN1-NEXT:    v_mov_b32_e32 v1, s1
7393; GCN1-NEXT:    v_mov_b32_e32 v2, s2
7394; GCN1-NEXT:    flat_atomic_dec v[0:1], v2
7395; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7396; GCN1-NEXT:    buffer_wbinvl1_vol
7397; GCN1-NEXT:    s_endpgm
7398;
7399; GCN2-LABEL: atomic_dec_i32_offset:
7400; GCN2:       ; %bb.0: ; %entry
7401; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
7402; GCN2-NEXT:    s_load_dword s2, s[4:5], 0x2c
7403; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
7404; GCN2-NEXT:    s_add_u32 s0, s0, 16
7405; GCN2-NEXT:    s_addc_u32 s1, s1, 0
7406; GCN2-NEXT:    v_mov_b32_e32 v0, s0
7407; GCN2-NEXT:    v_mov_b32_e32 v1, s1
7408; GCN2-NEXT:    v_mov_b32_e32 v2, s2
7409; GCN2-NEXT:    flat_atomic_dec v[0:1], v2
7410; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7411; GCN2-NEXT:    buffer_wbinvl1_vol
7412; GCN2-NEXT:    s_endpgm
7413;
7414; GCN3-LABEL: atomic_dec_i32_offset:
7415; GCN3:       ; %bb.0: ; %entry
7416; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
7417; GCN3-NEXT:    s_load_dword s2, s[4:5], 0x2c
7418; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
7419; GCN3-NEXT:    v_mov_b32_e32 v0, s0
7420; GCN3-NEXT:    v_mov_b32_e32 v1, s1
7421; GCN3-NEXT:    v_mov_b32_e32 v2, s2
7422; GCN3-NEXT:    flat_atomic_dec v[0:1], v2 offset:16
7423; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7424; GCN3-NEXT:    buffer_wbinvl1_vol
7425; GCN3-NEXT:    s_endpgm
7426entry:
7427  %gep = getelementptr i32, ptr %out, i32 4
7428  %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
7429  ret void
7430}
7431
7432define amdgpu_kernel void @atomic_dec_i32_max_offset(ptr %out, i32 %in) {
7433; GCN1-LABEL: atomic_dec_i32_max_offset:
7434; GCN1:       ; %bb.0: ; %entry
7435; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
7436; GCN1-NEXT:    s_load_dword s2, s[4:5], 0xb
7437; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
7438; GCN1-NEXT:    s_add_u32 s0, s0, 0xffc
7439; GCN1-NEXT:    s_addc_u32 s1, s1, 0
7440; GCN1-NEXT:    v_mov_b32_e32 v0, s0
7441; GCN1-NEXT:    v_mov_b32_e32 v1, s1
7442; GCN1-NEXT:    v_mov_b32_e32 v2, s2
7443; GCN1-NEXT:    flat_atomic_dec v[0:1], v2
7444; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7445; GCN1-NEXT:    buffer_wbinvl1_vol
7446; GCN1-NEXT:    s_endpgm
7447;
7448; GCN2-LABEL: atomic_dec_i32_max_offset:
7449; GCN2:       ; %bb.0: ; %entry
7450; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
7451; GCN2-NEXT:    s_load_dword s2, s[4:5], 0x2c
7452; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
7453; GCN2-NEXT:    s_add_u32 s0, s0, 0xffc
7454; GCN2-NEXT:    s_addc_u32 s1, s1, 0
7455; GCN2-NEXT:    v_mov_b32_e32 v0, s0
7456; GCN2-NEXT:    v_mov_b32_e32 v1, s1
7457; GCN2-NEXT:    v_mov_b32_e32 v2, s2
7458; GCN2-NEXT:    flat_atomic_dec v[0:1], v2
7459; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7460; GCN2-NEXT:    buffer_wbinvl1_vol
7461; GCN2-NEXT:    s_endpgm
7462;
7463; GCN3-LABEL: atomic_dec_i32_max_offset:
7464; GCN3:       ; %bb.0: ; %entry
7465; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
7466; GCN3-NEXT:    s_load_dword s2, s[4:5], 0x2c
7467; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
7468; GCN3-NEXT:    v_mov_b32_e32 v0, s0
7469; GCN3-NEXT:    v_mov_b32_e32 v1, s1
7470; GCN3-NEXT:    v_mov_b32_e32 v2, s2
7471; GCN3-NEXT:    flat_atomic_dec v[0:1], v2 offset:4092
7472; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7473; GCN3-NEXT:    buffer_wbinvl1_vol
7474; GCN3-NEXT:    s_endpgm
7475entry:
7476  %gep = getelementptr i32, ptr %out, i32 1023
7477  %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
7478  ret void
7479}
7480
7481define amdgpu_kernel void @atomic_dec_i32_max_offset_p1(ptr %out, i32 %in) {
7482; GCN1-LABEL: atomic_dec_i32_max_offset_p1:
7483; GCN1:       ; %bb.0: ; %entry
7484; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
7485; GCN1-NEXT:    s_load_dword s2, s[4:5], 0xb
7486; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
7487; GCN1-NEXT:    s_add_u32 s0, s0, 0x1000
7488; GCN1-NEXT:    s_addc_u32 s1, s1, 0
7489; GCN1-NEXT:    v_mov_b32_e32 v0, s0
7490; GCN1-NEXT:    v_mov_b32_e32 v1, s1
7491; GCN1-NEXT:    v_mov_b32_e32 v2, s2
7492; GCN1-NEXT:    flat_atomic_dec v[0:1], v2
7493; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7494; GCN1-NEXT:    buffer_wbinvl1_vol
7495; GCN1-NEXT:    s_endpgm
7496;
7497; GCN2-LABEL: atomic_dec_i32_max_offset_p1:
7498; GCN2:       ; %bb.0: ; %entry
7499; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
7500; GCN2-NEXT:    s_load_dword s2, s[4:5], 0x2c
7501; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
7502; GCN2-NEXT:    s_add_u32 s0, s0, 0x1000
7503; GCN2-NEXT:    s_addc_u32 s1, s1, 0
7504; GCN2-NEXT:    v_mov_b32_e32 v0, s0
7505; GCN2-NEXT:    v_mov_b32_e32 v1, s1
7506; GCN2-NEXT:    v_mov_b32_e32 v2, s2
7507; GCN2-NEXT:    flat_atomic_dec v[0:1], v2
7508; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7509; GCN2-NEXT:    buffer_wbinvl1_vol
7510; GCN2-NEXT:    s_endpgm
7511;
7512; GCN3-LABEL: atomic_dec_i32_max_offset_p1:
7513; GCN3:       ; %bb.0: ; %entry
7514; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
7515; GCN3-NEXT:    s_load_dword s2, s[4:5], 0x2c
7516; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
7517; GCN3-NEXT:    v_mov_b32_e32 v0, s0
7518; GCN3-NEXT:    v_mov_b32_e32 v1, s1
7519; GCN3-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
7520; GCN3-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
7521; GCN3-NEXT:    v_mov_b32_e32 v2, s2
7522; GCN3-NEXT:    flat_atomic_dec v[0:1], v2
7523; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7524; GCN3-NEXT:    buffer_wbinvl1_vol
7525; GCN3-NEXT:    s_endpgm
7526entry:
7527  %gep = getelementptr i32, ptr %out, i32 1024
7528  %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
7529  ret void
7530}
7531
7532define amdgpu_kernel void @atomic_dec_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
7533; GCN1-LABEL: atomic_dec_i32_ret_offset:
7534; GCN1:       ; %bb.0: ; %entry
7535; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
7536; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xd
7537; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
7538; GCN1-NEXT:    s_add_u32 s0, s0, 16
7539; GCN1-NEXT:    s_addc_u32 s1, s1, 0
7540; GCN1-NEXT:    v_mov_b32_e32 v0, s0
7541; GCN1-NEXT:    v_mov_b32_e32 v1, s1
7542; GCN1-NEXT:    v_mov_b32_e32 v2, s4
7543; GCN1-NEXT:    flat_atomic_dec v2, v[0:1], v2 glc
7544; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7545; GCN1-NEXT:    buffer_wbinvl1_vol
7546; GCN1-NEXT:    v_mov_b32_e32 v0, s2
7547; GCN1-NEXT:    v_mov_b32_e32 v1, s3
7548; GCN1-NEXT:    flat_store_dword v[0:1], v2
7549; GCN1-NEXT:    s_endpgm
7550;
7551; GCN2-LABEL: atomic_dec_i32_ret_offset:
7552; GCN2:       ; %bb.0: ; %entry
7553; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
7554; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x34
7555; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
7556; GCN2-NEXT:    s_add_u32 s0, s0, 16
7557; GCN2-NEXT:    s_addc_u32 s1, s1, 0
7558; GCN2-NEXT:    v_mov_b32_e32 v0, s0
7559; GCN2-NEXT:    v_mov_b32_e32 v1, s1
7560; GCN2-NEXT:    v_mov_b32_e32 v2, s4
7561; GCN2-NEXT:    flat_atomic_dec v2, v[0:1], v2 glc
7562; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7563; GCN2-NEXT:    buffer_wbinvl1_vol
7564; GCN2-NEXT:    v_mov_b32_e32 v0, s2
7565; GCN2-NEXT:    v_mov_b32_e32 v1, s3
7566; GCN2-NEXT:    flat_store_dword v[0:1], v2
7567; GCN2-NEXT:    s_endpgm
7568;
7569; GCN3-LABEL: atomic_dec_i32_ret_offset:
7570; GCN3:       ; %bb.0: ; %entry
7571; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
7572; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x34
7573; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
7574; GCN3-NEXT:    v_mov_b32_e32 v0, s0
7575; GCN3-NEXT:    v_mov_b32_e32 v1, s1
7576; GCN3-NEXT:    v_mov_b32_e32 v2, s6
7577; GCN3-NEXT:    flat_atomic_dec v2, v[0:1], v2 offset:16 glc
7578; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7579; GCN3-NEXT:    buffer_wbinvl1_vol
7580; GCN3-NEXT:    v_mov_b32_e32 v0, s2
7581; GCN3-NEXT:    v_mov_b32_e32 v1, s3
7582; GCN3-NEXT:    flat_store_dword v[0:1], v2
7583; GCN3-NEXT:    s_endpgm
7584entry:
7585  %gep = getelementptr i32, ptr %out, i32 4
7586  %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
7587  store i32 %val, ptr %out2
7588  ret void
7589}
7590
7591define amdgpu_kernel void @atomic_dec_i32_decr64_offset(ptr %out, i32 %in, i64 %index) {
7592; GCN1-LABEL: atomic_dec_i32_decr64_offset:
7593; GCN1:       ; %bb.0: ; %entry
7594; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
7595; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
7596; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
7597; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
7598; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
7599; GCN1-NEXT:    s_add_u32 s0, s2, s0
7600; GCN1-NEXT:    s_addc_u32 s1, s3, s1
7601; GCN1-NEXT:    s_add_u32 s0, s0, 16
7602; GCN1-NEXT:    s_addc_u32 s1, s1, 0
7603; GCN1-NEXT:    v_mov_b32_e32 v0, s0
7604; GCN1-NEXT:    v_mov_b32_e32 v1, s1
7605; GCN1-NEXT:    v_mov_b32_e32 v2, s4
7606; GCN1-NEXT:    flat_atomic_dec v[0:1], v2
7607; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7608; GCN1-NEXT:    buffer_wbinvl1_vol
7609; GCN1-NEXT:    s_endpgm
7610;
7611; GCN2-LABEL: atomic_dec_i32_decr64_offset:
7612; GCN2:       ; %bb.0: ; %entry
7613; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
7614; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
7615; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
7616; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
7617; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
7618; GCN2-NEXT:    s_add_u32 s0, s2, s0
7619; GCN2-NEXT:    s_addc_u32 s1, s3, s1
7620; GCN2-NEXT:    s_add_u32 s0, s0, 16
7621; GCN2-NEXT:    s_addc_u32 s1, s1, 0
7622; GCN2-NEXT:    v_mov_b32_e32 v0, s0
7623; GCN2-NEXT:    v_mov_b32_e32 v1, s1
7624; GCN2-NEXT:    v_mov_b32_e32 v2, s4
7625; GCN2-NEXT:    flat_atomic_dec v[0:1], v2
7626; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7627; GCN2-NEXT:    buffer_wbinvl1_vol
7628; GCN2-NEXT:    s_endpgm
7629;
7630; GCN3-LABEL: atomic_dec_i32_decr64_offset:
7631; GCN3:       ; %bb.0: ; %entry
7632; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
7633; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
7634; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
7635; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
7636; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
7637; GCN3-NEXT:    s_add_u32 s0, s2, s0
7638; GCN3-NEXT:    s_addc_u32 s1, s3, s1
7639; GCN3-NEXT:    v_mov_b32_e32 v0, s0
7640; GCN3-NEXT:    v_mov_b32_e32 v1, s1
7641; GCN3-NEXT:    v_mov_b32_e32 v2, s6
7642; GCN3-NEXT:    flat_atomic_dec v[0:1], v2 offset:16
7643; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7644; GCN3-NEXT:    buffer_wbinvl1_vol
7645; GCN3-NEXT:    s_endpgm
7646entry:
7647  %ptr = getelementptr i32, ptr %out, i64 %index
7648  %gep = getelementptr i32, ptr %ptr, i32 4
7649  %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
7650  ret void
7651}
7652
7653define amdgpu_kernel void @atomic_dec_i32_ret_decr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
7654; GCN1-LABEL: atomic_dec_i32_ret_decr64_offset:
7655; GCN1:       ; %bb.0: ; %entry
7656; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
7657; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
7658; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
7659; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
7660; GCN1-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
7661; GCN1-NEXT:    s_add_u32 s0, s0, s4
7662; GCN1-NEXT:    s_addc_u32 s1, s1, s5
7663; GCN1-NEXT:    s_add_u32 s0, s0, 16
7664; GCN1-NEXT:    s_addc_u32 s1, s1, 0
7665; GCN1-NEXT:    v_mov_b32_e32 v0, s0
7666; GCN1-NEXT:    v_mov_b32_e32 v1, s1
7667; GCN1-NEXT:    v_mov_b32_e32 v2, s8
7668; GCN1-NEXT:    flat_atomic_dec v2, v[0:1], v2 glc
7669; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7670; GCN1-NEXT:    buffer_wbinvl1_vol
7671; GCN1-NEXT:    v_mov_b32_e32 v0, s2
7672; GCN1-NEXT:    v_mov_b32_e32 v1, s3
7673; GCN1-NEXT:    flat_store_dword v[0:1], v2
7674; GCN1-NEXT:    s_endpgm
7675;
7676; GCN2-LABEL: atomic_dec_i32_ret_decr64_offset:
7677; GCN2:       ; %bb.0: ; %entry
7678; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
7679; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
7680; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
7681; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
7682; GCN2-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
7683; GCN2-NEXT:    s_add_u32 s0, s0, s4
7684; GCN2-NEXT:    s_addc_u32 s1, s1, s5
7685; GCN2-NEXT:    s_add_u32 s0, s0, 16
7686; GCN2-NEXT:    s_addc_u32 s1, s1, 0
7687; GCN2-NEXT:    v_mov_b32_e32 v0, s0
7688; GCN2-NEXT:    v_mov_b32_e32 v1, s1
7689; GCN2-NEXT:    v_mov_b32_e32 v2, s8
7690; GCN2-NEXT:    flat_atomic_dec v2, v[0:1], v2 glc
7691; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7692; GCN2-NEXT:    buffer_wbinvl1_vol
7693; GCN2-NEXT:    v_mov_b32_e32 v0, s2
7694; GCN2-NEXT:    v_mov_b32_e32 v1, s3
7695; GCN2-NEXT:    flat_store_dword v[0:1], v2
7696; GCN2-NEXT:    s_endpgm
7697;
7698; GCN3-LABEL: atomic_dec_i32_ret_decr64_offset:
7699; GCN3:       ; %bb.0: ; %entry
7700; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
7701; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
7702; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
7703; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
7704; GCN3-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
7705; GCN3-NEXT:    s_add_u32 s0, s0, s4
7706; GCN3-NEXT:    s_addc_u32 s1, s1, s5
7707; GCN3-NEXT:    v_mov_b32_e32 v0, s0
7708; GCN3-NEXT:    v_mov_b32_e32 v1, s1
7709; GCN3-NEXT:    v_mov_b32_e32 v2, s8
7710; GCN3-NEXT:    flat_atomic_dec v2, v[0:1], v2 offset:16 glc
7711; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7712; GCN3-NEXT:    buffer_wbinvl1_vol
7713; GCN3-NEXT:    v_mov_b32_e32 v0, s2
7714; GCN3-NEXT:    v_mov_b32_e32 v1, s3
7715; GCN3-NEXT:    flat_store_dword v[0:1], v2
7716; GCN3-NEXT:    s_endpgm
7717entry:
7718  %ptr = getelementptr i32, ptr %out, i64 %index
7719  %gep = getelementptr i32, ptr %ptr, i32 4
7720  %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
7721  store i32 %val, ptr %out2
7722  ret void
7723}
7724
7725define amdgpu_kernel void @atomic_dec_i32(ptr %out, i32 %in) {
7726; GCN1-LABEL: atomic_dec_i32:
7727; GCN1:       ; %bb.0: ; %entry
7728; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
7729; GCN1-NEXT:    s_load_dword s2, s[4:5], 0xb
7730; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
7731; GCN1-NEXT:    v_mov_b32_e32 v0, s0
7732; GCN1-NEXT:    v_mov_b32_e32 v1, s1
7733; GCN1-NEXT:    v_mov_b32_e32 v2, s2
7734; GCN1-NEXT:    flat_atomic_dec v[0:1], v2
7735; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7736; GCN1-NEXT:    buffer_wbinvl1_vol
7737; GCN1-NEXT:    s_endpgm
7738;
7739; GCN2-LABEL: atomic_dec_i32:
7740; GCN2:       ; %bb.0: ; %entry
7741; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
7742; GCN2-NEXT:    s_load_dword s2, s[4:5], 0x2c
7743; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
7744; GCN2-NEXT:    v_mov_b32_e32 v0, s0
7745; GCN2-NEXT:    v_mov_b32_e32 v1, s1
7746; GCN2-NEXT:    v_mov_b32_e32 v2, s2
7747; GCN2-NEXT:    flat_atomic_dec v[0:1], v2
7748; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7749; GCN2-NEXT:    buffer_wbinvl1_vol
7750; GCN2-NEXT:    s_endpgm
7751;
7752; GCN3-LABEL: atomic_dec_i32:
7753; GCN3:       ; %bb.0: ; %entry
7754; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
7755; GCN3-NEXT:    s_load_dword s2, s[4:5], 0x2c
7756; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
7757; GCN3-NEXT:    v_mov_b32_e32 v0, s0
7758; GCN3-NEXT:    v_mov_b32_e32 v1, s1
7759; GCN3-NEXT:    v_mov_b32_e32 v2, s2
7760; GCN3-NEXT:    flat_atomic_dec v[0:1], v2
7761; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7762; GCN3-NEXT:    buffer_wbinvl1_vol
7763; GCN3-NEXT:    s_endpgm
7764entry:
7765  %val = atomicrmw volatile udec_wrap ptr %out, i32 %in syncscope("agent") seq_cst
7766  ret void
7767}
7768
7769define amdgpu_kernel void @atomic_dec_i32_ret(ptr %out, ptr %out2, i32 %in) {
7770; GCN1-LABEL: atomic_dec_i32_ret:
7771; GCN1:       ; %bb.0: ; %entry
7772; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
7773; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xd
7774; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
7775; GCN1-NEXT:    v_mov_b32_e32 v0, s0
7776; GCN1-NEXT:    v_mov_b32_e32 v1, s1
7777; GCN1-NEXT:    v_mov_b32_e32 v2, s4
7778; GCN1-NEXT:    flat_atomic_dec v2, v[0:1], v2 glc
7779; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7780; GCN1-NEXT:    buffer_wbinvl1_vol
7781; GCN1-NEXT:    v_mov_b32_e32 v0, s2
7782; GCN1-NEXT:    v_mov_b32_e32 v1, s3
7783; GCN1-NEXT:    flat_store_dword v[0:1], v2
7784; GCN1-NEXT:    s_endpgm
7785;
7786; GCN2-LABEL: atomic_dec_i32_ret:
7787; GCN2:       ; %bb.0: ; %entry
7788; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
7789; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x34
7790; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
7791; GCN2-NEXT:    v_mov_b32_e32 v0, s0
7792; GCN2-NEXT:    v_mov_b32_e32 v1, s1
7793; GCN2-NEXT:    v_mov_b32_e32 v2, s4
7794; GCN2-NEXT:    flat_atomic_dec v2, v[0:1], v2 glc
7795; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7796; GCN2-NEXT:    buffer_wbinvl1_vol
7797; GCN2-NEXT:    v_mov_b32_e32 v0, s2
7798; GCN2-NEXT:    v_mov_b32_e32 v1, s3
7799; GCN2-NEXT:    flat_store_dword v[0:1], v2
7800; GCN2-NEXT:    s_endpgm
7801;
7802; GCN3-LABEL: atomic_dec_i32_ret:
7803; GCN3:       ; %bb.0: ; %entry
7804; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
7805; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x34
7806; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
7807; GCN3-NEXT:    v_mov_b32_e32 v0, s0
7808; GCN3-NEXT:    v_mov_b32_e32 v1, s1
7809; GCN3-NEXT:    v_mov_b32_e32 v2, s6
7810; GCN3-NEXT:    flat_atomic_dec v2, v[0:1], v2 glc
7811; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7812; GCN3-NEXT:    buffer_wbinvl1_vol
7813; GCN3-NEXT:    v_mov_b32_e32 v0, s2
7814; GCN3-NEXT:    v_mov_b32_e32 v1, s3
7815; GCN3-NEXT:    flat_store_dword v[0:1], v2
7816; GCN3-NEXT:    s_endpgm
7817entry:
7818  %val = atomicrmw volatile udec_wrap ptr %out, i32 %in syncscope("agent") seq_cst
7819  store i32 %val, ptr %out2
7820  ret void
7821}
7822
7823define amdgpu_kernel void @atomic_dec_i32_decr64(ptr %out, i32 %in, i64 %index) {
7824; GCN1-LABEL: atomic_dec_i32_decr64:
7825; GCN1:       ; %bb.0: ; %entry
7826; GCN1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
7827; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x9
7828; GCN1-NEXT:    s_load_dword s4, s[4:5], 0xb
7829; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
7830; GCN1-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
7831; GCN1-NEXT:    s_add_u32 s0, s2, s0
7832; GCN1-NEXT:    s_addc_u32 s1, s3, s1
7833; GCN1-NEXT:    v_mov_b32_e32 v0, s0
7834; GCN1-NEXT:    v_mov_b32_e32 v1, s1
7835; GCN1-NEXT:    v_mov_b32_e32 v2, s4
7836; GCN1-NEXT:    flat_atomic_dec v[0:1], v2
7837; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7838; GCN1-NEXT:    buffer_wbinvl1_vol
7839; GCN1-NEXT:    s_endpgm
7840;
7841; GCN2-LABEL: atomic_dec_i32_decr64:
7842; GCN2:       ; %bb.0: ; %entry
7843; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
7844; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
7845; GCN2-NEXT:    s_load_dword s4, s[4:5], 0x2c
7846; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
7847; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
7848; GCN2-NEXT:    s_add_u32 s0, s2, s0
7849; GCN2-NEXT:    s_addc_u32 s1, s3, s1
7850; GCN2-NEXT:    v_mov_b32_e32 v0, s0
7851; GCN2-NEXT:    v_mov_b32_e32 v1, s1
7852; GCN2-NEXT:    v_mov_b32_e32 v2, s4
7853; GCN2-NEXT:    flat_atomic_dec v[0:1], v2
7854; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7855; GCN2-NEXT:    buffer_wbinvl1_vol
7856; GCN2-NEXT:    s_endpgm
7857;
7858; GCN3-LABEL: atomic_dec_i32_decr64:
7859; GCN3:       ; %bb.0: ; %entry
7860; GCN3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
7861; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
7862; GCN3-NEXT:    s_load_dword s6, s[4:5], 0x2c
7863; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
7864; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
7865; GCN3-NEXT:    s_add_u32 s0, s2, s0
7866; GCN3-NEXT:    s_addc_u32 s1, s3, s1
7867; GCN3-NEXT:    v_mov_b32_e32 v0, s0
7868; GCN3-NEXT:    v_mov_b32_e32 v1, s1
7869; GCN3-NEXT:    v_mov_b32_e32 v2, s6
7870; GCN3-NEXT:    flat_atomic_dec v[0:1], v2
7871; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7872; GCN3-NEXT:    buffer_wbinvl1_vol
7873; GCN3-NEXT:    s_endpgm
7874entry:
7875  %ptr = getelementptr i32, ptr %out, i64 %index
7876  %val = atomicrmw volatile udec_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst
7877  ret void
7878}
7879
7880define amdgpu_kernel void @atomic_dec_i32_ret_decr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
7881; GCN1-LABEL: atomic_dec_i32_ret_decr64:
7882; GCN1:       ; %bb.0: ; %entry
7883; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
7884; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
7885; GCN1-NEXT:    s_load_dword s8, s[4:5], 0xd
7886; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
7887; GCN1-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
7888; GCN1-NEXT:    s_add_u32 s0, s0, s4
7889; GCN1-NEXT:    s_addc_u32 s1, s1, s5
7890; GCN1-NEXT:    v_mov_b32_e32 v0, s0
7891; GCN1-NEXT:    v_mov_b32_e32 v1, s1
7892; GCN1-NEXT:    v_mov_b32_e32 v2, s8
7893; GCN1-NEXT:    flat_atomic_dec v2, v[0:1], v2 glc
7894; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7895; GCN1-NEXT:    buffer_wbinvl1_vol
7896; GCN1-NEXT:    v_mov_b32_e32 v0, s2
7897; GCN1-NEXT:    v_mov_b32_e32 v1, s3
7898; GCN1-NEXT:    flat_store_dword v[0:1], v2
7899; GCN1-NEXT:    s_endpgm
7900;
7901; GCN2-LABEL: atomic_dec_i32_ret_decr64:
7902; GCN2:       ; %bb.0: ; %entry
7903; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
7904; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
7905; GCN2-NEXT:    s_load_dword s8, s[4:5], 0x34
7906; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
7907; GCN2-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
7908; GCN2-NEXT:    s_add_u32 s0, s0, s4
7909; GCN2-NEXT:    s_addc_u32 s1, s1, s5
7910; GCN2-NEXT:    v_mov_b32_e32 v0, s0
7911; GCN2-NEXT:    v_mov_b32_e32 v1, s1
7912; GCN2-NEXT:    v_mov_b32_e32 v2, s8
7913; GCN2-NEXT:    flat_atomic_dec v2, v[0:1], v2 glc
7914; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7915; GCN2-NEXT:    buffer_wbinvl1_vol
7916; GCN2-NEXT:    v_mov_b32_e32 v0, s2
7917; GCN2-NEXT:    v_mov_b32_e32 v1, s3
7918; GCN2-NEXT:    flat_store_dword v[0:1], v2
7919; GCN2-NEXT:    s_endpgm
7920;
7921; GCN3-LABEL: atomic_dec_i32_ret_decr64:
7922; GCN3:       ; %bb.0: ; %entry
7923; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
7924; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
7925; GCN3-NEXT:    s_load_dword s8, s[4:5], 0x34
7926; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
7927; GCN3-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
7928; GCN3-NEXT:    s_add_u32 s0, s0, s4
7929; GCN3-NEXT:    s_addc_u32 s1, s1, s5
7930; GCN3-NEXT:    v_mov_b32_e32 v0, s0
7931; GCN3-NEXT:    v_mov_b32_e32 v1, s1
7932; GCN3-NEXT:    v_mov_b32_e32 v2, s8
7933; GCN3-NEXT:    flat_atomic_dec v2, v[0:1], v2 glc
7934; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7935; GCN3-NEXT:    buffer_wbinvl1_vol
7936; GCN3-NEXT:    v_mov_b32_e32 v0, s2
7937; GCN3-NEXT:    v_mov_b32_e32 v1, s3
7938; GCN3-NEXT:    flat_store_dword v[0:1], v2
7939; GCN3-NEXT:    s_endpgm
7940entry:
7941  %ptr = getelementptr i32, ptr %out, i64 %index
7942  %val = atomicrmw volatile udec_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst
7943  store i32 %val, ptr %out2
7944  ret void
7945}
7946
7947define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) {
7948; GCN1-LABEL: atomic_load_f16_offset:
7949; GCN1:       ; %bb.0:
7950; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
7951; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
7952; GCN1-NEXT:    s_add_u32 s0, s0, 16
7953; GCN1-NEXT:    s_addc_u32 s1, s1, 0
7954; GCN1-NEXT:    v_mov_b32_e32 v0, s0
7955; GCN1-NEXT:    v_mov_b32_e32 v1, s1
7956; GCN1-NEXT:    flat_load_ushort v2, v[0:1] glc
7957; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7958; GCN1-NEXT:    buffer_wbinvl1_vol
7959; GCN1-NEXT:    v_mov_b32_e32 v0, s2
7960; GCN1-NEXT:    v_mov_b32_e32 v1, s3
7961; GCN1-NEXT:    flat_store_short v[0:1], v2
7962; GCN1-NEXT:    s_endpgm
7963;
7964; GCN2-LABEL: atomic_load_f16_offset:
7965; GCN2:       ; %bb.0:
7966; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
7967; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
7968; GCN2-NEXT:    s_add_u32 s0, s0, 16
7969; GCN2-NEXT:    s_addc_u32 s1, s1, 0
7970; GCN2-NEXT:    v_mov_b32_e32 v0, s0
7971; GCN2-NEXT:    v_mov_b32_e32 v1, s1
7972; GCN2-NEXT:    flat_load_ushort v2, v[0:1] glc
7973; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7974; GCN2-NEXT:    buffer_wbinvl1_vol
7975; GCN2-NEXT:    v_mov_b32_e32 v0, s2
7976; GCN2-NEXT:    v_mov_b32_e32 v1, s3
7977; GCN2-NEXT:    flat_store_short v[0:1], v2
7978; GCN2-NEXT:    s_endpgm
7979;
7980; GCN3-LABEL: atomic_load_f16_offset:
7981; GCN3:       ; %bb.0:
7982; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
7983; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
7984; GCN3-NEXT:    v_mov_b32_e32 v0, s0
7985; GCN3-NEXT:    v_mov_b32_e32 v1, s1
7986; GCN3-NEXT:    flat_load_ushort v2, v[0:1] offset:16 glc
7987; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7988; GCN3-NEXT:    buffer_wbinvl1_vol
7989; GCN3-NEXT:    v_mov_b32_e32 v0, s2
7990; GCN3-NEXT:    v_mov_b32_e32 v1, s3
7991; GCN3-NEXT:    flat_store_short v[0:1], v2
7992; GCN3-NEXT:    s_endpgm
7993  %gep = getelementptr half, ptr %in, i64 8
7994  %val = load atomic half, ptr %gep  seq_cst, align 2
7995  store half %val, ptr %out
7996  ret void
7997}
7998
7999define amdgpu_kernel void @atomic_load_f16(ptr %in, ptr %out) {
8000; GCN1-LABEL: atomic_load_f16:
8001; GCN1:       ; %bb.0:
8002; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
8003; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
8004; GCN1-NEXT:    v_mov_b32_e32 v0, s0
8005; GCN1-NEXT:    v_mov_b32_e32 v1, s1
8006; GCN1-NEXT:    flat_load_ushort v2, v[0:1] glc
8007; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8008; GCN1-NEXT:    buffer_wbinvl1_vol
8009; GCN1-NEXT:    v_mov_b32_e32 v0, s2
8010; GCN1-NEXT:    v_mov_b32_e32 v1, s3
8011; GCN1-NEXT:    flat_store_short v[0:1], v2
8012; GCN1-NEXT:    s_endpgm
8013;
8014; GCN2-LABEL: atomic_load_f16:
8015; GCN2:       ; %bb.0:
8016; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
8017; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
8018; GCN2-NEXT:    v_mov_b32_e32 v0, s0
8019; GCN2-NEXT:    v_mov_b32_e32 v1, s1
8020; GCN2-NEXT:    flat_load_ushort v2, v[0:1] glc
8021; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8022; GCN2-NEXT:    buffer_wbinvl1_vol
8023; GCN2-NEXT:    v_mov_b32_e32 v0, s2
8024; GCN2-NEXT:    v_mov_b32_e32 v1, s3
8025; GCN2-NEXT:    flat_store_short v[0:1], v2
8026; GCN2-NEXT:    s_endpgm
8027;
8028; GCN3-LABEL: atomic_load_f16:
8029; GCN3:       ; %bb.0:
8030; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
8031; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
8032; GCN3-NEXT:    v_mov_b32_e32 v0, s0
8033; GCN3-NEXT:    v_mov_b32_e32 v1, s1
8034; GCN3-NEXT:    flat_load_ushort v2, v[0:1] glc
8035; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8036; GCN3-NEXT:    buffer_wbinvl1_vol
8037; GCN3-NEXT:    v_mov_b32_e32 v0, s2
8038; GCN3-NEXT:    v_mov_b32_e32 v1, s3
8039; GCN3-NEXT:    flat_store_short v[0:1], v2
8040; GCN3-NEXT:    s_endpgm
8041  %val = load atomic half, ptr %in seq_cst, align 2
8042  store half %val, ptr %out
8043  ret void
8044}
8045
8046define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) {
8047; GCN1-LABEL: atomic_load_bf16_offset:
8048; GCN1:       ; %bb.0:
8049; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
8050; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
8051; GCN1-NEXT:    s_add_u32 s0, s0, 16
8052; GCN1-NEXT:    s_addc_u32 s1, s1, 0
8053; GCN1-NEXT:    v_mov_b32_e32 v0, s0
8054; GCN1-NEXT:    v_mov_b32_e32 v1, s1
8055; GCN1-NEXT:    flat_load_ushort v2, v[0:1] glc
8056; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8057; GCN1-NEXT:    buffer_wbinvl1_vol
8058; GCN1-NEXT:    v_mov_b32_e32 v0, s2
8059; GCN1-NEXT:    v_mov_b32_e32 v1, s3
8060; GCN1-NEXT:    flat_store_short v[0:1], v2
8061; GCN1-NEXT:    s_endpgm
8062;
8063; GCN2-LABEL: atomic_load_bf16_offset:
8064; GCN2:       ; %bb.0:
8065; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
8066; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
8067; GCN2-NEXT:    s_add_u32 s0, s0, 16
8068; GCN2-NEXT:    s_addc_u32 s1, s1, 0
8069; GCN2-NEXT:    v_mov_b32_e32 v0, s0
8070; GCN2-NEXT:    v_mov_b32_e32 v1, s1
8071; GCN2-NEXT:    flat_load_ushort v2, v[0:1] glc
8072; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8073; GCN2-NEXT:    buffer_wbinvl1_vol
8074; GCN2-NEXT:    v_mov_b32_e32 v0, s2
8075; GCN2-NEXT:    v_mov_b32_e32 v1, s3
8076; GCN2-NEXT:    flat_store_short v[0:1], v2
8077; GCN2-NEXT:    s_endpgm
8078;
8079; GCN3-LABEL: atomic_load_bf16_offset:
8080; GCN3:       ; %bb.0:
8081; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
8082; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
8083; GCN3-NEXT:    v_mov_b32_e32 v0, s0
8084; GCN3-NEXT:    v_mov_b32_e32 v1, s1
8085; GCN3-NEXT:    flat_load_ushort v2, v[0:1] offset:16 glc
8086; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8087; GCN3-NEXT:    buffer_wbinvl1_vol
8088; GCN3-NEXT:    v_mov_b32_e32 v0, s2
8089; GCN3-NEXT:    v_mov_b32_e32 v1, s3
8090; GCN3-NEXT:    flat_store_short v[0:1], v2
8091; GCN3-NEXT:    s_endpgm
8092  %gep = getelementptr bfloat, ptr %in, i64 8
8093  %val = load atomic bfloat, ptr %gep  seq_cst, align 2
8094  store bfloat %val, ptr %out
8095  ret void
8096}
8097
8098define amdgpu_kernel void @atomic_load_bf16(ptr %in, ptr %out) {
8099; GCN1-LABEL: atomic_load_bf16:
8100; GCN1:       ; %bb.0:
8101; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
8102; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
8103; GCN1-NEXT:    v_mov_b32_e32 v0, s0
8104; GCN1-NEXT:    v_mov_b32_e32 v1, s1
8105; GCN1-NEXT:    flat_load_ushort v2, v[0:1] glc
8106; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8107; GCN1-NEXT:    buffer_wbinvl1_vol
8108; GCN1-NEXT:    v_mov_b32_e32 v0, s2
8109; GCN1-NEXT:    v_mov_b32_e32 v1, s3
8110; GCN1-NEXT:    flat_store_short v[0:1], v2
8111; GCN1-NEXT:    s_endpgm
8112;
8113; GCN2-LABEL: atomic_load_bf16:
8114; GCN2:       ; %bb.0:
8115; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
8116; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
8117; GCN2-NEXT:    v_mov_b32_e32 v0, s0
8118; GCN2-NEXT:    v_mov_b32_e32 v1, s1
8119; GCN2-NEXT:    flat_load_ushort v2, v[0:1] glc
8120; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8121; GCN2-NEXT:    buffer_wbinvl1_vol
8122; GCN2-NEXT:    v_mov_b32_e32 v0, s2
8123; GCN2-NEXT:    v_mov_b32_e32 v1, s3
8124; GCN2-NEXT:    flat_store_short v[0:1], v2
8125; GCN2-NEXT:    s_endpgm
8126;
8127; GCN3-LABEL: atomic_load_bf16:
8128; GCN3:       ; %bb.0:
8129; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
8130; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
8131; GCN3-NEXT:    v_mov_b32_e32 v0, s0
8132; GCN3-NEXT:    v_mov_b32_e32 v1, s1
8133; GCN3-NEXT:    flat_load_ushort v2, v[0:1] glc
8134; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8135; GCN3-NEXT:    buffer_wbinvl1_vol
8136; GCN3-NEXT:    v_mov_b32_e32 v0, s2
8137; GCN3-NEXT:    v_mov_b32_e32 v1, s3
8138; GCN3-NEXT:    flat_store_short v[0:1], v2
8139; GCN3-NEXT:    s_endpgm
8140  %val = load atomic bfloat, ptr %in seq_cst, align 2
8141  store bfloat %val, ptr %out
8142  ret void
8143}
8144