xref: /llvm-project/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll (revision eeac0ffaf46cf9f9b0f680b9940cc4b68a0286d8)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=GCN1 %s
3; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN2 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN3 %s
5
6; ---------------------------------------------------------------------
7; atomicrmw xchg
8; ---------------------------------------------------------------------
9
10define void @flat_atomic_xchg_i32_noret(ptr %ptr, i32 %in) {
11; GCN1-LABEL: flat_atomic_xchg_i32_noret:
12; GCN1:       ; %bb.0:
13; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14; GCN1-NEXT:    flat_atomic_swap v[0:1], v2
15; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16; GCN1-NEXT:    buffer_wbinvl1_vol
17; GCN1-NEXT:    s_setpc_b64 s[30:31]
18;
19; GCN2-LABEL: flat_atomic_xchg_i32_noret:
20; GCN2:       ; %bb.0:
21; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22; GCN2-NEXT:    flat_atomic_swap v[0:1], v2
23; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
24; GCN2-NEXT:    buffer_wbinvl1_vol
25; GCN2-NEXT:    s_setpc_b64 s[30:31]
26;
27; GCN3-LABEL: flat_atomic_xchg_i32_noret:
28; GCN3:       ; %bb.0:
29; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30; GCN3-NEXT:    flat_atomic_swap v[0:1], v2
31; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
32; GCN3-NEXT:    buffer_wbinvl1_vol
33; GCN3-NEXT:    s_setpc_b64 s[30:31]
34  %tmp0 = atomicrmw xchg ptr %ptr, i32 %in seq_cst
35  ret void
36}
37
38define void @flat_atomic_xchg_i32_noret_offset(ptr %out, i32 %in) {
39; GCN1-LABEL: flat_atomic_xchg_i32_noret_offset:
40; GCN1:       ; %bb.0:
41; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
42; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
43; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
44; GCN1-NEXT:    flat_atomic_swap v[0:1], v2
45; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
46; GCN1-NEXT:    buffer_wbinvl1_vol
47; GCN1-NEXT:    s_setpc_b64 s[30:31]
48;
49; GCN2-LABEL: flat_atomic_xchg_i32_noret_offset:
50; GCN2:       ; %bb.0:
51; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
52; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
53; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
54; GCN2-NEXT:    flat_atomic_swap v[0:1], v2
55; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
56; GCN2-NEXT:    buffer_wbinvl1_vol
57; GCN2-NEXT:    s_setpc_b64 s[30:31]
58;
59; GCN3-LABEL: flat_atomic_xchg_i32_noret_offset:
60; GCN3:       ; %bb.0:
61; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
62; GCN3-NEXT:    flat_atomic_swap v[0:1], v2 offset:16
63; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
64; GCN3-NEXT:    buffer_wbinvl1_vol
65; GCN3-NEXT:    s_setpc_b64 s[30:31]
66  %gep = getelementptr i32, ptr %out, i32 4
67  %tmp0 = atomicrmw xchg ptr %gep, i32 %in seq_cst
68  ret void
69}
70
71define i32 @flat_atomic_xchg_i32_ret(ptr %ptr, i32 %in) {
72; GCN1-LABEL: flat_atomic_xchg_i32_ret:
73; GCN1:       ; %bb.0:
74; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
75; GCN1-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
76; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
77; GCN1-NEXT:    buffer_wbinvl1_vol
78; GCN1-NEXT:    s_setpc_b64 s[30:31]
79;
80; GCN2-LABEL: flat_atomic_xchg_i32_ret:
81; GCN2:       ; %bb.0:
82; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
83; GCN2-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
84; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
85; GCN2-NEXT:    buffer_wbinvl1_vol
86; GCN2-NEXT:    s_setpc_b64 s[30:31]
87;
88; GCN3-LABEL: flat_atomic_xchg_i32_ret:
89; GCN3:       ; %bb.0:
90; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
91; GCN3-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
92; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
93; GCN3-NEXT:    buffer_wbinvl1_vol
94; GCN3-NEXT:    s_setpc_b64 s[30:31]
95  %result = atomicrmw xchg ptr %ptr, i32 %in seq_cst
96  ret i32 %result
97}
98
99define i32 @flat_atomic_xchg_i32_ret_offset(ptr %out, i32 %in) {
100; GCN1-LABEL: flat_atomic_xchg_i32_ret_offset:
101; GCN1:       ; %bb.0:
102; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
103; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
104; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
105; GCN1-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
106; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
107; GCN1-NEXT:    buffer_wbinvl1_vol
108; GCN1-NEXT:    s_setpc_b64 s[30:31]
109;
110; GCN2-LABEL: flat_atomic_xchg_i32_ret_offset:
111; GCN2:       ; %bb.0:
112; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
113; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
114; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
115; GCN2-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
116; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
117; GCN2-NEXT:    buffer_wbinvl1_vol
118; GCN2-NEXT:    s_setpc_b64 s[30:31]
119;
120; GCN3-LABEL: flat_atomic_xchg_i32_ret_offset:
121; GCN3:       ; %bb.0:
122; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
123; GCN3-NEXT:    flat_atomic_swap v0, v[0:1], v2 offset:16 glc
124; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
125; GCN3-NEXT:    buffer_wbinvl1_vol
126; GCN3-NEXT:    s_setpc_b64 s[30:31]
127  %gep = getelementptr i32, ptr %out, i32 4
128  %result = atomicrmw xchg ptr %gep, i32 %in seq_cst
129  ret i32 %result
130}
131
132define amdgpu_gfx void @flat_atomic_xchg_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) {
133; GCN1-LABEL: flat_atomic_xchg_i32_noret_scalar:
134; GCN1:       ; %bb.0:
135; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
136; GCN1-NEXT:    v_mov_b32_e32 v0, s4
137; GCN1-NEXT:    v_mov_b32_e32 v1, s5
138; GCN1-NEXT:    v_mov_b32_e32 v2, s6
139; GCN1-NEXT:    flat_atomic_swap v[0:1], v2
140; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
141; GCN1-NEXT:    buffer_wbinvl1_vol
142; GCN1-NEXT:    s_setpc_b64 s[30:31]
143;
144; GCN2-LABEL: flat_atomic_xchg_i32_noret_scalar:
145; GCN2:       ; %bb.0:
146; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
147; GCN2-NEXT:    v_mov_b32_e32 v0, s4
148; GCN2-NEXT:    v_mov_b32_e32 v1, s5
149; GCN2-NEXT:    v_mov_b32_e32 v2, s6
150; GCN2-NEXT:    flat_atomic_swap v[0:1], v2
151; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
152; GCN2-NEXT:    buffer_wbinvl1_vol
153; GCN2-NEXT:    s_setpc_b64 s[30:31]
154;
155; GCN3-LABEL: flat_atomic_xchg_i32_noret_scalar:
156; GCN3:       ; %bb.0:
157; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
158; GCN3-NEXT:    v_mov_b32_e32 v0, s4
159; GCN3-NEXT:    v_mov_b32_e32 v1, s5
160; GCN3-NEXT:    v_mov_b32_e32 v2, s6
161; GCN3-NEXT:    flat_atomic_swap v[0:1], v2
162; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
163; GCN3-NEXT:    buffer_wbinvl1_vol
164; GCN3-NEXT:    s_setpc_b64 s[30:31]
165  %tmp0 = atomicrmw xchg ptr %ptr, i32 %in seq_cst
166  ret void
167}
168
169define amdgpu_gfx void @flat_atomic_xchg_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) {
170; GCN1-LABEL: flat_atomic_xchg_i32_noret_offset_scalar:
171; GCN1:       ; %bb.0:
172; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
173; GCN1-NEXT:    s_add_u32 s34, s4, 16
174; GCN1-NEXT:    s_addc_u32 s35, s5, 0
175; GCN1-NEXT:    v_mov_b32_e32 v0, s34
176; GCN1-NEXT:    v_mov_b32_e32 v1, s35
177; GCN1-NEXT:    v_mov_b32_e32 v2, s6
178; GCN1-NEXT:    flat_atomic_swap v[0:1], v2
179; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
180; GCN1-NEXT:    buffer_wbinvl1_vol
181; GCN1-NEXT:    s_setpc_b64 s[30:31]
182;
183; GCN2-LABEL: flat_atomic_xchg_i32_noret_offset_scalar:
184; GCN2:       ; %bb.0:
185; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
186; GCN2-NEXT:    s_add_u32 s34, s4, 16
187; GCN2-NEXT:    s_addc_u32 s35, s5, 0
188; GCN2-NEXT:    v_mov_b32_e32 v0, s34
189; GCN2-NEXT:    v_mov_b32_e32 v1, s35
190; GCN2-NEXT:    v_mov_b32_e32 v2, s6
191; GCN2-NEXT:    flat_atomic_swap v[0:1], v2
192; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
193; GCN2-NEXT:    buffer_wbinvl1_vol
194; GCN2-NEXT:    s_setpc_b64 s[30:31]
195;
196; GCN3-LABEL: flat_atomic_xchg_i32_noret_offset_scalar:
197; GCN3:       ; %bb.0:
198; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
199; GCN3-NEXT:    v_mov_b32_e32 v0, s4
200; GCN3-NEXT:    v_mov_b32_e32 v1, s5
201; GCN3-NEXT:    v_mov_b32_e32 v2, s6
202; GCN3-NEXT:    flat_atomic_swap v[0:1], v2 offset:16
203; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
204; GCN3-NEXT:    buffer_wbinvl1_vol
205; GCN3-NEXT:    s_setpc_b64 s[30:31]
206  %gep = getelementptr i32, ptr %out, i32 4
207  %tmp0 = atomicrmw xchg ptr %gep, i32 %in seq_cst
208  ret void
209}
210
211define amdgpu_gfx i32 @flat_atomic_xchg_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) {
212; GCN1-LABEL: flat_atomic_xchg_i32_ret_scalar:
213; GCN1:       ; %bb.0:
214; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
215; GCN1-NEXT:    v_mov_b32_e32 v0, s4
216; GCN1-NEXT:    v_mov_b32_e32 v1, s5
217; GCN1-NEXT:    v_mov_b32_e32 v2, s6
218; GCN1-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
219; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
220; GCN1-NEXT:    buffer_wbinvl1_vol
221; GCN1-NEXT:    s_setpc_b64 s[30:31]
222;
223; GCN2-LABEL: flat_atomic_xchg_i32_ret_scalar:
224; GCN2:       ; %bb.0:
225; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
226; GCN2-NEXT:    v_mov_b32_e32 v0, s4
227; GCN2-NEXT:    v_mov_b32_e32 v1, s5
228; GCN2-NEXT:    v_mov_b32_e32 v2, s6
229; GCN2-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
230; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
231; GCN2-NEXT:    buffer_wbinvl1_vol
232; GCN2-NEXT:    s_setpc_b64 s[30:31]
233;
234; GCN3-LABEL: flat_atomic_xchg_i32_ret_scalar:
235; GCN3:       ; %bb.0:
236; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
237; GCN3-NEXT:    v_mov_b32_e32 v0, s4
238; GCN3-NEXT:    v_mov_b32_e32 v1, s5
239; GCN3-NEXT:    v_mov_b32_e32 v2, s6
240; GCN3-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
241; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
242; GCN3-NEXT:    buffer_wbinvl1_vol
243; GCN3-NEXT:    s_setpc_b64 s[30:31]
244  %result = atomicrmw xchg ptr %ptr, i32 %in seq_cst
245  ret i32 %result
246}
247
248define amdgpu_gfx i32 @flat_atomic_xchg_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) {
249; GCN1-LABEL: flat_atomic_xchg_i32_ret_offset_scalar:
250; GCN1:       ; %bb.0:
251; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
252; GCN1-NEXT:    s_add_u32 s34, s4, 16
253; GCN1-NEXT:    s_addc_u32 s35, s5, 0
254; GCN1-NEXT:    v_mov_b32_e32 v0, s34
255; GCN1-NEXT:    v_mov_b32_e32 v1, s35
256; GCN1-NEXT:    v_mov_b32_e32 v2, s6
257; GCN1-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
258; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
259; GCN1-NEXT:    buffer_wbinvl1_vol
260; GCN1-NEXT:    s_setpc_b64 s[30:31]
261;
262; GCN2-LABEL: flat_atomic_xchg_i32_ret_offset_scalar:
263; GCN2:       ; %bb.0:
264; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
265; GCN2-NEXT:    s_add_u32 s34, s4, 16
266; GCN2-NEXT:    s_addc_u32 s35, s5, 0
267; GCN2-NEXT:    v_mov_b32_e32 v0, s34
268; GCN2-NEXT:    v_mov_b32_e32 v1, s35
269; GCN2-NEXT:    v_mov_b32_e32 v2, s6
270; GCN2-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
271; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
272; GCN2-NEXT:    buffer_wbinvl1_vol
273; GCN2-NEXT:    s_setpc_b64 s[30:31]
274;
275; GCN3-LABEL: flat_atomic_xchg_i32_ret_offset_scalar:
276; GCN3:       ; %bb.0:
277; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
278; GCN3-NEXT:    v_mov_b32_e32 v0, s4
279; GCN3-NEXT:    v_mov_b32_e32 v1, s5
280; GCN3-NEXT:    v_mov_b32_e32 v2, s6
281; GCN3-NEXT:    flat_atomic_swap v0, v[0:1], v2 offset:16 glc
282; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
283; GCN3-NEXT:    buffer_wbinvl1_vol
284; GCN3-NEXT:    s_setpc_b64 s[30:31]
285  %gep = getelementptr i32, ptr %out, i32 4
286  %result = atomicrmw xchg ptr %gep, i32 %in seq_cst
287  ret i32 %result
288}
289
290define void @flat_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
291; GCN1-LABEL: flat_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory:
292; GCN1:       ; %bb.0:
293; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
294; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
295; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
296; GCN1-NEXT:    flat_atomic_swap v[0:1], v2
297; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
298; GCN1-NEXT:    buffer_wbinvl1_vol
299; GCN1-NEXT:    s_setpc_b64 s[30:31]
300;
301; GCN2-LABEL: flat_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory:
302; GCN2:       ; %bb.0:
303; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
304; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
305; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
306; GCN2-NEXT:    flat_atomic_swap v[0:1], v2
307; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
308; GCN2-NEXT:    buffer_wbinvl1_vol
309; GCN2-NEXT:    s_setpc_b64 s[30:31]
310;
311; GCN3-LABEL: flat_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory:
312; GCN3:       ; %bb.0:
313; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
314; GCN3-NEXT:    flat_atomic_swap v[0:1], v2 offset:16
315; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
316; GCN3-NEXT:    buffer_wbinvl1_vol
317; GCN3-NEXT:    s_setpc_b64 s[30:31]
318  %gep = getelementptr i32, ptr %out, i64 4
319  %tmp0 = atomicrmw xchg ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
320  ret void
321}
322
323define i32 @flat_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
324; GCN1-LABEL: flat_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory:
325; GCN1:       ; %bb.0:
326; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
327; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
328; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
329; GCN1-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
330; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
331; GCN1-NEXT:    buffer_wbinvl1_vol
332; GCN1-NEXT:    s_setpc_b64 s[30:31]
333;
334; GCN2-LABEL: flat_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory:
335; GCN2:       ; %bb.0:
336; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
337; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
338; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
339; GCN2-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
340; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
341; GCN2-NEXT:    buffer_wbinvl1_vol
342; GCN2-NEXT:    s_setpc_b64 s[30:31]
343;
344; GCN3-LABEL: flat_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory:
345; GCN3:       ; %bb.0:
346; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
347; GCN3-NEXT:    flat_atomic_swap v0, v[0:1], v2 offset:16 glc
348; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
349; GCN3-NEXT:    buffer_wbinvl1_vol
350; GCN3-NEXT:    s_setpc_b64 s[30:31]
351  %gep = getelementptr i32, ptr %out, i64 4
352  %result = atomicrmw xchg ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
353  ret i32 %result
354}
355
356; ---------------------------------------------------------------------
357; atomicrmw xchg f32
358; ---------------------------------------------------------------------
359
360define void @flat_atomic_xchg_f32_noret(ptr %ptr, float %in) {
361; GCN1-LABEL: flat_atomic_xchg_f32_noret:
362; GCN1:       ; %bb.0:
363; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
364; GCN1-NEXT:    flat_atomic_swap v[0:1], v2
365; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
366; GCN1-NEXT:    buffer_wbinvl1_vol
367; GCN1-NEXT:    s_setpc_b64 s[30:31]
368;
369; GCN2-LABEL: flat_atomic_xchg_f32_noret:
370; GCN2:       ; %bb.0:
371; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
372; GCN2-NEXT:    flat_atomic_swap v[0:1], v2
373; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
374; GCN2-NEXT:    buffer_wbinvl1_vol
375; GCN2-NEXT:    s_setpc_b64 s[30:31]
376;
377; GCN3-LABEL: flat_atomic_xchg_f32_noret:
378; GCN3:       ; %bb.0:
379; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
380; GCN3-NEXT:    flat_atomic_swap v[0:1], v2
381; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
382; GCN3-NEXT:    buffer_wbinvl1_vol
383; GCN3-NEXT:    s_setpc_b64 s[30:31]
384  %tmp0 = atomicrmw xchg ptr %ptr, float %in seq_cst
385  ret void
386}
387
388define void @flat_atomic_xchg_f32_noret_offset(ptr %out, float %in) {
389; GCN1-LABEL: flat_atomic_xchg_f32_noret_offset:
390; GCN1:       ; %bb.0:
391; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
392; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
393; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
394; GCN1-NEXT:    flat_atomic_swap v[0:1], v2
395; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
396; GCN1-NEXT:    buffer_wbinvl1_vol
397; GCN1-NEXT:    s_setpc_b64 s[30:31]
398;
399; GCN2-LABEL: flat_atomic_xchg_f32_noret_offset:
400; GCN2:       ; %bb.0:
401; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
402; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
403; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
404; GCN2-NEXT:    flat_atomic_swap v[0:1], v2
405; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
406; GCN2-NEXT:    buffer_wbinvl1_vol
407; GCN2-NEXT:    s_setpc_b64 s[30:31]
408;
409; GCN3-LABEL: flat_atomic_xchg_f32_noret_offset:
410; GCN3:       ; %bb.0:
411; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
412; GCN3-NEXT:    flat_atomic_swap v[0:1], v2 offset:16
413; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
414; GCN3-NEXT:    buffer_wbinvl1_vol
415; GCN3-NEXT:    s_setpc_b64 s[30:31]
416  %gep = getelementptr float, ptr %out, i32 4
417  %tmp0 = atomicrmw xchg ptr %gep, float %in seq_cst
418  ret void
419}
420
421define float @flat_atomic_xchg_f32_ret(ptr %ptr, float %in) {
422; GCN1-LABEL: flat_atomic_xchg_f32_ret:
423; GCN1:       ; %bb.0:
424; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
425; GCN1-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
426; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
427; GCN1-NEXT:    buffer_wbinvl1_vol
428; GCN1-NEXT:    s_setpc_b64 s[30:31]
429;
430; GCN2-LABEL: flat_atomic_xchg_f32_ret:
431; GCN2:       ; %bb.0:
432; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
433; GCN2-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
434; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
435; GCN2-NEXT:    buffer_wbinvl1_vol
436; GCN2-NEXT:    s_setpc_b64 s[30:31]
437;
438; GCN3-LABEL: flat_atomic_xchg_f32_ret:
439; GCN3:       ; %bb.0:
440; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
441; GCN3-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
442; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
443; GCN3-NEXT:    buffer_wbinvl1_vol
444; GCN3-NEXT:    s_setpc_b64 s[30:31]
445  %result = atomicrmw xchg ptr %ptr, float %in seq_cst
446  ret float %result
447}
448
449define float @flat_atomic_xchg_f32_ret_offset(ptr %out, float %in) {
450; GCN1-LABEL: flat_atomic_xchg_f32_ret_offset:
451; GCN1:       ; %bb.0:
452; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
453; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
454; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
455; GCN1-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
456; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
457; GCN1-NEXT:    buffer_wbinvl1_vol
458; GCN1-NEXT:    s_setpc_b64 s[30:31]
459;
460; GCN2-LABEL: flat_atomic_xchg_f32_ret_offset:
461; GCN2:       ; %bb.0:
462; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
463; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
464; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
465; GCN2-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
466; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
467; GCN2-NEXT:    buffer_wbinvl1_vol
468; GCN2-NEXT:    s_setpc_b64 s[30:31]
469;
470; GCN3-LABEL: flat_atomic_xchg_f32_ret_offset:
471; GCN3:       ; %bb.0:
472; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
473; GCN3-NEXT:    flat_atomic_swap v0, v[0:1], v2 offset:16 glc
474; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
475; GCN3-NEXT:    buffer_wbinvl1_vol
476; GCN3-NEXT:    s_setpc_b64 s[30:31]
477  %gep = getelementptr float, ptr %out, i32 4
478  %result = atomicrmw xchg ptr %gep, float %in seq_cst
479  ret float %result
480}
481
482define amdgpu_gfx void @flat_atomic_xchg_f32_noret_scalar(ptr inreg %ptr, float inreg %in) {
483; GCN1-LABEL: flat_atomic_xchg_f32_noret_scalar:
484; GCN1:       ; %bb.0:
485; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
486; GCN1-NEXT:    v_mov_b32_e32 v0, s4
487; GCN1-NEXT:    v_mov_b32_e32 v1, s5
488; GCN1-NEXT:    v_mov_b32_e32 v2, s6
489; GCN1-NEXT:    flat_atomic_swap v[0:1], v2
490; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
491; GCN1-NEXT:    buffer_wbinvl1_vol
492; GCN1-NEXT:    s_setpc_b64 s[30:31]
493;
494; GCN2-LABEL: flat_atomic_xchg_f32_noret_scalar:
495; GCN2:       ; %bb.0:
496; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
497; GCN2-NEXT:    v_mov_b32_e32 v0, s4
498; GCN2-NEXT:    v_mov_b32_e32 v1, s5
499; GCN2-NEXT:    v_mov_b32_e32 v2, s6
500; GCN2-NEXT:    flat_atomic_swap v[0:1], v2
501; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
502; GCN2-NEXT:    buffer_wbinvl1_vol
503; GCN2-NEXT:    s_setpc_b64 s[30:31]
504;
505; GCN3-LABEL: flat_atomic_xchg_f32_noret_scalar:
506; GCN3:       ; %bb.0:
507; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
508; GCN3-NEXT:    v_mov_b32_e32 v0, s4
509; GCN3-NEXT:    v_mov_b32_e32 v1, s5
510; GCN3-NEXT:    v_mov_b32_e32 v2, s6
511; GCN3-NEXT:    flat_atomic_swap v[0:1], v2
512; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
513; GCN3-NEXT:    buffer_wbinvl1_vol
514; GCN3-NEXT:    s_setpc_b64 s[30:31]
515  %tmp0 = atomicrmw xchg ptr %ptr, float %in seq_cst
516  ret void
517}
518
519define amdgpu_gfx void @flat_atomic_xchg_f32_noret_offset_scalar(ptr inreg %out, float inreg %in) {
520; GCN1-LABEL: flat_atomic_xchg_f32_noret_offset_scalar:
521; GCN1:       ; %bb.0:
522; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
523; GCN1-NEXT:    s_add_u32 s34, s4, 16
524; GCN1-NEXT:    s_addc_u32 s35, s5, 0
525; GCN1-NEXT:    v_mov_b32_e32 v0, s34
526; GCN1-NEXT:    v_mov_b32_e32 v1, s35
527; GCN1-NEXT:    v_mov_b32_e32 v2, s6
528; GCN1-NEXT:    flat_atomic_swap v[0:1], v2
529; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
530; GCN1-NEXT:    buffer_wbinvl1_vol
531; GCN1-NEXT:    s_setpc_b64 s[30:31]
532;
533; GCN2-LABEL: flat_atomic_xchg_f32_noret_offset_scalar:
534; GCN2:       ; %bb.0:
535; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
536; GCN2-NEXT:    s_add_u32 s34, s4, 16
537; GCN2-NEXT:    s_addc_u32 s35, s5, 0
538; GCN2-NEXT:    v_mov_b32_e32 v0, s34
539; GCN2-NEXT:    v_mov_b32_e32 v1, s35
540; GCN2-NEXT:    v_mov_b32_e32 v2, s6
541; GCN2-NEXT:    flat_atomic_swap v[0:1], v2
542; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
543; GCN2-NEXT:    buffer_wbinvl1_vol
544; GCN2-NEXT:    s_setpc_b64 s[30:31]
545;
546; GCN3-LABEL: flat_atomic_xchg_f32_noret_offset_scalar:
547; GCN3:       ; %bb.0:
548; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
549; GCN3-NEXT:    v_mov_b32_e32 v0, s4
550; GCN3-NEXT:    v_mov_b32_e32 v1, s5
551; GCN3-NEXT:    v_mov_b32_e32 v2, s6
552; GCN3-NEXT:    flat_atomic_swap v[0:1], v2 offset:16
553; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
554; GCN3-NEXT:    buffer_wbinvl1_vol
555; GCN3-NEXT:    s_setpc_b64 s[30:31]
556  %gep = getelementptr float, ptr %out, i32 4
557  %tmp0 = atomicrmw xchg ptr %gep, float %in seq_cst
558  ret void
559}
560
561define amdgpu_gfx float @flat_atomic_xchg_f32_ret_scalar(ptr inreg %ptr, float inreg %in) {
562; GCN1-LABEL: flat_atomic_xchg_f32_ret_scalar:
563; GCN1:       ; %bb.0:
564; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
565; GCN1-NEXT:    v_mov_b32_e32 v0, s4
566; GCN1-NEXT:    v_mov_b32_e32 v1, s5
567; GCN1-NEXT:    v_mov_b32_e32 v2, s6
568; GCN1-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
569; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
570; GCN1-NEXT:    buffer_wbinvl1_vol
571; GCN1-NEXT:    s_setpc_b64 s[30:31]
572;
573; GCN2-LABEL: flat_atomic_xchg_f32_ret_scalar:
574; GCN2:       ; %bb.0:
575; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
576; GCN2-NEXT:    v_mov_b32_e32 v0, s4
577; GCN2-NEXT:    v_mov_b32_e32 v1, s5
578; GCN2-NEXT:    v_mov_b32_e32 v2, s6
579; GCN2-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
580; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
581; GCN2-NEXT:    buffer_wbinvl1_vol
582; GCN2-NEXT:    s_setpc_b64 s[30:31]
583;
584; GCN3-LABEL: flat_atomic_xchg_f32_ret_scalar:
585; GCN3:       ; %bb.0:
586; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
587; GCN3-NEXT:    v_mov_b32_e32 v0, s4
588; GCN3-NEXT:    v_mov_b32_e32 v1, s5
589; GCN3-NEXT:    v_mov_b32_e32 v2, s6
590; GCN3-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
591; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
592; GCN3-NEXT:    buffer_wbinvl1_vol
593; GCN3-NEXT:    s_setpc_b64 s[30:31]
594  %result = atomicrmw xchg ptr %ptr, float %in seq_cst
595  ret float %result
596}
597
598define amdgpu_gfx float @flat_atomic_xchg_f32_ret_offset_scalar(ptr inreg %out, float inreg %in) {
599; GCN1-LABEL: flat_atomic_xchg_f32_ret_offset_scalar:
600; GCN1:       ; %bb.0:
601; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
602; GCN1-NEXT:    s_add_u32 s34, s4, 16
603; GCN1-NEXT:    s_addc_u32 s35, s5, 0
604; GCN1-NEXT:    v_mov_b32_e32 v0, s34
605; GCN1-NEXT:    v_mov_b32_e32 v1, s35
606; GCN1-NEXT:    v_mov_b32_e32 v2, s6
607; GCN1-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
608; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
609; GCN1-NEXT:    buffer_wbinvl1_vol
610; GCN1-NEXT:    s_setpc_b64 s[30:31]
611;
612; GCN2-LABEL: flat_atomic_xchg_f32_ret_offset_scalar:
613; GCN2:       ; %bb.0:
614; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
615; GCN2-NEXT:    s_add_u32 s34, s4, 16
616; GCN2-NEXT:    s_addc_u32 s35, s5, 0
617; GCN2-NEXT:    v_mov_b32_e32 v0, s34
618; GCN2-NEXT:    v_mov_b32_e32 v1, s35
619; GCN2-NEXT:    v_mov_b32_e32 v2, s6
620; GCN2-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
621; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
622; GCN2-NEXT:    buffer_wbinvl1_vol
623; GCN2-NEXT:    s_setpc_b64 s[30:31]
624;
625; GCN3-LABEL: flat_atomic_xchg_f32_ret_offset_scalar:
626; GCN3:       ; %bb.0:
627; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
628; GCN3-NEXT:    v_mov_b32_e32 v0, s4
629; GCN3-NEXT:    v_mov_b32_e32 v1, s5
630; GCN3-NEXT:    v_mov_b32_e32 v2, s6
631; GCN3-NEXT:    flat_atomic_swap v0, v[0:1], v2 offset:16 glc
632; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
633; GCN3-NEXT:    buffer_wbinvl1_vol
634; GCN3-NEXT:    s_setpc_b64 s[30:31]
635  %gep = getelementptr float, ptr %out, i32 4
636  %result = atomicrmw xchg ptr %gep, float %in seq_cst
637  ret float %result
638}
639
640define void @flat_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory(ptr %out, float %in) {
641; GCN1-LABEL: flat_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory:
642; GCN1:       ; %bb.0:
643; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
644; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
645; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
646; GCN1-NEXT:    flat_atomic_swap v[0:1], v2
647; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
648; GCN1-NEXT:    buffer_wbinvl1_vol
649; GCN1-NEXT:    s_setpc_b64 s[30:31]
650;
651; GCN2-LABEL: flat_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory:
652; GCN2:       ; %bb.0:
653; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
654; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
655; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
656; GCN2-NEXT:    flat_atomic_swap v[0:1], v2
657; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
658; GCN2-NEXT:    buffer_wbinvl1_vol
659; GCN2-NEXT:    s_setpc_b64 s[30:31]
660;
661; GCN3-LABEL: flat_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory:
662; GCN3:       ; %bb.0:
663; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
664; GCN3-NEXT:    flat_atomic_swap v[0:1], v2 offset:16
665; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
666; GCN3-NEXT:    buffer_wbinvl1_vol
667; GCN3-NEXT:    s_setpc_b64 s[30:31]
668  %gep = getelementptr float, ptr %out, i64 4
669  %tmp0 = atomicrmw xchg ptr %gep, float %in seq_cst, !amdgpu.no.remote.memory !0
670  ret void
671}
672
673define float @flat_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory(ptr %out, float %in) {
674; GCN1-LABEL: flat_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory:
675; GCN1:       ; %bb.0:
676; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
677; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
678; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
679; GCN1-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
680; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
681; GCN1-NEXT:    buffer_wbinvl1_vol
682; GCN1-NEXT:    s_setpc_b64 s[30:31]
683;
684; GCN2-LABEL: flat_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory:
685; GCN2:       ; %bb.0:
686; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
687; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
688; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
689; GCN2-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
690; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
691; GCN2-NEXT:    buffer_wbinvl1_vol
692; GCN2-NEXT:    s_setpc_b64 s[30:31]
693;
694; GCN3-LABEL: flat_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory:
695; GCN3:       ; %bb.0:
696; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
697; GCN3-NEXT:    flat_atomic_swap v0, v[0:1], v2 offset:16 glc
698; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
699; GCN3-NEXT:    buffer_wbinvl1_vol
700; GCN3-NEXT:    s_setpc_b64 s[30:31]
701  %gep = getelementptr float, ptr %out, i64 4
702  %result = atomicrmw xchg ptr %gep, float %in seq_cst, !amdgpu.no.remote.memory !0
703  ret float %result
704}
705
706; ---------------------------------------------------------------------
707; atomicrmw add
708; ---------------------------------------------------------------------
709
710define void @flat_atomic_add_i32_noret(ptr %ptr, i32 %in) {
711; GCN1-LABEL: flat_atomic_add_i32_noret:
712; GCN1:       ; %bb.0:
713; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
714; GCN1-NEXT:    flat_atomic_add v[0:1], v2
715; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
716; GCN1-NEXT:    buffer_wbinvl1_vol
717; GCN1-NEXT:    s_setpc_b64 s[30:31]
718;
719; GCN2-LABEL: flat_atomic_add_i32_noret:
720; GCN2:       ; %bb.0:
721; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
722; GCN2-NEXT:    flat_atomic_add v[0:1], v2
723; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
724; GCN2-NEXT:    buffer_wbinvl1_vol
725; GCN2-NEXT:    s_setpc_b64 s[30:31]
726;
727; GCN3-LABEL: flat_atomic_add_i32_noret:
728; GCN3:       ; %bb.0:
729; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
730; GCN3-NEXT:    flat_atomic_add v[0:1], v2
731; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
732; GCN3-NEXT:    buffer_wbinvl1_vol
733; GCN3-NEXT:    s_setpc_b64 s[30:31]
734  %tmp0 = atomicrmw add ptr %ptr, i32 %in seq_cst
735  ret void
736}
737
738define void @flat_atomic_add_i32_noret_offset(ptr %out, i32 %in) {
739; GCN1-LABEL: flat_atomic_add_i32_noret_offset:
740; GCN1:       ; %bb.0:
741; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
742; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
743; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
744; GCN1-NEXT:    flat_atomic_add v[0:1], v2
745; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
746; GCN1-NEXT:    buffer_wbinvl1_vol
747; GCN1-NEXT:    s_setpc_b64 s[30:31]
748;
749; GCN2-LABEL: flat_atomic_add_i32_noret_offset:
750; GCN2:       ; %bb.0:
751; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
752; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
753; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
754; GCN2-NEXT:    flat_atomic_add v[0:1], v2
755; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
756; GCN2-NEXT:    buffer_wbinvl1_vol
757; GCN2-NEXT:    s_setpc_b64 s[30:31]
758;
759; GCN3-LABEL: flat_atomic_add_i32_noret_offset:
760; GCN3:       ; %bb.0:
761; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
762; GCN3-NEXT:    flat_atomic_add v[0:1], v2 offset:16
763; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
764; GCN3-NEXT:    buffer_wbinvl1_vol
765; GCN3-NEXT:    s_setpc_b64 s[30:31]
766  %gep = getelementptr i32, ptr %out, i32 4
767  %tmp0 = atomicrmw add ptr %gep, i32 %in seq_cst
768  ret void
769}
770
771define i32 @flat_atomic_add_i32_ret(ptr %ptr, i32 %in) {
772; GCN1-LABEL: flat_atomic_add_i32_ret:
773; GCN1:       ; %bb.0:
774; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
775; GCN1-NEXT:    flat_atomic_add v0, v[0:1], v2 glc
776; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
777; GCN1-NEXT:    buffer_wbinvl1_vol
778; GCN1-NEXT:    s_setpc_b64 s[30:31]
779;
780; GCN2-LABEL: flat_atomic_add_i32_ret:
781; GCN2:       ; %bb.0:
782; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
783; GCN2-NEXT:    flat_atomic_add v0, v[0:1], v2 glc
784; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
785; GCN2-NEXT:    buffer_wbinvl1_vol
786; GCN2-NEXT:    s_setpc_b64 s[30:31]
787;
788; GCN3-LABEL: flat_atomic_add_i32_ret:
789; GCN3:       ; %bb.0:
790; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
791; GCN3-NEXT:    flat_atomic_add v0, v[0:1], v2 glc
792; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
793; GCN3-NEXT:    buffer_wbinvl1_vol
794; GCN3-NEXT:    s_setpc_b64 s[30:31]
795  %result = atomicrmw add ptr %ptr, i32 %in seq_cst
796  ret i32 %result
797}
798
799define i32 @flat_atomic_add_i32_ret_offset(ptr %out, i32 %in) {
800; GCN1-LABEL: flat_atomic_add_i32_ret_offset:
801; GCN1:       ; %bb.0:
802; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
803; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
804; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
805; GCN1-NEXT:    flat_atomic_add v0, v[0:1], v2 glc
806; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
807; GCN1-NEXT:    buffer_wbinvl1_vol
808; GCN1-NEXT:    s_setpc_b64 s[30:31]
809;
810; GCN2-LABEL: flat_atomic_add_i32_ret_offset:
811; GCN2:       ; %bb.0:
812; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
813; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
814; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
815; GCN2-NEXT:    flat_atomic_add v0, v[0:1], v2 glc
816; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
817; GCN2-NEXT:    buffer_wbinvl1_vol
818; GCN2-NEXT:    s_setpc_b64 s[30:31]
819;
820; GCN3-LABEL: flat_atomic_add_i32_ret_offset:
821; GCN3:       ; %bb.0:
822; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
823; GCN3-NEXT:    flat_atomic_add v0, v[0:1], v2 offset:16 glc
824; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
825; GCN3-NEXT:    buffer_wbinvl1_vol
826; GCN3-NEXT:    s_setpc_b64 s[30:31]
827  %gep = getelementptr i32, ptr %out, i32 4
828  %result = atomicrmw add ptr %gep, i32 %in seq_cst
829  ret i32 %result
830}
831
832define amdgpu_gfx void @flat_atomic_add_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) {
833; GCN1-LABEL: flat_atomic_add_i32_noret_scalar:
834; GCN1:       ; %bb.0:
835; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
836; GCN1-NEXT:    v_mov_b32_e32 v0, s4
837; GCN1-NEXT:    v_mov_b32_e32 v1, s5
838; GCN1-NEXT:    v_mov_b32_e32 v2, s6
839; GCN1-NEXT:    flat_atomic_add v[0:1], v2
840; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
841; GCN1-NEXT:    buffer_wbinvl1_vol
842; GCN1-NEXT:    s_setpc_b64 s[30:31]
843;
844; GCN2-LABEL: flat_atomic_add_i32_noret_scalar:
845; GCN2:       ; %bb.0:
846; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
847; GCN2-NEXT:    v_mov_b32_e32 v0, s4
848; GCN2-NEXT:    v_mov_b32_e32 v1, s5
849; GCN2-NEXT:    v_mov_b32_e32 v2, s6
850; GCN2-NEXT:    flat_atomic_add v[0:1], v2
851; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
852; GCN2-NEXT:    buffer_wbinvl1_vol
853; GCN2-NEXT:    s_setpc_b64 s[30:31]
854;
855; GCN3-LABEL: flat_atomic_add_i32_noret_scalar:
856; GCN3:       ; %bb.0:
857; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
858; GCN3-NEXT:    v_mov_b32_e32 v0, s4
859; GCN3-NEXT:    v_mov_b32_e32 v1, s5
860; GCN3-NEXT:    v_mov_b32_e32 v2, s6
861; GCN3-NEXT:    flat_atomic_add v[0:1], v2
862; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
863; GCN3-NEXT:    buffer_wbinvl1_vol
864; GCN3-NEXT:    s_setpc_b64 s[30:31]
865  %tmp0 = atomicrmw add ptr %ptr, i32 %in seq_cst
866  ret void
867}
868
869define amdgpu_gfx void @flat_atomic_add_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) {
870; GCN1-LABEL: flat_atomic_add_i32_noret_offset_scalar:
871; GCN1:       ; %bb.0:
872; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
873; GCN1-NEXT:    s_add_u32 s34, s4, 16
874; GCN1-NEXT:    s_addc_u32 s35, s5, 0
875; GCN1-NEXT:    v_mov_b32_e32 v0, s34
876; GCN1-NEXT:    v_mov_b32_e32 v1, s35
877; GCN1-NEXT:    v_mov_b32_e32 v2, s6
878; GCN1-NEXT:    flat_atomic_add v[0:1], v2
879; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
880; GCN1-NEXT:    buffer_wbinvl1_vol
881; GCN1-NEXT:    s_setpc_b64 s[30:31]
882;
883; GCN2-LABEL: flat_atomic_add_i32_noret_offset_scalar:
884; GCN2:       ; %bb.0:
885; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
886; GCN2-NEXT:    s_add_u32 s34, s4, 16
887; GCN2-NEXT:    s_addc_u32 s35, s5, 0
888; GCN2-NEXT:    v_mov_b32_e32 v0, s34
889; GCN2-NEXT:    v_mov_b32_e32 v1, s35
890; GCN2-NEXT:    v_mov_b32_e32 v2, s6
891; GCN2-NEXT:    flat_atomic_add v[0:1], v2
892; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
893; GCN2-NEXT:    buffer_wbinvl1_vol
894; GCN2-NEXT:    s_setpc_b64 s[30:31]
895;
896; GCN3-LABEL: flat_atomic_add_i32_noret_offset_scalar:
897; GCN3:       ; %bb.0:
898; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
899; GCN3-NEXT:    v_mov_b32_e32 v0, s4
900; GCN3-NEXT:    v_mov_b32_e32 v1, s5
901; GCN3-NEXT:    v_mov_b32_e32 v2, s6
902; GCN3-NEXT:    flat_atomic_add v[0:1], v2 offset:16
903; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
904; GCN3-NEXT:    buffer_wbinvl1_vol
905; GCN3-NEXT:    s_setpc_b64 s[30:31]
906  %gep = getelementptr i32, ptr %out, i32 4
907  %tmp0 = atomicrmw add ptr %gep, i32 %in seq_cst
908  ret void
909}
910
911define amdgpu_gfx i32 @flat_atomic_add_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) {
912; GCN1-LABEL: flat_atomic_add_i32_ret_scalar:
913; GCN1:       ; %bb.0:
914; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
915; GCN1-NEXT:    v_mov_b32_e32 v0, s4
916; GCN1-NEXT:    v_mov_b32_e32 v1, s5
917; GCN1-NEXT:    v_mov_b32_e32 v2, s6
918; GCN1-NEXT:    flat_atomic_add v0, v[0:1], v2 glc
919; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
920; GCN1-NEXT:    buffer_wbinvl1_vol
921; GCN1-NEXT:    s_setpc_b64 s[30:31]
922;
923; GCN2-LABEL: flat_atomic_add_i32_ret_scalar:
924; GCN2:       ; %bb.0:
925; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
926; GCN2-NEXT:    v_mov_b32_e32 v0, s4
927; GCN2-NEXT:    v_mov_b32_e32 v1, s5
928; GCN2-NEXT:    v_mov_b32_e32 v2, s6
929; GCN2-NEXT:    flat_atomic_add v0, v[0:1], v2 glc
930; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
931; GCN2-NEXT:    buffer_wbinvl1_vol
932; GCN2-NEXT:    s_setpc_b64 s[30:31]
933;
934; GCN3-LABEL: flat_atomic_add_i32_ret_scalar:
935; GCN3:       ; %bb.0:
936; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
937; GCN3-NEXT:    v_mov_b32_e32 v0, s4
938; GCN3-NEXT:    v_mov_b32_e32 v1, s5
939; GCN3-NEXT:    v_mov_b32_e32 v2, s6
940; GCN3-NEXT:    flat_atomic_add v0, v[0:1], v2 glc
941; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
942; GCN3-NEXT:    buffer_wbinvl1_vol
943; GCN3-NEXT:    s_setpc_b64 s[30:31]
944  %result = atomicrmw add ptr %ptr, i32 %in seq_cst
945  ret i32 %result
946}
947
948define amdgpu_gfx i32 @flat_atomic_add_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) {
949; GCN1-LABEL: flat_atomic_add_i32_ret_offset_scalar:
950; GCN1:       ; %bb.0:
951; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
952; GCN1-NEXT:    s_add_u32 s34, s4, 16
953; GCN1-NEXT:    s_addc_u32 s35, s5, 0
954; GCN1-NEXT:    v_mov_b32_e32 v0, s34
955; GCN1-NEXT:    v_mov_b32_e32 v1, s35
956; GCN1-NEXT:    v_mov_b32_e32 v2, s6
957; GCN1-NEXT:    flat_atomic_add v0, v[0:1], v2 glc
958; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
959; GCN1-NEXT:    buffer_wbinvl1_vol
960; GCN1-NEXT:    s_setpc_b64 s[30:31]
961;
962; GCN2-LABEL: flat_atomic_add_i32_ret_offset_scalar:
963; GCN2:       ; %bb.0:
964; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
965; GCN2-NEXT:    s_add_u32 s34, s4, 16
966; GCN2-NEXT:    s_addc_u32 s35, s5, 0
967; GCN2-NEXT:    v_mov_b32_e32 v0, s34
968; GCN2-NEXT:    v_mov_b32_e32 v1, s35
969; GCN2-NEXT:    v_mov_b32_e32 v2, s6
970; GCN2-NEXT:    flat_atomic_add v0, v[0:1], v2 glc
971; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
972; GCN2-NEXT:    buffer_wbinvl1_vol
973; GCN2-NEXT:    s_setpc_b64 s[30:31]
974;
975; GCN3-LABEL: flat_atomic_add_i32_ret_offset_scalar:
976; GCN3:       ; %bb.0:
977; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
978; GCN3-NEXT:    v_mov_b32_e32 v0, s4
979; GCN3-NEXT:    v_mov_b32_e32 v1, s5
980; GCN3-NEXT:    v_mov_b32_e32 v2, s6
981; GCN3-NEXT:    flat_atomic_add v0, v[0:1], v2 offset:16 glc
982; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
983; GCN3-NEXT:    buffer_wbinvl1_vol
984; GCN3-NEXT:    s_setpc_b64 s[30:31]
985  %gep = getelementptr i32, ptr %out, i32 4
986  %result = atomicrmw add ptr %gep, i32 %in seq_cst
987  ret i32 %result
988}
989
990define void @flat_atomic_add_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
991; GCN1-LABEL: flat_atomic_add_i32_noret_offset__amdgpu_no_remote_memory:
992; GCN1:       ; %bb.0:
993; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
994; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
995; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
996; GCN1-NEXT:    flat_atomic_add v[0:1], v2
997; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
998; GCN1-NEXT:    buffer_wbinvl1_vol
999; GCN1-NEXT:    s_setpc_b64 s[30:31]
1000;
1001; GCN2-LABEL: flat_atomic_add_i32_noret_offset__amdgpu_no_remote_memory:
1002; GCN2:       ; %bb.0:
1003; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1004; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
1005; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1006; GCN2-NEXT:    flat_atomic_add v[0:1], v2
1007; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1008; GCN2-NEXT:    buffer_wbinvl1_vol
1009; GCN2-NEXT:    s_setpc_b64 s[30:31]
1010;
1011; GCN3-LABEL: flat_atomic_add_i32_noret_offset__amdgpu_no_remote_memory:
1012; GCN3:       ; %bb.0:
1013; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1014; GCN3-NEXT:    flat_atomic_add v[0:1], v2 offset:16
1015; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1016; GCN3-NEXT:    buffer_wbinvl1_vol
1017; GCN3-NEXT:    s_setpc_b64 s[30:31]
1018  %gep = getelementptr i32, ptr %out, i64 4
1019  %tmp0 = atomicrmw add ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
1020  ret void
1021}
1022
1023define i32 @flat_atomic_add_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
1024; GCN1-LABEL: flat_atomic_add_i32_ret_offset__amdgpu_no_remote_memory:
1025; GCN1:       ; %bb.0:
1026; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1027; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
1028; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1029; GCN1-NEXT:    flat_atomic_add v0, v[0:1], v2 glc
1030; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1031; GCN1-NEXT:    buffer_wbinvl1_vol
1032; GCN1-NEXT:    s_setpc_b64 s[30:31]
1033;
1034; GCN2-LABEL: flat_atomic_add_i32_ret_offset__amdgpu_no_remote_memory:
1035; GCN2:       ; %bb.0:
1036; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1037; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
1038; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1039; GCN2-NEXT:    flat_atomic_add v0, v[0:1], v2 glc
1040; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1041; GCN2-NEXT:    buffer_wbinvl1_vol
1042; GCN2-NEXT:    s_setpc_b64 s[30:31]
1043;
1044; GCN3-LABEL: flat_atomic_add_i32_ret_offset__amdgpu_no_remote_memory:
1045; GCN3:       ; %bb.0:
1046; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1047; GCN3-NEXT:    flat_atomic_add v0, v[0:1], v2 offset:16 glc
1048; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1049; GCN3-NEXT:    buffer_wbinvl1_vol
1050; GCN3-NEXT:    s_setpc_b64 s[30:31]
1051  %gep = getelementptr i32, ptr %out, i64 4
1052  %result = atomicrmw add ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
1053  ret i32 %result
1054}
1055
1056; ---------------------------------------------------------------------
1057; atomicrmw sub
1058; ---------------------------------------------------------------------
1059
1060define void @flat_atomic_sub_i32_noret(ptr %ptr, i32 %in) {
1061; GCN1-LABEL: flat_atomic_sub_i32_noret:
1062; GCN1:       ; %bb.0:
1063; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1064; GCN1-NEXT:    flat_atomic_sub v[0:1], v2
1065; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1066; GCN1-NEXT:    buffer_wbinvl1_vol
1067; GCN1-NEXT:    s_setpc_b64 s[30:31]
1068;
1069; GCN2-LABEL: flat_atomic_sub_i32_noret:
1070; GCN2:       ; %bb.0:
1071; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1072; GCN2-NEXT:    flat_atomic_sub v[0:1], v2
1073; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1074; GCN2-NEXT:    buffer_wbinvl1_vol
1075; GCN2-NEXT:    s_setpc_b64 s[30:31]
1076;
1077; GCN3-LABEL: flat_atomic_sub_i32_noret:
1078; GCN3:       ; %bb.0:
1079; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1080; GCN3-NEXT:    flat_atomic_sub v[0:1], v2
1081; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1082; GCN3-NEXT:    buffer_wbinvl1_vol
1083; GCN3-NEXT:    s_setpc_b64 s[30:31]
1084  %tmp0 = atomicrmw sub ptr %ptr, i32 %in seq_cst
1085  ret void
1086}
1087
1088define void @flat_atomic_sub_i32_noret_offset(ptr %out, i32 %in) {
1089; GCN1-LABEL: flat_atomic_sub_i32_noret_offset:
1090; GCN1:       ; %bb.0:
1091; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1092; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
1093; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1094; GCN1-NEXT:    flat_atomic_sub v[0:1], v2
1095; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1096; GCN1-NEXT:    buffer_wbinvl1_vol
1097; GCN1-NEXT:    s_setpc_b64 s[30:31]
1098;
1099; GCN2-LABEL: flat_atomic_sub_i32_noret_offset:
1100; GCN2:       ; %bb.0:
1101; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1102; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
1103; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1104; GCN2-NEXT:    flat_atomic_sub v[0:1], v2
1105; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1106; GCN2-NEXT:    buffer_wbinvl1_vol
1107; GCN2-NEXT:    s_setpc_b64 s[30:31]
1108;
1109; GCN3-LABEL: flat_atomic_sub_i32_noret_offset:
1110; GCN3:       ; %bb.0:
1111; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1112; GCN3-NEXT:    flat_atomic_sub v[0:1], v2 offset:16
1113; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1114; GCN3-NEXT:    buffer_wbinvl1_vol
1115; GCN3-NEXT:    s_setpc_b64 s[30:31]
1116  %gep = getelementptr i32, ptr %out, i32 4
1117  %tmp0 = atomicrmw sub ptr %gep, i32 %in seq_cst
1118  ret void
1119}
1120
1121define i32 @flat_atomic_sub_i32_ret(ptr %ptr, i32 %in) {
1122; GCN1-LABEL: flat_atomic_sub_i32_ret:
1123; GCN1:       ; %bb.0:
1124; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1125; GCN1-NEXT:    flat_atomic_sub v0, v[0:1], v2 glc
1126; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1127; GCN1-NEXT:    buffer_wbinvl1_vol
1128; GCN1-NEXT:    s_setpc_b64 s[30:31]
1129;
1130; GCN2-LABEL: flat_atomic_sub_i32_ret:
1131; GCN2:       ; %bb.0:
1132; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1133; GCN2-NEXT:    flat_atomic_sub v0, v[0:1], v2 glc
1134; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1135; GCN2-NEXT:    buffer_wbinvl1_vol
1136; GCN2-NEXT:    s_setpc_b64 s[30:31]
1137;
1138; GCN3-LABEL: flat_atomic_sub_i32_ret:
1139; GCN3:       ; %bb.0:
1140; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1141; GCN3-NEXT:    flat_atomic_sub v0, v[0:1], v2 glc
1142; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1143; GCN3-NEXT:    buffer_wbinvl1_vol
1144; GCN3-NEXT:    s_setpc_b64 s[30:31]
1145  %result = atomicrmw sub ptr %ptr, i32 %in seq_cst
1146  ret i32 %result
1147}
1148
1149define i32 @flat_atomic_sub_i32_ret_offset(ptr %out, i32 %in) {
1150; GCN1-LABEL: flat_atomic_sub_i32_ret_offset:
1151; GCN1:       ; %bb.0:
1152; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1153; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
1154; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1155; GCN1-NEXT:    flat_atomic_sub v0, v[0:1], v2 glc
1156; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1157; GCN1-NEXT:    buffer_wbinvl1_vol
1158; GCN1-NEXT:    s_setpc_b64 s[30:31]
1159;
1160; GCN2-LABEL: flat_atomic_sub_i32_ret_offset:
1161; GCN2:       ; %bb.0:
1162; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1163; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
1164; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1165; GCN2-NEXT:    flat_atomic_sub v0, v[0:1], v2 glc
1166; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1167; GCN2-NEXT:    buffer_wbinvl1_vol
1168; GCN2-NEXT:    s_setpc_b64 s[30:31]
1169;
1170; GCN3-LABEL: flat_atomic_sub_i32_ret_offset:
1171; GCN3:       ; %bb.0:
1172; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1173; GCN3-NEXT:    flat_atomic_sub v0, v[0:1], v2 offset:16 glc
1174; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1175; GCN3-NEXT:    buffer_wbinvl1_vol
1176; GCN3-NEXT:    s_setpc_b64 s[30:31]
1177  %gep = getelementptr i32, ptr %out, i32 4
1178  %result = atomicrmw sub ptr %gep, i32 %in seq_cst
1179  ret i32 %result
1180}
1181
1182define amdgpu_gfx void @flat_atomic_sub_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) {
1183; GCN1-LABEL: flat_atomic_sub_i32_noret_scalar:
1184; GCN1:       ; %bb.0:
1185; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1186; GCN1-NEXT:    v_mov_b32_e32 v0, s4
1187; GCN1-NEXT:    v_mov_b32_e32 v1, s5
1188; GCN1-NEXT:    v_mov_b32_e32 v2, s6
1189; GCN1-NEXT:    flat_atomic_sub v[0:1], v2
1190; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1191; GCN1-NEXT:    buffer_wbinvl1_vol
1192; GCN1-NEXT:    s_setpc_b64 s[30:31]
1193;
1194; GCN2-LABEL: flat_atomic_sub_i32_noret_scalar:
1195; GCN2:       ; %bb.0:
1196; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1197; GCN2-NEXT:    v_mov_b32_e32 v0, s4
1198; GCN2-NEXT:    v_mov_b32_e32 v1, s5
1199; GCN2-NEXT:    v_mov_b32_e32 v2, s6
1200; GCN2-NEXT:    flat_atomic_sub v[0:1], v2
1201; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1202; GCN2-NEXT:    buffer_wbinvl1_vol
1203; GCN2-NEXT:    s_setpc_b64 s[30:31]
1204;
1205; GCN3-LABEL: flat_atomic_sub_i32_noret_scalar:
1206; GCN3:       ; %bb.0:
1207; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1208; GCN3-NEXT:    v_mov_b32_e32 v0, s4
1209; GCN3-NEXT:    v_mov_b32_e32 v1, s5
1210; GCN3-NEXT:    v_mov_b32_e32 v2, s6
1211; GCN3-NEXT:    flat_atomic_sub v[0:1], v2
1212; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1213; GCN3-NEXT:    buffer_wbinvl1_vol
1214; GCN3-NEXT:    s_setpc_b64 s[30:31]
1215  %tmp0 = atomicrmw sub ptr %ptr, i32 %in seq_cst
1216  ret void
1217}
1218
1219define amdgpu_gfx void @flat_atomic_sub_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) {
1220; GCN1-LABEL: flat_atomic_sub_i32_noret_offset_scalar:
1221; GCN1:       ; %bb.0:
1222; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1223; GCN1-NEXT:    s_add_u32 s34, s4, 16
1224; GCN1-NEXT:    s_addc_u32 s35, s5, 0
1225; GCN1-NEXT:    v_mov_b32_e32 v0, s34
1226; GCN1-NEXT:    v_mov_b32_e32 v1, s35
1227; GCN1-NEXT:    v_mov_b32_e32 v2, s6
1228; GCN1-NEXT:    flat_atomic_sub v[0:1], v2
1229; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1230; GCN1-NEXT:    buffer_wbinvl1_vol
1231; GCN1-NEXT:    s_setpc_b64 s[30:31]
1232;
1233; GCN2-LABEL: flat_atomic_sub_i32_noret_offset_scalar:
1234; GCN2:       ; %bb.0:
1235; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1236; GCN2-NEXT:    s_add_u32 s34, s4, 16
1237; GCN2-NEXT:    s_addc_u32 s35, s5, 0
1238; GCN2-NEXT:    v_mov_b32_e32 v0, s34
1239; GCN2-NEXT:    v_mov_b32_e32 v1, s35
1240; GCN2-NEXT:    v_mov_b32_e32 v2, s6
1241; GCN2-NEXT:    flat_atomic_sub v[0:1], v2
1242; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1243; GCN2-NEXT:    buffer_wbinvl1_vol
1244; GCN2-NEXT:    s_setpc_b64 s[30:31]
1245;
1246; GCN3-LABEL: flat_atomic_sub_i32_noret_offset_scalar:
1247; GCN3:       ; %bb.0:
1248; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1249; GCN3-NEXT:    v_mov_b32_e32 v0, s4
1250; GCN3-NEXT:    v_mov_b32_e32 v1, s5
1251; GCN3-NEXT:    v_mov_b32_e32 v2, s6
1252; GCN3-NEXT:    flat_atomic_sub v[0:1], v2 offset:16
1253; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1254; GCN3-NEXT:    buffer_wbinvl1_vol
1255; GCN3-NEXT:    s_setpc_b64 s[30:31]
1256  %gep = getelementptr i32, ptr %out, i32 4
1257  %tmp0 = atomicrmw sub ptr %gep, i32 %in seq_cst
1258  ret void
1259}
1260
1261define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) {
1262; GCN1-LABEL: flat_atomic_sub_i32_ret_scalar:
1263; GCN1:       ; %bb.0:
1264; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1265; GCN1-NEXT:    v_mov_b32_e32 v0, s4
1266; GCN1-NEXT:    v_mov_b32_e32 v1, s5
1267; GCN1-NEXT:    v_mov_b32_e32 v2, s6
1268; GCN1-NEXT:    flat_atomic_sub v0, v[0:1], v2 glc
1269; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1270; GCN1-NEXT:    buffer_wbinvl1_vol
1271; GCN1-NEXT:    s_setpc_b64 s[30:31]
1272;
1273; GCN2-LABEL: flat_atomic_sub_i32_ret_scalar:
1274; GCN2:       ; %bb.0:
1275; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1276; GCN2-NEXT:    v_mov_b32_e32 v0, s4
1277; GCN2-NEXT:    v_mov_b32_e32 v1, s5
1278; GCN2-NEXT:    v_mov_b32_e32 v2, s6
1279; GCN2-NEXT:    flat_atomic_sub v0, v[0:1], v2 glc
1280; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1281; GCN2-NEXT:    buffer_wbinvl1_vol
1282; GCN2-NEXT:    s_setpc_b64 s[30:31]
1283;
1284; GCN3-LABEL: flat_atomic_sub_i32_ret_scalar:
1285; GCN3:       ; %bb.0:
1286; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1287; GCN3-NEXT:    v_mov_b32_e32 v0, s4
1288; GCN3-NEXT:    v_mov_b32_e32 v1, s5
1289; GCN3-NEXT:    v_mov_b32_e32 v2, s6
1290; GCN3-NEXT:    flat_atomic_sub v0, v[0:1], v2 glc
1291; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1292; GCN3-NEXT:    buffer_wbinvl1_vol
1293; GCN3-NEXT:    s_setpc_b64 s[30:31]
1294  %result = atomicrmw sub ptr %ptr, i32 %in seq_cst
1295  ret i32 %result
1296}
1297
1298define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) {
1299; GCN1-LABEL: flat_atomic_sub_i32_ret_offset_scalar:
1300; GCN1:       ; %bb.0:
1301; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1302; GCN1-NEXT:    s_add_u32 s34, s4, 16
1303; GCN1-NEXT:    s_addc_u32 s35, s5, 0
1304; GCN1-NEXT:    v_mov_b32_e32 v0, s34
1305; GCN1-NEXT:    v_mov_b32_e32 v1, s35
1306; GCN1-NEXT:    v_mov_b32_e32 v2, s6
1307; GCN1-NEXT:    flat_atomic_sub v0, v[0:1], v2 glc
1308; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1309; GCN1-NEXT:    buffer_wbinvl1_vol
1310; GCN1-NEXT:    s_setpc_b64 s[30:31]
1311;
1312; GCN2-LABEL: flat_atomic_sub_i32_ret_offset_scalar:
1313; GCN2:       ; %bb.0:
1314; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1315; GCN2-NEXT:    s_add_u32 s34, s4, 16
1316; GCN2-NEXT:    s_addc_u32 s35, s5, 0
1317; GCN2-NEXT:    v_mov_b32_e32 v0, s34
1318; GCN2-NEXT:    v_mov_b32_e32 v1, s35
1319; GCN2-NEXT:    v_mov_b32_e32 v2, s6
1320; GCN2-NEXT:    flat_atomic_sub v0, v[0:1], v2 glc
1321; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1322; GCN2-NEXT:    buffer_wbinvl1_vol
1323; GCN2-NEXT:    s_setpc_b64 s[30:31]
1324;
1325; GCN3-LABEL: flat_atomic_sub_i32_ret_offset_scalar:
1326; GCN3:       ; %bb.0:
1327; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1328; GCN3-NEXT:    v_mov_b32_e32 v0, s4
1329; GCN3-NEXT:    v_mov_b32_e32 v1, s5
1330; GCN3-NEXT:    v_mov_b32_e32 v2, s6
1331; GCN3-NEXT:    flat_atomic_sub v0, v[0:1], v2 offset:16 glc
1332; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1333; GCN3-NEXT:    buffer_wbinvl1_vol
1334; GCN3-NEXT:    s_setpc_b64 s[30:31]
1335  %gep = getelementptr i32, ptr %out, i32 4
1336  %result = atomicrmw sub ptr %gep, i32 %in seq_cst
1337  ret i32 %result
1338}
1339
1340define void @flat_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
1341; GCN1-LABEL: flat_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory:
1342; GCN1:       ; %bb.0:
1343; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1344; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
1345; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1346; GCN1-NEXT:    flat_atomic_sub v[0:1], v2
1347; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1348; GCN1-NEXT:    buffer_wbinvl1_vol
1349; GCN1-NEXT:    s_setpc_b64 s[30:31]
1350;
1351; GCN2-LABEL: flat_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory:
1352; GCN2:       ; %bb.0:
1353; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1354; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
1355; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1356; GCN2-NEXT:    flat_atomic_sub v[0:1], v2
1357; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1358; GCN2-NEXT:    buffer_wbinvl1_vol
1359; GCN2-NEXT:    s_setpc_b64 s[30:31]
1360;
1361; GCN3-LABEL: flat_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory:
1362; GCN3:       ; %bb.0:
1363; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1364; GCN3-NEXT:    flat_atomic_sub v[0:1], v2 offset:16
1365; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1366; GCN3-NEXT:    buffer_wbinvl1_vol
1367; GCN3-NEXT:    s_setpc_b64 s[30:31]
1368  %gep = getelementptr i32, ptr %out, i64 4
1369  %tmp0 = atomicrmw sub ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
1370  ret void
1371}
1372
1373define i32 @flat_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
1374; GCN1-LABEL: flat_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory:
1375; GCN1:       ; %bb.0:
1376; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1377; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
1378; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1379; GCN1-NEXT:    flat_atomic_sub v0, v[0:1], v2 glc
1380; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1381; GCN1-NEXT:    buffer_wbinvl1_vol
1382; GCN1-NEXT:    s_setpc_b64 s[30:31]
1383;
1384; GCN2-LABEL: flat_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory:
1385; GCN2:       ; %bb.0:
1386; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1387; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
1388; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1389; GCN2-NEXT:    flat_atomic_sub v0, v[0:1], v2 glc
1390; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1391; GCN2-NEXT:    buffer_wbinvl1_vol
1392; GCN2-NEXT:    s_setpc_b64 s[30:31]
1393;
1394; GCN3-LABEL: flat_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory:
1395; GCN3:       ; %bb.0:
1396; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1397; GCN3-NEXT:    flat_atomic_sub v0, v[0:1], v2 offset:16 glc
1398; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1399; GCN3-NEXT:    buffer_wbinvl1_vol
1400; GCN3-NEXT:    s_setpc_b64 s[30:31]
1401  %gep = getelementptr i32, ptr %out, i64 4
1402  %result = atomicrmw sub ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
1403  ret i32 %result
1404}
1405
1406; ---------------------------------------------------------------------
1407; atomicrmw and
1408; ---------------------------------------------------------------------
1409
1410define void @flat_atomic_and_i32_noret(ptr %ptr, i32 %in) {
1411; GCN1-LABEL: flat_atomic_and_i32_noret:
1412; GCN1:       ; %bb.0:
1413; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1414; GCN1-NEXT:    flat_atomic_and v[0:1], v2
1415; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1416; GCN1-NEXT:    buffer_wbinvl1_vol
1417; GCN1-NEXT:    s_setpc_b64 s[30:31]
1418;
1419; GCN2-LABEL: flat_atomic_and_i32_noret:
1420; GCN2:       ; %bb.0:
1421; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1422; GCN2-NEXT:    flat_atomic_and v[0:1], v2
1423; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1424; GCN2-NEXT:    buffer_wbinvl1_vol
1425; GCN2-NEXT:    s_setpc_b64 s[30:31]
1426;
1427; GCN3-LABEL: flat_atomic_and_i32_noret:
1428; GCN3:       ; %bb.0:
1429; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1430; GCN3-NEXT:    flat_atomic_and v[0:1], v2
1431; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1432; GCN3-NEXT:    buffer_wbinvl1_vol
1433; GCN3-NEXT:    s_setpc_b64 s[30:31]
1434  %tmp0 = atomicrmw and ptr %ptr, i32 %in seq_cst
1435  ret void
1436}
1437
1438define void @flat_atomic_and_i32_noret_offset(ptr %out, i32 %in) {
1439; GCN1-LABEL: flat_atomic_and_i32_noret_offset:
1440; GCN1:       ; %bb.0:
1441; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1442; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
1443; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1444; GCN1-NEXT:    flat_atomic_and v[0:1], v2
1445; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1446; GCN1-NEXT:    buffer_wbinvl1_vol
1447; GCN1-NEXT:    s_setpc_b64 s[30:31]
1448;
1449; GCN2-LABEL: flat_atomic_and_i32_noret_offset:
1450; GCN2:       ; %bb.0:
1451; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1452; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
1453; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1454; GCN2-NEXT:    flat_atomic_and v[0:1], v2
1455; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1456; GCN2-NEXT:    buffer_wbinvl1_vol
1457; GCN2-NEXT:    s_setpc_b64 s[30:31]
1458;
1459; GCN3-LABEL: flat_atomic_and_i32_noret_offset:
1460; GCN3:       ; %bb.0:
1461; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1462; GCN3-NEXT:    flat_atomic_and v[0:1], v2 offset:16
1463; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1464; GCN3-NEXT:    buffer_wbinvl1_vol
1465; GCN3-NEXT:    s_setpc_b64 s[30:31]
1466  %gep = getelementptr i32, ptr %out, i32 4
1467  %tmp0 = atomicrmw and ptr %gep, i32 %in seq_cst
1468  ret void
1469}
1470
1471define i32 @flat_atomic_and_i32_ret(ptr %ptr, i32 %in) {
1472; GCN1-LABEL: flat_atomic_and_i32_ret:
1473; GCN1:       ; %bb.0:
1474; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1475; GCN1-NEXT:    flat_atomic_and v0, v[0:1], v2 glc
1476; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1477; GCN1-NEXT:    buffer_wbinvl1_vol
1478; GCN1-NEXT:    s_setpc_b64 s[30:31]
1479;
1480; GCN2-LABEL: flat_atomic_and_i32_ret:
1481; GCN2:       ; %bb.0:
1482; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1483; GCN2-NEXT:    flat_atomic_and v0, v[0:1], v2 glc
1484; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1485; GCN2-NEXT:    buffer_wbinvl1_vol
1486; GCN2-NEXT:    s_setpc_b64 s[30:31]
1487;
1488; GCN3-LABEL: flat_atomic_and_i32_ret:
1489; GCN3:       ; %bb.0:
1490; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1491; GCN3-NEXT:    flat_atomic_and v0, v[0:1], v2 glc
1492; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1493; GCN3-NEXT:    buffer_wbinvl1_vol
1494; GCN3-NEXT:    s_setpc_b64 s[30:31]
1495  %result = atomicrmw and ptr %ptr, i32 %in seq_cst
1496  ret i32 %result
1497}
1498
1499define i32 @flat_atomic_and_i32_ret_offset(ptr %out, i32 %in) {
1500; GCN1-LABEL: flat_atomic_and_i32_ret_offset:
1501; GCN1:       ; %bb.0:
1502; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1503; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
1504; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1505; GCN1-NEXT:    flat_atomic_and v0, v[0:1], v2 glc
1506; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1507; GCN1-NEXT:    buffer_wbinvl1_vol
1508; GCN1-NEXT:    s_setpc_b64 s[30:31]
1509;
1510; GCN2-LABEL: flat_atomic_and_i32_ret_offset:
1511; GCN2:       ; %bb.0:
1512; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1513; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
1514; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1515; GCN2-NEXT:    flat_atomic_and v0, v[0:1], v2 glc
1516; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1517; GCN2-NEXT:    buffer_wbinvl1_vol
1518; GCN2-NEXT:    s_setpc_b64 s[30:31]
1519;
1520; GCN3-LABEL: flat_atomic_and_i32_ret_offset:
1521; GCN3:       ; %bb.0:
1522; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1523; GCN3-NEXT:    flat_atomic_and v0, v[0:1], v2 offset:16 glc
1524; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1525; GCN3-NEXT:    buffer_wbinvl1_vol
1526; GCN3-NEXT:    s_setpc_b64 s[30:31]
1527  %gep = getelementptr i32, ptr %out, i32 4
1528  %result = atomicrmw and ptr %gep, i32 %in seq_cst
1529  ret i32 %result
1530}
1531
1532define amdgpu_gfx void @flat_atomic_and_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) {
1533; GCN1-LABEL: flat_atomic_and_i32_noret_scalar:
1534; GCN1:       ; %bb.0:
1535; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1536; GCN1-NEXT:    v_mov_b32_e32 v0, s4
1537; GCN1-NEXT:    v_mov_b32_e32 v1, s5
1538; GCN1-NEXT:    v_mov_b32_e32 v2, s6
1539; GCN1-NEXT:    flat_atomic_and v[0:1], v2
1540; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1541; GCN1-NEXT:    buffer_wbinvl1_vol
1542; GCN1-NEXT:    s_setpc_b64 s[30:31]
1543;
1544; GCN2-LABEL: flat_atomic_and_i32_noret_scalar:
1545; GCN2:       ; %bb.0:
1546; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1547; GCN2-NEXT:    v_mov_b32_e32 v0, s4
1548; GCN2-NEXT:    v_mov_b32_e32 v1, s5
1549; GCN2-NEXT:    v_mov_b32_e32 v2, s6
1550; GCN2-NEXT:    flat_atomic_and v[0:1], v2
1551; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1552; GCN2-NEXT:    buffer_wbinvl1_vol
1553; GCN2-NEXT:    s_setpc_b64 s[30:31]
1554;
1555; GCN3-LABEL: flat_atomic_and_i32_noret_scalar:
1556; GCN3:       ; %bb.0:
1557; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1558; GCN3-NEXT:    v_mov_b32_e32 v0, s4
1559; GCN3-NEXT:    v_mov_b32_e32 v1, s5
1560; GCN3-NEXT:    v_mov_b32_e32 v2, s6
1561; GCN3-NEXT:    flat_atomic_and v[0:1], v2
1562; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1563; GCN3-NEXT:    buffer_wbinvl1_vol
1564; GCN3-NEXT:    s_setpc_b64 s[30:31]
1565  %tmp0 = atomicrmw and ptr %ptr, i32 %in seq_cst
1566  ret void
1567}
1568
1569define amdgpu_gfx void @flat_atomic_and_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) {
1570; GCN1-LABEL: flat_atomic_and_i32_noret_offset_scalar:
1571; GCN1:       ; %bb.0:
1572; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1573; GCN1-NEXT:    s_add_u32 s34, s4, 16
1574; GCN1-NEXT:    s_addc_u32 s35, s5, 0
1575; GCN1-NEXT:    v_mov_b32_e32 v0, s34
1576; GCN1-NEXT:    v_mov_b32_e32 v1, s35
1577; GCN1-NEXT:    v_mov_b32_e32 v2, s6
1578; GCN1-NEXT:    flat_atomic_and v[0:1], v2
1579; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1580; GCN1-NEXT:    buffer_wbinvl1_vol
1581; GCN1-NEXT:    s_setpc_b64 s[30:31]
1582;
1583; GCN2-LABEL: flat_atomic_and_i32_noret_offset_scalar:
1584; GCN2:       ; %bb.0:
1585; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1586; GCN2-NEXT:    s_add_u32 s34, s4, 16
1587; GCN2-NEXT:    s_addc_u32 s35, s5, 0
1588; GCN2-NEXT:    v_mov_b32_e32 v0, s34
1589; GCN2-NEXT:    v_mov_b32_e32 v1, s35
1590; GCN2-NEXT:    v_mov_b32_e32 v2, s6
1591; GCN2-NEXT:    flat_atomic_and v[0:1], v2
1592; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1593; GCN2-NEXT:    buffer_wbinvl1_vol
1594; GCN2-NEXT:    s_setpc_b64 s[30:31]
1595;
1596; GCN3-LABEL: flat_atomic_and_i32_noret_offset_scalar:
1597; GCN3:       ; %bb.0:
1598; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1599; GCN3-NEXT:    v_mov_b32_e32 v0, s4
1600; GCN3-NEXT:    v_mov_b32_e32 v1, s5
1601; GCN3-NEXT:    v_mov_b32_e32 v2, s6
1602; GCN3-NEXT:    flat_atomic_and v[0:1], v2 offset:16
1603; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1604; GCN3-NEXT:    buffer_wbinvl1_vol
1605; GCN3-NEXT:    s_setpc_b64 s[30:31]
1606  %gep = getelementptr i32, ptr %out, i32 4
1607  %tmp0 = atomicrmw and ptr %gep, i32 %in seq_cst
1608  ret void
1609}
1610
1611define amdgpu_gfx i32 @flat_atomic_and_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) {
1612; GCN1-LABEL: flat_atomic_and_i32_ret_scalar:
1613; GCN1:       ; %bb.0:
1614; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1615; GCN1-NEXT:    v_mov_b32_e32 v0, s4
1616; GCN1-NEXT:    v_mov_b32_e32 v1, s5
1617; GCN1-NEXT:    v_mov_b32_e32 v2, s6
1618; GCN1-NEXT:    flat_atomic_and v0, v[0:1], v2 glc
1619; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1620; GCN1-NEXT:    buffer_wbinvl1_vol
1621; GCN1-NEXT:    s_setpc_b64 s[30:31]
1622;
1623; GCN2-LABEL: flat_atomic_and_i32_ret_scalar:
1624; GCN2:       ; %bb.0:
1625; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1626; GCN2-NEXT:    v_mov_b32_e32 v0, s4
1627; GCN2-NEXT:    v_mov_b32_e32 v1, s5
1628; GCN2-NEXT:    v_mov_b32_e32 v2, s6
1629; GCN2-NEXT:    flat_atomic_and v0, v[0:1], v2 glc
1630; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1631; GCN2-NEXT:    buffer_wbinvl1_vol
1632; GCN2-NEXT:    s_setpc_b64 s[30:31]
1633;
1634; GCN3-LABEL: flat_atomic_and_i32_ret_scalar:
1635; GCN3:       ; %bb.0:
1636; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1637; GCN3-NEXT:    v_mov_b32_e32 v0, s4
1638; GCN3-NEXT:    v_mov_b32_e32 v1, s5
1639; GCN3-NEXT:    v_mov_b32_e32 v2, s6
1640; GCN3-NEXT:    flat_atomic_and v0, v[0:1], v2 glc
1641; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1642; GCN3-NEXT:    buffer_wbinvl1_vol
1643; GCN3-NEXT:    s_setpc_b64 s[30:31]
1644  %result = atomicrmw and ptr %ptr, i32 %in seq_cst
1645  ret i32 %result
1646}
1647
1648define amdgpu_gfx i32 @flat_atomic_and_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) {
1649; GCN1-LABEL: flat_atomic_and_i32_ret_offset_scalar:
1650; GCN1:       ; %bb.0:
1651; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1652; GCN1-NEXT:    s_add_u32 s34, s4, 16
1653; GCN1-NEXT:    s_addc_u32 s35, s5, 0
1654; GCN1-NEXT:    v_mov_b32_e32 v0, s34
1655; GCN1-NEXT:    v_mov_b32_e32 v1, s35
1656; GCN1-NEXT:    v_mov_b32_e32 v2, s6
1657; GCN1-NEXT:    flat_atomic_and v0, v[0:1], v2 glc
1658; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1659; GCN1-NEXT:    buffer_wbinvl1_vol
1660; GCN1-NEXT:    s_setpc_b64 s[30:31]
1661;
1662; GCN2-LABEL: flat_atomic_and_i32_ret_offset_scalar:
1663; GCN2:       ; %bb.0:
1664; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1665; GCN2-NEXT:    s_add_u32 s34, s4, 16
1666; GCN2-NEXT:    s_addc_u32 s35, s5, 0
1667; GCN2-NEXT:    v_mov_b32_e32 v0, s34
1668; GCN2-NEXT:    v_mov_b32_e32 v1, s35
1669; GCN2-NEXT:    v_mov_b32_e32 v2, s6
1670; GCN2-NEXT:    flat_atomic_and v0, v[0:1], v2 glc
1671; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1672; GCN2-NEXT:    buffer_wbinvl1_vol
1673; GCN2-NEXT:    s_setpc_b64 s[30:31]
1674;
1675; GCN3-LABEL: flat_atomic_and_i32_ret_offset_scalar:
1676; GCN3:       ; %bb.0:
1677; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1678; GCN3-NEXT:    v_mov_b32_e32 v0, s4
1679; GCN3-NEXT:    v_mov_b32_e32 v1, s5
1680; GCN3-NEXT:    v_mov_b32_e32 v2, s6
1681; GCN3-NEXT:    flat_atomic_and v0, v[0:1], v2 offset:16 glc
1682; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1683; GCN3-NEXT:    buffer_wbinvl1_vol
1684; GCN3-NEXT:    s_setpc_b64 s[30:31]
1685  %gep = getelementptr i32, ptr %out, i32 4
1686  %result = atomicrmw and ptr %gep, i32 %in seq_cst
1687  ret i32 %result
1688}
1689
1690define void @flat_atomic_and_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
1691; GCN1-LABEL: flat_atomic_and_i32_noret_offset__amdgpu_no_remote_memory:
1692; GCN1:       ; %bb.0:
1693; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1694; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
1695; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1696; GCN1-NEXT:    flat_atomic_and v[0:1], v2
1697; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1698; GCN1-NEXT:    buffer_wbinvl1_vol
1699; GCN1-NEXT:    s_setpc_b64 s[30:31]
1700;
1701; GCN2-LABEL: flat_atomic_and_i32_noret_offset__amdgpu_no_remote_memory:
1702; GCN2:       ; %bb.0:
1703; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1704; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
1705; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1706; GCN2-NEXT:    flat_atomic_and v[0:1], v2
1707; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1708; GCN2-NEXT:    buffer_wbinvl1_vol
1709; GCN2-NEXT:    s_setpc_b64 s[30:31]
1710;
1711; GCN3-LABEL: flat_atomic_and_i32_noret_offset__amdgpu_no_remote_memory:
1712; GCN3:       ; %bb.0:
1713; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1714; GCN3-NEXT:    flat_atomic_and v[0:1], v2 offset:16
1715; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1716; GCN3-NEXT:    buffer_wbinvl1_vol
1717; GCN3-NEXT:    s_setpc_b64 s[30:31]
1718  %gep = getelementptr i32, ptr %out, i64 4
1719  %tmp0 = atomicrmw and ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
1720  ret void
1721}
1722
1723define i32 @flat_atomic_and_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
1724; GCN1-LABEL: flat_atomic_and_i32_ret_offset__amdgpu_no_remote_memory:
1725; GCN1:       ; %bb.0:
1726; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1727; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
1728; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1729; GCN1-NEXT:    flat_atomic_and v0, v[0:1], v2 glc
1730; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1731; GCN1-NEXT:    buffer_wbinvl1_vol
1732; GCN1-NEXT:    s_setpc_b64 s[30:31]
1733;
1734; GCN2-LABEL: flat_atomic_and_i32_ret_offset__amdgpu_no_remote_memory:
1735; GCN2:       ; %bb.0:
1736; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1737; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
1738; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1739; GCN2-NEXT:    flat_atomic_and v0, v[0:1], v2 glc
1740; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1741; GCN2-NEXT:    buffer_wbinvl1_vol
1742; GCN2-NEXT:    s_setpc_b64 s[30:31]
1743;
1744; GCN3-LABEL: flat_atomic_and_i32_ret_offset__amdgpu_no_remote_memory:
1745; GCN3:       ; %bb.0:
1746; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1747; GCN3-NEXT:    flat_atomic_and v0, v[0:1], v2 offset:16 glc
1748; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1749; GCN3-NEXT:    buffer_wbinvl1_vol
1750; GCN3-NEXT:    s_setpc_b64 s[30:31]
1751  %gep = getelementptr i32, ptr %out, i64 4
1752  %result = atomicrmw and ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
1753  ret i32 %result
1754}
1755
1756; ---------------------------------------------------------------------
1757; atomicrmw nand
1758; ---------------------------------------------------------------------
1759
1760define void @flat_atomic_nand_i32_noret(ptr %ptr, i32 %in) {
1761; GCN1-LABEL: flat_atomic_nand_i32_noret:
1762; GCN1:       ; %bb.0:
1763; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1764; GCN1-NEXT:    flat_load_dword v4, v[0:1]
1765; GCN1-NEXT:    s_mov_b64 s[4:5], 0
1766; GCN1-NEXT:  .LBB50_1: ; %atomicrmw.start
1767; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
1768; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1769; GCN1-NEXT:    v_and_b32_e32 v3, v4, v2
1770; GCN1-NEXT:    v_not_b32_e32 v3, v3
1771; GCN1-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
1772; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1773; GCN1-NEXT:    buffer_wbinvl1_vol
1774; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
1775; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1776; GCN1-NEXT:    v_mov_b32_e32 v4, v3
1777; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1778; GCN1-NEXT:    s_cbranch_execnz .LBB50_1
1779; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
1780; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
1781; GCN1-NEXT:    s_setpc_b64 s[30:31]
1782;
1783; GCN2-LABEL: flat_atomic_nand_i32_noret:
1784; GCN2:       ; %bb.0:
1785; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1786; GCN2-NEXT:    flat_load_dword v4, v[0:1]
1787; GCN2-NEXT:    s_mov_b64 s[4:5], 0
1788; GCN2-NEXT:  .LBB50_1: ; %atomicrmw.start
1789; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
1790; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1791; GCN2-NEXT:    v_and_b32_e32 v3, v4, v2
1792; GCN2-NEXT:    v_not_b32_e32 v3, v3
1793; GCN2-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
1794; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1795; GCN2-NEXT:    buffer_wbinvl1_vol
1796; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
1797; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1798; GCN2-NEXT:    v_mov_b32_e32 v4, v3
1799; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1800; GCN2-NEXT:    s_cbranch_execnz .LBB50_1
1801; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
1802; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
1803; GCN2-NEXT:    s_setpc_b64 s[30:31]
1804;
1805; GCN3-LABEL: flat_atomic_nand_i32_noret:
1806; GCN3:       ; %bb.0:
1807; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1808; GCN3-NEXT:    flat_load_dword v4, v[0:1]
1809; GCN3-NEXT:    s_mov_b64 s[4:5], 0
1810; GCN3-NEXT:  .LBB50_1: ; %atomicrmw.start
1811; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
1812; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1813; GCN3-NEXT:    v_and_b32_e32 v3, v4, v2
1814; GCN3-NEXT:    v_not_b32_e32 v3, v3
1815; GCN3-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
1816; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1817; GCN3-NEXT:    buffer_wbinvl1_vol
1818; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
1819; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1820; GCN3-NEXT:    v_mov_b32_e32 v4, v3
1821; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1822; GCN3-NEXT:    s_cbranch_execnz .LBB50_1
1823; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
1824; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
1825; GCN3-NEXT:    s_setpc_b64 s[30:31]
1826  %tmp0 = atomicrmw nand ptr %ptr, i32 %in seq_cst
1827  ret void
1828}
1829
1830define void @flat_atomic_nand_i32_noret_offset(ptr %out, i32 %in) {
1831; GCN1-LABEL: flat_atomic_nand_i32_noret_offset:
1832; GCN1:       ; %bb.0:
1833; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1834; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
1835; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1836; GCN1-NEXT:    flat_load_dword v4, v[0:1]
1837; GCN1-NEXT:    s_mov_b64 s[4:5], 0
1838; GCN1-NEXT:  .LBB51_1: ; %atomicrmw.start
1839; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
1840; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1841; GCN1-NEXT:    v_and_b32_e32 v3, v4, v2
1842; GCN1-NEXT:    v_not_b32_e32 v3, v3
1843; GCN1-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
1844; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1845; GCN1-NEXT:    buffer_wbinvl1_vol
1846; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
1847; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1848; GCN1-NEXT:    v_mov_b32_e32 v4, v3
1849; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1850; GCN1-NEXT:    s_cbranch_execnz .LBB51_1
1851; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
1852; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
1853; GCN1-NEXT:    s_setpc_b64 s[30:31]
1854;
1855; GCN2-LABEL: flat_atomic_nand_i32_noret_offset:
1856; GCN2:       ; %bb.0:
1857; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1858; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
1859; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1860; GCN2-NEXT:    flat_load_dword v4, v[0:1]
1861; GCN2-NEXT:    s_mov_b64 s[4:5], 0
1862; GCN2-NEXT:  .LBB51_1: ; %atomicrmw.start
1863; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
1864; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1865; GCN2-NEXT:    v_and_b32_e32 v3, v4, v2
1866; GCN2-NEXT:    v_not_b32_e32 v3, v3
1867; GCN2-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
1868; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1869; GCN2-NEXT:    buffer_wbinvl1_vol
1870; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
1871; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1872; GCN2-NEXT:    v_mov_b32_e32 v4, v3
1873; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1874; GCN2-NEXT:    s_cbranch_execnz .LBB51_1
1875; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
1876; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
1877; GCN2-NEXT:    s_setpc_b64 s[30:31]
1878;
1879; GCN3-LABEL: flat_atomic_nand_i32_noret_offset:
1880; GCN3:       ; %bb.0:
1881; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1882; GCN3-NEXT:    flat_load_dword v4, v[0:1] offset:16
1883; GCN3-NEXT:    s_mov_b64 s[4:5], 0
1884; GCN3-NEXT:  .LBB51_1: ; %atomicrmw.start
1885; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
1886; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1887; GCN3-NEXT:    v_and_b32_e32 v3, v4, v2
1888; GCN3-NEXT:    v_not_b32_e32 v3, v3
1889; GCN3-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
1890; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1891; GCN3-NEXT:    buffer_wbinvl1_vol
1892; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
1893; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1894; GCN3-NEXT:    v_mov_b32_e32 v4, v3
1895; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1896; GCN3-NEXT:    s_cbranch_execnz .LBB51_1
1897; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
1898; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
1899; GCN3-NEXT:    s_setpc_b64 s[30:31]
1900  %gep = getelementptr i32, ptr %out, i32 4
1901  %tmp0 = atomicrmw nand ptr %gep, i32 %in seq_cst
1902  ret void
1903}
1904
1905define i32 @flat_atomic_nand_i32_ret(ptr %ptr, i32 %in) {
1906; GCN1-LABEL: flat_atomic_nand_i32_ret:
1907; GCN1:       ; %bb.0:
1908; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1909; GCN1-NEXT:    flat_load_dword v3, v[0:1]
1910; GCN1-NEXT:    s_mov_b64 s[4:5], 0
1911; GCN1-NEXT:  .LBB52_1: ; %atomicrmw.start
1912; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
1913; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1914; GCN1-NEXT:    v_mov_b32_e32 v4, v3
1915; GCN1-NEXT:    v_and_b32_e32 v3, v4, v2
1916; GCN1-NEXT:    v_not_b32_e32 v3, v3
1917; GCN1-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
1918; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1919; GCN1-NEXT:    buffer_wbinvl1_vol
1920; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
1921; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1922; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1923; GCN1-NEXT:    s_cbranch_execnz .LBB52_1
1924; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
1925; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
1926; GCN1-NEXT:    v_mov_b32_e32 v0, v3
1927; GCN1-NEXT:    s_setpc_b64 s[30:31]
1928;
1929; GCN2-LABEL: flat_atomic_nand_i32_ret:
1930; GCN2:       ; %bb.0:
1931; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1932; GCN2-NEXT:    flat_load_dword v3, v[0:1]
1933; GCN2-NEXT:    s_mov_b64 s[4:5], 0
1934; GCN2-NEXT:  .LBB52_1: ; %atomicrmw.start
1935; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
1936; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1937; GCN2-NEXT:    v_mov_b32_e32 v4, v3
1938; GCN2-NEXT:    v_and_b32_e32 v3, v4, v2
1939; GCN2-NEXT:    v_not_b32_e32 v3, v3
1940; GCN2-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
1941; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1942; GCN2-NEXT:    buffer_wbinvl1_vol
1943; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
1944; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1945; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1946; GCN2-NEXT:    s_cbranch_execnz .LBB52_1
1947; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
1948; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
1949; GCN2-NEXT:    v_mov_b32_e32 v0, v3
1950; GCN2-NEXT:    s_setpc_b64 s[30:31]
1951;
1952; GCN3-LABEL: flat_atomic_nand_i32_ret:
1953; GCN3:       ; %bb.0:
1954; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1955; GCN3-NEXT:    flat_load_dword v3, v[0:1]
1956; GCN3-NEXT:    s_mov_b64 s[4:5], 0
1957; GCN3-NEXT:  .LBB52_1: ; %atomicrmw.start
1958; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
1959; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1960; GCN3-NEXT:    v_mov_b32_e32 v4, v3
1961; GCN3-NEXT:    v_and_b32_e32 v3, v4, v2
1962; GCN3-NEXT:    v_not_b32_e32 v3, v3
1963; GCN3-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
1964; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1965; GCN3-NEXT:    buffer_wbinvl1_vol
1966; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
1967; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1968; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1969; GCN3-NEXT:    s_cbranch_execnz .LBB52_1
1970; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
1971; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
1972; GCN3-NEXT:    v_mov_b32_e32 v0, v3
1973; GCN3-NEXT:    s_setpc_b64 s[30:31]
1974  %result = atomicrmw nand ptr %ptr, i32 %in seq_cst
1975  ret i32 %result
1976}
1977
1978define i32 @flat_atomic_nand_i32_ret_offset(ptr %out, i32 %in) {
1979; GCN1-LABEL: flat_atomic_nand_i32_ret_offset:
1980; GCN1:       ; %bb.0:
1981; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1982; GCN1-NEXT:    v_add_i32_e32 v3, vcc, 16, v0
1983; GCN1-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
1984; GCN1-NEXT:    flat_load_dword v0, v[3:4]
1985; GCN1-NEXT:    s_mov_b64 s[4:5], 0
1986; GCN1-NEXT:  .LBB53_1: ; %atomicrmw.start
1987; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
1988; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1989; GCN1-NEXT:    v_mov_b32_e32 v1, v0
1990; GCN1-NEXT:    v_and_b32_e32 v0, v1, v2
1991; GCN1-NEXT:    v_not_b32_e32 v0, v0
1992; GCN1-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
1993; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1994; GCN1-NEXT:    buffer_wbinvl1_vol
1995; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
1996; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1997; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1998; GCN1-NEXT:    s_cbranch_execnz .LBB53_1
1999; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
2000; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
2001; GCN1-NEXT:    s_setpc_b64 s[30:31]
2002;
2003; GCN2-LABEL: flat_atomic_nand_i32_ret_offset:
2004; GCN2:       ; %bb.0:
2005; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2006; GCN2-NEXT:    v_add_u32_e32 v3, vcc, 16, v0
2007; GCN2-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
2008; GCN2-NEXT:    flat_load_dword v0, v[3:4]
2009; GCN2-NEXT:    s_mov_b64 s[4:5], 0
2010; GCN2-NEXT:  .LBB53_1: ; %atomicrmw.start
2011; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
2012; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2013; GCN2-NEXT:    v_mov_b32_e32 v1, v0
2014; GCN2-NEXT:    v_and_b32_e32 v0, v1, v2
2015; GCN2-NEXT:    v_not_b32_e32 v0, v0
2016; GCN2-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
2017; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2018; GCN2-NEXT:    buffer_wbinvl1_vol
2019; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
2020; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2021; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2022; GCN2-NEXT:    s_cbranch_execnz .LBB53_1
2023; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
2024; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
2025; GCN2-NEXT:    s_setpc_b64 s[30:31]
2026;
2027; GCN3-LABEL: flat_atomic_nand_i32_ret_offset:
2028; GCN3:       ; %bb.0:
2029; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2030; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
2031; GCN3-NEXT:    s_mov_b64 s[4:5], 0
2032; GCN3-NEXT:  .LBB53_1: ; %atomicrmw.start
2033; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
2034; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2035; GCN3-NEXT:    v_mov_b32_e32 v4, v3
2036; GCN3-NEXT:    v_and_b32_e32 v3, v4, v2
2037; GCN3-NEXT:    v_not_b32_e32 v3, v3
2038; GCN3-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
2039; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2040; GCN3-NEXT:    buffer_wbinvl1_vol
2041; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
2042; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2043; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2044; GCN3-NEXT:    s_cbranch_execnz .LBB53_1
2045; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
2046; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
2047; GCN3-NEXT:    v_mov_b32_e32 v0, v3
2048; GCN3-NEXT:    s_setpc_b64 s[30:31]
2049  %gep = getelementptr i32, ptr %out, i32 4
2050  %result = atomicrmw nand ptr %gep, i32 %in seq_cst
2051  ret i32 %result
2052}
2053
2054define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) {
2055; GCN1-LABEL: flat_atomic_nand_i32_noret_scalar:
2056; GCN1:       ; %bb.0:
2057; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2058; GCN1-NEXT:    v_mov_b32_e32 v0, s4
2059; GCN1-NEXT:    v_mov_b32_e32 v1, s5
2060; GCN1-NEXT:    flat_load_dword v3, v[0:1]
2061; GCN1-NEXT:    s_mov_b64 s[34:35], 0
2062; GCN1-NEXT:  .LBB54_1: ; %atomicrmw.start
2063; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
2064; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2065; GCN1-NEXT:    v_and_b32_e32 v2, s6, v3
2066; GCN1-NEXT:    v_not_b32_e32 v2, v2
2067; GCN1-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2068; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2069; GCN1-NEXT:    buffer_wbinvl1_vol
2070; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
2071; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
2072; GCN1-NEXT:    v_mov_b32_e32 v3, v2
2073; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
2074; GCN1-NEXT:    s_cbranch_execnz .LBB54_1
2075; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
2076; GCN1-NEXT:    s_or_b64 exec, exec, s[34:35]
2077; GCN1-NEXT:    s_setpc_b64 s[30:31]
2078;
2079; GCN2-LABEL: flat_atomic_nand_i32_noret_scalar:
2080; GCN2:       ; %bb.0:
2081; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2082; GCN2-NEXT:    v_mov_b32_e32 v0, s4
2083; GCN2-NEXT:    v_mov_b32_e32 v1, s5
2084; GCN2-NEXT:    flat_load_dword v3, v[0:1]
2085; GCN2-NEXT:    s_mov_b64 s[34:35], 0
2086; GCN2-NEXT:  .LBB54_1: ; %atomicrmw.start
2087; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
2088; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2089; GCN2-NEXT:    v_and_b32_e32 v2, s6, v3
2090; GCN2-NEXT:    v_not_b32_e32 v2, v2
2091; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2092; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2093; GCN2-NEXT:    buffer_wbinvl1_vol
2094; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
2095; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
2096; GCN2-NEXT:    v_mov_b32_e32 v3, v2
2097; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
2098; GCN2-NEXT:    s_cbranch_execnz .LBB54_1
2099; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
2100; GCN2-NEXT:    s_or_b64 exec, exec, s[34:35]
2101; GCN2-NEXT:    s_setpc_b64 s[30:31]
2102;
2103; GCN3-LABEL: flat_atomic_nand_i32_noret_scalar:
2104; GCN3:       ; %bb.0:
2105; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2106; GCN3-NEXT:    v_mov_b32_e32 v0, s4
2107; GCN3-NEXT:    v_mov_b32_e32 v1, s5
2108; GCN3-NEXT:    flat_load_dword v3, v[0:1]
2109; GCN3-NEXT:    s_mov_b64 s[34:35], 0
2110; GCN3-NEXT:  .LBB54_1: ; %atomicrmw.start
2111; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
2112; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2113; GCN3-NEXT:    v_and_b32_e32 v2, s6, v3
2114; GCN3-NEXT:    v_not_b32_e32 v2, v2
2115; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2116; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2117; GCN3-NEXT:    buffer_wbinvl1_vol
2118; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
2119; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
2120; GCN3-NEXT:    v_mov_b32_e32 v3, v2
2121; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
2122; GCN3-NEXT:    s_cbranch_execnz .LBB54_1
2123; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
2124; GCN3-NEXT:    s_or_b64 exec, exec, s[34:35]
2125; GCN3-NEXT:    s_setpc_b64 s[30:31]
2126  %tmp0 = atomicrmw nand ptr %ptr, i32 %in seq_cst
2127  ret void
2128}
2129
2130define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) {
2131; GCN1-LABEL: flat_atomic_nand_i32_noret_offset_scalar:
2132; GCN1:       ; %bb.0:
2133; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2134; GCN1-NEXT:    s_add_u32 s34, s4, 16
2135; GCN1-NEXT:    s_addc_u32 s35, s5, 0
2136; GCN1-NEXT:    v_mov_b32_e32 v0, s34
2137; GCN1-NEXT:    v_mov_b32_e32 v1, s35
2138; GCN1-NEXT:    flat_load_dword v3, v[0:1]
2139; GCN1-NEXT:    s_mov_b64 s[34:35], 0
2140; GCN1-NEXT:  .LBB55_1: ; %atomicrmw.start
2141; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
2142; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2143; GCN1-NEXT:    v_and_b32_e32 v2, s6, v3
2144; GCN1-NEXT:    v_not_b32_e32 v2, v2
2145; GCN1-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2146; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2147; GCN1-NEXT:    buffer_wbinvl1_vol
2148; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
2149; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
2150; GCN1-NEXT:    v_mov_b32_e32 v3, v2
2151; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
2152; GCN1-NEXT:    s_cbranch_execnz .LBB55_1
2153; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
2154; GCN1-NEXT:    s_or_b64 exec, exec, s[34:35]
2155; GCN1-NEXT:    s_setpc_b64 s[30:31]
2156;
2157; GCN2-LABEL: flat_atomic_nand_i32_noret_offset_scalar:
2158; GCN2:       ; %bb.0:
2159; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2160; GCN2-NEXT:    s_add_u32 s34, s4, 16
2161; GCN2-NEXT:    s_addc_u32 s35, s5, 0
2162; GCN2-NEXT:    v_mov_b32_e32 v0, s34
2163; GCN2-NEXT:    v_mov_b32_e32 v1, s35
2164; GCN2-NEXT:    flat_load_dword v3, v[0:1]
2165; GCN2-NEXT:    s_mov_b64 s[34:35], 0
2166; GCN2-NEXT:  .LBB55_1: ; %atomicrmw.start
2167; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
2168; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2169; GCN2-NEXT:    v_and_b32_e32 v2, s6, v3
2170; GCN2-NEXT:    v_not_b32_e32 v2, v2
2171; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2172; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2173; GCN2-NEXT:    buffer_wbinvl1_vol
2174; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
2175; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
2176; GCN2-NEXT:    v_mov_b32_e32 v3, v2
2177; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
2178; GCN2-NEXT:    s_cbranch_execnz .LBB55_1
2179; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
2180; GCN2-NEXT:    s_or_b64 exec, exec, s[34:35]
2181; GCN2-NEXT:    s_setpc_b64 s[30:31]
2182;
2183; GCN3-LABEL: flat_atomic_nand_i32_noret_offset_scalar:
2184; GCN3:       ; %bb.0:
2185; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2186; GCN3-NEXT:    v_mov_b32_e32 v0, s4
2187; GCN3-NEXT:    v_mov_b32_e32 v1, s5
2188; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
2189; GCN3-NEXT:    s_mov_b64 s[34:35], 0
2190; GCN3-NEXT:  .LBB55_1: ; %atomicrmw.start
2191; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
2192; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2193; GCN3-NEXT:    v_and_b32_e32 v2, s6, v3
2194; GCN3-NEXT:    v_not_b32_e32 v2, v2
2195; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
2196; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2197; GCN3-NEXT:    buffer_wbinvl1_vol
2198; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
2199; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
2200; GCN3-NEXT:    v_mov_b32_e32 v3, v2
2201; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
2202; GCN3-NEXT:    s_cbranch_execnz .LBB55_1
2203; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
2204; GCN3-NEXT:    s_or_b64 exec, exec, s[34:35]
2205; GCN3-NEXT:    s_setpc_b64 s[30:31]
2206  %gep = getelementptr i32, ptr %out, i32 4
2207  %tmp0 = atomicrmw nand ptr %gep, i32 %in seq_cst
2208  ret void
2209}
2210
2211define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) {
2212; GCN1-LABEL: flat_atomic_nand_i32_ret_scalar:
2213; GCN1:       ; %bb.0:
2214; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2215; GCN1-NEXT:    v_mov_b32_e32 v0, s4
2216; GCN1-NEXT:    v_mov_b32_e32 v1, s5
2217; GCN1-NEXT:    flat_load_dword v0, v[0:1]
2218; GCN1-NEXT:    v_mov_b32_e32 v1, s4
2219; GCN1-NEXT:    s_mov_b64 s[34:35], 0
2220; GCN1-NEXT:    v_mov_b32_e32 v2, s5
2221; GCN1-NEXT:  .LBB56_1: ; %atomicrmw.start
2222; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
2223; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2224; GCN1-NEXT:    v_mov_b32_e32 v4, v0
2225; GCN1-NEXT:    v_and_b32_e32 v0, s6, v4
2226; GCN1-NEXT:    v_not_b32_e32 v3, v0
2227; GCN1-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
2228; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2229; GCN1-NEXT:    buffer_wbinvl1_vol
2230; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
2231; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
2232; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
2233; GCN1-NEXT:    s_cbranch_execnz .LBB56_1
2234; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
2235; GCN1-NEXT:    s_or_b64 exec, exec, s[34:35]
2236; GCN1-NEXT:    s_setpc_b64 s[30:31]
2237;
2238; GCN2-LABEL: flat_atomic_nand_i32_ret_scalar:
2239; GCN2:       ; %bb.0:
2240; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2241; GCN2-NEXT:    v_mov_b32_e32 v0, s4
2242; GCN2-NEXT:    v_mov_b32_e32 v1, s5
2243; GCN2-NEXT:    flat_load_dword v0, v[0:1]
2244; GCN2-NEXT:    v_mov_b32_e32 v1, s4
2245; GCN2-NEXT:    s_mov_b64 s[34:35], 0
2246; GCN2-NEXT:    v_mov_b32_e32 v2, s5
2247; GCN2-NEXT:  .LBB56_1: ; %atomicrmw.start
2248; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
2249; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2250; GCN2-NEXT:    v_mov_b32_e32 v4, v0
2251; GCN2-NEXT:    v_and_b32_e32 v0, s6, v4
2252; GCN2-NEXT:    v_not_b32_e32 v3, v0
2253; GCN2-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
2254; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2255; GCN2-NEXT:    buffer_wbinvl1_vol
2256; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
2257; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
2258; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
2259; GCN2-NEXT:    s_cbranch_execnz .LBB56_1
2260; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
2261; GCN2-NEXT:    s_or_b64 exec, exec, s[34:35]
2262; GCN2-NEXT:    s_setpc_b64 s[30:31]
2263;
2264; GCN3-LABEL: flat_atomic_nand_i32_ret_scalar:
2265; GCN3:       ; %bb.0:
2266; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2267; GCN3-NEXT:    v_mov_b32_e32 v0, s4
2268; GCN3-NEXT:    v_mov_b32_e32 v1, s5
2269; GCN3-NEXT:    flat_load_dword v0, v[0:1]
2270; GCN3-NEXT:    v_mov_b32_e32 v1, s4
2271; GCN3-NEXT:    s_mov_b64 s[34:35], 0
2272; GCN3-NEXT:    v_mov_b32_e32 v2, s5
2273; GCN3-NEXT:  .LBB56_1: ; %atomicrmw.start
2274; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
2275; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2276; GCN3-NEXT:    v_mov_b32_e32 v4, v0
2277; GCN3-NEXT:    v_and_b32_e32 v0, s6, v4
2278; GCN3-NEXT:    v_not_b32_e32 v3, v0
2279; GCN3-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
2280; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2281; GCN3-NEXT:    buffer_wbinvl1_vol
2282; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
2283; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
2284; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
2285; GCN3-NEXT:    s_cbranch_execnz .LBB56_1
2286; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
2287; GCN3-NEXT:    s_or_b64 exec, exec, s[34:35]
2288; GCN3-NEXT:    s_setpc_b64 s[30:31]
2289  %result = atomicrmw nand ptr %ptr, i32 %in seq_cst
2290  ret i32 %result
2291}
2292
2293define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) {
2294; GCN1-LABEL: flat_atomic_nand_i32_ret_offset_scalar:
2295; GCN1:       ; %bb.0:
2296; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2297; GCN1-NEXT:    s_add_u32 s34, s4, 16
2298; GCN1-NEXT:    s_addc_u32 s35, s5, 0
2299; GCN1-NEXT:    v_mov_b32_e32 v1, s34
2300; GCN1-NEXT:    v_mov_b32_e32 v2, s35
2301; GCN1-NEXT:    flat_load_dword v0, v[1:2]
2302; GCN1-NEXT:    s_mov_b64 s[34:35], 0
2303; GCN1-NEXT:  .LBB57_1: ; %atomicrmw.start
2304; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
2305; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2306; GCN1-NEXT:    v_mov_b32_e32 v4, v0
2307; GCN1-NEXT:    v_and_b32_e32 v0, s6, v4
2308; GCN1-NEXT:    v_not_b32_e32 v3, v0
2309; GCN1-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
2310; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2311; GCN1-NEXT:    buffer_wbinvl1_vol
2312; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
2313; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
2314; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
2315; GCN1-NEXT:    s_cbranch_execnz .LBB57_1
2316; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
2317; GCN1-NEXT:    s_or_b64 exec, exec, s[34:35]
2318; GCN1-NEXT:    s_setpc_b64 s[30:31]
2319;
2320; GCN2-LABEL: flat_atomic_nand_i32_ret_offset_scalar:
2321; GCN2:       ; %bb.0:
2322; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2323; GCN2-NEXT:    s_add_u32 s34, s4, 16
2324; GCN2-NEXT:    s_addc_u32 s35, s5, 0
2325; GCN2-NEXT:    v_mov_b32_e32 v1, s34
2326; GCN2-NEXT:    v_mov_b32_e32 v2, s35
2327; GCN2-NEXT:    flat_load_dword v0, v[1:2]
2328; GCN2-NEXT:    s_mov_b64 s[34:35], 0
2329; GCN2-NEXT:  .LBB57_1: ; %atomicrmw.start
2330; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
2331; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2332; GCN2-NEXT:    v_mov_b32_e32 v4, v0
2333; GCN2-NEXT:    v_and_b32_e32 v0, s6, v4
2334; GCN2-NEXT:    v_not_b32_e32 v3, v0
2335; GCN2-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
2336; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2337; GCN2-NEXT:    buffer_wbinvl1_vol
2338; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
2339; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
2340; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
2341; GCN2-NEXT:    s_cbranch_execnz .LBB57_1
2342; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
2343; GCN2-NEXT:    s_or_b64 exec, exec, s[34:35]
2344; GCN2-NEXT:    s_setpc_b64 s[30:31]
2345;
2346; GCN3-LABEL: flat_atomic_nand_i32_ret_offset_scalar:
2347; GCN3:       ; %bb.0:
2348; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2349; GCN3-NEXT:    v_mov_b32_e32 v0, s4
2350; GCN3-NEXT:    v_mov_b32_e32 v1, s5
2351; GCN3-NEXT:    flat_load_dword v0, v[0:1] offset:16
2352; GCN3-NEXT:    v_mov_b32_e32 v1, s4
2353; GCN3-NEXT:    s_mov_b64 s[34:35], 0
2354; GCN3-NEXT:    v_mov_b32_e32 v2, s5
2355; GCN3-NEXT:  .LBB57_1: ; %atomicrmw.start
2356; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
2357; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2358; GCN3-NEXT:    v_mov_b32_e32 v4, v0
2359; GCN3-NEXT:    v_and_b32_e32 v0, s6, v4
2360; GCN3-NEXT:    v_not_b32_e32 v3, v0
2361; GCN3-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc
2362; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2363; GCN3-NEXT:    buffer_wbinvl1_vol
2364; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
2365; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
2366; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
2367; GCN3-NEXT:    s_cbranch_execnz .LBB57_1
2368; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
2369; GCN3-NEXT:    s_or_b64 exec, exec, s[34:35]
2370; GCN3-NEXT:    s_setpc_b64 s[30:31]
2371  %gep = getelementptr i32, ptr %out, i32 4
2372  %result = atomicrmw nand ptr %gep, i32 %in seq_cst
2373  ret i32 %result
2374}
2375
2376define void @flat_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
2377; GCN1-LABEL: flat_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory:
2378; GCN1:       ; %bb.0:
2379; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2380; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
2381; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2382; GCN1-NEXT:    flat_load_dword v4, v[0:1]
2383; GCN1-NEXT:    s_mov_b64 s[4:5], 0
2384; GCN1-NEXT:  .LBB58_1: ; %atomicrmw.start
2385; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
2386; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2387; GCN1-NEXT:    v_and_b32_e32 v3, v4, v2
2388; GCN1-NEXT:    v_not_b32_e32 v3, v3
2389; GCN1-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
2390; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2391; GCN1-NEXT:    buffer_wbinvl1_vol
2392; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
2393; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2394; GCN1-NEXT:    v_mov_b32_e32 v4, v3
2395; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2396; GCN1-NEXT:    s_cbranch_execnz .LBB58_1
2397; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
2398; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
2399; GCN1-NEXT:    s_setpc_b64 s[30:31]
2400;
2401; GCN2-LABEL: flat_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory:
2402; GCN2:       ; %bb.0:
2403; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2404; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
2405; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2406; GCN2-NEXT:    flat_load_dword v4, v[0:1]
2407; GCN2-NEXT:    s_mov_b64 s[4:5], 0
2408; GCN2-NEXT:  .LBB58_1: ; %atomicrmw.start
2409; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
2410; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2411; GCN2-NEXT:    v_and_b32_e32 v3, v4, v2
2412; GCN2-NEXT:    v_not_b32_e32 v3, v3
2413; GCN2-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
2414; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2415; GCN2-NEXT:    buffer_wbinvl1_vol
2416; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
2417; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2418; GCN2-NEXT:    v_mov_b32_e32 v4, v3
2419; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2420; GCN2-NEXT:    s_cbranch_execnz .LBB58_1
2421; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
2422; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
2423; GCN2-NEXT:    s_setpc_b64 s[30:31]
2424;
2425; GCN3-LABEL: flat_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory:
2426; GCN3:       ; %bb.0:
2427; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2428; GCN3-NEXT:    flat_load_dword v4, v[0:1] offset:16
2429; GCN3-NEXT:    s_mov_b64 s[4:5], 0
2430; GCN3-NEXT:  .LBB58_1: ; %atomicrmw.start
2431; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
2432; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2433; GCN3-NEXT:    v_and_b32_e32 v3, v4, v2
2434; GCN3-NEXT:    v_not_b32_e32 v3, v3
2435; GCN3-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
2436; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2437; GCN3-NEXT:    buffer_wbinvl1_vol
2438; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
2439; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2440; GCN3-NEXT:    v_mov_b32_e32 v4, v3
2441; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2442; GCN3-NEXT:    s_cbranch_execnz .LBB58_1
2443; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
2444; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
2445; GCN3-NEXT:    s_setpc_b64 s[30:31]
2446  %gep = getelementptr i32, ptr %out, i64 4
2447  %tmp0 = atomicrmw nand ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
2448  ret void
2449}
2450
2451define i32 @flat_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
2452; GCN1-LABEL: flat_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory:
2453; GCN1:       ; %bb.0:
2454; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2455; GCN1-NEXT:    v_add_i32_e32 v3, vcc, 16, v0
2456; GCN1-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
2457; GCN1-NEXT:    flat_load_dword v0, v[3:4]
2458; GCN1-NEXT:    s_mov_b64 s[4:5], 0
2459; GCN1-NEXT:  .LBB59_1: ; %atomicrmw.start
2460; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
2461; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2462; GCN1-NEXT:    v_mov_b32_e32 v1, v0
2463; GCN1-NEXT:    v_and_b32_e32 v0, v1, v2
2464; GCN1-NEXT:    v_not_b32_e32 v0, v0
2465; GCN1-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
2466; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2467; GCN1-NEXT:    buffer_wbinvl1_vol
2468; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
2469; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2470; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2471; GCN1-NEXT:    s_cbranch_execnz .LBB59_1
2472; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
2473; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
2474; GCN1-NEXT:    s_setpc_b64 s[30:31]
2475;
2476; GCN2-LABEL: flat_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory:
2477; GCN2:       ; %bb.0:
2478; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2479; GCN2-NEXT:    v_add_u32_e32 v3, vcc, 16, v0
2480; GCN2-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
2481; GCN2-NEXT:    flat_load_dword v0, v[3:4]
2482; GCN2-NEXT:    s_mov_b64 s[4:5], 0
2483; GCN2-NEXT:  .LBB59_1: ; %atomicrmw.start
2484; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
2485; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2486; GCN2-NEXT:    v_mov_b32_e32 v1, v0
2487; GCN2-NEXT:    v_and_b32_e32 v0, v1, v2
2488; GCN2-NEXT:    v_not_b32_e32 v0, v0
2489; GCN2-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
2490; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2491; GCN2-NEXT:    buffer_wbinvl1_vol
2492; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
2493; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2494; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2495; GCN2-NEXT:    s_cbranch_execnz .LBB59_1
2496; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
2497; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
2498; GCN2-NEXT:    s_setpc_b64 s[30:31]
2499;
2500; GCN3-LABEL: flat_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory:
2501; GCN3:       ; %bb.0:
2502; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2503; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
2504; GCN3-NEXT:    s_mov_b64 s[4:5], 0
2505; GCN3-NEXT:  .LBB59_1: ; %atomicrmw.start
2506; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
2507; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2508; GCN3-NEXT:    v_mov_b32_e32 v4, v3
2509; GCN3-NEXT:    v_and_b32_e32 v3, v4, v2
2510; GCN3-NEXT:    v_not_b32_e32 v3, v3
2511; GCN3-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
2512; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2513; GCN3-NEXT:    buffer_wbinvl1_vol
2514; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
2515; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2516; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2517; GCN3-NEXT:    s_cbranch_execnz .LBB59_1
2518; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
2519; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
2520; GCN3-NEXT:    v_mov_b32_e32 v0, v3
2521; GCN3-NEXT:    s_setpc_b64 s[30:31]
2522  %gep = getelementptr i32, ptr %out, i64 4
2523  %result = atomicrmw nand ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
2524  ret i32 %result
2525}
2526
2527; ---------------------------------------------------------------------
2528; atomicrmw or
2529; ---------------------------------------------------------------------
2530
2531define void @flat_atomic_or_i32_noret(ptr %ptr, i32 %in) {
2532; GCN1-LABEL: flat_atomic_or_i32_noret:
2533; GCN1:       ; %bb.0:
2534; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2535; GCN1-NEXT:    flat_atomic_or v[0:1], v2
2536; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2537; GCN1-NEXT:    buffer_wbinvl1_vol
2538; GCN1-NEXT:    s_setpc_b64 s[30:31]
2539;
2540; GCN2-LABEL: flat_atomic_or_i32_noret:
2541; GCN2:       ; %bb.0:
2542; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2543; GCN2-NEXT:    flat_atomic_or v[0:1], v2
2544; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2545; GCN2-NEXT:    buffer_wbinvl1_vol
2546; GCN2-NEXT:    s_setpc_b64 s[30:31]
2547;
2548; GCN3-LABEL: flat_atomic_or_i32_noret:
2549; GCN3:       ; %bb.0:
2550; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2551; GCN3-NEXT:    flat_atomic_or v[0:1], v2
2552; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2553; GCN3-NEXT:    buffer_wbinvl1_vol
2554; GCN3-NEXT:    s_setpc_b64 s[30:31]
2555  %tmp0 = atomicrmw or ptr %ptr, i32 %in seq_cst
2556  ret void
2557}
2558
2559define void @flat_atomic_or_i32_noret_offset(ptr %out, i32 %in) {
2560; GCN1-LABEL: flat_atomic_or_i32_noret_offset:
2561; GCN1:       ; %bb.0:
2562; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2563; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
2564; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2565; GCN1-NEXT:    flat_atomic_or v[0:1], v2
2566; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2567; GCN1-NEXT:    buffer_wbinvl1_vol
2568; GCN1-NEXT:    s_setpc_b64 s[30:31]
2569;
2570; GCN2-LABEL: flat_atomic_or_i32_noret_offset:
2571; GCN2:       ; %bb.0:
2572; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2573; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
2574; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2575; GCN2-NEXT:    flat_atomic_or v[0:1], v2
2576; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2577; GCN2-NEXT:    buffer_wbinvl1_vol
2578; GCN2-NEXT:    s_setpc_b64 s[30:31]
2579;
2580; GCN3-LABEL: flat_atomic_or_i32_noret_offset:
2581; GCN3:       ; %bb.0:
2582; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2583; GCN3-NEXT:    flat_atomic_or v[0:1], v2 offset:16
2584; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2585; GCN3-NEXT:    buffer_wbinvl1_vol
2586; GCN3-NEXT:    s_setpc_b64 s[30:31]
2587  %gep = getelementptr i32, ptr %out, i32 4
2588  %tmp0 = atomicrmw or ptr %gep, i32 %in seq_cst
2589  ret void
2590}
2591
2592define i32 @flat_atomic_or_i32_ret(ptr %ptr, i32 %in) {
2593; GCN1-LABEL: flat_atomic_or_i32_ret:
2594; GCN1:       ; %bb.0:
2595; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2596; GCN1-NEXT:    flat_atomic_or v0, v[0:1], v2 glc
2597; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2598; GCN1-NEXT:    buffer_wbinvl1_vol
2599; GCN1-NEXT:    s_setpc_b64 s[30:31]
2600;
2601; GCN2-LABEL: flat_atomic_or_i32_ret:
2602; GCN2:       ; %bb.0:
2603; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2604; GCN2-NEXT:    flat_atomic_or v0, v[0:1], v2 glc
2605; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2606; GCN2-NEXT:    buffer_wbinvl1_vol
2607; GCN2-NEXT:    s_setpc_b64 s[30:31]
2608;
2609; GCN3-LABEL: flat_atomic_or_i32_ret:
2610; GCN3:       ; %bb.0:
2611; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2612; GCN3-NEXT:    flat_atomic_or v0, v[0:1], v2 glc
2613; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2614; GCN3-NEXT:    buffer_wbinvl1_vol
2615; GCN3-NEXT:    s_setpc_b64 s[30:31]
2616  %result = atomicrmw or ptr %ptr, i32 %in seq_cst
2617  ret i32 %result
2618}
2619
2620define i32 @flat_atomic_or_i32_ret_offset(ptr %out, i32 %in) {
2621; GCN1-LABEL: flat_atomic_or_i32_ret_offset:
2622; GCN1:       ; %bb.0:
2623; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2624; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
2625; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2626; GCN1-NEXT:    flat_atomic_or v0, v[0:1], v2 glc
2627; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2628; GCN1-NEXT:    buffer_wbinvl1_vol
2629; GCN1-NEXT:    s_setpc_b64 s[30:31]
2630;
2631; GCN2-LABEL: flat_atomic_or_i32_ret_offset:
2632; GCN2:       ; %bb.0:
2633; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2634; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
2635; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2636; GCN2-NEXT:    flat_atomic_or v0, v[0:1], v2 glc
2637; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2638; GCN2-NEXT:    buffer_wbinvl1_vol
2639; GCN2-NEXT:    s_setpc_b64 s[30:31]
2640;
2641; GCN3-LABEL: flat_atomic_or_i32_ret_offset:
2642; GCN3:       ; %bb.0:
2643; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2644; GCN3-NEXT:    flat_atomic_or v0, v[0:1], v2 offset:16 glc
2645; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2646; GCN3-NEXT:    buffer_wbinvl1_vol
2647; GCN3-NEXT:    s_setpc_b64 s[30:31]
2648  %gep = getelementptr i32, ptr %out, i32 4
2649  %result = atomicrmw or ptr %gep, i32 %in seq_cst
2650  ret i32 %result
2651}
2652
2653define amdgpu_gfx void @flat_atomic_or_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) {
2654; GCN1-LABEL: flat_atomic_or_i32_noret_scalar:
2655; GCN1:       ; %bb.0:
2656; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2657; GCN1-NEXT:    v_mov_b32_e32 v0, s4
2658; GCN1-NEXT:    v_mov_b32_e32 v1, s5
2659; GCN1-NEXT:    v_mov_b32_e32 v2, s6
2660; GCN1-NEXT:    flat_atomic_or v[0:1], v2
2661; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2662; GCN1-NEXT:    buffer_wbinvl1_vol
2663; GCN1-NEXT:    s_setpc_b64 s[30:31]
2664;
2665; GCN2-LABEL: flat_atomic_or_i32_noret_scalar:
2666; GCN2:       ; %bb.0:
2667; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2668; GCN2-NEXT:    v_mov_b32_e32 v0, s4
2669; GCN2-NEXT:    v_mov_b32_e32 v1, s5
2670; GCN2-NEXT:    v_mov_b32_e32 v2, s6
2671; GCN2-NEXT:    flat_atomic_or v[0:1], v2
2672; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2673; GCN2-NEXT:    buffer_wbinvl1_vol
2674; GCN2-NEXT:    s_setpc_b64 s[30:31]
2675;
2676; GCN3-LABEL: flat_atomic_or_i32_noret_scalar:
2677; GCN3:       ; %bb.0:
2678; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2679; GCN3-NEXT:    v_mov_b32_e32 v0, s4
2680; GCN3-NEXT:    v_mov_b32_e32 v1, s5
2681; GCN3-NEXT:    v_mov_b32_e32 v2, s6
2682; GCN3-NEXT:    flat_atomic_or v[0:1], v2
2683; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2684; GCN3-NEXT:    buffer_wbinvl1_vol
2685; GCN3-NEXT:    s_setpc_b64 s[30:31]
2686  %tmp0 = atomicrmw or ptr %ptr, i32 %in seq_cst
2687  ret void
2688}
2689
2690define amdgpu_gfx void @flat_atomic_or_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) {
2691; GCN1-LABEL: flat_atomic_or_i32_noret_offset_scalar:
2692; GCN1:       ; %bb.0:
2693; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2694; GCN1-NEXT:    s_add_u32 s34, s4, 16
2695; GCN1-NEXT:    s_addc_u32 s35, s5, 0
2696; GCN1-NEXT:    v_mov_b32_e32 v0, s34
2697; GCN1-NEXT:    v_mov_b32_e32 v1, s35
2698; GCN1-NEXT:    v_mov_b32_e32 v2, s6
2699; GCN1-NEXT:    flat_atomic_or v[0:1], v2
2700; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2701; GCN1-NEXT:    buffer_wbinvl1_vol
2702; GCN1-NEXT:    s_setpc_b64 s[30:31]
2703;
2704; GCN2-LABEL: flat_atomic_or_i32_noret_offset_scalar:
2705; GCN2:       ; %bb.0:
2706; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2707; GCN2-NEXT:    s_add_u32 s34, s4, 16
2708; GCN2-NEXT:    s_addc_u32 s35, s5, 0
2709; GCN2-NEXT:    v_mov_b32_e32 v0, s34
2710; GCN2-NEXT:    v_mov_b32_e32 v1, s35
2711; GCN2-NEXT:    v_mov_b32_e32 v2, s6
2712; GCN2-NEXT:    flat_atomic_or v[0:1], v2
2713; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2714; GCN2-NEXT:    buffer_wbinvl1_vol
2715; GCN2-NEXT:    s_setpc_b64 s[30:31]
2716;
2717; GCN3-LABEL: flat_atomic_or_i32_noret_offset_scalar:
2718; GCN3:       ; %bb.0:
2719; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2720; GCN3-NEXT:    v_mov_b32_e32 v0, s4
2721; GCN3-NEXT:    v_mov_b32_e32 v1, s5
2722; GCN3-NEXT:    v_mov_b32_e32 v2, s6
2723; GCN3-NEXT:    flat_atomic_or v[0:1], v2 offset:16
2724; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2725; GCN3-NEXT:    buffer_wbinvl1_vol
2726; GCN3-NEXT:    s_setpc_b64 s[30:31]
2727  %gep = getelementptr i32, ptr %out, i32 4
2728  %tmp0 = atomicrmw or ptr %gep, i32 %in seq_cst
2729  ret void
2730}
2731
2732define amdgpu_gfx i32 @flat_atomic_or_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) {
2733; GCN1-LABEL: flat_atomic_or_i32_ret_scalar:
2734; GCN1:       ; %bb.0:
2735; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2736; GCN1-NEXT:    v_mov_b32_e32 v0, s4
2737; GCN1-NEXT:    v_mov_b32_e32 v1, s5
2738; GCN1-NEXT:    v_mov_b32_e32 v2, s6
2739; GCN1-NEXT:    flat_atomic_or v0, v[0:1], v2 glc
2740; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2741; GCN1-NEXT:    buffer_wbinvl1_vol
2742; GCN1-NEXT:    s_setpc_b64 s[30:31]
2743;
2744; GCN2-LABEL: flat_atomic_or_i32_ret_scalar:
2745; GCN2:       ; %bb.0:
2746; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2747; GCN2-NEXT:    v_mov_b32_e32 v0, s4
2748; GCN2-NEXT:    v_mov_b32_e32 v1, s5
2749; GCN2-NEXT:    v_mov_b32_e32 v2, s6
2750; GCN2-NEXT:    flat_atomic_or v0, v[0:1], v2 glc
2751; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2752; GCN2-NEXT:    buffer_wbinvl1_vol
2753; GCN2-NEXT:    s_setpc_b64 s[30:31]
2754;
2755; GCN3-LABEL: flat_atomic_or_i32_ret_scalar:
2756; GCN3:       ; %bb.0:
2757; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2758; GCN3-NEXT:    v_mov_b32_e32 v0, s4
2759; GCN3-NEXT:    v_mov_b32_e32 v1, s5
2760; GCN3-NEXT:    v_mov_b32_e32 v2, s6
2761; GCN3-NEXT:    flat_atomic_or v0, v[0:1], v2 glc
2762; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2763; GCN3-NEXT:    buffer_wbinvl1_vol
2764; GCN3-NEXT:    s_setpc_b64 s[30:31]
2765  %result = atomicrmw or ptr %ptr, i32 %in seq_cst
2766  ret i32 %result
2767}
2768
2769define amdgpu_gfx i32 @flat_atomic_or_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) {
2770; GCN1-LABEL: flat_atomic_or_i32_ret_offset_scalar:
2771; GCN1:       ; %bb.0:
2772; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2773; GCN1-NEXT:    s_add_u32 s34, s4, 16
2774; GCN1-NEXT:    s_addc_u32 s35, s5, 0
2775; GCN1-NEXT:    v_mov_b32_e32 v0, s34
2776; GCN1-NEXT:    v_mov_b32_e32 v1, s35
2777; GCN1-NEXT:    v_mov_b32_e32 v2, s6
2778; GCN1-NEXT:    flat_atomic_or v0, v[0:1], v2 glc
2779; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2780; GCN1-NEXT:    buffer_wbinvl1_vol
2781; GCN1-NEXT:    s_setpc_b64 s[30:31]
2782;
2783; GCN2-LABEL: flat_atomic_or_i32_ret_offset_scalar:
2784; GCN2:       ; %bb.0:
2785; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2786; GCN2-NEXT:    s_add_u32 s34, s4, 16
2787; GCN2-NEXT:    s_addc_u32 s35, s5, 0
2788; GCN2-NEXT:    v_mov_b32_e32 v0, s34
2789; GCN2-NEXT:    v_mov_b32_e32 v1, s35
2790; GCN2-NEXT:    v_mov_b32_e32 v2, s6
2791; GCN2-NEXT:    flat_atomic_or v0, v[0:1], v2 glc
2792; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2793; GCN2-NEXT:    buffer_wbinvl1_vol
2794; GCN2-NEXT:    s_setpc_b64 s[30:31]
2795;
2796; GCN3-LABEL: flat_atomic_or_i32_ret_offset_scalar:
2797; GCN3:       ; %bb.0:
2798; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2799; GCN3-NEXT:    v_mov_b32_e32 v0, s4
2800; GCN3-NEXT:    v_mov_b32_e32 v1, s5
2801; GCN3-NEXT:    v_mov_b32_e32 v2, s6
2802; GCN3-NEXT:    flat_atomic_or v0, v[0:1], v2 offset:16 glc
2803; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2804; GCN3-NEXT:    buffer_wbinvl1_vol
2805; GCN3-NEXT:    s_setpc_b64 s[30:31]
2806  %gep = getelementptr i32, ptr %out, i32 4
2807  %result = atomicrmw or ptr %gep, i32 %in seq_cst
2808  ret i32 %result
2809}
2810
2811define void @flat_atomic_or_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
2812; GCN1-LABEL: flat_atomic_or_i32_noret_offset__amdgpu_no_remote_memory:
2813; GCN1:       ; %bb.0:
2814; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2815; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
2816; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2817; GCN1-NEXT:    flat_atomic_or v[0:1], v2
2818; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2819; GCN1-NEXT:    buffer_wbinvl1_vol
2820; GCN1-NEXT:    s_setpc_b64 s[30:31]
2821;
2822; GCN2-LABEL: flat_atomic_or_i32_noret_offset__amdgpu_no_remote_memory:
2823; GCN2:       ; %bb.0:
2824; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2825; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
2826; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2827; GCN2-NEXT:    flat_atomic_or v[0:1], v2
2828; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2829; GCN2-NEXT:    buffer_wbinvl1_vol
2830; GCN2-NEXT:    s_setpc_b64 s[30:31]
2831;
2832; GCN3-LABEL: flat_atomic_or_i32_noret_offset__amdgpu_no_remote_memory:
2833; GCN3:       ; %bb.0:
2834; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2835; GCN3-NEXT:    flat_atomic_or v[0:1], v2 offset:16
2836; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2837; GCN3-NEXT:    buffer_wbinvl1_vol
2838; GCN3-NEXT:    s_setpc_b64 s[30:31]
2839  %gep = getelementptr i32, ptr %out, i64 4
2840  %tmp0 = atomicrmw or ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
2841  ret void
2842}
2843
2844define i32 @flat_atomic_or_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
2845; GCN1-LABEL: flat_atomic_or_i32_ret_offset__amdgpu_no_remote_memory:
2846; GCN1:       ; %bb.0:
2847; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2848; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
2849; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2850; GCN1-NEXT:    flat_atomic_or v0, v[0:1], v2 glc
2851; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2852; GCN1-NEXT:    buffer_wbinvl1_vol
2853; GCN1-NEXT:    s_setpc_b64 s[30:31]
2854;
2855; GCN2-LABEL: flat_atomic_or_i32_ret_offset__amdgpu_no_remote_memory:
2856; GCN2:       ; %bb.0:
2857; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2858; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
2859; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2860; GCN2-NEXT:    flat_atomic_or v0, v[0:1], v2 glc
2861; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2862; GCN2-NEXT:    buffer_wbinvl1_vol
2863; GCN2-NEXT:    s_setpc_b64 s[30:31]
2864;
2865; GCN3-LABEL: flat_atomic_or_i32_ret_offset__amdgpu_no_remote_memory:
2866; GCN3:       ; %bb.0:
2867; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2868; GCN3-NEXT:    flat_atomic_or v0, v[0:1], v2 offset:16 glc
2869; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2870; GCN3-NEXT:    buffer_wbinvl1_vol
2871; GCN3-NEXT:    s_setpc_b64 s[30:31]
2872  %gep = getelementptr i32, ptr %out, i64 4
2873  %result = atomicrmw or ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
2874  ret i32 %result
2875}
2876
2877; ---------------------------------------------------------------------
2878; atomicrmw xor
2879; ---------------------------------------------------------------------
2880
2881define void @flat_atomic_xor_i32_noret(ptr %ptr, i32 %in) {
2882; GCN1-LABEL: flat_atomic_xor_i32_noret:
2883; GCN1:       ; %bb.0:
2884; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2885; GCN1-NEXT:    flat_atomic_xor v[0:1], v2
2886; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2887; GCN1-NEXT:    buffer_wbinvl1_vol
2888; GCN1-NEXT:    s_setpc_b64 s[30:31]
2889;
2890; GCN2-LABEL: flat_atomic_xor_i32_noret:
2891; GCN2:       ; %bb.0:
2892; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2893; GCN2-NEXT:    flat_atomic_xor v[0:1], v2
2894; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2895; GCN2-NEXT:    buffer_wbinvl1_vol
2896; GCN2-NEXT:    s_setpc_b64 s[30:31]
2897;
2898; GCN3-LABEL: flat_atomic_xor_i32_noret:
2899; GCN3:       ; %bb.0:
2900; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2901; GCN3-NEXT:    flat_atomic_xor v[0:1], v2
2902; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2903; GCN3-NEXT:    buffer_wbinvl1_vol
2904; GCN3-NEXT:    s_setpc_b64 s[30:31]
2905  %tmp0 = atomicrmw xor ptr %ptr, i32 %in seq_cst
2906  ret void
2907}
2908
2909define void @flat_atomic_xor_i32_noret_offset(ptr %out, i32 %in) {
2910; GCN1-LABEL: flat_atomic_xor_i32_noret_offset:
2911; GCN1:       ; %bb.0:
2912; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2913; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
2914; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2915; GCN1-NEXT:    flat_atomic_xor v[0:1], v2
2916; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2917; GCN1-NEXT:    buffer_wbinvl1_vol
2918; GCN1-NEXT:    s_setpc_b64 s[30:31]
2919;
2920; GCN2-LABEL: flat_atomic_xor_i32_noret_offset:
2921; GCN2:       ; %bb.0:
2922; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2923; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
2924; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2925; GCN2-NEXT:    flat_atomic_xor v[0:1], v2
2926; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2927; GCN2-NEXT:    buffer_wbinvl1_vol
2928; GCN2-NEXT:    s_setpc_b64 s[30:31]
2929;
2930; GCN3-LABEL: flat_atomic_xor_i32_noret_offset:
2931; GCN3:       ; %bb.0:
2932; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2933; GCN3-NEXT:    flat_atomic_xor v[0:1], v2 offset:16
2934; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2935; GCN3-NEXT:    buffer_wbinvl1_vol
2936; GCN3-NEXT:    s_setpc_b64 s[30:31]
2937  %gep = getelementptr i32, ptr %out, i32 4
2938  %tmp0 = atomicrmw xor ptr %gep, i32 %in seq_cst
2939  ret void
2940}
2941
2942define i32 @flat_atomic_xor_i32_ret(ptr %ptr, i32 %in) {
2943; GCN1-LABEL: flat_atomic_xor_i32_ret:
2944; GCN1:       ; %bb.0:
2945; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2946; GCN1-NEXT:    flat_atomic_xor v0, v[0:1], v2 glc
2947; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2948; GCN1-NEXT:    buffer_wbinvl1_vol
2949; GCN1-NEXT:    s_setpc_b64 s[30:31]
2950;
2951; GCN2-LABEL: flat_atomic_xor_i32_ret:
2952; GCN2:       ; %bb.0:
2953; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2954; GCN2-NEXT:    flat_atomic_xor v0, v[0:1], v2 glc
2955; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2956; GCN2-NEXT:    buffer_wbinvl1_vol
2957; GCN2-NEXT:    s_setpc_b64 s[30:31]
2958;
2959; GCN3-LABEL: flat_atomic_xor_i32_ret:
2960; GCN3:       ; %bb.0:
2961; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2962; GCN3-NEXT:    flat_atomic_xor v0, v[0:1], v2 glc
2963; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2964; GCN3-NEXT:    buffer_wbinvl1_vol
2965; GCN3-NEXT:    s_setpc_b64 s[30:31]
2966  %result = atomicrmw xor ptr %ptr, i32 %in seq_cst
2967  ret i32 %result
2968}
2969
2970define i32 @flat_atomic_xor_i32_ret_offset(ptr %out, i32 %in) {
2971; GCN1-LABEL: flat_atomic_xor_i32_ret_offset:
2972; GCN1:       ; %bb.0:
2973; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2974; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
2975; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2976; GCN1-NEXT:    flat_atomic_xor v0, v[0:1], v2 glc
2977; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2978; GCN1-NEXT:    buffer_wbinvl1_vol
2979; GCN1-NEXT:    s_setpc_b64 s[30:31]
2980;
2981; GCN2-LABEL: flat_atomic_xor_i32_ret_offset:
2982; GCN2:       ; %bb.0:
2983; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2984; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
2985; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2986; GCN2-NEXT:    flat_atomic_xor v0, v[0:1], v2 glc
2987; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2988; GCN2-NEXT:    buffer_wbinvl1_vol
2989; GCN2-NEXT:    s_setpc_b64 s[30:31]
2990;
2991; GCN3-LABEL: flat_atomic_xor_i32_ret_offset:
2992; GCN3:       ; %bb.0:
2993; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2994; GCN3-NEXT:    flat_atomic_xor v0, v[0:1], v2 offset:16 glc
2995; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2996; GCN3-NEXT:    buffer_wbinvl1_vol
2997; GCN3-NEXT:    s_setpc_b64 s[30:31]
2998  %gep = getelementptr i32, ptr %out, i32 4
2999  %result = atomicrmw xor ptr %gep, i32 %in seq_cst
3000  ret i32 %result
3001}
3002
3003define amdgpu_gfx void @flat_atomic_xor_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) {
3004; GCN1-LABEL: flat_atomic_xor_i32_noret_scalar:
3005; GCN1:       ; %bb.0:
3006; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3007; GCN1-NEXT:    v_mov_b32_e32 v0, s4
3008; GCN1-NEXT:    v_mov_b32_e32 v1, s5
3009; GCN1-NEXT:    v_mov_b32_e32 v2, s6
3010; GCN1-NEXT:    flat_atomic_xor v[0:1], v2
3011; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3012; GCN1-NEXT:    buffer_wbinvl1_vol
3013; GCN1-NEXT:    s_setpc_b64 s[30:31]
3014;
3015; GCN2-LABEL: flat_atomic_xor_i32_noret_scalar:
3016; GCN2:       ; %bb.0:
3017; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3018; GCN2-NEXT:    v_mov_b32_e32 v0, s4
3019; GCN2-NEXT:    v_mov_b32_e32 v1, s5
3020; GCN2-NEXT:    v_mov_b32_e32 v2, s6
3021; GCN2-NEXT:    flat_atomic_xor v[0:1], v2
3022; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3023; GCN2-NEXT:    buffer_wbinvl1_vol
3024; GCN2-NEXT:    s_setpc_b64 s[30:31]
3025;
3026; GCN3-LABEL: flat_atomic_xor_i32_noret_scalar:
3027; GCN3:       ; %bb.0:
3028; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3029; GCN3-NEXT:    v_mov_b32_e32 v0, s4
3030; GCN3-NEXT:    v_mov_b32_e32 v1, s5
3031; GCN3-NEXT:    v_mov_b32_e32 v2, s6
3032; GCN3-NEXT:    flat_atomic_xor v[0:1], v2
3033; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3034; GCN3-NEXT:    buffer_wbinvl1_vol
3035; GCN3-NEXT:    s_setpc_b64 s[30:31]
3036  %tmp0 = atomicrmw xor ptr %ptr, i32 %in seq_cst
3037  ret void
3038}
3039
3040define amdgpu_gfx void @flat_atomic_xor_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) {
3041; GCN1-LABEL: flat_atomic_xor_i32_noret_offset_scalar:
3042; GCN1:       ; %bb.0:
3043; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3044; GCN1-NEXT:    s_add_u32 s34, s4, 16
3045; GCN1-NEXT:    s_addc_u32 s35, s5, 0
3046; GCN1-NEXT:    v_mov_b32_e32 v0, s34
3047; GCN1-NEXT:    v_mov_b32_e32 v1, s35
3048; GCN1-NEXT:    v_mov_b32_e32 v2, s6
3049; GCN1-NEXT:    flat_atomic_xor v[0:1], v2
3050; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3051; GCN1-NEXT:    buffer_wbinvl1_vol
3052; GCN1-NEXT:    s_setpc_b64 s[30:31]
3053;
3054; GCN2-LABEL: flat_atomic_xor_i32_noret_offset_scalar:
3055; GCN2:       ; %bb.0:
3056; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3057; GCN2-NEXT:    s_add_u32 s34, s4, 16
3058; GCN2-NEXT:    s_addc_u32 s35, s5, 0
3059; GCN2-NEXT:    v_mov_b32_e32 v0, s34
3060; GCN2-NEXT:    v_mov_b32_e32 v1, s35
3061; GCN2-NEXT:    v_mov_b32_e32 v2, s6
3062; GCN2-NEXT:    flat_atomic_xor v[0:1], v2
3063; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3064; GCN2-NEXT:    buffer_wbinvl1_vol
3065; GCN2-NEXT:    s_setpc_b64 s[30:31]
3066;
3067; GCN3-LABEL: flat_atomic_xor_i32_noret_offset_scalar:
3068; GCN3:       ; %bb.0:
3069; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3070; GCN3-NEXT:    v_mov_b32_e32 v0, s4
3071; GCN3-NEXT:    v_mov_b32_e32 v1, s5
3072; GCN3-NEXT:    v_mov_b32_e32 v2, s6
3073; GCN3-NEXT:    flat_atomic_xor v[0:1], v2 offset:16
3074; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3075; GCN3-NEXT:    buffer_wbinvl1_vol
3076; GCN3-NEXT:    s_setpc_b64 s[30:31]
3077  %gep = getelementptr i32, ptr %out, i32 4
3078  %tmp0 = atomicrmw xor ptr %gep, i32 %in seq_cst
3079  ret void
3080}
3081
3082define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) {
3083; GCN1-LABEL: flat_atomic_xor_i32_ret_scalar:
3084; GCN1:       ; %bb.0:
3085; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3086; GCN1-NEXT:    v_mov_b32_e32 v0, s4
3087; GCN1-NEXT:    v_mov_b32_e32 v1, s5
3088; GCN1-NEXT:    v_mov_b32_e32 v2, s6
3089; GCN1-NEXT:    flat_atomic_xor v0, v[0:1], v2 glc
3090; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3091; GCN1-NEXT:    buffer_wbinvl1_vol
3092; GCN1-NEXT:    s_setpc_b64 s[30:31]
3093;
3094; GCN2-LABEL: flat_atomic_xor_i32_ret_scalar:
3095; GCN2:       ; %bb.0:
3096; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3097; GCN2-NEXT:    v_mov_b32_e32 v0, s4
3098; GCN2-NEXT:    v_mov_b32_e32 v1, s5
3099; GCN2-NEXT:    v_mov_b32_e32 v2, s6
3100; GCN2-NEXT:    flat_atomic_xor v0, v[0:1], v2 glc
3101; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3102; GCN2-NEXT:    buffer_wbinvl1_vol
3103; GCN2-NEXT:    s_setpc_b64 s[30:31]
3104;
3105; GCN3-LABEL: flat_atomic_xor_i32_ret_scalar:
3106; GCN3:       ; %bb.0:
3107; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3108; GCN3-NEXT:    v_mov_b32_e32 v0, s4
3109; GCN3-NEXT:    v_mov_b32_e32 v1, s5
3110; GCN3-NEXT:    v_mov_b32_e32 v2, s6
3111; GCN3-NEXT:    flat_atomic_xor v0, v[0:1], v2 glc
3112; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3113; GCN3-NEXT:    buffer_wbinvl1_vol
3114; GCN3-NEXT:    s_setpc_b64 s[30:31]
3115  %result = atomicrmw xor ptr %ptr, i32 %in seq_cst
3116  ret i32 %result
3117}
3118
3119define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) {
3120; GCN1-LABEL: flat_atomic_xor_i32_ret_offset_scalar:
3121; GCN1:       ; %bb.0:
3122; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3123; GCN1-NEXT:    s_add_u32 s34, s4, 16
3124; GCN1-NEXT:    s_addc_u32 s35, s5, 0
3125; GCN1-NEXT:    v_mov_b32_e32 v0, s34
3126; GCN1-NEXT:    v_mov_b32_e32 v1, s35
3127; GCN1-NEXT:    v_mov_b32_e32 v2, s6
3128; GCN1-NEXT:    flat_atomic_xor v0, v[0:1], v2 glc
3129; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3130; GCN1-NEXT:    buffer_wbinvl1_vol
3131; GCN1-NEXT:    s_setpc_b64 s[30:31]
3132;
3133; GCN2-LABEL: flat_atomic_xor_i32_ret_offset_scalar:
3134; GCN2:       ; %bb.0:
3135; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3136; GCN2-NEXT:    s_add_u32 s34, s4, 16
3137; GCN2-NEXT:    s_addc_u32 s35, s5, 0
3138; GCN2-NEXT:    v_mov_b32_e32 v0, s34
3139; GCN2-NEXT:    v_mov_b32_e32 v1, s35
3140; GCN2-NEXT:    v_mov_b32_e32 v2, s6
3141; GCN2-NEXT:    flat_atomic_xor v0, v[0:1], v2 glc
3142; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3143; GCN2-NEXT:    buffer_wbinvl1_vol
3144; GCN2-NEXT:    s_setpc_b64 s[30:31]
3145;
3146; GCN3-LABEL: flat_atomic_xor_i32_ret_offset_scalar:
3147; GCN3:       ; %bb.0:
3148; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3149; GCN3-NEXT:    v_mov_b32_e32 v0, s4
3150; GCN3-NEXT:    v_mov_b32_e32 v1, s5
3151; GCN3-NEXT:    v_mov_b32_e32 v2, s6
3152; GCN3-NEXT:    flat_atomic_xor v0, v[0:1], v2 offset:16 glc
3153; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3154; GCN3-NEXT:    buffer_wbinvl1_vol
3155; GCN3-NEXT:    s_setpc_b64 s[30:31]
3156  %gep = getelementptr i32, ptr %out, i32 4
3157  %result = atomicrmw xor ptr %gep, i32 %in seq_cst
3158  ret i32 %result
3159}
3160
3161define void @flat_xor_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
3162; GCN1-LABEL: flat_xor_i32_noret_offset__amdgpu_no_remote_memory:
3163; GCN1:       ; %bb.0:
3164; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3165; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
3166; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3167; GCN1-NEXT:    flat_atomic_xor v[0:1], v2
3168; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3169; GCN1-NEXT:    buffer_wbinvl1_vol
3170; GCN1-NEXT:    s_setpc_b64 s[30:31]
3171;
3172; GCN2-LABEL: flat_xor_i32_noret_offset__amdgpu_no_remote_memory:
3173; GCN2:       ; %bb.0:
3174; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3175; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
3176; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3177; GCN2-NEXT:    flat_atomic_xor v[0:1], v2
3178; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3179; GCN2-NEXT:    buffer_wbinvl1_vol
3180; GCN2-NEXT:    s_setpc_b64 s[30:31]
3181;
3182; GCN3-LABEL: flat_xor_i32_noret_offset__amdgpu_no_remote_memory:
3183; GCN3:       ; %bb.0:
3184; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3185; GCN3-NEXT:    flat_atomic_xor v[0:1], v2 offset:16
3186; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3187; GCN3-NEXT:    buffer_wbinvl1_vol
3188; GCN3-NEXT:    s_setpc_b64 s[30:31]
3189  %gep = getelementptr i32, ptr %out, i64 4
3190  %tmp0 = atomicrmw xor ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
3191  ret void
3192}
3193
3194define i32 @flat_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
3195; GCN1-LABEL: flat_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory:
3196; GCN1:       ; %bb.0:
3197; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3198; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
3199; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3200; GCN1-NEXT:    flat_atomic_xor v0, v[0:1], v2 glc
3201; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3202; GCN1-NEXT:    buffer_wbinvl1_vol
3203; GCN1-NEXT:    s_setpc_b64 s[30:31]
3204;
3205; GCN2-LABEL: flat_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory:
3206; GCN2:       ; %bb.0:
3207; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3208; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
3209; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3210; GCN2-NEXT:    flat_atomic_xor v0, v[0:1], v2 glc
3211; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3212; GCN2-NEXT:    buffer_wbinvl1_vol
3213; GCN2-NEXT:    s_setpc_b64 s[30:31]
3214;
3215; GCN3-LABEL: flat_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory:
3216; GCN3:       ; %bb.0:
3217; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3218; GCN3-NEXT:    flat_atomic_xor v0, v[0:1], v2 offset:16 glc
3219; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3220; GCN3-NEXT:    buffer_wbinvl1_vol
3221; GCN3-NEXT:    s_setpc_b64 s[30:31]
3222  %gep = getelementptr i32, ptr %out, i64 4
3223  %result = atomicrmw xor ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
3224  ret i32 %result
3225}
3226
3227; ---------------------------------------------------------------------
3228; atomicrmw max
3229; ---------------------------------------------------------------------
3230
3231define void @flat_atomic_max_i32_noret(ptr %ptr, i32 %in) {
3232; GCN1-LABEL: flat_atomic_max_i32_noret:
3233; GCN1:       ; %bb.0:
3234; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3235; GCN1-NEXT:    flat_load_dword v4, v[0:1]
3236; GCN1-NEXT:    s_mov_b64 s[4:5], 0
3237; GCN1-NEXT:  .LBB80_1: ; %atomicrmw.start
3238; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
3239; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3240; GCN1-NEXT:    v_max_i32_e32 v3, v4, v2
3241; GCN1-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
3242; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3243; GCN1-NEXT:    buffer_wbinvl1_vol
3244; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
3245; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3246; GCN1-NEXT:    v_mov_b32_e32 v4, v3
3247; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3248; GCN1-NEXT:    s_cbranch_execnz .LBB80_1
3249; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
3250; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
3251; GCN1-NEXT:    s_setpc_b64 s[30:31]
3252;
3253; GCN2-LABEL: flat_atomic_max_i32_noret:
3254; GCN2:       ; %bb.0:
3255; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3256; GCN2-NEXT:    flat_load_dword v4, v[0:1]
3257; GCN2-NEXT:    s_mov_b64 s[4:5], 0
3258; GCN2-NEXT:  .LBB80_1: ; %atomicrmw.start
3259; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
3260; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3261; GCN2-NEXT:    v_max_i32_e32 v3, v4, v2
3262; GCN2-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
3263; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3264; GCN2-NEXT:    buffer_wbinvl1_vol
3265; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
3266; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3267; GCN2-NEXT:    v_mov_b32_e32 v4, v3
3268; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3269; GCN2-NEXT:    s_cbranch_execnz .LBB80_1
3270; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
3271; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
3272; GCN2-NEXT:    s_setpc_b64 s[30:31]
3273;
3274; GCN3-LABEL: flat_atomic_max_i32_noret:
3275; GCN3:       ; %bb.0:
3276; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3277; GCN3-NEXT:    flat_load_dword v4, v[0:1]
3278; GCN3-NEXT:    s_mov_b64 s[4:5], 0
3279; GCN3-NEXT:  .LBB80_1: ; %atomicrmw.start
3280; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
3281; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3282; GCN3-NEXT:    v_max_i32_e32 v3, v4, v2
3283; GCN3-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
3284; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3285; GCN3-NEXT:    buffer_wbinvl1_vol
3286; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
3287; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3288; GCN3-NEXT:    v_mov_b32_e32 v4, v3
3289; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3290; GCN3-NEXT:    s_cbranch_execnz .LBB80_1
3291; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
3292; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
3293; GCN3-NEXT:    s_setpc_b64 s[30:31]
3294  %tmp0 = atomicrmw max ptr %ptr, i32 %in seq_cst
3295  ret void
3296}
3297
3298define void @flat_atomic_max_i32_noret_offset(ptr %out, i32 %in) {
3299; GCN1-LABEL: flat_atomic_max_i32_noret_offset:
3300; GCN1:       ; %bb.0:
3301; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3302; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
3303; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3304; GCN1-NEXT:    flat_load_dword v4, v[0:1]
3305; GCN1-NEXT:    s_mov_b64 s[4:5], 0
3306; GCN1-NEXT:  .LBB81_1: ; %atomicrmw.start
3307; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
3308; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3309; GCN1-NEXT:    v_max_i32_e32 v3, v4, v2
3310; GCN1-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
3311; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3312; GCN1-NEXT:    buffer_wbinvl1_vol
3313; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
3314; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3315; GCN1-NEXT:    v_mov_b32_e32 v4, v3
3316; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3317; GCN1-NEXT:    s_cbranch_execnz .LBB81_1
3318; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
3319; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
3320; GCN1-NEXT:    s_setpc_b64 s[30:31]
3321;
3322; GCN2-LABEL: flat_atomic_max_i32_noret_offset:
3323; GCN2:       ; %bb.0:
3324; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3325; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
3326; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3327; GCN2-NEXT:    flat_load_dword v4, v[0:1]
3328; GCN2-NEXT:    s_mov_b64 s[4:5], 0
3329; GCN2-NEXT:  .LBB81_1: ; %atomicrmw.start
3330; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
3331; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3332; GCN2-NEXT:    v_max_i32_e32 v3, v4, v2
3333; GCN2-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
3334; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3335; GCN2-NEXT:    buffer_wbinvl1_vol
3336; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
3337; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3338; GCN2-NEXT:    v_mov_b32_e32 v4, v3
3339; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3340; GCN2-NEXT:    s_cbranch_execnz .LBB81_1
3341; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
3342; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
3343; GCN2-NEXT:    s_setpc_b64 s[30:31]
3344;
3345; GCN3-LABEL: flat_atomic_max_i32_noret_offset:
3346; GCN3:       ; %bb.0:
3347; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3348; GCN3-NEXT:    flat_load_dword v4, v[0:1] offset:16
3349; GCN3-NEXT:    s_mov_b64 s[4:5], 0
3350; GCN3-NEXT:  .LBB81_1: ; %atomicrmw.start
3351; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
3352; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3353; GCN3-NEXT:    v_max_i32_e32 v3, v4, v2
3354; GCN3-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
3355; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3356; GCN3-NEXT:    buffer_wbinvl1_vol
3357; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
3358; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3359; GCN3-NEXT:    v_mov_b32_e32 v4, v3
3360; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3361; GCN3-NEXT:    s_cbranch_execnz .LBB81_1
3362; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
3363; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
3364; GCN3-NEXT:    s_setpc_b64 s[30:31]
3365  %gep = getelementptr i32, ptr %out, i32 4
3366  %tmp0 = atomicrmw max ptr %gep, i32 %in seq_cst
3367  ret void
3368}
3369
3370define i32 @flat_atomic_max_i32_ret(ptr %ptr, i32 %in) {
3371; GCN1-LABEL: flat_atomic_max_i32_ret:
3372; GCN1:       ; %bb.0:
3373; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3374; GCN1-NEXT:    flat_load_dword v3, v[0:1]
3375; GCN1-NEXT:    s_mov_b64 s[4:5], 0
3376; GCN1-NEXT:  .LBB82_1: ; %atomicrmw.start
3377; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
3378; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3379; GCN1-NEXT:    v_mov_b32_e32 v4, v3
3380; GCN1-NEXT:    v_max_i32_e32 v3, v4, v2
3381; GCN1-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
3382; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3383; GCN1-NEXT:    buffer_wbinvl1_vol
3384; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
3385; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3386; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3387; GCN1-NEXT:    s_cbranch_execnz .LBB82_1
3388; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
3389; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
3390; GCN1-NEXT:    v_mov_b32_e32 v0, v3
3391; GCN1-NEXT:    s_setpc_b64 s[30:31]
3392;
3393; GCN2-LABEL: flat_atomic_max_i32_ret:
3394; GCN2:       ; %bb.0:
3395; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3396; GCN2-NEXT:    flat_load_dword v3, v[0:1]
3397; GCN2-NEXT:    s_mov_b64 s[4:5], 0
3398; GCN2-NEXT:  .LBB82_1: ; %atomicrmw.start
3399; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
3400; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3401; GCN2-NEXT:    v_mov_b32_e32 v4, v3
3402; GCN2-NEXT:    v_max_i32_e32 v3, v4, v2
3403; GCN2-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
3404; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3405; GCN2-NEXT:    buffer_wbinvl1_vol
3406; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
3407; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3408; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3409; GCN2-NEXT:    s_cbranch_execnz .LBB82_1
3410; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
3411; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
3412; GCN2-NEXT:    v_mov_b32_e32 v0, v3
3413; GCN2-NEXT:    s_setpc_b64 s[30:31]
3414;
3415; GCN3-LABEL: flat_atomic_max_i32_ret:
3416; GCN3:       ; %bb.0:
3417; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3418; GCN3-NEXT:    flat_load_dword v3, v[0:1]
3419; GCN3-NEXT:    s_mov_b64 s[4:5], 0
3420; GCN3-NEXT:  .LBB82_1: ; %atomicrmw.start
3421; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
3422; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3423; GCN3-NEXT:    v_mov_b32_e32 v4, v3
3424; GCN3-NEXT:    v_max_i32_e32 v3, v4, v2
3425; GCN3-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
3426; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3427; GCN3-NEXT:    buffer_wbinvl1_vol
3428; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
3429; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3430; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3431; GCN3-NEXT:    s_cbranch_execnz .LBB82_1
3432; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
3433; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
3434; GCN3-NEXT:    v_mov_b32_e32 v0, v3
3435; GCN3-NEXT:    s_setpc_b64 s[30:31]
3436  %result = atomicrmw max ptr %ptr, i32 %in seq_cst
3437  ret i32 %result
3438}
3439
3440define i32 @flat_atomic_max_i32_ret_offset(ptr %out, i32 %in) {
3441; GCN1-LABEL: flat_atomic_max_i32_ret_offset:
3442; GCN1:       ; %bb.0:
3443; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3444; GCN1-NEXT:    v_add_i32_e32 v3, vcc, 16, v0
3445; GCN1-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
3446; GCN1-NEXT:    flat_load_dword v0, v[3:4]
3447; GCN1-NEXT:    s_mov_b64 s[4:5], 0
3448; GCN1-NEXT:  .LBB83_1: ; %atomicrmw.start
3449; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
3450; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3451; GCN1-NEXT:    v_mov_b32_e32 v1, v0
3452; GCN1-NEXT:    v_max_i32_e32 v0, v1, v2
3453; GCN1-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
3454; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3455; GCN1-NEXT:    buffer_wbinvl1_vol
3456; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
3457; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3458; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3459; GCN1-NEXT:    s_cbranch_execnz .LBB83_1
3460; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
3461; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
3462; GCN1-NEXT:    s_setpc_b64 s[30:31]
3463;
3464; GCN2-LABEL: flat_atomic_max_i32_ret_offset:
3465; GCN2:       ; %bb.0:
3466; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3467; GCN2-NEXT:    v_add_u32_e32 v3, vcc, 16, v0
3468; GCN2-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
3469; GCN2-NEXT:    flat_load_dword v0, v[3:4]
3470; GCN2-NEXT:    s_mov_b64 s[4:5], 0
3471; GCN2-NEXT:  .LBB83_1: ; %atomicrmw.start
3472; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
3473; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3474; GCN2-NEXT:    v_mov_b32_e32 v1, v0
3475; GCN2-NEXT:    v_max_i32_e32 v0, v1, v2
3476; GCN2-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
3477; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3478; GCN2-NEXT:    buffer_wbinvl1_vol
3479; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
3480; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3481; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3482; GCN2-NEXT:    s_cbranch_execnz .LBB83_1
3483; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
3484; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
3485; GCN2-NEXT:    s_setpc_b64 s[30:31]
3486;
3487; GCN3-LABEL: flat_atomic_max_i32_ret_offset:
3488; GCN3:       ; %bb.0:
3489; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3490; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
3491; GCN3-NEXT:    s_mov_b64 s[4:5], 0
3492; GCN3-NEXT:  .LBB83_1: ; %atomicrmw.start
3493; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
3494; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3495; GCN3-NEXT:    v_mov_b32_e32 v4, v3
3496; GCN3-NEXT:    v_max_i32_e32 v3, v4, v2
3497; GCN3-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
3498; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3499; GCN3-NEXT:    buffer_wbinvl1_vol
3500; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
3501; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3502; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3503; GCN3-NEXT:    s_cbranch_execnz .LBB83_1
3504; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
3505; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
3506; GCN3-NEXT:    v_mov_b32_e32 v0, v3
3507; GCN3-NEXT:    s_setpc_b64 s[30:31]
3508  %gep = getelementptr i32, ptr %out, i32 4
3509  %result = atomicrmw max ptr %gep, i32 %in seq_cst
3510  ret i32 %result
3511}
3512
3513define amdgpu_gfx void @flat_atomic_max_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) {
3514; GCN1-LABEL: flat_atomic_max_i32_noret_scalar:
3515; GCN1:       ; %bb.0:
3516; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3517; GCN1-NEXT:    v_mov_b32_e32 v0, s4
3518; GCN1-NEXT:    v_mov_b32_e32 v1, s5
3519; GCN1-NEXT:    flat_load_dword v3, v[0:1]
3520; GCN1-NEXT:    s_mov_b64 s[34:35], 0
3521; GCN1-NEXT:  .LBB84_1: ; %atomicrmw.start
3522; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
3523; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3524; GCN1-NEXT:    v_max_i32_e32 v2, s6, v3
3525; GCN1-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3526; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3527; GCN1-NEXT:    buffer_wbinvl1_vol
3528; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
3529; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
3530; GCN1-NEXT:    v_mov_b32_e32 v3, v2
3531; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
3532; GCN1-NEXT:    s_cbranch_execnz .LBB84_1
3533; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
3534; GCN1-NEXT:    s_or_b64 exec, exec, s[34:35]
3535; GCN1-NEXT:    s_setpc_b64 s[30:31]
3536;
3537; GCN2-LABEL: flat_atomic_max_i32_noret_scalar:
3538; GCN2:       ; %bb.0:
3539; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3540; GCN2-NEXT:    v_mov_b32_e32 v0, s4
3541; GCN2-NEXT:    v_mov_b32_e32 v1, s5
3542; GCN2-NEXT:    flat_load_dword v3, v[0:1]
3543; GCN2-NEXT:    s_mov_b64 s[34:35], 0
3544; GCN2-NEXT:  .LBB84_1: ; %atomicrmw.start
3545; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
3546; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3547; GCN2-NEXT:    v_max_i32_e32 v2, s6, v3
3548; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3549; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3550; GCN2-NEXT:    buffer_wbinvl1_vol
3551; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
3552; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
3553; GCN2-NEXT:    v_mov_b32_e32 v3, v2
3554; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
3555; GCN2-NEXT:    s_cbranch_execnz .LBB84_1
3556; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
3557; GCN2-NEXT:    s_or_b64 exec, exec, s[34:35]
3558; GCN2-NEXT:    s_setpc_b64 s[30:31]
3559;
3560; GCN3-LABEL: flat_atomic_max_i32_noret_scalar:
3561; GCN3:       ; %bb.0:
3562; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3563; GCN3-NEXT:    v_mov_b32_e32 v0, s4
3564; GCN3-NEXT:    v_mov_b32_e32 v1, s5
3565; GCN3-NEXT:    flat_load_dword v3, v[0:1]
3566; GCN3-NEXT:    s_mov_b64 s[34:35], 0
3567; GCN3-NEXT:  .LBB84_1: ; %atomicrmw.start
3568; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
3569; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3570; GCN3-NEXT:    v_max_i32_e32 v2, s6, v3
3571; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3572; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3573; GCN3-NEXT:    buffer_wbinvl1_vol
3574; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
3575; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
3576; GCN3-NEXT:    v_mov_b32_e32 v3, v2
3577; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
3578; GCN3-NEXT:    s_cbranch_execnz .LBB84_1
3579; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
3580; GCN3-NEXT:    s_or_b64 exec, exec, s[34:35]
3581; GCN3-NEXT:    s_setpc_b64 s[30:31]
3582  %tmp0 = atomicrmw max ptr %ptr, i32 %in seq_cst
3583  ret void
3584}
3585
3586define amdgpu_gfx void @flat_atomic_max_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) {
3587; GCN1-LABEL: flat_atomic_max_i32_noret_offset_scalar:
3588; GCN1:       ; %bb.0:
3589; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3590; GCN1-NEXT:    s_add_u32 s34, s4, 16
3591; GCN1-NEXT:    s_addc_u32 s35, s5, 0
3592; GCN1-NEXT:    v_mov_b32_e32 v0, s34
3593; GCN1-NEXT:    v_mov_b32_e32 v1, s35
3594; GCN1-NEXT:    flat_load_dword v3, v[0:1]
3595; GCN1-NEXT:    s_mov_b64 s[34:35], 0
3596; GCN1-NEXT:  .LBB85_1: ; %atomicrmw.start
3597; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
3598; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3599; GCN1-NEXT:    v_max_i32_e32 v2, s6, v3
3600; GCN1-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3601; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3602; GCN1-NEXT:    buffer_wbinvl1_vol
3603; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
3604; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
3605; GCN1-NEXT:    v_mov_b32_e32 v3, v2
3606; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
3607; GCN1-NEXT:    s_cbranch_execnz .LBB85_1
3608; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
3609; GCN1-NEXT:    s_or_b64 exec, exec, s[34:35]
3610; GCN1-NEXT:    s_setpc_b64 s[30:31]
3611;
3612; GCN2-LABEL: flat_atomic_max_i32_noret_offset_scalar:
3613; GCN2:       ; %bb.0:
3614; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3615; GCN2-NEXT:    s_add_u32 s34, s4, 16
3616; GCN2-NEXT:    s_addc_u32 s35, s5, 0
3617; GCN2-NEXT:    v_mov_b32_e32 v0, s34
3618; GCN2-NEXT:    v_mov_b32_e32 v1, s35
3619; GCN2-NEXT:    flat_load_dword v3, v[0:1]
3620; GCN2-NEXT:    s_mov_b64 s[34:35], 0
3621; GCN2-NEXT:  .LBB85_1: ; %atomicrmw.start
3622; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
3623; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3624; GCN2-NEXT:    v_max_i32_e32 v2, s6, v3
3625; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3626; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3627; GCN2-NEXT:    buffer_wbinvl1_vol
3628; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
3629; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
3630; GCN2-NEXT:    v_mov_b32_e32 v3, v2
3631; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
3632; GCN2-NEXT:    s_cbranch_execnz .LBB85_1
3633; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
3634; GCN2-NEXT:    s_or_b64 exec, exec, s[34:35]
3635; GCN2-NEXT:    s_setpc_b64 s[30:31]
3636;
3637; GCN3-LABEL: flat_atomic_max_i32_noret_offset_scalar:
3638; GCN3:       ; %bb.0:
3639; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3640; GCN3-NEXT:    v_mov_b32_e32 v0, s4
3641; GCN3-NEXT:    v_mov_b32_e32 v1, s5
3642; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
3643; GCN3-NEXT:    s_mov_b64 s[34:35], 0
3644; GCN3-NEXT:  .LBB85_1: ; %atomicrmw.start
3645; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
3646; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3647; GCN3-NEXT:    v_max_i32_e32 v2, s6, v3
3648; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
3649; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3650; GCN3-NEXT:    buffer_wbinvl1_vol
3651; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
3652; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
3653; GCN3-NEXT:    v_mov_b32_e32 v3, v2
3654; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
3655; GCN3-NEXT:    s_cbranch_execnz .LBB85_1
3656; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
3657; GCN3-NEXT:    s_or_b64 exec, exec, s[34:35]
3658; GCN3-NEXT:    s_setpc_b64 s[30:31]
3659  %gep = getelementptr i32, ptr %out, i32 4
3660  %tmp0 = atomicrmw max ptr %gep, i32 %in seq_cst
3661  ret void
3662}
3663
3664define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) {
3665; GCN1-LABEL: flat_atomic_max_i32_ret_scalar:
3666; GCN1:       ; %bb.0:
3667; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3668; GCN1-NEXT:    v_mov_b32_e32 v0, s4
3669; GCN1-NEXT:    v_mov_b32_e32 v1, s5
3670; GCN1-NEXT:    flat_load_dword v0, v[0:1]
3671; GCN1-NEXT:    v_mov_b32_e32 v1, s4
3672; GCN1-NEXT:    s_mov_b64 s[34:35], 0
3673; GCN1-NEXT:    v_mov_b32_e32 v2, s5
3674; GCN1-NEXT:  .LBB86_1: ; %atomicrmw.start
3675; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
3676; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3677; GCN1-NEXT:    v_mov_b32_e32 v4, v0
3678; GCN1-NEXT:    v_max_i32_e32 v3, s6, v4
3679; GCN1-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
3680; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3681; GCN1-NEXT:    buffer_wbinvl1_vol
3682; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
3683; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
3684; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
3685; GCN1-NEXT:    s_cbranch_execnz .LBB86_1
3686; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
3687; GCN1-NEXT:    s_or_b64 exec, exec, s[34:35]
3688; GCN1-NEXT:    s_setpc_b64 s[30:31]
3689;
3690; GCN2-LABEL: flat_atomic_max_i32_ret_scalar:
3691; GCN2:       ; %bb.0:
3692; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3693; GCN2-NEXT:    v_mov_b32_e32 v0, s4
3694; GCN2-NEXT:    v_mov_b32_e32 v1, s5
3695; GCN2-NEXT:    flat_load_dword v0, v[0:1]
3696; GCN2-NEXT:    v_mov_b32_e32 v1, s4
3697; GCN2-NEXT:    s_mov_b64 s[34:35], 0
3698; GCN2-NEXT:    v_mov_b32_e32 v2, s5
3699; GCN2-NEXT:  .LBB86_1: ; %atomicrmw.start
3700; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
3701; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3702; GCN2-NEXT:    v_mov_b32_e32 v4, v0
3703; GCN2-NEXT:    v_max_i32_e32 v3, s6, v4
3704; GCN2-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
3705; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3706; GCN2-NEXT:    buffer_wbinvl1_vol
3707; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
3708; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
3709; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
3710; GCN2-NEXT:    s_cbranch_execnz .LBB86_1
3711; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
3712; GCN2-NEXT:    s_or_b64 exec, exec, s[34:35]
3713; GCN2-NEXT:    s_setpc_b64 s[30:31]
3714;
3715; GCN3-LABEL: flat_atomic_max_i32_ret_scalar:
3716; GCN3:       ; %bb.0:
3717; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3718; GCN3-NEXT:    v_mov_b32_e32 v0, s4
3719; GCN3-NEXT:    v_mov_b32_e32 v1, s5
3720; GCN3-NEXT:    flat_load_dword v0, v[0:1]
3721; GCN3-NEXT:    v_mov_b32_e32 v1, s4
3722; GCN3-NEXT:    s_mov_b64 s[34:35], 0
3723; GCN3-NEXT:    v_mov_b32_e32 v2, s5
3724; GCN3-NEXT:  .LBB86_1: ; %atomicrmw.start
3725; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
3726; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3727; GCN3-NEXT:    v_mov_b32_e32 v4, v0
3728; GCN3-NEXT:    v_max_i32_e32 v3, s6, v4
3729; GCN3-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
3730; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3731; GCN3-NEXT:    buffer_wbinvl1_vol
3732; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
3733; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
3734; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
3735; GCN3-NEXT:    s_cbranch_execnz .LBB86_1
3736; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
3737; GCN3-NEXT:    s_or_b64 exec, exec, s[34:35]
3738; GCN3-NEXT:    s_setpc_b64 s[30:31]
3739  %result = atomicrmw max ptr %ptr, i32 %in seq_cst
3740  ret i32 %result
3741}
3742
3743define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) {
3744; GCN1-LABEL: flat_atomic_max_i32_ret_offset_scalar:
3745; GCN1:       ; %bb.0:
3746; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3747; GCN1-NEXT:    s_add_u32 s34, s4, 16
3748; GCN1-NEXT:    s_addc_u32 s35, s5, 0
3749; GCN1-NEXT:    v_mov_b32_e32 v1, s34
3750; GCN1-NEXT:    v_mov_b32_e32 v2, s35
3751; GCN1-NEXT:    flat_load_dword v0, v[1:2]
3752; GCN1-NEXT:    s_mov_b64 s[34:35], 0
3753; GCN1-NEXT:  .LBB87_1: ; %atomicrmw.start
3754; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
3755; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3756; GCN1-NEXT:    v_mov_b32_e32 v4, v0
3757; GCN1-NEXT:    v_max_i32_e32 v3, s6, v4
3758; GCN1-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
3759; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3760; GCN1-NEXT:    buffer_wbinvl1_vol
3761; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
3762; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
3763; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
3764; GCN1-NEXT:    s_cbranch_execnz .LBB87_1
3765; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
3766; GCN1-NEXT:    s_or_b64 exec, exec, s[34:35]
3767; GCN1-NEXT:    s_setpc_b64 s[30:31]
3768;
3769; GCN2-LABEL: flat_atomic_max_i32_ret_offset_scalar:
3770; GCN2:       ; %bb.0:
3771; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3772; GCN2-NEXT:    s_add_u32 s34, s4, 16
3773; GCN2-NEXT:    s_addc_u32 s35, s5, 0
3774; GCN2-NEXT:    v_mov_b32_e32 v1, s34
3775; GCN2-NEXT:    v_mov_b32_e32 v2, s35
3776; GCN2-NEXT:    flat_load_dword v0, v[1:2]
3777; GCN2-NEXT:    s_mov_b64 s[34:35], 0
3778; GCN2-NEXT:  .LBB87_1: ; %atomicrmw.start
3779; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
3780; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3781; GCN2-NEXT:    v_mov_b32_e32 v4, v0
3782; GCN2-NEXT:    v_max_i32_e32 v3, s6, v4
3783; GCN2-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
3784; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3785; GCN2-NEXT:    buffer_wbinvl1_vol
3786; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
3787; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
3788; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
3789; GCN2-NEXT:    s_cbranch_execnz .LBB87_1
3790; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
3791; GCN2-NEXT:    s_or_b64 exec, exec, s[34:35]
3792; GCN2-NEXT:    s_setpc_b64 s[30:31]
3793;
3794; GCN3-LABEL: flat_atomic_max_i32_ret_offset_scalar:
3795; GCN3:       ; %bb.0:
3796; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3797; GCN3-NEXT:    v_mov_b32_e32 v0, s4
3798; GCN3-NEXT:    v_mov_b32_e32 v1, s5
3799; GCN3-NEXT:    flat_load_dword v0, v[0:1] offset:16
3800; GCN3-NEXT:    v_mov_b32_e32 v1, s4
3801; GCN3-NEXT:    s_mov_b64 s[34:35], 0
3802; GCN3-NEXT:    v_mov_b32_e32 v2, s5
3803; GCN3-NEXT:  .LBB87_1: ; %atomicrmw.start
3804; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
3805; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3806; GCN3-NEXT:    v_mov_b32_e32 v4, v0
3807; GCN3-NEXT:    v_max_i32_e32 v3, s6, v4
3808; GCN3-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc
3809; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3810; GCN3-NEXT:    buffer_wbinvl1_vol
3811; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
3812; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
3813; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
3814; GCN3-NEXT:    s_cbranch_execnz .LBB87_1
3815; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
3816; GCN3-NEXT:    s_or_b64 exec, exec, s[34:35]
3817; GCN3-NEXT:    s_setpc_b64 s[30:31]
3818  %gep = getelementptr i32, ptr %out, i32 4
3819  %result = atomicrmw max ptr %gep, i32 %in seq_cst
3820  ret i32 %result
3821}
3822
3823define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 %index) {
3824; GCN1-LABEL: atomic_max_i32_addr64_offset:
3825; GCN1:       ; %bb.0: ; %entry
3826; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
3827; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
3828; GCN1-NEXT:    s_ashr_i32 s5, s3, 31
3829; GCN1-NEXT:    s_mov_b32 s4, s3
3830; GCN1-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
3831; GCN1-NEXT:    s_add_u32 s0, s0, s4
3832; GCN1-NEXT:    s_addc_u32 s1, s1, s5
3833; GCN1-NEXT:    s_add_u32 s0, s0, 16
3834; GCN1-NEXT:    s_addc_u32 s1, s1, 0
3835; GCN1-NEXT:    v_mov_b32_e32 v0, s0
3836; GCN1-NEXT:    v_mov_b32_e32 v1, s1
3837; GCN1-NEXT:    flat_load_dword v3, v[0:1]
3838; GCN1-NEXT:    s_mov_b64 s[0:1], 0
3839; GCN1-NEXT:  .LBB88_1: ; %atomicrmw.start
3840; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
3841; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3842; GCN1-NEXT:    v_max_i32_e32 v2, s2, v3
3843; GCN1-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3844; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3845; GCN1-NEXT:    buffer_wbinvl1_vol
3846; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
3847; GCN1-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
3848; GCN1-NEXT:    v_mov_b32_e32 v3, v2
3849; GCN1-NEXT:    s_andn2_b64 exec, exec, s[0:1]
3850; GCN1-NEXT:    s_cbranch_execnz .LBB88_1
3851; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
3852; GCN1-NEXT:    s_endpgm
3853;
3854; GCN2-LABEL: atomic_max_i32_addr64_offset:
3855; GCN2:       ; %bb.0: ; %entry
3856; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3857; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
3858; GCN2-NEXT:    s_ashr_i32 s5, s3, 31
3859; GCN2-NEXT:    s_mov_b32 s4, s3
3860; GCN2-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
3861; GCN2-NEXT:    s_add_u32 s0, s0, s4
3862; GCN2-NEXT:    s_addc_u32 s1, s1, s5
3863; GCN2-NEXT:    s_add_u32 s0, s0, 16
3864; GCN2-NEXT:    s_addc_u32 s1, s1, 0
3865; GCN2-NEXT:    v_mov_b32_e32 v0, s0
3866; GCN2-NEXT:    v_mov_b32_e32 v1, s1
3867; GCN2-NEXT:    flat_load_dword v3, v[0:1]
3868; GCN2-NEXT:    s_mov_b64 s[0:1], 0
3869; GCN2-NEXT:  .LBB88_1: ; %atomicrmw.start
3870; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
3871; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3872; GCN2-NEXT:    v_max_i32_e32 v2, s2, v3
3873; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3874; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3875; GCN2-NEXT:    buffer_wbinvl1_vol
3876; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
3877; GCN2-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
3878; GCN2-NEXT:    v_mov_b32_e32 v3, v2
3879; GCN2-NEXT:    s_andn2_b64 exec, exec, s[0:1]
3880; GCN2-NEXT:    s_cbranch_execnz .LBB88_1
3881; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
3882; GCN2-NEXT:    s_endpgm
3883;
3884; GCN3-LABEL: atomic_max_i32_addr64_offset:
3885; GCN3:       ; %bb.0: ; %entry
3886; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3887; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
3888; GCN3-NEXT:    s_ashr_i32 s5, s3, 31
3889; GCN3-NEXT:    s_mov_b32 s4, s3
3890; GCN3-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
3891; GCN3-NEXT:    s_add_u32 s0, s0, s4
3892; GCN3-NEXT:    s_addc_u32 s1, s1, s5
3893; GCN3-NEXT:    v_mov_b32_e32 v0, s0
3894; GCN3-NEXT:    v_mov_b32_e32 v1, s1
3895; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
3896; GCN3-NEXT:    s_mov_b64 s[0:1], 0
3897; GCN3-NEXT:  .LBB88_1: ; %atomicrmw.start
3898; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
3899; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3900; GCN3-NEXT:    v_max_i32_e32 v2, s2, v3
3901; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
3902; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3903; GCN3-NEXT:    buffer_wbinvl1_vol
3904; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
3905; GCN3-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
3906; GCN3-NEXT:    v_mov_b32_e32 v3, v2
3907; GCN3-NEXT:    s_andn2_b64 exec, exec, s[0:1]
3908; GCN3-NEXT:    s_cbranch_execnz .LBB88_1
3909; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
3910; GCN3-NEXT:    s_endpgm
3911entry:
3912  %ptr = getelementptr i32, ptr %out, i32 %index
3913  %gep = getelementptr i32, ptr %ptr, i32 4
3914  %tmp0 = atomicrmw max ptr %gep, i32 %in seq_cst
3915  ret void
3916}
3917
3918define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i32 %index) {
3919; GCN1-LABEL: atomic_max_i32_ret_addr64_offset:
3920; GCN1:       ; %bb.0: ; %entry
3921; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
3922; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
3923; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
3924; GCN1-NEXT:    s_ashr_i32 s5, s7, 31
3925; GCN1-NEXT:    s_mov_b32 s4, s7
3926; GCN1-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
3927; GCN1-NEXT:    s_add_u32 s0, s0, s4
3928; GCN1-NEXT:    s_addc_u32 s1, s1, s5
3929; GCN1-NEXT:    s_add_u32 s0, s0, 16
3930; GCN1-NEXT:    s_addc_u32 s1, s1, 0
3931; GCN1-NEXT:    v_mov_b32_e32 v0, s0
3932; GCN1-NEXT:    v_mov_b32_e32 v1, s1
3933; GCN1-NEXT:    flat_load_dword v2, v[0:1]
3934; GCN1-NEXT:    s_mov_b64 s[0:1], 0
3935; GCN1-NEXT:  .LBB89_1: ; %atomicrmw.start
3936; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
3937; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3938; GCN1-NEXT:    v_mov_b32_e32 v3, v2
3939; GCN1-NEXT:    v_max_i32_e32 v2, s6, v3
3940; GCN1-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3941; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3942; GCN1-NEXT:    buffer_wbinvl1_vol
3943; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
3944; GCN1-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
3945; GCN1-NEXT:    s_andn2_b64 exec, exec, s[0:1]
3946; GCN1-NEXT:    s_cbranch_execnz .LBB89_1
3947; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
3948; GCN1-NEXT:    s_or_b64 exec, exec, s[0:1]
3949; GCN1-NEXT:    v_mov_b32_e32 v0, s2
3950; GCN1-NEXT:    v_mov_b32_e32 v1, s3
3951; GCN1-NEXT:    flat_store_dword v[0:1], v2
3952; GCN1-NEXT:    s_endpgm
3953;
3954; GCN2-LABEL: atomic_max_i32_ret_addr64_offset:
3955; GCN2:       ; %bb.0: ; %entry
3956; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
3957; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3958; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
3959; GCN2-NEXT:    s_ashr_i32 s5, s7, 31
3960; GCN2-NEXT:    s_mov_b32 s4, s7
3961; GCN2-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
3962; GCN2-NEXT:    s_add_u32 s0, s0, s4
3963; GCN2-NEXT:    s_addc_u32 s1, s1, s5
3964; GCN2-NEXT:    s_add_u32 s0, s0, 16
3965; GCN2-NEXT:    s_addc_u32 s1, s1, 0
3966; GCN2-NEXT:    v_mov_b32_e32 v0, s0
3967; GCN2-NEXT:    v_mov_b32_e32 v1, s1
3968; GCN2-NEXT:    flat_load_dword v2, v[0:1]
3969; GCN2-NEXT:    s_mov_b64 s[0:1], 0
3970; GCN2-NEXT:  .LBB89_1: ; %atomicrmw.start
3971; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
3972; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3973; GCN2-NEXT:    v_mov_b32_e32 v3, v2
3974; GCN2-NEXT:    v_max_i32_e32 v2, s6, v3
3975; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3976; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3977; GCN2-NEXT:    buffer_wbinvl1_vol
3978; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
3979; GCN2-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
3980; GCN2-NEXT:    s_andn2_b64 exec, exec, s[0:1]
3981; GCN2-NEXT:    s_cbranch_execnz .LBB89_1
3982; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
3983; GCN2-NEXT:    s_or_b64 exec, exec, s[0:1]
3984; GCN2-NEXT:    v_mov_b32_e32 v0, s2
3985; GCN2-NEXT:    v_mov_b32_e32 v1, s3
3986; GCN2-NEXT:    flat_store_dword v[0:1], v2
3987; GCN2-NEXT:    s_endpgm
3988;
3989; GCN3-LABEL: atomic_max_i32_ret_addr64_offset:
3990; GCN3:       ; %bb.0: ; %entry
3991; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
3992; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3993; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
3994; GCN3-NEXT:    s_ashr_i32 s5, s7, 31
3995; GCN3-NEXT:    s_mov_b32 s4, s7
3996; GCN3-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
3997; GCN3-NEXT:    s_add_u32 s0, s0, s4
3998; GCN3-NEXT:    s_addc_u32 s1, s1, s5
3999; GCN3-NEXT:    v_mov_b32_e32 v0, s0
4000; GCN3-NEXT:    v_mov_b32_e32 v1, s1
4001; GCN3-NEXT:    flat_load_dword v2, v[0:1] offset:16
4002; GCN3-NEXT:    s_mov_b64 s[0:1], 0
4003; GCN3-NEXT:  .LBB89_1: ; %atomicrmw.start
4004; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
4005; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4006; GCN3-NEXT:    v_mov_b32_e32 v3, v2
4007; GCN3-NEXT:    v_max_i32_e32 v2, s6, v3
4008; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
4009; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4010; GCN3-NEXT:    buffer_wbinvl1_vol
4011; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
4012; GCN3-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
4013; GCN3-NEXT:    s_andn2_b64 exec, exec, s[0:1]
4014; GCN3-NEXT:    s_cbranch_execnz .LBB89_1
4015; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
4016; GCN3-NEXT:    s_or_b64 exec, exec, s[0:1]
4017; GCN3-NEXT:    v_mov_b32_e32 v0, s2
4018; GCN3-NEXT:    v_mov_b32_e32 v1, s3
4019; GCN3-NEXT:    flat_store_dword v[0:1], v2
4020; GCN3-NEXT:    s_endpgm
4021entry:
4022  %ptr = getelementptr i32, ptr %out, i32 %index
4023  %gep = getelementptr i32, ptr %ptr, i32 4
4024  %tmp0 = atomicrmw max ptr %gep, i32 %in seq_cst
4025  store i32 %tmp0, ptr %out2
4026  ret void
4027}
4028
4029define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) {
4030; GCN1-LABEL: atomic_max_i32_addr64:
4031; GCN1:       ; %bb.0: ; %entry
4032; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
4033; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
4034; GCN1-NEXT:    s_ashr_i32 s5, s3, 31
4035; GCN1-NEXT:    s_mov_b32 s4, s3
4036; GCN1-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
4037; GCN1-NEXT:    s_add_u32 s0, s0, s4
4038; GCN1-NEXT:    s_addc_u32 s1, s1, s5
4039; GCN1-NEXT:    v_mov_b32_e32 v0, s0
4040; GCN1-NEXT:    v_mov_b32_e32 v1, s1
4041; GCN1-NEXT:    flat_load_dword v3, v[0:1]
4042; GCN1-NEXT:    s_mov_b64 s[0:1], 0
4043; GCN1-NEXT:  .LBB90_1: ; %atomicrmw.start
4044; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
4045; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4046; GCN1-NEXT:    v_max_i32_e32 v2, s2, v3
4047; GCN1-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4048; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4049; GCN1-NEXT:    buffer_wbinvl1_vol
4050; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
4051; GCN1-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
4052; GCN1-NEXT:    v_mov_b32_e32 v3, v2
4053; GCN1-NEXT:    s_andn2_b64 exec, exec, s[0:1]
4054; GCN1-NEXT:    s_cbranch_execnz .LBB90_1
4055; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
4056; GCN1-NEXT:    s_endpgm
4057;
4058; GCN2-LABEL: atomic_max_i32_addr64:
4059; GCN2:       ; %bb.0: ; %entry
4060; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4061; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
4062; GCN2-NEXT:    s_ashr_i32 s5, s3, 31
4063; GCN2-NEXT:    s_mov_b32 s4, s3
4064; GCN2-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
4065; GCN2-NEXT:    s_add_u32 s0, s0, s4
4066; GCN2-NEXT:    s_addc_u32 s1, s1, s5
4067; GCN2-NEXT:    v_mov_b32_e32 v0, s0
4068; GCN2-NEXT:    v_mov_b32_e32 v1, s1
4069; GCN2-NEXT:    flat_load_dword v3, v[0:1]
4070; GCN2-NEXT:    s_mov_b64 s[0:1], 0
4071; GCN2-NEXT:  .LBB90_1: ; %atomicrmw.start
4072; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
4073; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4074; GCN2-NEXT:    v_max_i32_e32 v2, s2, v3
4075; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4076; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4077; GCN2-NEXT:    buffer_wbinvl1_vol
4078; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
4079; GCN2-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
4080; GCN2-NEXT:    v_mov_b32_e32 v3, v2
4081; GCN2-NEXT:    s_andn2_b64 exec, exec, s[0:1]
4082; GCN2-NEXT:    s_cbranch_execnz .LBB90_1
4083; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
4084; GCN2-NEXT:    s_endpgm
4085;
4086; GCN3-LABEL: atomic_max_i32_addr64:
4087; GCN3:       ; %bb.0: ; %entry
4088; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4089; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
4090; GCN3-NEXT:    s_ashr_i32 s5, s3, 31
4091; GCN3-NEXT:    s_mov_b32 s4, s3
4092; GCN3-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
4093; GCN3-NEXT:    s_add_u32 s0, s0, s4
4094; GCN3-NEXT:    s_addc_u32 s1, s1, s5
4095; GCN3-NEXT:    v_mov_b32_e32 v0, s0
4096; GCN3-NEXT:    v_mov_b32_e32 v1, s1
4097; GCN3-NEXT:    flat_load_dword v3, v[0:1]
4098; GCN3-NEXT:    s_mov_b64 s[0:1], 0
4099; GCN3-NEXT:  .LBB90_1: ; %atomicrmw.start
4100; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
4101; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4102; GCN3-NEXT:    v_max_i32_e32 v2, s2, v3
4103; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4104; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4105; GCN3-NEXT:    buffer_wbinvl1_vol
4106; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
4107; GCN3-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
4108; GCN3-NEXT:    v_mov_b32_e32 v3, v2
4109; GCN3-NEXT:    s_andn2_b64 exec, exec, s[0:1]
4110; GCN3-NEXT:    s_cbranch_execnz .LBB90_1
4111; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
4112; GCN3-NEXT:    s_endpgm
4113entry:
4114  %ptr = getelementptr i32, ptr %out, i32 %index
4115  %tmp0 = atomicrmw max ptr %ptr, i32 %in seq_cst
4116  ret void
4117}
4118
4119define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i32 %index) {
4120; GCN1-LABEL: atomic_max_i32_ret_addr64:
4121; GCN1:       ; %bb.0: ; %entry
4122; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
4123; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
4124; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
4125; GCN1-NEXT:    s_ashr_i32 s5, s7, 31
4126; GCN1-NEXT:    s_mov_b32 s4, s7
4127; GCN1-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
4128; GCN1-NEXT:    s_add_u32 s0, s0, s4
4129; GCN1-NEXT:    s_addc_u32 s1, s1, s5
4130; GCN1-NEXT:    v_mov_b32_e32 v0, s0
4131; GCN1-NEXT:    v_mov_b32_e32 v1, s1
4132; GCN1-NEXT:    flat_load_dword v2, v[0:1]
4133; GCN1-NEXT:    s_mov_b64 s[0:1], 0
4134; GCN1-NEXT:  .LBB91_1: ; %atomicrmw.start
4135; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
4136; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4137; GCN1-NEXT:    v_mov_b32_e32 v3, v2
4138; GCN1-NEXT:    v_max_i32_e32 v2, s6, v3
4139; GCN1-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4140; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4141; GCN1-NEXT:    buffer_wbinvl1_vol
4142; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
4143; GCN1-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
4144; GCN1-NEXT:    s_andn2_b64 exec, exec, s[0:1]
4145; GCN1-NEXT:    s_cbranch_execnz .LBB91_1
4146; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
4147; GCN1-NEXT:    s_or_b64 exec, exec, s[0:1]
4148; GCN1-NEXT:    v_mov_b32_e32 v0, s2
4149; GCN1-NEXT:    v_mov_b32_e32 v1, s3
4150; GCN1-NEXT:    flat_store_dword v[0:1], v2
4151; GCN1-NEXT:    s_endpgm
4152;
4153; GCN2-LABEL: atomic_max_i32_ret_addr64:
4154; GCN2:       ; %bb.0: ; %entry
4155; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
4156; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4157; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
4158; GCN2-NEXT:    s_ashr_i32 s5, s7, 31
4159; GCN2-NEXT:    s_mov_b32 s4, s7
4160; GCN2-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
4161; GCN2-NEXT:    s_add_u32 s0, s0, s4
4162; GCN2-NEXT:    s_addc_u32 s1, s1, s5
4163; GCN2-NEXT:    v_mov_b32_e32 v0, s0
4164; GCN2-NEXT:    v_mov_b32_e32 v1, s1
4165; GCN2-NEXT:    flat_load_dword v2, v[0:1]
4166; GCN2-NEXT:    s_mov_b64 s[0:1], 0
4167; GCN2-NEXT:  .LBB91_1: ; %atomicrmw.start
4168; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
4169; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4170; GCN2-NEXT:    v_mov_b32_e32 v3, v2
4171; GCN2-NEXT:    v_max_i32_e32 v2, s6, v3
4172; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4173; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4174; GCN2-NEXT:    buffer_wbinvl1_vol
4175; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
4176; GCN2-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
4177; GCN2-NEXT:    s_andn2_b64 exec, exec, s[0:1]
4178; GCN2-NEXT:    s_cbranch_execnz .LBB91_1
4179; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
4180; GCN2-NEXT:    s_or_b64 exec, exec, s[0:1]
4181; GCN2-NEXT:    v_mov_b32_e32 v0, s2
4182; GCN2-NEXT:    v_mov_b32_e32 v1, s3
4183; GCN2-NEXT:    flat_store_dword v[0:1], v2
4184; GCN2-NEXT:    s_endpgm
4185;
4186; GCN3-LABEL: atomic_max_i32_ret_addr64:
4187; GCN3:       ; %bb.0: ; %entry
4188; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
4189; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4190; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
4191; GCN3-NEXT:    s_ashr_i32 s5, s7, 31
4192; GCN3-NEXT:    s_mov_b32 s4, s7
4193; GCN3-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
4194; GCN3-NEXT:    s_add_u32 s0, s0, s4
4195; GCN3-NEXT:    s_addc_u32 s1, s1, s5
4196; GCN3-NEXT:    v_mov_b32_e32 v0, s0
4197; GCN3-NEXT:    v_mov_b32_e32 v1, s1
4198; GCN3-NEXT:    flat_load_dword v2, v[0:1]
4199; GCN3-NEXT:    s_mov_b64 s[0:1], 0
4200; GCN3-NEXT:  .LBB91_1: ; %atomicrmw.start
4201; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
4202; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4203; GCN3-NEXT:    v_mov_b32_e32 v3, v2
4204; GCN3-NEXT:    v_max_i32_e32 v2, s6, v3
4205; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4206; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4207; GCN3-NEXT:    buffer_wbinvl1_vol
4208; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
4209; GCN3-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
4210; GCN3-NEXT:    s_andn2_b64 exec, exec, s[0:1]
4211; GCN3-NEXT:    s_cbranch_execnz .LBB91_1
4212; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
4213; GCN3-NEXT:    s_or_b64 exec, exec, s[0:1]
4214; GCN3-NEXT:    v_mov_b32_e32 v0, s2
4215; GCN3-NEXT:    v_mov_b32_e32 v1, s3
4216; GCN3-NEXT:    flat_store_dword v[0:1], v2
4217; GCN3-NEXT:    s_endpgm
4218entry:
4219  %ptr = getelementptr i32, ptr %out, i32 %index
4220  %tmp0 = atomicrmw max ptr %ptr, i32 %in seq_cst
4221  store i32 %tmp0, ptr %out2
4222  ret void
4223}
4224
4225define void @flat_max_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
4226; GCN1-LABEL: flat_max_i32_noret_offset__amdgpu_no_remote_memory:
4227; GCN1:       ; %bb.0:
4228; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4229; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
4230; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4231; GCN1-NEXT:    flat_load_dword v4, v[0:1]
4232; GCN1-NEXT:    s_mov_b64 s[4:5], 0
4233; GCN1-NEXT:  .LBB92_1: ; %atomicrmw.start
4234; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
4235; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4236; GCN1-NEXT:    v_max_i32_e32 v3, v4, v2
4237; GCN1-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
4238; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4239; GCN1-NEXT:    buffer_wbinvl1_vol
4240; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
4241; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4242; GCN1-NEXT:    v_mov_b32_e32 v4, v3
4243; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4244; GCN1-NEXT:    s_cbranch_execnz .LBB92_1
4245; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
4246; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
4247; GCN1-NEXT:    s_setpc_b64 s[30:31]
4248;
4249; GCN2-LABEL: flat_max_i32_noret_offset__amdgpu_no_remote_memory:
4250; GCN2:       ; %bb.0:
4251; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4252; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
4253; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4254; GCN2-NEXT:    flat_load_dword v4, v[0:1]
4255; GCN2-NEXT:    s_mov_b64 s[4:5], 0
4256; GCN2-NEXT:  .LBB92_1: ; %atomicrmw.start
4257; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
4258; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4259; GCN2-NEXT:    v_max_i32_e32 v3, v4, v2
4260; GCN2-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
4261; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4262; GCN2-NEXT:    buffer_wbinvl1_vol
4263; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
4264; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4265; GCN2-NEXT:    v_mov_b32_e32 v4, v3
4266; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4267; GCN2-NEXT:    s_cbranch_execnz .LBB92_1
4268; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
4269; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
4270; GCN2-NEXT:    s_setpc_b64 s[30:31]
4271;
4272; GCN3-LABEL: flat_max_i32_noret_offset__amdgpu_no_remote_memory:
4273; GCN3:       ; %bb.0:
4274; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4275; GCN3-NEXT:    flat_load_dword v4, v[0:1] offset:16
4276; GCN3-NEXT:    s_mov_b64 s[4:5], 0
4277; GCN3-NEXT:  .LBB92_1: ; %atomicrmw.start
4278; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
4279; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4280; GCN3-NEXT:    v_max_i32_e32 v3, v4, v2
4281; GCN3-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
4282; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4283; GCN3-NEXT:    buffer_wbinvl1_vol
4284; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
4285; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4286; GCN3-NEXT:    v_mov_b32_e32 v4, v3
4287; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4288; GCN3-NEXT:    s_cbranch_execnz .LBB92_1
4289; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
4290; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
4291; GCN3-NEXT:    s_setpc_b64 s[30:31]
4292  %gep = getelementptr i32, ptr %out, i64 4
4293  %tmp0 = atomicrmw max ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
4294  ret void
4295}
4296
4297define i32 @flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
4298; GCN1-LABEL: flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory:
4299; GCN1:       ; %bb.0:
4300; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4301; GCN1-NEXT:    v_add_i32_e32 v3, vcc, 16, v0
4302; GCN1-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
4303; GCN1-NEXT:    flat_load_dword v0, v[3:4]
4304; GCN1-NEXT:    s_mov_b64 s[4:5], 0
4305; GCN1-NEXT:  .LBB93_1: ; %atomicrmw.start
4306; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
4307; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4308; GCN1-NEXT:    v_mov_b32_e32 v1, v0
4309; GCN1-NEXT:    v_max_i32_e32 v0, v1, v2
4310; GCN1-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
4311; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4312; GCN1-NEXT:    buffer_wbinvl1_vol
4313; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
4314; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4315; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4316; GCN1-NEXT:    s_cbranch_execnz .LBB93_1
4317; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
4318; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
4319; GCN1-NEXT:    s_setpc_b64 s[30:31]
4320;
4321; GCN2-LABEL: flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory:
4322; GCN2:       ; %bb.0:
4323; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4324; GCN2-NEXT:    v_add_u32_e32 v3, vcc, 16, v0
4325; GCN2-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
4326; GCN2-NEXT:    flat_load_dword v0, v[3:4]
4327; GCN2-NEXT:    s_mov_b64 s[4:5], 0
4328; GCN2-NEXT:  .LBB93_1: ; %atomicrmw.start
4329; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
4330; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4331; GCN2-NEXT:    v_mov_b32_e32 v1, v0
4332; GCN2-NEXT:    v_max_i32_e32 v0, v1, v2
4333; GCN2-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
4334; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4335; GCN2-NEXT:    buffer_wbinvl1_vol
4336; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
4337; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4338; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4339; GCN2-NEXT:    s_cbranch_execnz .LBB93_1
4340; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
4341; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
4342; GCN2-NEXT:    s_setpc_b64 s[30:31]
4343;
4344; GCN3-LABEL: flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory:
4345; GCN3:       ; %bb.0:
4346; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4347; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
4348; GCN3-NEXT:    s_mov_b64 s[4:5], 0
4349; GCN3-NEXT:  .LBB93_1: ; %atomicrmw.start
4350; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
4351; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4352; GCN3-NEXT:    v_mov_b32_e32 v4, v3
4353; GCN3-NEXT:    v_max_i32_e32 v3, v4, v2
4354; GCN3-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
4355; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4356; GCN3-NEXT:    buffer_wbinvl1_vol
4357; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
4358; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4359; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4360; GCN3-NEXT:    s_cbranch_execnz .LBB93_1
4361; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
4362; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
4363; GCN3-NEXT:    v_mov_b32_e32 v0, v3
4364; GCN3-NEXT:    s_setpc_b64 s[30:31]
4365  %gep = getelementptr i32, ptr %out, i64 4
4366  %result = atomicrmw max ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
4367  ret i32 %result
4368}
4369
4370; ---------------------------------------------------------------------
4371; atomicrmw umax
4372; ---------------------------------------------------------------------
4373
4374define void @flat_atomic_umax_i32_noret(ptr %ptr, i32 %in) {
4375; GCN1-LABEL: flat_atomic_umax_i32_noret:
4376; GCN1:       ; %bb.0:
4377; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4378; GCN1-NEXT:    flat_load_dword v4, v[0:1]
4379; GCN1-NEXT:    s_mov_b64 s[4:5], 0
4380; GCN1-NEXT:  .LBB94_1: ; %atomicrmw.start
4381; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
4382; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4383; GCN1-NEXT:    v_max_u32_e32 v3, v4, v2
4384; GCN1-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
4385; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4386; GCN1-NEXT:    buffer_wbinvl1_vol
4387; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
4388; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4389; GCN1-NEXT:    v_mov_b32_e32 v4, v3
4390; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4391; GCN1-NEXT:    s_cbranch_execnz .LBB94_1
4392; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
4393; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
4394; GCN1-NEXT:    s_setpc_b64 s[30:31]
4395;
4396; GCN2-LABEL: flat_atomic_umax_i32_noret:
4397; GCN2:       ; %bb.0:
4398; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4399; GCN2-NEXT:    flat_load_dword v4, v[0:1]
4400; GCN2-NEXT:    s_mov_b64 s[4:5], 0
4401; GCN2-NEXT:  .LBB94_1: ; %atomicrmw.start
4402; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
4403; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4404; GCN2-NEXT:    v_max_u32_e32 v3, v4, v2
4405; GCN2-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
4406; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4407; GCN2-NEXT:    buffer_wbinvl1_vol
4408; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
4409; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4410; GCN2-NEXT:    v_mov_b32_e32 v4, v3
4411; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4412; GCN2-NEXT:    s_cbranch_execnz .LBB94_1
4413; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
4414; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
4415; GCN2-NEXT:    s_setpc_b64 s[30:31]
4416;
4417; GCN3-LABEL: flat_atomic_umax_i32_noret:
4418; GCN3:       ; %bb.0:
4419; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4420; GCN3-NEXT:    flat_load_dword v4, v[0:1]
4421; GCN3-NEXT:    s_mov_b64 s[4:5], 0
4422; GCN3-NEXT:  .LBB94_1: ; %atomicrmw.start
4423; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
4424; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4425; GCN3-NEXT:    v_max_u32_e32 v3, v4, v2
4426; GCN3-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
4427; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4428; GCN3-NEXT:    buffer_wbinvl1_vol
4429; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
4430; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4431; GCN3-NEXT:    v_mov_b32_e32 v4, v3
4432; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4433; GCN3-NEXT:    s_cbranch_execnz .LBB94_1
4434; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
4435; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
4436; GCN3-NEXT:    s_setpc_b64 s[30:31]
4437  %tmp0 = atomicrmw umax ptr %ptr, i32 %in seq_cst
4438  ret void
4439}
4440
4441define void @flat_atomic_umax_i32_noret_offset(ptr %out, i32 %in) {
4442; GCN1-LABEL: flat_atomic_umax_i32_noret_offset:
4443; GCN1:       ; %bb.0:
4444; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4445; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
4446; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4447; GCN1-NEXT:    flat_load_dword v4, v[0:1]
4448; GCN1-NEXT:    s_mov_b64 s[4:5], 0
4449; GCN1-NEXT:  .LBB95_1: ; %atomicrmw.start
4450; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
4451; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4452; GCN1-NEXT:    v_max_u32_e32 v3, v4, v2
4453; GCN1-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
4454; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4455; GCN1-NEXT:    buffer_wbinvl1_vol
4456; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
4457; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4458; GCN1-NEXT:    v_mov_b32_e32 v4, v3
4459; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4460; GCN1-NEXT:    s_cbranch_execnz .LBB95_1
4461; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
4462; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
4463; GCN1-NEXT:    s_setpc_b64 s[30:31]
4464;
4465; GCN2-LABEL: flat_atomic_umax_i32_noret_offset:
4466; GCN2:       ; %bb.0:
4467; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4468; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
4469; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4470; GCN2-NEXT:    flat_load_dword v4, v[0:1]
4471; GCN2-NEXT:    s_mov_b64 s[4:5], 0
4472; GCN2-NEXT:  .LBB95_1: ; %atomicrmw.start
4473; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
4474; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4475; GCN2-NEXT:    v_max_u32_e32 v3, v4, v2
4476; GCN2-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
4477; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4478; GCN2-NEXT:    buffer_wbinvl1_vol
4479; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
4480; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4481; GCN2-NEXT:    v_mov_b32_e32 v4, v3
4482; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4483; GCN2-NEXT:    s_cbranch_execnz .LBB95_1
4484; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
4485; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
4486; GCN2-NEXT:    s_setpc_b64 s[30:31]
4487;
4488; GCN3-LABEL: flat_atomic_umax_i32_noret_offset:
4489; GCN3:       ; %bb.0:
4490; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4491; GCN3-NEXT:    flat_load_dword v4, v[0:1] offset:16
4492; GCN3-NEXT:    s_mov_b64 s[4:5], 0
4493; GCN3-NEXT:  .LBB95_1: ; %atomicrmw.start
4494; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
4495; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4496; GCN3-NEXT:    v_max_u32_e32 v3, v4, v2
4497; GCN3-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
4498; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4499; GCN3-NEXT:    buffer_wbinvl1_vol
4500; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
4501; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4502; GCN3-NEXT:    v_mov_b32_e32 v4, v3
4503; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4504; GCN3-NEXT:    s_cbranch_execnz .LBB95_1
4505; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
4506; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
4507; GCN3-NEXT:    s_setpc_b64 s[30:31]
4508  %gep = getelementptr i32, ptr %out, i32 4
4509  %tmp0 = atomicrmw umax ptr %gep, i32 %in seq_cst
4510  ret void
4511}
4512
4513define i32 @flat_atomic_umax_i32_ret(ptr %ptr, i32 %in) {
4514; GCN1-LABEL: flat_atomic_umax_i32_ret:
4515; GCN1:       ; %bb.0:
4516; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4517; GCN1-NEXT:    flat_load_dword v3, v[0:1]
4518; GCN1-NEXT:    s_mov_b64 s[4:5], 0
4519; GCN1-NEXT:  .LBB96_1: ; %atomicrmw.start
4520; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
4521; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4522; GCN1-NEXT:    v_mov_b32_e32 v4, v3
4523; GCN1-NEXT:    v_max_u32_e32 v3, v4, v2
4524; GCN1-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
4525; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4526; GCN1-NEXT:    buffer_wbinvl1_vol
4527; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
4528; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4529; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4530; GCN1-NEXT:    s_cbranch_execnz .LBB96_1
4531; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
4532; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
4533; GCN1-NEXT:    v_mov_b32_e32 v0, v3
4534; GCN1-NEXT:    s_setpc_b64 s[30:31]
4535;
4536; GCN2-LABEL: flat_atomic_umax_i32_ret:
4537; GCN2:       ; %bb.0:
4538; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4539; GCN2-NEXT:    flat_load_dword v3, v[0:1]
4540; GCN2-NEXT:    s_mov_b64 s[4:5], 0
4541; GCN2-NEXT:  .LBB96_1: ; %atomicrmw.start
4542; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
4543; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4544; GCN2-NEXT:    v_mov_b32_e32 v4, v3
4545; GCN2-NEXT:    v_max_u32_e32 v3, v4, v2
4546; GCN2-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
4547; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4548; GCN2-NEXT:    buffer_wbinvl1_vol
4549; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
4550; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4551; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4552; GCN2-NEXT:    s_cbranch_execnz .LBB96_1
4553; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
4554; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
4555; GCN2-NEXT:    v_mov_b32_e32 v0, v3
4556; GCN2-NEXT:    s_setpc_b64 s[30:31]
4557;
4558; GCN3-LABEL: flat_atomic_umax_i32_ret:
4559; GCN3:       ; %bb.0:
4560; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4561; GCN3-NEXT:    flat_load_dword v3, v[0:1]
4562; GCN3-NEXT:    s_mov_b64 s[4:5], 0
4563; GCN3-NEXT:  .LBB96_1: ; %atomicrmw.start
4564; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
4565; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4566; GCN3-NEXT:    v_mov_b32_e32 v4, v3
4567; GCN3-NEXT:    v_max_u32_e32 v3, v4, v2
4568; GCN3-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
4569; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4570; GCN3-NEXT:    buffer_wbinvl1_vol
4571; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
4572; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4573; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4574; GCN3-NEXT:    s_cbranch_execnz .LBB96_1
4575; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
4576; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
4577; GCN3-NEXT:    v_mov_b32_e32 v0, v3
4578; GCN3-NEXT:    s_setpc_b64 s[30:31]
4579  %result = atomicrmw umax ptr %ptr, i32 %in seq_cst
4580  ret i32 %result
4581}
4582
4583define i32 @flat_atomic_umax_i32_ret_offset(ptr %out, i32 %in) {
4584; GCN1-LABEL: flat_atomic_umax_i32_ret_offset:
4585; GCN1:       ; %bb.0:
4586; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4587; GCN1-NEXT:    v_add_i32_e32 v3, vcc, 16, v0
4588; GCN1-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
4589; GCN1-NEXT:    flat_load_dword v0, v[3:4]
4590; GCN1-NEXT:    s_mov_b64 s[4:5], 0
4591; GCN1-NEXT:  .LBB97_1: ; %atomicrmw.start
4592; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
4593; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4594; GCN1-NEXT:    v_mov_b32_e32 v1, v0
4595; GCN1-NEXT:    v_max_u32_e32 v0, v1, v2
4596; GCN1-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
4597; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4598; GCN1-NEXT:    buffer_wbinvl1_vol
4599; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
4600; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4601; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4602; GCN1-NEXT:    s_cbranch_execnz .LBB97_1
4603; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
4604; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
4605; GCN1-NEXT:    s_setpc_b64 s[30:31]
4606;
4607; GCN2-LABEL: flat_atomic_umax_i32_ret_offset:
4608; GCN2:       ; %bb.0:
4609; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4610; GCN2-NEXT:    v_add_u32_e32 v3, vcc, 16, v0
4611; GCN2-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
4612; GCN2-NEXT:    flat_load_dword v0, v[3:4]
4613; GCN2-NEXT:    s_mov_b64 s[4:5], 0
4614; GCN2-NEXT:  .LBB97_1: ; %atomicrmw.start
4615; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
4616; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4617; GCN2-NEXT:    v_mov_b32_e32 v1, v0
4618; GCN2-NEXT:    v_max_u32_e32 v0, v1, v2
4619; GCN2-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
4620; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4621; GCN2-NEXT:    buffer_wbinvl1_vol
4622; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
4623; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4624; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4625; GCN2-NEXT:    s_cbranch_execnz .LBB97_1
4626; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
4627; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
4628; GCN2-NEXT:    s_setpc_b64 s[30:31]
4629;
4630; GCN3-LABEL: flat_atomic_umax_i32_ret_offset:
4631; GCN3:       ; %bb.0:
4632; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4633; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
4634; GCN3-NEXT:    s_mov_b64 s[4:5], 0
4635; GCN3-NEXT:  .LBB97_1: ; %atomicrmw.start
4636; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
4637; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4638; GCN3-NEXT:    v_mov_b32_e32 v4, v3
4639; GCN3-NEXT:    v_max_u32_e32 v3, v4, v2
4640; GCN3-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
4641; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4642; GCN3-NEXT:    buffer_wbinvl1_vol
4643; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
4644; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4645; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4646; GCN3-NEXT:    s_cbranch_execnz .LBB97_1
4647; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
4648; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
4649; GCN3-NEXT:    v_mov_b32_e32 v0, v3
4650; GCN3-NEXT:    s_setpc_b64 s[30:31]
4651  %gep = getelementptr i32, ptr %out, i32 4
4652  %result = atomicrmw umax ptr %gep, i32 %in seq_cst
4653  ret i32 %result
4654}
4655
4656define amdgpu_gfx void @flat_atomic_umax_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) {
4657; GCN1-LABEL: flat_atomic_umax_i32_noret_scalar:
4658; GCN1:       ; %bb.0:
4659; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4660; GCN1-NEXT:    v_mov_b32_e32 v0, s4
4661; GCN1-NEXT:    v_mov_b32_e32 v1, s5
4662; GCN1-NEXT:    flat_load_dword v3, v[0:1]
4663; GCN1-NEXT:    s_mov_b64 s[34:35], 0
4664; GCN1-NEXT:  .LBB98_1: ; %atomicrmw.start
4665; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
4666; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4667; GCN1-NEXT:    v_max_u32_e32 v2, s6, v3
4668; GCN1-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4669; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4670; GCN1-NEXT:    buffer_wbinvl1_vol
4671; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
4672; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
4673; GCN1-NEXT:    v_mov_b32_e32 v3, v2
4674; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
4675; GCN1-NEXT:    s_cbranch_execnz .LBB98_1
4676; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
4677; GCN1-NEXT:    s_or_b64 exec, exec, s[34:35]
4678; GCN1-NEXT:    s_setpc_b64 s[30:31]
4679;
4680; GCN2-LABEL: flat_atomic_umax_i32_noret_scalar:
4681; GCN2:       ; %bb.0:
4682; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4683; GCN2-NEXT:    v_mov_b32_e32 v0, s4
4684; GCN2-NEXT:    v_mov_b32_e32 v1, s5
4685; GCN2-NEXT:    flat_load_dword v3, v[0:1]
4686; GCN2-NEXT:    s_mov_b64 s[34:35], 0
4687; GCN2-NEXT:  .LBB98_1: ; %atomicrmw.start
4688; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
4689; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4690; GCN2-NEXT:    v_max_u32_e32 v2, s6, v3
4691; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4692; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4693; GCN2-NEXT:    buffer_wbinvl1_vol
4694; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
4695; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
4696; GCN2-NEXT:    v_mov_b32_e32 v3, v2
4697; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
4698; GCN2-NEXT:    s_cbranch_execnz .LBB98_1
4699; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
4700; GCN2-NEXT:    s_or_b64 exec, exec, s[34:35]
4701; GCN2-NEXT:    s_setpc_b64 s[30:31]
4702;
4703; GCN3-LABEL: flat_atomic_umax_i32_noret_scalar:
4704; GCN3:       ; %bb.0:
4705; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4706; GCN3-NEXT:    v_mov_b32_e32 v0, s4
4707; GCN3-NEXT:    v_mov_b32_e32 v1, s5
4708; GCN3-NEXT:    flat_load_dword v3, v[0:1]
4709; GCN3-NEXT:    s_mov_b64 s[34:35], 0
4710; GCN3-NEXT:  .LBB98_1: ; %atomicrmw.start
4711; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
4712; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4713; GCN3-NEXT:    v_max_u32_e32 v2, s6, v3
4714; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4715; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4716; GCN3-NEXT:    buffer_wbinvl1_vol
4717; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
4718; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
4719; GCN3-NEXT:    v_mov_b32_e32 v3, v2
4720; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
4721; GCN3-NEXT:    s_cbranch_execnz .LBB98_1
4722; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
4723; GCN3-NEXT:    s_or_b64 exec, exec, s[34:35]
4724; GCN3-NEXT:    s_setpc_b64 s[30:31]
4725  %tmp0 = atomicrmw umax ptr %ptr, i32 %in seq_cst
4726  ret void
4727}
4728
4729define amdgpu_gfx void @flat_atomic_umax_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) {
4730; GCN1-LABEL: flat_atomic_umax_i32_noret_offset_scalar:
4731; GCN1:       ; %bb.0:
4732; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4733; GCN1-NEXT:    s_add_u32 s34, s4, 16
4734; GCN1-NEXT:    s_addc_u32 s35, s5, 0
4735; GCN1-NEXT:    v_mov_b32_e32 v0, s34
4736; GCN1-NEXT:    v_mov_b32_e32 v1, s35
4737; GCN1-NEXT:    flat_load_dword v3, v[0:1]
4738; GCN1-NEXT:    s_mov_b64 s[34:35], 0
4739; GCN1-NEXT:  .LBB99_1: ; %atomicrmw.start
4740; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
4741; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4742; GCN1-NEXT:    v_max_u32_e32 v2, s6, v3
4743; GCN1-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4744; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4745; GCN1-NEXT:    buffer_wbinvl1_vol
4746; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
4747; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
4748; GCN1-NEXT:    v_mov_b32_e32 v3, v2
4749; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
4750; GCN1-NEXT:    s_cbranch_execnz .LBB99_1
4751; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
4752; GCN1-NEXT:    s_or_b64 exec, exec, s[34:35]
4753; GCN1-NEXT:    s_setpc_b64 s[30:31]
4754;
4755; GCN2-LABEL: flat_atomic_umax_i32_noret_offset_scalar:
4756; GCN2:       ; %bb.0:
4757; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4758; GCN2-NEXT:    s_add_u32 s34, s4, 16
4759; GCN2-NEXT:    s_addc_u32 s35, s5, 0
4760; GCN2-NEXT:    v_mov_b32_e32 v0, s34
4761; GCN2-NEXT:    v_mov_b32_e32 v1, s35
4762; GCN2-NEXT:    flat_load_dword v3, v[0:1]
4763; GCN2-NEXT:    s_mov_b64 s[34:35], 0
4764; GCN2-NEXT:  .LBB99_1: ; %atomicrmw.start
4765; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
4766; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4767; GCN2-NEXT:    v_max_u32_e32 v2, s6, v3
4768; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4769; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4770; GCN2-NEXT:    buffer_wbinvl1_vol
4771; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
4772; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
4773; GCN2-NEXT:    v_mov_b32_e32 v3, v2
4774; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
4775; GCN2-NEXT:    s_cbranch_execnz .LBB99_1
4776; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
4777; GCN2-NEXT:    s_or_b64 exec, exec, s[34:35]
4778; GCN2-NEXT:    s_setpc_b64 s[30:31]
4779;
4780; GCN3-LABEL: flat_atomic_umax_i32_noret_offset_scalar:
4781; GCN3:       ; %bb.0:
4782; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4783; GCN3-NEXT:    v_mov_b32_e32 v0, s4
4784; GCN3-NEXT:    v_mov_b32_e32 v1, s5
4785; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
4786; GCN3-NEXT:    s_mov_b64 s[34:35], 0
4787; GCN3-NEXT:  .LBB99_1: ; %atomicrmw.start
4788; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
4789; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4790; GCN3-NEXT:    v_max_u32_e32 v2, s6, v3
4791; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
4792; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4793; GCN3-NEXT:    buffer_wbinvl1_vol
4794; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
4795; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
4796; GCN3-NEXT:    v_mov_b32_e32 v3, v2
4797; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
4798; GCN3-NEXT:    s_cbranch_execnz .LBB99_1
4799; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
4800; GCN3-NEXT:    s_or_b64 exec, exec, s[34:35]
4801; GCN3-NEXT:    s_setpc_b64 s[30:31]
4802  %gep = getelementptr i32, ptr %out, i32 4
4803  %tmp0 = atomicrmw umax ptr %gep, i32 %in seq_cst
4804  ret void
4805}
4806
4807define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) {
4808; GCN1-LABEL: flat_atomic_umax_i32_ret_scalar:
4809; GCN1:       ; %bb.0:
4810; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4811; GCN1-NEXT:    v_mov_b32_e32 v0, s4
4812; GCN1-NEXT:    v_mov_b32_e32 v1, s5
4813; GCN1-NEXT:    flat_load_dword v0, v[0:1]
4814; GCN1-NEXT:    v_mov_b32_e32 v1, s4
4815; GCN1-NEXT:    s_mov_b64 s[34:35], 0
4816; GCN1-NEXT:    v_mov_b32_e32 v2, s5
4817; GCN1-NEXT:  .LBB100_1: ; %atomicrmw.start
4818; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
4819; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4820; GCN1-NEXT:    v_mov_b32_e32 v4, v0
4821; GCN1-NEXT:    v_max_u32_e32 v3, s6, v4
4822; GCN1-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
4823; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4824; GCN1-NEXT:    buffer_wbinvl1_vol
4825; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
4826; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
4827; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
4828; GCN1-NEXT:    s_cbranch_execnz .LBB100_1
4829; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
4830; GCN1-NEXT:    s_or_b64 exec, exec, s[34:35]
4831; GCN1-NEXT:    s_setpc_b64 s[30:31]
4832;
4833; GCN2-LABEL: flat_atomic_umax_i32_ret_scalar:
4834; GCN2:       ; %bb.0:
4835; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4836; GCN2-NEXT:    v_mov_b32_e32 v0, s4
4837; GCN2-NEXT:    v_mov_b32_e32 v1, s5
4838; GCN2-NEXT:    flat_load_dword v0, v[0:1]
4839; GCN2-NEXT:    v_mov_b32_e32 v1, s4
4840; GCN2-NEXT:    s_mov_b64 s[34:35], 0
4841; GCN2-NEXT:    v_mov_b32_e32 v2, s5
4842; GCN2-NEXT:  .LBB100_1: ; %atomicrmw.start
4843; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
4844; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4845; GCN2-NEXT:    v_mov_b32_e32 v4, v0
4846; GCN2-NEXT:    v_max_u32_e32 v3, s6, v4
4847; GCN2-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
4848; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4849; GCN2-NEXT:    buffer_wbinvl1_vol
4850; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
4851; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
4852; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
4853; GCN2-NEXT:    s_cbranch_execnz .LBB100_1
4854; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
4855; GCN2-NEXT:    s_or_b64 exec, exec, s[34:35]
4856; GCN2-NEXT:    s_setpc_b64 s[30:31]
4857;
4858; GCN3-LABEL: flat_atomic_umax_i32_ret_scalar:
4859; GCN3:       ; %bb.0:
4860; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4861; GCN3-NEXT:    v_mov_b32_e32 v0, s4
4862; GCN3-NEXT:    v_mov_b32_e32 v1, s5
4863; GCN3-NEXT:    flat_load_dword v0, v[0:1]
4864; GCN3-NEXT:    v_mov_b32_e32 v1, s4
4865; GCN3-NEXT:    s_mov_b64 s[34:35], 0
4866; GCN3-NEXT:    v_mov_b32_e32 v2, s5
4867; GCN3-NEXT:  .LBB100_1: ; %atomicrmw.start
4868; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
4869; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4870; GCN3-NEXT:    v_mov_b32_e32 v4, v0
4871; GCN3-NEXT:    v_max_u32_e32 v3, s6, v4
4872; GCN3-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
4873; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4874; GCN3-NEXT:    buffer_wbinvl1_vol
4875; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
4876; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
4877; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
4878; GCN3-NEXT:    s_cbranch_execnz .LBB100_1
4879; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
4880; GCN3-NEXT:    s_or_b64 exec, exec, s[34:35]
4881; GCN3-NEXT:    s_setpc_b64 s[30:31]
4882  %result = atomicrmw umax ptr %ptr, i32 %in seq_cst
4883  ret i32 %result
4884}
4885
4886define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) {
4887; GCN1-LABEL: flat_atomic_umax_i32_ret_offset_scalar:
4888; GCN1:       ; %bb.0:
4889; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4890; GCN1-NEXT:    s_add_u32 s34, s4, 16
4891; GCN1-NEXT:    s_addc_u32 s35, s5, 0
4892; GCN1-NEXT:    v_mov_b32_e32 v1, s34
4893; GCN1-NEXT:    v_mov_b32_e32 v2, s35
4894; GCN1-NEXT:    flat_load_dword v0, v[1:2]
4895; GCN1-NEXT:    s_mov_b64 s[34:35], 0
4896; GCN1-NEXT:  .LBB101_1: ; %atomicrmw.start
4897; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
4898; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4899; GCN1-NEXT:    v_mov_b32_e32 v4, v0
4900; GCN1-NEXT:    v_max_u32_e32 v3, s6, v4
4901; GCN1-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
4902; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4903; GCN1-NEXT:    buffer_wbinvl1_vol
4904; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
4905; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
4906; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
4907; GCN1-NEXT:    s_cbranch_execnz .LBB101_1
4908; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
4909; GCN1-NEXT:    s_or_b64 exec, exec, s[34:35]
4910; GCN1-NEXT:    s_setpc_b64 s[30:31]
4911;
4912; GCN2-LABEL: flat_atomic_umax_i32_ret_offset_scalar:
4913; GCN2:       ; %bb.0:
4914; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4915; GCN2-NEXT:    s_add_u32 s34, s4, 16
4916; GCN2-NEXT:    s_addc_u32 s35, s5, 0
4917; GCN2-NEXT:    v_mov_b32_e32 v1, s34
4918; GCN2-NEXT:    v_mov_b32_e32 v2, s35
4919; GCN2-NEXT:    flat_load_dword v0, v[1:2]
4920; GCN2-NEXT:    s_mov_b64 s[34:35], 0
4921; GCN2-NEXT:  .LBB101_1: ; %atomicrmw.start
4922; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
4923; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4924; GCN2-NEXT:    v_mov_b32_e32 v4, v0
4925; GCN2-NEXT:    v_max_u32_e32 v3, s6, v4
4926; GCN2-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
4927; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4928; GCN2-NEXT:    buffer_wbinvl1_vol
4929; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
4930; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
4931; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
4932; GCN2-NEXT:    s_cbranch_execnz .LBB101_1
4933; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
4934; GCN2-NEXT:    s_or_b64 exec, exec, s[34:35]
4935; GCN2-NEXT:    s_setpc_b64 s[30:31]
4936;
4937; GCN3-LABEL: flat_atomic_umax_i32_ret_offset_scalar:
4938; GCN3:       ; %bb.0:
4939; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4940; GCN3-NEXT:    v_mov_b32_e32 v0, s4
4941; GCN3-NEXT:    v_mov_b32_e32 v1, s5
4942; GCN3-NEXT:    flat_load_dword v0, v[0:1] offset:16
4943; GCN3-NEXT:    v_mov_b32_e32 v1, s4
4944; GCN3-NEXT:    s_mov_b64 s[34:35], 0
4945; GCN3-NEXT:    v_mov_b32_e32 v2, s5
4946; GCN3-NEXT:  .LBB101_1: ; %atomicrmw.start
4947; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
4948; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4949; GCN3-NEXT:    v_mov_b32_e32 v4, v0
4950; GCN3-NEXT:    v_max_u32_e32 v3, s6, v4
4951; GCN3-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc
4952; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4953; GCN3-NEXT:    buffer_wbinvl1_vol
4954; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
4955; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
4956; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
4957; GCN3-NEXT:    s_cbranch_execnz .LBB101_1
4958; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
4959; GCN3-NEXT:    s_or_b64 exec, exec, s[34:35]
4960; GCN3-NEXT:    s_setpc_b64 s[30:31]
4961  %gep = getelementptr i32, ptr %out, i32 4
4962  %result = atomicrmw umax ptr %gep, i32 %in seq_cst
4963  ret i32 %result
4964}
4965
4966define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 %index) {
4967; GCN1-LABEL: atomic_umax_i32_addr64_offset:
4968; GCN1:       ; %bb.0: ; %entry
4969; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
4970; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
4971; GCN1-NEXT:    s_ashr_i32 s5, s3, 31
4972; GCN1-NEXT:    s_mov_b32 s4, s3
4973; GCN1-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
4974; GCN1-NEXT:    s_add_u32 s0, s0, s4
4975; GCN1-NEXT:    s_addc_u32 s1, s1, s5
4976; GCN1-NEXT:    s_add_u32 s0, s0, 16
4977; GCN1-NEXT:    s_addc_u32 s1, s1, 0
4978; GCN1-NEXT:    v_mov_b32_e32 v0, s0
4979; GCN1-NEXT:    v_mov_b32_e32 v1, s1
4980; GCN1-NEXT:    flat_load_dword v3, v[0:1]
4981; GCN1-NEXT:    s_mov_b64 s[0:1], 0
4982; GCN1-NEXT:  .LBB102_1: ; %atomicrmw.start
4983; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
4984; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4985; GCN1-NEXT:    v_max_u32_e32 v2, s2, v3
4986; GCN1-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4987; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4988; GCN1-NEXT:    buffer_wbinvl1_vol
4989; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
4990; GCN1-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
4991; GCN1-NEXT:    v_mov_b32_e32 v3, v2
4992; GCN1-NEXT:    s_andn2_b64 exec, exec, s[0:1]
4993; GCN1-NEXT:    s_cbranch_execnz .LBB102_1
4994; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
4995; GCN1-NEXT:    s_endpgm
4996;
4997; GCN2-LABEL: atomic_umax_i32_addr64_offset:
4998; GCN2:       ; %bb.0: ; %entry
4999; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5000; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
5001; GCN2-NEXT:    s_ashr_i32 s5, s3, 31
5002; GCN2-NEXT:    s_mov_b32 s4, s3
5003; GCN2-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
5004; GCN2-NEXT:    s_add_u32 s0, s0, s4
5005; GCN2-NEXT:    s_addc_u32 s1, s1, s5
5006; GCN2-NEXT:    s_add_u32 s0, s0, 16
5007; GCN2-NEXT:    s_addc_u32 s1, s1, 0
5008; GCN2-NEXT:    v_mov_b32_e32 v0, s0
5009; GCN2-NEXT:    v_mov_b32_e32 v1, s1
5010; GCN2-NEXT:    flat_load_dword v3, v[0:1]
5011; GCN2-NEXT:    s_mov_b64 s[0:1], 0
5012; GCN2-NEXT:  .LBB102_1: ; %atomicrmw.start
5013; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
5014; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5015; GCN2-NEXT:    v_max_u32_e32 v2, s2, v3
5016; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5017; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5018; GCN2-NEXT:    buffer_wbinvl1_vol
5019; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
5020; GCN2-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
5021; GCN2-NEXT:    v_mov_b32_e32 v3, v2
5022; GCN2-NEXT:    s_andn2_b64 exec, exec, s[0:1]
5023; GCN2-NEXT:    s_cbranch_execnz .LBB102_1
5024; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
5025; GCN2-NEXT:    s_endpgm
5026;
5027; GCN3-LABEL: atomic_umax_i32_addr64_offset:
5028; GCN3:       ; %bb.0: ; %entry
5029; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5030; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
5031; GCN3-NEXT:    s_ashr_i32 s5, s3, 31
5032; GCN3-NEXT:    s_mov_b32 s4, s3
5033; GCN3-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
5034; GCN3-NEXT:    s_add_u32 s0, s0, s4
5035; GCN3-NEXT:    s_addc_u32 s1, s1, s5
5036; GCN3-NEXT:    v_mov_b32_e32 v0, s0
5037; GCN3-NEXT:    v_mov_b32_e32 v1, s1
5038; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
5039; GCN3-NEXT:    s_mov_b64 s[0:1], 0
5040; GCN3-NEXT:  .LBB102_1: ; %atomicrmw.start
5041; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
5042; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5043; GCN3-NEXT:    v_max_u32_e32 v2, s2, v3
5044; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
5045; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5046; GCN3-NEXT:    buffer_wbinvl1_vol
5047; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
5048; GCN3-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
5049; GCN3-NEXT:    v_mov_b32_e32 v3, v2
5050; GCN3-NEXT:    s_andn2_b64 exec, exec, s[0:1]
5051; GCN3-NEXT:    s_cbranch_execnz .LBB102_1
5052; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
5053; GCN3-NEXT:    s_endpgm
5054entry:
5055  %ptr = getelementptr i32, ptr %out, i32 %index
5056  %gep = getelementptr i32, ptr %ptr, i32 4
5057  %tmp0 = atomicrmw umax ptr %gep, i32 %in seq_cst
5058  ret void
5059}
5060
5061define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i32 %index) {
5062; GCN1-LABEL: atomic_umax_i32_ret_addr64_offset:
5063; GCN1:       ; %bb.0: ; %entry
5064; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
5065; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
5066; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
5067; GCN1-NEXT:    s_ashr_i32 s5, s7, 31
5068; GCN1-NEXT:    s_mov_b32 s4, s7
5069; GCN1-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
5070; GCN1-NEXT:    s_add_u32 s0, s0, s4
5071; GCN1-NEXT:    s_addc_u32 s1, s1, s5
5072; GCN1-NEXT:    s_add_u32 s0, s0, 16
5073; GCN1-NEXT:    s_addc_u32 s1, s1, 0
5074; GCN1-NEXT:    v_mov_b32_e32 v0, s0
5075; GCN1-NEXT:    v_mov_b32_e32 v1, s1
5076; GCN1-NEXT:    flat_load_dword v2, v[0:1]
5077; GCN1-NEXT:    s_mov_b64 s[0:1], 0
5078; GCN1-NEXT:  .LBB103_1: ; %atomicrmw.start
5079; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
5080; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5081; GCN1-NEXT:    v_mov_b32_e32 v3, v2
5082; GCN1-NEXT:    v_max_u32_e32 v2, s6, v3
5083; GCN1-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5084; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5085; GCN1-NEXT:    buffer_wbinvl1_vol
5086; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
5087; GCN1-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
5088; GCN1-NEXT:    s_andn2_b64 exec, exec, s[0:1]
5089; GCN1-NEXT:    s_cbranch_execnz .LBB103_1
5090; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
5091; GCN1-NEXT:    s_or_b64 exec, exec, s[0:1]
5092; GCN1-NEXT:    v_mov_b32_e32 v0, s2
5093; GCN1-NEXT:    v_mov_b32_e32 v1, s3
5094; GCN1-NEXT:    flat_store_dword v[0:1], v2
5095; GCN1-NEXT:    s_endpgm
5096;
5097; GCN2-LABEL: atomic_umax_i32_ret_addr64_offset:
5098; GCN2:       ; %bb.0: ; %entry
5099; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
5100; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5101; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
5102; GCN2-NEXT:    s_ashr_i32 s5, s7, 31
5103; GCN2-NEXT:    s_mov_b32 s4, s7
5104; GCN2-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
5105; GCN2-NEXT:    s_add_u32 s0, s0, s4
5106; GCN2-NEXT:    s_addc_u32 s1, s1, s5
5107; GCN2-NEXT:    s_add_u32 s0, s0, 16
5108; GCN2-NEXT:    s_addc_u32 s1, s1, 0
5109; GCN2-NEXT:    v_mov_b32_e32 v0, s0
5110; GCN2-NEXT:    v_mov_b32_e32 v1, s1
5111; GCN2-NEXT:    flat_load_dword v2, v[0:1]
5112; GCN2-NEXT:    s_mov_b64 s[0:1], 0
5113; GCN2-NEXT:  .LBB103_1: ; %atomicrmw.start
5114; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
5115; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5116; GCN2-NEXT:    v_mov_b32_e32 v3, v2
5117; GCN2-NEXT:    v_max_u32_e32 v2, s6, v3
5118; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5119; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5120; GCN2-NEXT:    buffer_wbinvl1_vol
5121; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
5122; GCN2-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
5123; GCN2-NEXT:    s_andn2_b64 exec, exec, s[0:1]
5124; GCN2-NEXT:    s_cbranch_execnz .LBB103_1
5125; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
5126; GCN2-NEXT:    s_or_b64 exec, exec, s[0:1]
5127; GCN2-NEXT:    v_mov_b32_e32 v0, s2
5128; GCN2-NEXT:    v_mov_b32_e32 v1, s3
5129; GCN2-NEXT:    flat_store_dword v[0:1], v2
5130; GCN2-NEXT:    s_endpgm
5131;
5132; GCN3-LABEL: atomic_umax_i32_ret_addr64_offset:
5133; GCN3:       ; %bb.0: ; %entry
5134; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
5135; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5136; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
5137; GCN3-NEXT:    s_ashr_i32 s5, s7, 31
5138; GCN3-NEXT:    s_mov_b32 s4, s7
5139; GCN3-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
5140; GCN3-NEXT:    s_add_u32 s0, s0, s4
5141; GCN3-NEXT:    s_addc_u32 s1, s1, s5
5142; GCN3-NEXT:    v_mov_b32_e32 v0, s0
5143; GCN3-NEXT:    v_mov_b32_e32 v1, s1
5144; GCN3-NEXT:    flat_load_dword v2, v[0:1] offset:16
5145; GCN3-NEXT:    s_mov_b64 s[0:1], 0
5146; GCN3-NEXT:  .LBB103_1: ; %atomicrmw.start
5147; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
5148; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5149; GCN3-NEXT:    v_mov_b32_e32 v3, v2
5150; GCN3-NEXT:    v_max_u32_e32 v2, s6, v3
5151; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
5152; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5153; GCN3-NEXT:    buffer_wbinvl1_vol
5154; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
5155; GCN3-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
5156; GCN3-NEXT:    s_andn2_b64 exec, exec, s[0:1]
5157; GCN3-NEXT:    s_cbranch_execnz .LBB103_1
5158; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
5159; GCN3-NEXT:    s_or_b64 exec, exec, s[0:1]
5160; GCN3-NEXT:    v_mov_b32_e32 v0, s2
5161; GCN3-NEXT:    v_mov_b32_e32 v1, s3
5162; GCN3-NEXT:    flat_store_dword v[0:1], v2
5163; GCN3-NEXT:    s_endpgm
5164entry:
5165  %ptr = getelementptr i32, ptr %out, i32 %index
5166  %gep = getelementptr i32, ptr %ptr, i32 4
5167  %tmp0 = atomicrmw umax ptr %gep, i32 %in seq_cst
5168  store i32 %tmp0, ptr %out2
5169  ret void
5170}
5171
5172define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i32 %index) {
5173; GCN1-LABEL: atomic_umax_i32_ret_addr64:
5174; GCN1:       ; %bb.0: ; %entry
5175; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
5176; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
5177; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
5178; GCN1-NEXT:    s_ashr_i32 s5, s7, 31
5179; GCN1-NEXT:    s_mov_b32 s4, s7
5180; GCN1-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
5181; GCN1-NEXT:    s_add_u32 s0, s0, s4
5182; GCN1-NEXT:    s_addc_u32 s1, s1, s5
5183; GCN1-NEXT:    v_mov_b32_e32 v0, s0
5184; GCN1-NEXT:    v_mov_b32_e32 v1, s1
5185; GCN1-NEXT:    flat_load_dword v2, v[0:1]
5186; GCN1-NEXT:    s_mov_b64 s[0:1], 0
5187; GCN1-NEXT:  .LBB104_1: ; %atomicrmw.start
5188; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
5189; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5190; GCN1-NEXT:    v_mov_b32_e32 v3, v2
5191; GCN1-NEXT:    v_max_u32_e32 v2, s6, v3
5192; GCN1-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5193; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5194; GCN1-NEXT:    buffer_wbinvl1_vol
5195; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
5196; GCN1-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
5197; GCN1-NEXT:    s_andn2_b64 exec, exec, s[0:1]
5198; GCN1-NEXT:    s_cbranch_execnz .LBB104_1
5199; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
5200; GCN1-NEXT:    s_or_b64 exec, exec, s[0:1]
5201; GCN1-NEXT:    v_mov_b32_e32 v0, s2
5202; GCN1-NEXT:    v_mov_b32_e32 v1, s3
5203; GCN1-NEXT:    flat_store_dword v[0:1], v2
5204; GCN1-NEXT:    s_endpgm
5205;
5206; GCN2-LABEL: atomic_umax_i32_ret_addr64:
5207; GCN2:       ; %bb.0: ; %entry
5208; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
5209; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5210; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
5211; GCN2-NEXT:    s_ashr_i32 s5, s7, 31
5212; GCN2-NEXT:    s_mov_b32 s4, s7
5213; GCN2-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
5214; GCN2-NEXT:    s_add_u32 s0, s0, s4
5215; GCN2-NEXT:    s_addc_u32 s1, s1, s5
5216; GCN2-NEXT:    v_mov_b32_e32 v0, s0
5217; GCN2-NEXT:    v_mov_b32_e32 v1, s1
5218; GCN2-NEXT:    flat_load_dword v2, v[0:1]
5219; GCN2-NEXT:    s_mov_b64 s[0:1], 0
5220; GCN2-NEXT:  .LBB104_1: ; %atomicrmw.start
5221; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
5222; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5223; GCN2-NEXT:    v_mov_b32_e32 v3, v2
5224; GCN2-NEXT:    v_max_u32_e32 v2, s6, v3
5225; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5226; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5227; GCN2-NEXT:    buffer_wbinvl1_vol
5228; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
5229; GCN2-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
5230; GCN2-NEXT:    s_andn2_b64 exec, exec, s[0:1]
5231; GCN2-NEXT:    s_cbranch_execnz .LBB104_1
5232; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
5233; GCN2-NEXT:    s_or_b64 exec, exec, s[0:1]
5234; GCN2-NEXT:    v_mov_b32_e32 v0, s2
5235; GCN2-NEXT:    v_mov_b32_e32 v1, s3
5236; GCN2-NEXT:    flat_store_dword v[0:1], v2
5237; GCN2-NEXT:    s_endpgm
5238;
5239; GCN3-LABEL: atomic_umax_i32_ret_addr64:
5240; GCN3:       ; %bb.0: ; %entry
5241; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
5242; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5243; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
5244; GCN3-NEXT:    s_ashr_i32 s5, s7, 31
5245; GCN3-NEXT:    s_mov_b32 s4, s7
5246; GCN3-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
5247; GCN3-NEXT:    s_add_u32 s0, s0, s4
5248; GCN3-NEXT:    s_addc_u32 s1, s1, s5
5249; GCN3-NEXT:    v_mov_b32_e32 v0, s0
5250; GCN3-NEXT:    v_mov_b32_e32 v1, s1
5251; GCN3-NEXT:    flat_load_dword v2, v[0:1]
5252; GCN3-NEXT:    s_mov_b64 s[0:1], 0
5253; GCN3-NEXT:  .LBB104_1: ; %atomicrmw.start
5254; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
5255; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5256; GCN3-NEXT:    v_mov_b32_e32 v3, v2
5257; GCN3-NEXT:    v_max_u32_e32 v2, s6, v3
5258; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5259; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5260; GCN3-NEXT:    buffer_wbinvl1_vol
5261; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
5262; GCN3-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
5263; GCN3-NEXT:    s_andn2_b64 exec, exec, s[0:1]
5264; GCN3-NEXT:    s_cbranch_execnz .LBB104_1
5265; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
5266; GCN3-NEXT:    s_or_b64 exec, exec, s[0:1]
5267; GCN3-NEXT:    v_mov_b32_e32 v0, s2
5268; GCN3-NEXT:    v_mov_b32_e32 v1, s3
5269; GCN3-NEXT:    flat_store_dword v[0:1], v2
5270; GCN3-NEXT:    s_endpgm
5271entry:
5272  %ptr = getelementptr i32, ptr %out, i32 %index
5273  %tmp0 = atomicrmw umax ptr %ptr, i32 %in seq_cst
5274  store i32 %tmp0, ptr %out2
5275  ret void
5276}
5277
5278define void @flat_umax_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
5279; GCN1-LABEL: flat_umax_i32_noret_offset__amdgpu_no_remote_memory:
5280; GCN1:       ; %bb.0:
5281; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5282; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
5283; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
5284; GCN1-NEXT:    flat_load_dword v4, v[0:1]
5285; GCN1-NEXT:    s_mov_b64 s[4:5], 0
5286; GCN1-NEXT:  .LBB105_1: ; %atomicrmw.start
5287; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
5288; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5289; GCN1-NEXT:    v_max_u32_e32 v3, v4, v2
5290; GCN1-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
5291; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5292; GCN1-NEXT:    buffer_wbinvl1_vol
5293; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
5294; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5295; GCN1-NEXT:    v_mov_b32_e32 v4, v3
5296; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5297; GCN1-NEXT:    s_cbranch_execnz .LBB105_1
5298; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
5299; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
5300; GCN1-NEXT:    s_setpc_b64 s[30:31]
5301;
5302; GCN2-LABEL: flat_umax_i32_noret_offset__amdgpu_no_remote_memory:
5303; GCN2:       ; %bb.0:
5304; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5305; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
5306; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
5307; GCN2-NEXT:    flat_load_dword v4, v[0:1]
5308; GCN2-NEXT:    s_mov_b64 s[4:5], 0
5309; GCN2-NEXT:  .LBB105_1: ; %atomicrmw.start
5310; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
5311; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5312; GCN2-NEXT:    v_max_u32_e32 v3, v4, v2
5313; GCN2-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
5314; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5315; GCN2-NEXT:    buffer_wbinvl1_vol
5316; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
5317; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5318; GCN2-NEXT:    v_mov_b32_e32 v4, v3
5319; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5320; GCN2-NEXT:    s_cbranch_execnz .LBB105_1
5321; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
5322; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
5323; GCN2-NEXT:    s_setpc_b64 s[30:31]
5324;
5325; GCN3-LABEL: flat_umax_i32_noret_offset__amdgpu_no_remote_memory:
5326; GCN3:       ; %bb.0:
5327; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5328; GCN3-NEXT:    flat_load_dword v4, v[0:1] offset:16
5329; GCN3-NEXT:    s_mov_b64 s[4:5], 0
5330; GCN3-NEXT:  .LBB105_1: ; %atomicrmw.start
5331; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
5332; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5333; GCN3-NEXT:    v_max_u32_e32 v3, v4, v2
5334; GCN3-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
5335; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5336; GCN3-NEXT:    buffer_wbinvl1_vol
5337; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
5338; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5339; GCN3-NEXT:    v_mov_b32_e32 v4, v3
5340; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5341; GCN3-NEXT:    s_cbranch_execnz .LBB105_1
5342; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
5343; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
5344; GCN3-NEXT:    s_setpc_b64 s[30:31]
5345  %gep = getelementptr i32, ptr %out, i64 4
5346  %tmp0 = atomicrmw umax ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
5347  ret void
5348}
5349
5350define i32 @flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
5351; GCN1-LABEL: flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory:
5352; GCN1:       ; %bb.0:
5353; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5354; GCN1-NEXT:    v_add_i32_e32 v3, vcc, 16, v0
5355; GCN1-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
5356; GCN1-NEXT:    flat_load_dword v0, v[3:4]
5357; GCN1-NEXT:    s_mov_b64 s[4:5], 0
5358; GCN1-NEXT:  .LBB106_1: ; %atomicrmw.start
5359; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
5360; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5361; GCN1-NEXT:    v_mov_b32_e32 v1, v0
5362; GCN1-NEXT:    v_max_u32_e32 v0, v1, v2
5363; GCN1-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
5364; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5365; GCN1-NEXT:    buffer_wbinvl1_vol
5366; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
5367; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5368; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5369; GCN1-NEXT:    s_cbranch_execnz .LBB106_1
5370; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
5371; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
5372; GCN1-NEXT:    s_setpc_b64 s[30:31]
5373;
5374; GCN2-LABEL: flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory:
5375; GCN2:       ; %bb.0:
5376; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5377; GCN2-NEXT:    v_add_u32_e32 v3, vcc, 16, v0
5378; GCN2-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
5379; GCN2-NEXT:    flat_load_dword v0, v[3:4]
5380; GCN2-NEXT:    s_mov_b64 s[4:5], 0
5381; GCN2-NEXT:  .LBB106_1: ; %atomicrmw.start
5382; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
5383; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5384; GCN2-NEXT:    v_mov_b32_e32 v1, v0
5385; GCN2-NEXT:    v_max_u32_e32 v0, v1, v2
5386; GCN2-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
5387; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5388; GCN2-NEXT:    buffer_wbinvl1_vol
5389; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
5390; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5391; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5392; GCN2-NEXT:    s_cbranch_execnz .LBB106_1
5393; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
5394; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
5395; GCN2-NEXT:    s_setpc_b64 s[30:31]
5396;
5397; GCN3-LABEL: flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory:
5398; GCN3:       ; %bb.0:
5399; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5400; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
5401; GCN3-NEXT:    s_mov_b64 s[4:5], 0
5402; GCN3-NEXT:  .LBB106_1: ; %atomicrmw.start
5403; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
5404; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5405; GCN3-NEXT:    v_mov_b32_e32 v4, v3
5406; GCN3-NEXT:    v_max_u32_e32 v3, v4, v2
5407; GCN3-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
5408; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5409; GCN3-NEXT:    buffer_wbinvl1_vol
5410; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
5411; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5412; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5413; GCN3-NEXT:    s_cbranch_execnz .LBB106_1
5414; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
5415; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
5416; GCN3-NEXT:    v_mov_b32_e32 v0, v3
5417; GCN3-NEXT:    s_setpc_b64 s[30:31]
5418  %gep = getelementptr i32, ptr %out, i64 4
5419  %result = atomicrmw umax ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
5420  ret i32 %result
5421}
5422
5423; ---------------------------------------------------------------------
5424; atomicrmw umin
5425; ---------------------------------------------------------------------
5426
5427define void @flat_atomic_umin_i32_noret(ptr %ptr, i32 %in) {
5428; GCN1-LABEL: flat_atomic_umin_i32_noret:
5429; GCN1:       ; %bb.0:
5430; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5431; GCN1-NEXT:    flat_load_dword v4, v[0:1]
5432; GCN1-NEXT:    s_mov_b64 s[4:5], 0
5433; GCN1-NEXT:  .LBB107_1: ; %atomicrmw.start
5434; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
5435; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5436; GCN1-NEXT:    v_min_u32_e32 v3, v4, v2
5437; GCN1-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
5438; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5439; GCN1-NEXT:    buffer_wbinvl1_vol
5440; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
5441; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5442; GCN1-NEXT:    v_mov_b32_e32 v4, v3
5443; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5444; GCN1-NEXT:    s_cbranch_execnz .LBB107_1
5445; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
5446; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
5447; GCN1-NEXT:    s_setpc_b64 s[30:31]
5448;
5449; GCN2-LABEL: flat_atomic_umin_i32_noret:
5450; GCN2:       ; %bb.0:
5451; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5452; GCN2-NEXT:    flat_load_dword v4, v[0:1]
5453; GCN2-NEXT:    s_mov_b64 s[4:5], 0
5454; GCN2-NEXT:  .LBB107_1: ; %atomicrmw.start
5455; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
5456; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5457; GCN2-NEXT:    v_min_u32_e32 v3, v4, v2
5458; GCN2-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
5459; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5460; GCN2-NEXT:    buffer_wbinvl1_vol
5461; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
5462; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5463; GCN2-NEXT:    v_mov_b32_e32 v4, v3
5464; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5465; GCN2-NEXT:    s_cbranch_execnz .LBB107_1
5466; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
5467; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
5468; GCN2-NEXT:    s_setpc_b64 s[30:31]
5469;
5470; GCN3-LABEL: flat_atomic_umin_i32_noret:
5471; GCN3:       ; %bb.0:
5472; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5473; GCN3-NEXT:    flat_load_dword v4, v[0:1]
5474; GCN3-NEXT:    s_mov_b64 s[4:5], 0
5475; GCN3-NEXT:  .LBB107_1: ; %atomicrmw.start
5476; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
5477; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5478; GCN3-NEXT:    v_min_u32_e32 v3, v4, v2
5479; GCN3-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
5480; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5481; GCN3-NEXT:    buffer_wbinvl1_vol
5482; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
5483; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5484; GCN3-NEXT:    v_mov_b32_e32 v4, v3
5485; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5486; GCN3-NEXT:    s_cbranch_execnz .LBB107_1
5487; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
5488; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
5489; GCN3-NEXT:    s_setpc_b64 s[30:31]
5490  %tmp0 = atomicrmw umin ptr %ptr, i32 %in seq_cst
5491  ret void
5492}
5493
5494define void @flat_atomic_umin_i32_noret_offset(ptr %out, i32 %in) {
5495; GCN1-LABEL: flat_atomic_umin_i32_noret_offset:
5496; GCN1:       ; %bb.0:
5497; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5498; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
5499; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
5500; GCN1-NEXT:    flat_load_dword v4, v[0:1]
5501; GCN1-NEXT:    s_mov_b64 s[4:5], 0
5502; GCN1-NEXT:  .LBB108_1: ; %atomicrmw.start
5503; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
5504; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5505; GCN1-NEXT:    v_min_u32_e32 v3, v4, v2
5506; GCN1-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
5507; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5508; GCN1-NEXT:    buffer_wbinvl1_vol
5509; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
5510; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5511; GCN1-NEXT:    v_mov_b32_e32 v4, v3
5512; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5513; GCN1-NEXT:    s_cbranch_execnz .LBB108_1
5514; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
5515; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
5516; GCN1-NEXT:    s_setpc_b64 s[30:31]
5517;
5518; GCN2-LABEL: flat_atomic_umin_i32_noret_offset:
5519; GCN2:       ; %bb.0:
5520; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5521; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
5522; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
5523; GCN2-NEXT:    flat_load_dword v4, v[0:1]
5524; GCN2-NEXT:    s_mov_b64 s[4:5], 0
5525; GCN2-NEXT:  .LBB108_1: ; %atomicrmw.start
5526; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
5527; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5528; GCN2-NEXT:    v_min_u32_e32 v3, v4, v2
5529; GCN2-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
5530; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5531; GCN2-NEXT:    buffer_wbinvl1_vol
5532; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
5533; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5534; GCN2-NEXT:    v_mov_b32_e32 v4, v3
5535; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5536; GCN2-NEXT:    s_cbranch_execnz .LBB108_1
5537; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
5538; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
5539; GCN2-NEXT:    s_setpc_b64 s[30:31]
5540;
5541; GCN3-LABEL: flat_atomic_umin_i32_noret_offset:
5542; GCN3:       ; %bb.0:
5543; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5544; GCN3-NEXT:    flat_load_dword v4, v[0:1] offset:16
5545; GCN3-NEXT:    s_mov_b64 s[4:5], 0
5546; GCN3-NEXT:  .LBB108_1: ; %atomicrmw.start
5547; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
5548; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5549; GCN3-NEXT:    v_min_u32_e32 v3, v4, v2
5550; GCN3-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
5551; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5552; GCN3-NEXT:    buffer_wbinvl1_vol
5553; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
5554; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5555; GCN3-NEXT:    v_mov_b32_e32 v4, v3
5556; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5557; GCN3-NEXT:    s_cbranch_execnz .LBB108_1
5558; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
5559; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
5560; GCN3-NEXT:    s_setpc_b64 s[30:31]
5561  %gep = getelementptr i32, ptr %out, i32 4
5562  %tmp0 = atomicrmw umin ptr %gep, i32 %in seq_cst
5563  ret void
5564}
5565
5566define i32 @flat_atomic_umin_i32_ret(ptr %ptr, i32 %in) {
5567; GCN1-LABEL: flat_atomic_umin_i32_ret:
5568; GCN1:       ; %bb.0:
5569; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5570; GCN1-NEXT:    flat_load_dword v3, v[0:1]
5571; GCN1-NEXT:    s_mov_b64 s[4:5], 0
5572; GCN1-NEXT:  .LBB109_1: ; %atomicrmw.start
5573; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
5574; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5575; GCN1-NEXT:    v_mov_b32_e32 v4, v3
5576; GCN1-NEXT:    v_min_u32_e32 v3, v4, v2
5577; GCN1-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
5578; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5579; GCN1-NEXT:    buffer_wbinvl1_vol
5580; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
5581; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5582; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5583; GCN1-NEXT:    s_cbranch_execnz .LBB109_1
5584; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
5585; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
5586; GCN1-NEXT:    v_mov_b32_e32 v0, v3
5587; GCN1-NEXT:    s_setpc_b64 s[30:31]
5588;
5589; GCN2-LABEL: flat_atomic_umin_i32_ret:
5590; GCN2:       ; %bb.0:
5591; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5592; GCN2-NEXT:    flat_load_dword v3, v[0:1]
5593; GCN2-NEXT:    s_mov_b64 s[4:5], 0
5594; GCN2-NEXT:  .LBB109_1: ; %atomicrmw.start
5595; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
5596; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5597; GCN2-NEXT:    v_mov_b32_e32 v4, v3
5598; GCN2-NEXT:    v_min_u32_e32 v3, v4, v2
5599; GCN2-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
5600; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5601; GCN2-NEXT:    buffer_wbinvl1_vol
5602; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
5603; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5604; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5605; GCN2-NEXT:    s_cbranch_execnz .LBB109_1
5606; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
5607; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
5608; GCN2-NEXT:    v_mov_b32_e32 v0, v3
5609; GCN2-NEXT:    s_setpc_b64 s[30:31]
5610;
5611; GCN3-LABEL: flat_atomic_umin_i32_ret:
5612; GCN3:       ; %bb.0:
5613; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5614; GCN3-NEXT:    flat_load_dword v3, v[0:1]
5615; GCN3-NEXT:    s_mov_b64 s[4:5], 0
5616; GCN3-NEXT:  .LBB109_1: ; %atomicrmw.start
5617; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
5618; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5619; GCN3-NEXT:    v_mov_b32_e32 v4, v3
5620; GCN3-NEXT:    v_min_u32_e32 v3, v4, v2
5621; GCN3-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
5622; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5623; GCN3-NEXT:    buffer_wbinvl1_vol
5624; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
5625; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5626; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5627; GCN3-NEXT:    s_cbranch_execnz .LBB109_1
5628; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
5629; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
5630; GCN3-NEXT:    v_mov_b32_e32 v0, v3
5631; GCN3-NEXT:    s_setpc_b64 s[30:31]
5632  %result = atomicrmw umin ptr %ptr, i32 %in seq_cst
5633  ret i32 %result
5634}
5635
5636define i32 @flat_atomic_umin_i32_ret_offset(ptr %out, i32 %in) {
5637; GCN1-LABEL: flat_atomic_umin_i32_ret_offset:
5638; GCN1:       ; %bb.0:
5639; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5640; GCN1-NEXT:    v_add_i32_e32 v3, vcc, 16, v0
5641; GCN1-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
5642; GCN1-NEXT:    flat_load_dword v0, v[3:4]
5643; GCN1-NEXT:    s_mov_b64 s[4:5], 0
5644; GCN1-NEXT:  .LBB110_1: ; %atomicrmw.start
5645; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
5646; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5647; GCN1-NEXT:    v_mov_b32_e32 v1, v0
5648; GCN1-NEXT:    v_min_u32_e32 v0, v1, v2
5649; GCN1-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
5650; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5651; GCN1-NEXT:    buffer_wbinvl1_vol
5652; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
5653; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5654; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5655; GCN1-NEXT:    s_cbranch_execnz .LBB110_1
5656; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
5657; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
5658; GCN1-NEXT:    s_setpc_b64 s[30:31]
5659;
5660; GCN2-LABEL: flat_atomic_umin_i32_ret_offset:
5661; GCN2:       ; %bb.0:
5662; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5663; GCN2-NEXT:    v_add_u32_e32 v3, vcc, 16, v0
5664; GCN2-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
5665; GCN2-NEXT:    flat_load_dword v0, v[3:4]
5666; GCN2-NEXT:    s_mov_b64 s[4:5], 0
5667; GCN2-NEXT:  .LBB110_1: ; %atomicrmw.start
5668; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
5669; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5670; GCN2-NEXT:    v_mov_b32_e32 v1, v0
5671; GCN2-NEXT:    v_min_u32_e32 v0, v1, v2
5672; GCN2-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
5673; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5674; GCN2-NEXT:    buffer_wbinvl1_vol
5675; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
5676; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5677; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5678; GCN2-NEXT:    s_cbranch_execnz .LBB110_1
5679; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
5680; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
5681; GCN2-NEXT:    s_setpc_b64 s[30:31]
5682;
5683; GCN3-LABEL: flat_atomic_umin_i32_ret_offset:
5684; GCN3:       ; %bb.0:
5685; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5686; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
5687; GCN3-NEXT:    s_mov_b64 s[4:5], 0
5688; GCN3-NEXT:  .LBB110_1: ; %atomicrmw.start
5689; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
5690; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5691; GCN3-NEXT:    v_mov_b32_e32 v4, v3
5692; GCN3-NEXT:    v_min_u32_e32 v3, v4, v2
5693; GCN3-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
5694; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5695; GCN3-NEXT:    buffer_wbinvl1_vol
5696; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
5697; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5698; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5699; GCN3-NEXT:    s_cbranch_execnz .LBB110_1
5700; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
5701; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
5702; GCN3-NEXT:    v_mov_b32_e32 v0, v3
5703; GCN3-NEXT:    s_setpc_b64 s[30:31]
5704  %gep = getelementptr i32, ptr %out, i32 4
5705  %result = atomicrmw umin ptr %gep, i32 %in seq_cst
5706  ret i32 %result
5707}
5708
5709define amdgpu_gfx void @flat_atomic_umin_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) {
5710; GCN1-LABEL: flat_atomic_umin_i32_noret_scalar:
5711; GCN1:       ; %bb.0:
5712; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5713; GCN1-NEXT:    v_mov_b32_e32 v0, s4
5714; GCN1-NEXT:    v_mov_b32_e32 v1, s5
5715; GCN1-NEXT:    flat_load_dword v3, v[0:1]
5716; GCN1-NEXT:    s_mov_b64 s[34:35], 0
5717; GCN1-NEXT:  .LBB111_1: ; %atomicrmw.start
5718; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
5719; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5720; GCN1-NEXT:    v_min_u32_e32 v2, s6, v3
5721; GCN1-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5722; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5723; GCN1-NEXT:    buffer_wbinvl1_vol
5724; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
5725; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
5726; GCN1-NEXT:    v_mov_b32_e32 v3, v2
5727; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
5728; GCN1-NEXT:    s_cbranch_execnz .LBB111_1
5729; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
5730; GCN1-NEXT:    s_or_b64 exec, exec, s[34:35]
5731; GCN1-NEXT:    s_setpc_b64 s[30:31]
5732;
5733; GCN2-LABEL: flat_atomic_umin_i32_noret_scalar:
5734; GCN2:       ; %bb.0:
5735; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5736; GCN2-NEXT:    v_mov_b32_e32 v0, s4
5737; GCN2-NEXT:    v_mov_b32_e32 v1, s5
5738; GCN2-NEXT:    flat_load_dword v3, v[0:1]
5739; GCN2-NEXT:    s_mov_b64 s[34:35], 0
5740; GCN2-NEXT:  .LBB111_1: ; %atomicrmw.start
5741; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
5742; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5743; GCN2-NEXT:    v_min_u32_e32 v2, s6, v3
5744; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5745; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5746; GCN2-NEXT:    buffer_wbinvl1_vol
5747; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
5748; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
5749; GCN2-NEXT:    v_mov_b32_e32 v3, v2
5750; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
5751; GCN2-NEXT:    s_cbranch_execnz .LBB111_1
5752; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
5753; GCN2-NEXT:    s_or_b64 exec, exec, s[34:35]
5754; GCN2-NEXT:    s_setpc_b64 s[30:31]
5755;
5756; GCN3-LABEL: flat_atomic_umin_i32_noret_scalar:
5757; GCN3:       ; %bb.0:
5758; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5759; GCN3-NEXT:    v_mov_b32_e32 v0, s4
5760; GCN3-NEXT:    v_mov_b32_e32 v1, s5
5761; GCN3-NEXT:    flat_load_dword v3, v[0:1]
5762; GCN3-NEXT:    s_mov_b64 s[34:35], 0
5763; GCN3-NEXT:  .LBB111_1: ; %atomicrmw.start
5764; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
5765; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5766; GCN3-NEXT:    v_min_u32_e32 v2, s6, v3
5767; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5768; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5769; GCN3-NEXT:    buffer_wbinvl1_vol
5770; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
5771; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
5772; GCN3-NEXT:    v_mov_b32_e32 v3, v2
5773; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
5774; GCN3-NEXT:    s_cbranch_execnz .LBB111_1
5775; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
5776; GCN3-NEXT:    s_or_b64 exec, exec, s[34:35]
5777; GCN3-NEXT:    s_setpc_b64 s[30:31]
5778  %tmp0 = atomicrmw umin ptr %ptr, i32 %in seq_cst
5779  ret void
5780}
5781
5782define amdgpu_gfx void @flat_atomic_umin_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) {
5783; GCN1-LABEL: flat_atomic_umin_i32_noret_offset_scalar:
5784; GCN1:       ; %bb.0:
5785; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5786; GCN1-NEXT:    s_add_u32 s34, s4, 16
5787; GCN1-NEXT:    s_addc_u32 s35, s5, 0
5788; GCN1-NEXT:    v_mov_b32_e32 v0, s34
5789; GCN1-NEXT:    v_mov_b32_e32 v1, s35
5790; GCN1-NEXT:    flat_load_dword v3, v[0:1]
5791; GCN1-NEXT:    s_mov_b64 s[34:35], 0
5792; GCN1-NEXT:  .LBB112_1: ; %atomicrmw.start
5793; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
5794; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5795; GCN1-NEXT:    v_min_u32_e32 v2, s6, v3
5796; GCN1-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5797; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5798; GCN1-NEXT:    buffer_wbinvl1_vol
5799; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
5800; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
5801; GCN1-NEXT:    v_mov_b32_e32 v3, v2
5802; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
5803; GCN1-NEXT:    s_cbranch_execnz .LBB112_1
5804; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
5805; GCN1-NEXT:    s_or_b64 exec, exec, s[34:35]
5806; GCN1-NEXT:    s_setpc_b64 s[30:31]
5807;
5808; GCN2-LABEL: flat_atomic_umin_i32_noret_offset_scalar:
5809; GCN2:       ; %bb.0:
5810; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5811; GCN2-NEXT:    s_add_u32 s34, s4, 16
5812; GCN2-NEXT:    s_addc_u32 s35, s5, 0
5813; GCN2-NEXT:    v_mov_b32_e32 v0, s34
5814; GCN2-NEXT:    v_mov_b32_e32 v1, s35
5815; GCN2-NEXT:    flat_load_dword v3, v[0:1]
5816; GCN2-NEXT:    s_mov_b64 s[34:35], 0
5817; GCN2-NEXT:  .LBB112_1: ; %atomicrmw.start
5818; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
5819; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5820; GCN2-NEXT:    v_min_u32_e32 v2, s6, v3
5821; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5822; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5823; GCN2-NEXT:    buffer_wbinvl1_vol
5824; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
5825; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
5826; GCN2-NEXT:    v_mov_b32_e32 v3, v2
5827; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
5828; GCN2-NEXT:    s_cbranch_execnz .LBB112_1
5829; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
5830; GCN2-NEXT:    s_or_b64 exec, exec, s[34:35]
5831; GCN2-NEXT:    s_setpc_b64 s[30:31]
5832;
5833; GCN3-LABEL: flat_atomic_umin_i32_noret_offset_scalar:
5834; GCN3:       ; %bb.0:
5835; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5836; GCN3-NEXT:    v_mov_b32_e32 v0, s4
5837; GCN3-NEXT:    v_mov_b32_e32 v1, s5
5838; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
5839; GCN3-NEXT:    s_mov_b64 s[34:35], 0
5840; GCN3-NEXT:  .LBB112_1: ; %atomicrmw.start
5841; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
5842; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5843; GCN3-NEXT:    v_min_u32_e32 v2, s6, v3
5844; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
5845; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5846; GCN3-NEXT:    buffer_wbinvl1_vol
5847; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
5848; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
5849; GCN3-NEXT:    v_mov_b32_e32 v3, v2
5850; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
5851; GCN3-NEXT:    s_cbranch_execnz .LBB112_1
5852; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
5853; GCN3-NEXT:    s_or_b64 exec, exec, s[34:35]
5854; GCN3-NEXT:    s_setpc_b64 s[30:31]
5855  %gep = getelementptr i32, ptr %out, i32 4
5856  %tmp0 = atomicrmw umin ptr %gep, i32 %in seq_cst
5857  ret void
5858}
5859
5860define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) {
5861; GCN1-LABEL: flat_atomic_umin_i32_ret_scalar:
5862; GCN1:       ; %bb.0:
5863; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5864; GCN1-NEXT:    v_mov_b32_e32 v0, s4
5865; GCN1-NEXT:    v_mov_b32_e32 v1, s5
5866; GCN1-NEXT:    flat_load_dword v0, v[0:1]
5867; GCN1-NEXT:    v_mov_b32_e32 v1, s4
5868; GCN1-NEXT:    s_mov_b64 s[34:35], 0
5869; GCN1-NEXT:    v_mov_b32_e32 v2, s5
5870; GCN1-NEXT:  .LBB113_1: ; %atomicrmw.start
5871; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
5872; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5873; GCN1-NEXT:    v_mov_b32_e32 v4, v0
5874; GCN1-NEXT:    v_min_u32_e32 v3, s6, v4
5875; GCN1-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
5876; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5877; GCN1-NEXT:    buffer_wbinvl1_vol
5878; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
5879; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
5880; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
5881; GCN1-NEXT:    s_cbranch_execnz .LBB113_1
5882; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
5883; GCN1-NEXT:    s_or_b64 exec, exec, s[34:35]
5884; GCN1-NEXT:    s_setpc_b64 s[30:31]
5885;
5886; GCN2-LABEL: flat_atomic_umin_i32_ret_scalar:
5887; GCN2:       ; %bb.0:
5888; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5889; GCN2-NEXT:    v_mov_b32_e32 v0, s4
5890; GCN2-NEXT:    v_mov_b32_e32 v1, s5
5891; GCN2-NEXT:    flat_load_dword v0, v[0:1]
5892; GCN2-NEXT:    v_mov_b32_e32 v1, s4
5893; GCN2-NEXT:    s_mov_b64 s[34:35], 0
5894; GCN2-NEXT:    v_mov_b32_e32 v2, s5
5895; GCN2-NEXT:  .LBB113_1: ; %atomicrmw.start
5896; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
5897; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5898; GCN2-NEXT:    v_mov_b32_e32 v4, v0
5899; GCN2-NEXT:    v_min_u32_e32 v3, s6, v4
5900; GCN2-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
5901; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5902; GCN2-NEXT:    buffer_wbinvl1_vol
5903; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
5904; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
5905; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
5906; GCN2-NEXT:    s_cbranch_execnz .LBB113_1
5907; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
5908; GCN2-NEXT:    s_or_b64 exec, exec, s[34:35]
5909; GCN2-NEXT:    s_setpc_b64 s[30:31]
5910;
5911; GCN3-LABEL: flat_atomic_umin_i32_ret_scalar:
5912; GCN3:       ; %bb.0:
5913; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5914; GCN3-NEXT:    v_mov_b32_e32 v0, s4
5915; GCN3-NEXT:    v_mov_b32_e32 v1, s5
5916; GCN3-NEXT:    flat_load_dword v0, v[0:1]
5917; GCN3-NEXT:    v_mov_b32_e32 v1, s4
5918; GCN3-NEXT:    s_mov_b64 s[34:35], 0
5919; GCN3-NEXT:    v_mov_b32_e32 v2, s5
5920; GCN3-NEXT:  .LBB113_1: ; %atomicrmw.start
5921; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
5922; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5923; GCN3-NEXT:    v_mov_b32_e32 v4, v0
5924; GCN3-NEXT:    v_min_u32_e32 v3, s6, v4
5925; GCN3-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
5926; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5927; GCN3-NEXT:    buffer_wbinvl1_vol
5928; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
5929; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
5930; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
5931; GCN3-NEXT:    s_cbranch_execnz .LBB113_1
5932; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
5933; GCN3-NEXT:    s_or_b64 exec, exec, s[34:35]
5934; GCN3-NEXT:    s_setpc_b64 s[30:31]
5935  %result = atomicrmw umin ptr %ptr, i32 %in seq_cst
5936  ret i32 %result
5937}
5938
5939define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) {
5940; GCN1-LABEL: flat_atomic_umin_i32_ret_offset_scalar:
5941; GCN1:       ; %bb.0:
5942; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5943; GCN1-NEXT:    s_add_u32 s34, s4, 16
5944; GCN1-NEXT:    s_addc_u32 s35, s5, 0
5945; GCN1-NEXT:    v_mov_b32_e32 v1, s34
5946; GCN1-NEXT:    v_mov_b32_e32 v2, s35
5947; GCN1-NEXT:    flat_load_dword v0, v[1:2]
5948; GCN1-NEXT:    s_mov_b64 s[34:35], 0
5949; GCN1-NEXT:  .LBB114_1: ; %atomicrmw.start
5950; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
5951; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5952; GCN1-NEXT:    v_mov_b32_e32 v4, v0
5953; GCN1-NEXT:    v_min_u32_e32 v3, s6, v4
5954; GCN1-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
5955; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5956; GCN1-NEXT:    buffer_wbinvl1_vol
5957; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
5958; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
5959; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
5960; GCN1-NEXT:    s_cbranch_execnz .LBB114_1
5961; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
5962; GCN1-NEXT:    s_or_b64 exec, exec, s[34:35]
5963; GCN1-NEXT:    s_setpc_b64 s[30:31]
5964;
5965; GCN2-LABEL: flat_atomic_umin_i32_ret_offset_scalar:
5966; GCN2:       ; %bb.0:
5967; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5968; GCN2-NEXT:    s_add_u32 s34, s4, 16
5969; GCN2-NEXT:    s_addc_u32 s35, s5, 0
5970; GCN2-NEXT:    v_mov_b32_e32 v1, s34
5971; GCN2-NEXT:    v_mov_b32_e32 v2, s35
5972; GCN2-NEXT:    flat_load_dword v0, v[1:2]
5973; GCN2-NEXT:    s_mov_b64 s[34:35], 0
5974; GCN2-NEXT:  .LBB114_1: ; %atomicrmw.start
5975; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
5976; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5977; GCN2-NEXT:    v_mov_b32_e32 v4, v0
5978; GCN2-NEXT:    v_min_u32_e32 v3, s6, v4
5979; GCN2-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
5980; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5981; GCN2-NEXT:    buffer_wbinvl1_vol
5982; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
5983; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
5984; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
5985; GCN2-NEXT:    s_cbranch_execnz .LBB114_1
5986; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
5987; GCN2-NEXT:    s_or_b64 exec, exec, s[34:35]
5988; GCN2-NEXT:    s_setpc_b64 s[30:31]
5989;
5990; GCN3-LABEL: flat_atomic_umin_i32_ret_offset_scalar:
5991; GCN3:       ; %bb.0:
5992; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5993; GCN3-NEXT:    v_mov_b32_e32 v0, s4
5994; GCN3-NEXT:    v_mov_b32_e32 v1, s5
5995; GCN3-NEXT:    flat_load_dword v0, v[0:1] offset:16
5996; GCN3-NEXT:    v_mov_b32_e32 v1, s4
5997; GCN3-NEXT:    s_mov_b64 s[34:35], 0
5998; GCN3-NEXT:    v_mov_b32_e32 v2, s5
5999; GCN3-NEXT:  .LBB114_1: ; %atomicrmw.start
6000; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
6001; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6002; GCN3-NEXT:    v_mov_b32_e32 v4, v0
6003; GCN3-NEXT:    v_min_u32_e32 v3, s6, v4
6004; GCN3-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc
6005; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6006; GCN3-NEXT:    buffer_wbinvl1_vol
6007; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
6008; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
6009; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
6010; GCN3-NEXT:    s_cbranch_execnz .LBB114_1
6011; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
6012; GCN3-NEXT:    s_or_b64 exec, exec, s[34:35]
6013; GCN3-NEXT:    s_setpc_b64 s[30:31]
6014  %gep = getelementptr i32, ptr %out, i32 4
6015  %result = atomicrmw umin ptr %gep, i32 %in seq_cst
6016  ret i32 %result
6017}
6018
6019define void @flat_umin_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
6020; GCN1-LABEL: flat_umin_i32_noret_offset__amdgpu_no_remote_memory:
6021; GCN1:       ; %bb.0:
6022; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6023; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
6024; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
6025; GCN1-NEXT:    flat_load_dword v4, v[0:1]
6026; GCN1-NEXT:    s_mov_b64 s[4:5], 0
6027; GCN1-NEXT:  .LBB115_1: ; %atomicrmw.start
6028; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
6029; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6030; GCN1-NEXT:    v_min_u32_e32 v3, v4, v2
6031; GCN1-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
6032; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6033; GCN1-NEXT:    buffer_wbinvl1_vol
6034; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
6035; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6036; GCN1-NEXT:    v_mov_b32_e32 v4, v3
6037; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6038; GCN1-NEXT:    s_cbranch_execnz .LBB115_1
6039; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
6040; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
6041; GCN1-NEXT:    s_setpc_b64 s[30:31]
6042;
6043; GCN2-LABEL: flat_umin_i32_noret_offset__amdgpu_no_remote_memory:
6044; GCN2:       ; %bb.0:
6045; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6046; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
6047; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
6048; GCN2-NEXT:    flat_load_dword v4, v[0:1]
6049; GCN2-NEXT:    s_mov_b64 s[4:5], 0
6050; GCN2-NEXT:  .LBB115_1: ; %atomicrmw.start
6051; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
6052; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6053; GCN2-NEXT:    v_min_u32_e32 v3, v4, v2
6054; GCN2-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
6055; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6056; GCN2-NEXT:    buffer_wbinvl1_vol
6057; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
6058; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6059; GCN2-NEXT:    v_mov_b32_e32 v4, v3
6060; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6061; GCN2-NEXT:    s_cbranch_execnz .LBB115_1
6062; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
6063; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
6064; GCN2-NEXT:    s_setpc_b64 s[30:31]
6065;
6066; GCN3-LABEL: flat_umin_i32_noret_offset__amdgpu_no_remote_memory:
6067; GCN3:       ; %bb.0:
6068; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6069; GCN3-NEXT:    flat_load_dword v4, v[0:1] offset:16
6070; GCN3-NEXT:    s_mov_b64 s[4:5], 0
6071; GCN3-NEXT:  .LBB115_1: ; %atomicrmw.start
6072; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
6073; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6074; GCN3-NEXT:    v_min_u32_e32 v3, v4, v2
6075; GCN3-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
6076; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6077; GCN3-NEXT:    buffer_wbinvl1_vol
6078; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
6079; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6080; GCN3-NEXT:    v_mov_b32_e32 v4, v3
6081; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6082; GCN3-NEXT:    s_cbranch_execnz .LBB115_1
6083; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
6084; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
6085; GCN3-NEXT:    s_setpc_b64 s[30:31]
6086  %gep = getelementptr i32, ptr %out, i64 4
6087  %tmp0 = atomicrmw umin ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
6088  ret void
6089}
6090
6091define i32 @flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
6092; GCN1-LABEL: flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory:
6093; GCN1:       ; %bb.0:
6094; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6095; GCN1-NEXT:    v_add_i32_e32 v3, vcc, 16, v0
6096; GCN1-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
6097; GCN1-NEXT:    flat_load_dword v0, v[3:4]
6098; GCN1-NEXT:    s_mov_b64 s[4:5], 0
6099; GCN1-NEXT:  .LBB116_1: ; %atomicrmw.start
6100; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
6101; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6102; GCN1-NEXT:    v_mov_b32_e32 v1, v0
6103; GCN1-NEXT:    v_min_u32_e32 v0, v1, v2
6104; GCN1-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
6105; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6106; GCN1-NEXT:    buffer_wbinvl1_vol
6107; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
6108; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6109; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6110; GCN1-NEXT:    s_cbranch_execnz .LBB116_1
6111; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
6112; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
6113; GCN1-NEXT:    s_setpc_b64 s[30:31]
6114;
6115; GCN2-LABEL: flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory:
6116; GCN2:       ; %bb.0:
6117; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6118; GCN2-NEXT:    v_add_u32_e32 v3, vcc, 16, v0
6119; GCN2-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
6120; GCN2-NEXT:    flat_load_dword v0, v[3:4]
6121; GCN2-NEXT:    s_mov_b64 s[4:5], 0
6122; GCN2-NEXT:  .LBB116_1: ; %atomicrmw.start
6123; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
6124; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6125; GCN2-NEXT:    v_mov_b32_e32 v1, v0
6126; GCN2-NEXT:    v_min_u32_e32 v0, v1, v2
6127; GCN2-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
6128; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6129; GCN2-NEXT:    buffer_wbinvl1_vol
6130; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
6131; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6132; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6133; GCN2-NEXT:    s_cbranch_execnz .LBB116_1
6134; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
6135; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
6136; GCN2-NEXT:    s_setpc_b64 s[30:31]
6137;
6138; GCN3-LABEL: flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory:
6139; GCN3:       ; %bb.0:
6140; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6141; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
6142; GCN3-NEXT:    s_mov_b64 s[4:5], 0
6143; GCN3-NEXT:  .LBB116_1: ; %atomicrmw.start
6144; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
6145; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6146; GCN3-NEXT:    v_mov_b32_e32 v4, v3
6147; GCN3-NEXT:    v_min_u32_e32 v3, v4, v2
6148; GCN3-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
6149; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6150; GCN3-NEXT:    buffer_wbinvl1_vol
6151; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
6152; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6153; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6154; GCN3-NEXT:    s_cbranch_execnz .LBB116_1
6155; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
6156; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
6157; GCN3-NEXT:    v_mov_b32_e32 v0, v3
6158; GCN3-NEXT:    s_setpc_b64 s[30:31]
6159  %gep = getelementptr i32, ptr %out, i64 4
6160  %result = atomicrmw umin ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
6161  ret i32 %result
6162}
6163
6164; ---------------------------------------------------------------------
6165; atomicrmw min
6166; ---------------------------------------------------------------------
6167
6168define void @flat_atomic_min_i32_noret(ptr %ptr, i32 %in) {
6169; GCN1-LABEL: flat_atomic_min_i32_noret:
6170; GCN1:       ; %bb.0:
6171; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6172; GCN1-NEXT:    flat_load_dword v4, v[0:1]
6173; GCN1-NEXT:    s_mov_b64 s[4:5], 0
6174; GCN1-NEXT:  .LBB117_1: ; %atomicrmw.start
6175; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
6176; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6177; GCN1-NEXT:    v_min_i32_e32 v3, v4, v2
6178; GCN1-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
6179; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6180; GCN1-NEXT:    buffer_wbinvl1_vol
6181; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
6182; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6183; GCN1-NEXT:    v_mov_b32_e32 v4, v3
6184; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6185; GCN1-NEXT:    s_cbranch_execnz .LBB117_1
6186; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
6187; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
6188; GCN1-NEXT:    s_setpc_b64 s[30:31]
6189;
6190; GCN2-LABEL: flat_atomic_min_i32_noret:
6191; GCN2:       ; %bb.0:
6192; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6193; GCN2-NEXT:    flat_load_dword v4, v[0:1]
6194; GCN2-NEXT:    s_mov_b64 s[4:5], 0
6195; GCN2-NEXT:  .LBB117_1: ; %atomicrmw.start
6196; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
6197; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6198; GCN2-NEXT:    v_min_i32_e32 v3, v4, v2
6199; GCN2-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
6200; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6201; GCN2-NEXT:    buffer_wbinvl1_vol
6202; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
6203; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6204; GCN2-NEXT:    v_mov_b32_e32 v4, v3
6205; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6206; GCN2-NEXT:    s_cbranch_execnz .LBB117_1
6207; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
6208; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
6209; GCN2-NEXT:    s_setpc_b64 s[30:31]
6210;
6211; GCN3-LABEL: flat_atomic_min_i32_noret:
6212; GCN3:       ; %bb.0:
6213; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6214; GCN3-NEXT:    flat_load_dword v4, v[0:1]
6215; GCN3-NEXT:    s_mov_b64 s[4:5], 0
6216; GCN3-NEXT:  .LBB117_1: ; %atomicrmw.start
6217; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
6218; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6219; GCN3-NEXT:    v_min_i32_e32 v3, v4, v2
6220; GCN3-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
6221; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6222; GCN3-NEXT:    buffer_wbinvl1_vol
6223; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
6224; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6225; GCN3-NEXT:    v_mov_b32_e32 v4, v3
6226; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6227; GCN3-NEXT:    s_cbranch_execnz .LBB117_1
6228; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
6229; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
6230; GCN3-NEXT:    s_setpc_b64 s[30:31]
6231  %tmp0 = atomicrmw min ptr %ptr, i32 %in seq_cst
6232  ret void
6233}
6234
6235define void @flat_atomic_min_i32_noret_offset(ptr %out, i32 %in) {
6236; GCN1-LABEL: flat_atomic_min_i32_noret_offset:
6237; GCN1:       ; %bb.0:
6238; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6239; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
6240; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
6241; GCN1-NEXT:    flat_load_dword v4, v[0:1]
6242; GCN1-NEXT:    s_mov_b64 s[4:5], 0
6243; GCN1-NEXT:  .LBB118_1: ; %atomicrmw.start
6244; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
6245; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6246; GCN1-NEXT:    v_min_i32_e32 v3, v4, v2
6247; GCN1-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
6248; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6249; GCN1-NEXT:    buffer_wbinvl1_vol
6250; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
6251; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6252; GCN1-NEXT:    v_mov_b32_e32 v4, v3
6253; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6254; GCN1-NEXT:    s_cbranch_execnz .LBB118_1
6255; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
6256; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
6257; GCN1-NEXT:    s_setpc_b64 s[30:31]
6258;
6259; GCN2-LABEL: flat_atomic_min_i32_noret_offset:
6260; GCN2:       ; %bb.0:
6261; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6262; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
6263; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
6264; GCN2-NEXT:    flat_load_dword v4, v[0:1]
6265; GCN2-NEXT:    s_mov_b64 s[4:5], 0
6266; GCN2-NEXT:  .LBB118_1: ; %atomicrmw.start
6267; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
6268; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6269; GCN2-NEXT:    v_min_i32_e32 v3, v4, v2
6270; GCN2-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
6271; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6272; GCN2-NEXT:    buffer_wbinvl1_vol
6273; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
6274; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6275; GCN2-NEXT:    v_mov_b32_e32 v4, v3
6276; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6277; GCN2-NEXT:    s_cbranch_execnz .LBB118_1
6278; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
6279; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
6280; GCN2-NEXT:    s_setpc_b64 s[30:31]
6281;
6282; GCN3-LABEL: flat_atomic_min_i32_noret_offset:
6283; GCN3:       ; %bb.0:
6284; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6285; GCN3-NEXT:    flat_load_dword v4, v[0:1] offset:16
6286; GCN3-NEXT:    s_mov_b64 s[4:5], 0
6287; GCN3-NEXT:  .LBB118_1: ; %atomicrmw.start
6288; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
6289; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6290; GCN3-NEXT:    v_min_i32_e32 v3, v4, v2
6291; GCN3-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
6292; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6293; GCN3-NEXT:    buffer_wbinvl1_vol
6294; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
6295; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6296; GCN3-NEXT:    v_mov_b32_e32 v4, v3
6297; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6298; GCN3-NEXT:    s_cbranch_execnz .LBB118_1
6299; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
6300; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
6301; GCN3-NEXT:    s_setpc_b64 s[30:31]
6302  %gep = getelementptr i32, ptr %out, i32 4
6303  %tmp0 = atomicrmw min ptr %gep, i32 %in seq_cst
6304  ret void
6305}
6306
6307define i32 @flat_atomic_min_i32_ret(ptr %ptr, i32 %in) {
6308; GCN1-LABEL: flat_atomic_min_i32_ret:
6309; GCN1:       ; %bb.0:
6310; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6311; GCN1-NEXT:    flat_load_dword v3, v[0:1]
6312; GCN1-NEXT:    s_mov_b64 s[4:5], 0
6313; GCN1-NEXT:  .LBB119_1: ; %atomicrmw.start
6314; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
6315; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6316; GCN1-NEXT:    v_mov_b32_e32 v4, v3
6317; GCN1-NEXT:    v_min_i32_e32 v3, v4, v2
6318; GCN1-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
6319; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6320; GCN1-NEXT:    buffer_wbinvl1_vol
6321; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
6322; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6323; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6324; GCN1-NEXT:    s_cbranch_execnz .LBB119_1
6325; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
6326; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
6327; GCN1-NEXT:    v_mov_b32_e32 v0, v3
6328; GCN1-NEXT:    s_setpc_b64 s[30:31]
6329;
6330; GCN2-LABEL: flat_atomic_min_i32_ret:
6331; GCN2:       ; %bb.0:
6332; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6333; GCN2-NEXT:    flat_load_dword v3, v[0:1]
6334; GCN2-NEXT:    s_mov_b64 s[4:5], 0
6335; GCN2-NEXT:  .LBB119_1: ; %atomicrmw.start
6336; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
6337; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6338; GCN2-NEXT:    v_mov_b32_e32 v4, v3
6339; GCN2-NEXT:    v_min_i32_e32 v3, v4, v2
6340; GCN2-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
6341; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6342; GCN2-NEXT:    buffer_wbinvl1_vol
6343; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
6344; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6345; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6346; GCN2-NEXT:    s_cbranch_execnz .LBB119_1
6347; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
6348; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
6349; GCN2-NEXT:    v_mov_b32_e32 v0, v3
6350; GCN2-NEXT:    s_setpc_b64 s[30:31]
6351;
6352; GCN3-LABEL: flat_atomic_min_i32_ret:
6353; GCN3:       ; %bb.0:
6354; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6355; GCN3-NEXT:    flat_load_dword v3, v[0:1]
6356; GCN3-NEXT:    s_mov_b64 s[4:5], 0
6357; GCN3-NEXT:  .LBB119_1: ; %atomicrmw.start
6358; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
6359; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6360; GCN3-NEXT:    v_mov_b32_e32 v4, v3
6361; GCN3-NEXT:    v_min_i32_e32 v3, v4, v2
6362; GCN3-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
6363; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6364; GCN3-NEXT:    buffer_wbinvl1_vol
6365; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
6366; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6367; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6368; GCN3-NEXT:    s_cbranch_execnz .LBB119_1
6369; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
6370; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
6371; GCN3-NEXT:    v_mov_b32_e32 v0, v3
6372; GCN3-NEXT:    s_setpc_b64 s[30:31]
6373  %result = atomicrmw min ptr %ptr, i32 %in seq_cst
6374  ret i32 %result
6375}
6376
6377define i32 @flat_atomic_min_i32_ret_offset(ptr %out, i32 %in) {
6378; GCN1-LABEL: flat_atomic_min_i32_ret_offset:
6379; GCN1:       ; %bb.0:
6380; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6381; GCN1-NEXT:    v_add_i32_e32 v3, vcc, 16, v0
6382; GCN1-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
6383; GCN1-NEXT:    flat_load_dword v0, v[3:4]
6384; GCN1-NEXT:    s_mov_b64 s[4:5], 0
6385; GCN1-NEXT:  .LBB120_1: ; %atomicrmw.start
6386; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
6387; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6388; GCN1-NEXT:    v_mov_b32_e32 v1, v0
6389; GCN1-NEXT:    v_min_i32_e32 v0, v1, v2
6390; GCN1-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
6391; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6392; GCN1-NEXT:    buffer_wbinvl1_vol
6393; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
6394; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6395; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6396; GCN1-NEXT:    s_cbranch_execnz .LBB120_1
6397; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
6398; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
6399; GCN1-NEXT:    s_setpc_b64 s[30:31]
6400;
6401; GCN2-LABEL: flat_atomic_min_i32_ret_offset:
6402; GCN2:       ; %bb.0:
6403; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6404; GCN2-NEXT:    v_add_u32_e32 v3, vcc, 16, v0
6405; GCN2-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
6406; GCN2-NEXT:    flat_load_dword v0, v[3:4]
6407; GCN2-NEXT:    s_mov_b64 s[4:5], 0
6408; GCN2-NEXT:  .LBB120_1: ; %atomicrmw.start
6409; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
6410; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6411; GCN2-NEXT:    v_mov_b32_e32 v1, v0
6412; GCN2-NEXT:    v_min_i32_e32 v0, v1, v2
6413; GCN2-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
6414; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6415; GCN2-NEXT:    buffer_wbinvl1_vol
6416; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
6417; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6418; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6419; GCN2-NEXT:    s_cbranch_execnz .LBB120_1
6420; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
6421; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
6422; GCN2-NEXT:    s_setpc_b64 s[30:31]
6423;
6424; GCN3-LABEL: flat_atomic_min_i32_ret_offset:
6425; GCN3:       ; %bb.0:
6426; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6427; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
6428; GCN3-NEXT:    s_mov_b64 s[4:5], 0
6429; GCN3-NEXT:  .LBB120_1: ; %atomicrmw.start
6430; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
6431; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6432; GCN3-NEXT:    v_mov_b32_e32 v4, v3
6433; GCN3-NEXT:    v_min_i32_e32 v3, v4, v2
6434; GCN3-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
6435; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6436; GCN3-NEXT:    buffer_wbinvl1_vol
6437; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
6438; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6439; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6440; GCN3-NEXT:    s_cbranch_execnz .LBB120_1
6441; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
6442; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
6443; GCN3-NEXT:    v_mov_b32_e32 v0, v3
6444; GCN3-NEXT:    s_setpc_b64 s[30:31]
6445  %gep = getelementptr i32, ptr %out, i32 4
6446  %result = atomicrmw min ptr %gep, i32 %in seq_cst
6447  ret i32 %result
6448}
6449
6450define amdgpu_gfx void @flat_atomic_min_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) {
6451; GCN1-LABEL: flat_atomic_min_i32_noret_scalar:
6452; GCN1:       ; %bb.0:
6453; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6454; GCN1-NEXT:    v_mov_b32_e32 v0, s4
6455; GCN1-NEXT:    v_mov_b32_e32 v1, s5
6456; GCN1-NEXT:    flat_load_dword v3, v[0:1]
6457; GCN1-NEXT:    s_mov_b64 s[34:35], 0
6458; GCN1-NEXT:  .LBB121_1: ; %atomicrmw.start
6459; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
6460; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6461; GCN1-NEXT:    v_min_i32_e32 v2, s6, v3
6462; GCN1-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6463; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6464; GCN1-NEXT:    buffer_wbinvl1_vol
6465; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
6466; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
6467; GCN1-NEXT:    v_mov_b32_e32 v3, v2
6468; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
6469; GCN1-NEXT:    s_cbranch_execnz .LBB121_1
6470; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
6471; GCN1-NEXT:    s_or_b64 exec, exec, s[34:35]
6472; GCN1-NEXT:    s_setpc_b64 s[30:31]
6473;
6474; GCN2-LABEL: flat_atomic_min_i32_noret_scalar:
6475; GCN2:       ; %bb.0:
6476; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6477; GCN2-NEXT:    v_mov_b32_e32 v0, s4
6478; GCN2-NEXT:    v_mov_b32_e32 v1, s5
6479; GCN2-NEXT:    flat_load_dword v3, v[0:1]
6480; GCN2-NEXT:    s_mov_b64 s[34:35], 0
6481; GCN2-NEXT:  .LBB121_1: ; %atomicrmw.start
6482; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
6483; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6484; GCN2-NEXT:    v_min_i32_e32 v2, s6, v3
6485; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6486; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6487; GCN2-NEXT:    buffer_wbinvl1_vol
6488; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
6489; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
6490; GCN2-NEXT:    v_mov_b32_e32 v3, v2
6491; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
6492; GCN2-NEXT:    s_cbranch_execnz .LBB121_1
6493; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
6494; GCN2-NEXT:    s_or_b64 exec, exec, s[34:35]
6495; GCN2-NEXT:    s_setpc_b64 s[30:31]
6496;
6497; GCN3-LABEL: flat_atomic_min_i32_noret_scalar:
6498; GCN3:       ; %bb.0:
6499; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6500; GCN3-NEXT:    v_mov_b32_e32 v0, s4
6501; GCN3-NEXT:    v_mov_b32_e32 v1, s5
6502; GCN3-NEXT:    flat_load_dword v3, v[0:1]
6503; GCN3-NEXT:    s_mov_b64 s[34:35], 0
6504; GCN3-NEXT:  .LBB121_1: ; %atomicrmw.start
6505; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
6506; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6507; GCN3-NEXT:    v_min_i32_e32 v2, s6, v3
6508; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6509; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6510; GCN3-NEXT:    buffer_wbinvl1_vol
6511; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
6512; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
6513; GCN3-NEXT:    v_mov_b32_e32 v3, v2
6514; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
6515; GCN3-NEXT:    s_cbranch_execnz .LBB121_1
6516; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
6517; GCN3-NEXT:    s_or_b64 exec, exec, s[34:35]
6518; GCN3-NEXT:    s_setpc_b64 s[30:31]
6519  %tmp0 = atomicrmw min ptr %ptr, i32 %in seq_cst
6520  ret void
6521}
6522
6523define amdgpu_gfx void @flat_atomic_min_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) {
6524; GCN1-LABEL: flat_atomic_min_i32_noret_offset_scalar:
6525; GCN1:       ; %bb.0:
6526; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6527; GCN1-NEXT:    s_add_u32 s34, s4, 16
6528; GCN1-NEXT:    s_addc_u32 s35, s5, 0
6529; GCN1-NEXT:    v_mov_b32_e32 v0, s34
6530; GCN1-NEXT:    v_mov_b32_e32 v1, s35
6531; GCN1-NEXT:    flat_load_dword v3, v[0:1]
6532; GCN1-NEXT:    s_mov_b64 s[34:35], 0
6533; GCN1-NEXT:  .LBB122_1: ; %atomicrmw.start
6534; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
6535; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6536; GCN1-NEXT:    v_min_i32_e32 v2, s6, v3
6537; GCN1-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6538; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6539; GCN1-NEXT:    buffer_wbinvl1_vol
6540; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
6541; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
6542; GCN1-NEXT:    v_mov_b32_e32 v3, v2
6543; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
6544; GCN1-NEXT:    s_cbranch_execnz .LBB122_1
6545; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
6546; GCN1-NEXT:    s_or_b64 exec, exec, s[34:35]
6547; GCN1-NEXT:    s_setpc_b64 s[30:31]
6548;
6549; GCN2-LABEL: flat_atomic_min_i32_noret_offset_scalar:
6550; GCN2:       ; %bb.0:
6551; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6552; GCN2-NEXT:    s_add_u32 s34, s4, 16
6553; GCN2-NEXT:    s_addc_u32 s35, s5, 0
6554; GCN2-NEXT:    v_mov_b32_e32 v0, s34
6555; GCN2-NEXT:    v_mov_b32_e32 v1, s35
6556; GCN2-NEXT:    flat_load_dword v3, v[0:1]
6557; GCN2-NEXT:    s_mov_b64 s[34:35], 0
6558; GCN2-NEXT:  .LBB122_1: ; %atomicrmw.start
6559; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
6560; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6561; GCN2-NEXT:    v_min_i32_e32 v2, s6, v3
6562; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6563; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6564; GCN2-NEXT:    buffer_wbinvl1_vol
6565; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
6566; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
6567; GCN2-NEXT:    v_mov_b32_e32 v3, v2
6568; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
6569; GCN2-NEXT:    s_cbranch_execnz .LBB122_1
6570; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
6571; GCN2-NEXT:    s_or_b64 exec, exec, s[34:35]
6572; GCN2-NEXT:    s_setpc_b64 s[30:31]
6573;
6574; GCN3-LABEL: flat_atomic_min_i32_noret_offset_scalar:
6575; GCN3:       ; %bb.0:
6576; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6577; GCN3-NEXT:    v_mov_b32_e32 v0, s4
6578; GCN3-NEXT:    v_mov_b32_e32 v1, s5
6579; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
6580; GCN3-NEXT:    s_mov_b64 s[34:35], 0
6581; GCN3-NEXT:  .LBB122_1: ; %atomicrmw.start
6582; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
6583; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6584; GCN3-NEXT:    v_min_i32_e32 v2, s6, v3
6585; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
6586; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6587; GCN3-NEXT:    buffer_wbinvl1_vol
6588; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
6589; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
6590; GCN3-NEXT:    v_mov_b32_e32 v3, v2
6591; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
6592; GCN3-NEXT:    s_cbranch_execnz .LBB122_1
6593; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
6594; GCN3-NEXT:    s_or_b64 exec, exec, s[34:35]
6595; GCN3-NEXT:    s_setpc_b64 s[30:31]
6596  %gep = getelementptr i32, ptr %out, i32 4
6597  %tmp0 = atomicrmw min ptr %gep, i32 %in seq_cst
6598  ret void
6599}
6600
6601define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) {
6602; GCN1-LABEL: flat_atomic_min_i32_ret_scalar:
6603; GCN1:       ; %bb.0:
6604; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6605; GCN1-NEXT:    v_mov_b32_e32 v0, s4
6606; GCN1-NEXT:    v_mov_b32_e32 v1, s5
6607; GCN1-NEXT:    flat_load_dword v0, v[0:1]
6608; GCN1-NEXT:    v_mov_b32_e32 v1, s4
6609; GCN1-NEXT:    s_mov_b64 s[34:35], 0
6610; GCN1-NEXT:    v_mov_b32_e32 v2, s5
6611; GCN1-NEXT:  .LBB123_1: ; %atomicrmw.start
6612; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
6613; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6614; GCN1-NEXT:    v_mov_b32_e32 v4, v0
6615; GCN1-NEXT:    v_min_i32_e32 v3, s6, v4
6616; GCN1-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
6617; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6618; GCN1-NEXT:    buffer_wbinvl1_vol
6619; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
6620; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
6621; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
6622; GCN1-NEXT:    s_cbranch_execnz .LBB123_1
6623; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
6624; GCN1-NEXT:    s_or_b64 exec, exec, s[34:35]
6625; GCN1-NEXT:    s_setpc_b64 s[30:31]
6626;
6627; GCN2-LABEL: flat_atomic_min_i32_ret_scalar:
6628; GCN2:       ; %bb.0:
6629; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6630; GCN2-NEXT:    v_mov_b32_e32 v0, s4
6631; GCN2-NEXT:    v_mov_b32_e32 v1, s5
6632; GCN2-NEXT:    flat_load_dword v0, v[0:1]
6633; GCN2-NEXT:    v_mov_b32_e32 v1, s4
6634; GCN2-NEXT:    s_mov_b64 s[34:35], 0
6635; GCN2-NEXT:    v_mov_b32_e32 v2, s5
6636; GCN2-NEXT:  .LBB123_1: ; %atomicrmw.start
6637; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
6638; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6639; GCN2-NEXT:    v_mov_b32_e32 v4, v0
6640; GCN2-NEXT:    v_min_i32_e32 v3, s6, v4
6641; GCN2-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
6642; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6643; GCN2-NEXT:    buffer_wbinvl1_vol
6644; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
6645; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
6646; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
6647; GCN2-NEXT:    s_cbranch_execnz .LBB123_1
6648; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
6649; GCN2-NEXT:    s_or_b64 exec, exec, s[34:35]
6650; GCN2-NEXT:    s_setpc_b64 s[30:31]
6651;
6652; GCN3-LABEL: flat_atomic_min_i32_ret_scalar:
6653; GCN3:       ; %bb.0:
6654; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6655; GCN3-NEXT:    v_mov_b32_e32 v0, s4
6656; GCN3-NEXT:    v_mov_b32_e32 v1, s5
6657; GCN3-NEXT:    flat_load_dword v0, v[0:1]
6658; GCN3-NEXT:    v_mov_b32_e32 v1, s4
6659; GCN3-NEXT:    s_mov_b64 s[34:35], 0
6660; GCN3-NEXT:    v_mov_b32_e32 v2, s5
6661; GCN3-NEXT:  .LBB123_1: ; %atomicrmw.start
6662; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
6663; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6664; GCN3-NEXT:    v_mov_b32_e32 v4, v0
6665; GCN3-NEXT:    v_min_i32_e32 v3, s6, v4
6666; GCN3-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
6667; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6668; GCN3-NEXT:    buffer_wbinvl1_vol
6669; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
6670; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
6671; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
6672; GCN3-NEXT:    s_cbranch_execnz .LBB123_1
6673; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
6674; GCN3-NEXT:    s_or_b64 exec, exec, s[34:35]
6675; GCN3-NEXT:    s_setpc_b64 s[30:31]
6676  %result = atomicrmw min ptr %ptr, i32 %in seq_cst
6677  ret i32 %result
6678}
6679
6680define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) {
6681; GCN1-LABEL: flat_atomic_min_i32_ret_offset_scalar:
6682; GCN1:       ; %bb.0:
6683; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6684; GCN1-NEXT:    s_add_u32 s34, s4, 16
6685; GCN1-NEXT:    s_addc_u32 s35, s5, 0
6686; GCN1-NEXT:    v_mov_b32_e32 v1, s34
6687; GCN1-NEXT:    v_mov_b32_e32 v2, s35
6688; GCN1-NEXT:    flat_load_dword v0, v[1:2]
6689; GCN1-NEXT:    s_mov_b64 s[34:35], 0
6690; GCN1-NEXT:  .LBB124_1: ; %atomicrmw.start
6691; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
6692; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6693; GCN1-NEXT:    v_mov_b32_e32 v4, v0
6694; GCN1-NEXT:    v_min_i32_e32 v3, s6, v4
6695; GCN1-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
6696; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6697; GCN1-NEXT:    buffer_wbinvl1_vol
6698; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
6699; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
6700; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
6701; GCN1-NEXT:    s_cbranch_execnz .LBB124_1
6702; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
6703; GCN1-NEXT:    s_or_b64 exec, exec, s[34:35]
6704; GCN1-NEXT:    s_setpc_b64 s[30:31]
6705;
6706; GCN2-LABEL: flat_atomic_min_i32_ret_offset_scalar:
6707; GCN2:       ; %bb.0:
6708; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6709; GCN2-NEXT:    s_add_u32 s34, s4, 16
6710; GCN2-NEXT:    s_addc_u32 s35, s5, 0
6711; GCN2-NEXT:    v_mov_b32_e32 v1, s34
6712; GCN2-NEXT:    v_mov_b32_e32 v2, s35
6713; GCN2-NEXT:    flat_load_dword v0, v[1:2]
6714; GCN2-NEXT:    s_mov_b64 s[34:35], 0
6715; GCN2-NEXT:  .LBB124_1: ; %atomicrmw.start
6716; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
6717; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6718; GCN2-NEXT:    v_mov_b32_e32 v4, v0
6719; GCN2-NEXT:    v_min_i32_e32 v3, s6, v4
6720; GCN2-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
6721; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6722; GCN2-NEXT:    buffer_wbinvl1_vol
6723; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
6724; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
6725; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
6726; GCN2-NEXT:    s_cbranch_execnz .LBB124_1
6727; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
6728; GCN2-NEXT:    s_or_b64 exec, exec, s[34:35]
6729; GCN2-NEXT:    s_setpc_b64 s[30:31]
6730;
6731; GCN3-LABEL: flat_atomic_min_i32_ret_offset_scalar:
6732; GCN3:       ; %bb.0:
6733; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6734; GCN3-NEXT:    v_mov_b32_e32 v0, s4
6735; GCN3-NEXT:    v_mov_b32_e32 v1, s5
6736; GCN3-NEXT:    flat_load_dword v0, v[0:1] offset:16
6737; GCN3-NEXT:    v_mov_b32_e32 v1, s4
6738; GCN3-NEXT:    s_mov_b64 s[34:35], 0
6739; GCN3-NEXT:    v_mov_b32_e32 v2, s5
6740; GCN3-NEXT:  .LBB124_1: ; %atomicrmw.start
6741; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
6742; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6743; GCN3-NEXT:    v_mov_b32_e32 v4, v0
6744; GCN3-NEXT:    v_min_i32_e32 v3, s6, v4
6745; GCN3-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc
6746; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6747; GCN3-NEXT:    buffer_wbinvl1_vol
6748; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
6749; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
6750; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
6751; GCN3-NEXT:    s_cbranch_execnz .LBB124_1
6752; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
6753; GCN3-NEXT:    s_or_b64 exec, exec, s[34:35]
6754; GCN3-NEXT:    s_setpc_b64 s[30:31]
6755  %gep = getelementptr i32, ptr %out, i32 4
6756  %result = atomicrmw min ptr %gep, i32 %in seq_cst
6757  ret i32 %result
6758}
6759
6760define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 %index) {
6761; GCN1-LABEL: atomic_min_i32_addr64_offset:
6762; GCN1:       ; %bb.0: ; %entry
6763; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
6764; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
6765; GCN1-NEXT:    s_ashr_i32 s5, s3, 31
6766; GCN1-NEXT:    s_mov_b32 s4, s3
6767; GCN1-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
6768; GCN1-NEXT:    s_add_u32 s0, s0, s4
6769; GCN1-NEXT:    s_addc_u32 s1, s1, s5
6770; GCN1-NEXT:    s_add_u32 s0, s0, 16
6771; GCN1-NEXT:    s_addc_u32 s1, s1, 0
6772; GCN1-NEXT:    v_mov_b32_e32 v0, s0
6773; GCN1-NEXT:    v_mov_b32_e32 v1, s1
6774; GCN1-NEXT:    flat_load_dword v3, v[0:1]
6775; GCN1-NEXT:    s_mov_b64 s[0:1], 0
6776; GCN1-NEXT:  .LBB125_1: ; %atomicrmw.start
6777; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
6778; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6779; GCN1-NEXT:    v_min_i32_e32 v2, s2, v3
6780; GCN1-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6781; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6782; GCN1-NEXT:    buffer_wbinvl1_vol
6783; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
6784; GCN1-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
6785; GCN1-NEXT:    v_mov_b32_e32 v3, v2
6786; GCN1-NEXT:    s_andn2_b64 exec, exec, s[0:1]
6787; GCN1-NEXT:    s_cbranch_execnz .LBB125_1
6788; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
6789; GCN1-NEXT:    s_endpgm
6790;
6791; GCN2-LABEL: atomic_min_i32_addr64_offset:
6792; GCN2:       ; %bb.0: ; %entry
6793; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
6794; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
6795; GCN2-NEXT:    s_ashr_i32 s5, s3, 31
6796; GCN2-NEXT:    s_mov_b32 s4, s3
6797; GCN2-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
6798; GCN2-NEXT:    s_add_u32 s0, s0, s4
6799; GCN2-NEXT:    s_addc_u32 s1, s1, s5
6800; GCN2-NEXT:    s_add_u32 s0, s0, 16
6801; GCN2-NEXT:    s_addc_u32 s1, s1, 0
6802; GCN2-NEXT:    v_mov_b32_e32 v0, s0
6803; GCN2-NEXT:    v_mov_b32_e32 v1, s1
6804; GCN2-NEXT:    flat_load_dword v3, v[0:1]
6805; GCN2-NEXT:    s_mov_b64 s[0:1], 0
6806; GCN2-NEXT:  .LBB125_1: ; %atomicrmw.start
6807; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
6808; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6809; GCN2-NEXT:    v_min_i32_e32 v2, s2, v3
6810; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6811; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6812; GCN2-NEXT:    buffer_wbinvl1_vol
6813; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
6814; GCN2-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
6815; GCN2-NEXT:    v_mov_b32_e32 v3, v2
6816; GCN2-NEXT:    s_andn2_b64 exec, exec, s[0:1]
6817; GCN2-NEXT:    s_cbranch_execnz .LBB125_1
6818; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
6819; GCN2-NEXT:    s_endpgm
6820;
6821; GCN3-LABEL: atomic_min_i32_addr64_offset:
6822; GCN3:       ; %bb.0: ; %entry
6823; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
6824; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
6825; GCN3-NEXT:    s_ashr_i32 s5, s3, 31
6826; GCN3-NEXT:    s_mov_b32 s4, s3
6827; GCN3-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
6828; GCN3-NEXT:    s_add_u32 s0, s0, s4
6829; GCN3-NEXT:    s_addc_u32 s1, s1, s5
6830; GCN3-NEXT:    v_mov_b32_e32 v0, s0
6831; GCN3-NEXT:    v_mov_b32_e32 v1, s1
6832; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
6833; GCN3-NEXT:    s_mov_b64 s[0:1], 0
6834; GCN3-NEXT:  .LBB125_1: ; %atomicrmw.start
6835; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
6836; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6837; GCN3-NEXT:    v_min_i32_e32 v2, s2, v3
6838; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
6839; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6840; GCN3-NEXT:    buffer_wbinvl1_vol
6841; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
6842; GCN3-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
6843; GCN3-NEXT:    v_mov_b32_e32 v3, v2
6844; GCN3-NEXT:    s_andn2_b64 exec, exec, s[0:1]
6845; GCN3-NEXT:    s_cbranch_execnz .LBB125_1
6846; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
6847; GCN3-NEXT:    s_endpgm
6848entry:
6849  %ptr = getelementptr i32, ptr %out, i32 %index
6850  %gep = getelementptr i32, ptr %ptr, i32 4
6851  %tmp0 = atomicrmw min ptr %gep, i32 %in seq_cst
6852  ret void
6853}
6854
6855define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i32 %index) {
6856; GCN1-LABEL: atomic_min_i32_ret_addr64_offset:
6857; GCN1:       ; %bb.0: ; %entry
6858; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
6859; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
6860; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
6861; GCN1-NEXT:    s_ashr_i32 s5, s7, 31
6862; GCN1-NEXT:    s_mov_b32 s4, s7
6863; GCN1-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
6864; GCN1-NEXT:    s_add_u32 s0, s0, s4
6865; GCN1-NEXT:    s_addc_u32 s1, s1, s5
6866; GCN1-NEXT:    s_add_u32 s0, s0, 16
6867; GCN1-NEXT:    s_addc_u32 s1, s1, 0
6868; GCN1-NEXT:    v_mov_b32_e32 v0, s0
6869; GCN1-NEXT:    v_mov_b32_e32 v1, s1
6870; GCN1-NEXT:    flat_load_dword v2, v[0:1]
6871; GCN1-NEXT:    s_mov_b64 s[0:1], 0
6872; GCN1-NEXT:  .LBB126_1: ; %atomicrmw.start
6873; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
6874; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6875; GCN1-NEXT:    v_mov_b32_e32 v3, v2
6876; GCN1-NEXT:    v_min_i32_e32 v2, s6, v3
6877; GCN1-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6878; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6879; GCN1-NEXT:    buffer_wbinvl1_vol
6880; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
6881; GCN1-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
6882; GCN1-NEXT:    s_andn2_b64 exec, exec, s[0:1]
6883; GCN1-NEXT:    s_cbranch_execnz .LBB126_1
6884; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
6885; GCN1-NEXT:    s_or_b64 exec, exec, s[0:1]
6886; GCN1-NEXT:    v_mov_b32_e32 v0, s2
6887; GCN1-NEXT:    v_mov_b32_e32 v1, s3
6888; GCN1-NEXT:    flat_store_dword v[0:1], v2
6889; GCN1-NEXT:    s_endpgm
6890;
6891; GCN2-LABEL: atomic_min_i32_ret_addr64_offset:
6892; GCN2:       ; %bb.0: ; %entry
6893; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
6894; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
6895; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
6896; GCN2-NEXT:    s_ashr_i32 s5, s7, 31
6897; GCN2-NEXT:    s_mov_b32 s4, s7
6898; GCN2-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
6899; GCN2-NEXT:    s_add_u32 s0, s0, s4
6900; GCN2-NEXT:    s_addc_u32 s1, s1, s5
6901; GCN2-NEXT:    s_add_u32 s0, s0, 16
6902; GCN2-NEXT:    s_addc_u32 s1, s1, 0
6903; GCN2-NEXT:    v_mov_b32_e32 v0, s0
6904; GCN2-NEXT:    v_mov_b32_e32 v1, s1
6905; GCN2-NEXT:    flat_load_dword v2, v[0:1]
6906; GCN2-NEXT:    s_mov_b64 s[0:1], 0
6907; GCN2-NEXT:  .LBB126_1: ; %atomicrmw.start
6908; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
6909; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6910; GCN2-NEXT:    v_mov_b32_e32 v3, v2
6911; GCN2-NEXT:    v_min_i32_e32 v2, s6, v3
6912; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6913; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6914; GCN2-NEXT:    buffer_wbinvl1_vol
6915; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
6916; GCN2-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
6917; GCN2-NEXT:    s_andn2_b64 exec, exec, s[0:1]
6918; GCN2-NEXT:    s_cbranch_execnz .LBB126_1
6919; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
6920; GCN2-NEXT:    s_or_b64 exec, exec, s[0:1]
6921; GCN2-NEXT:    v_mov_b32_e32 v0, s2
6922; GCN2-NEXT:    v_mov_b32_e32 v1, s3
6923; GCN2-NEXT:    flat_store_dword v[0:1], v2
6924; GCN2-NEXT:    s_endpgm
6925;
6926; GCN3-LABEL: atomic_min_i32_ret_addr64_offset:
6927; GCN3:       ; %bb.0: ; %entry
6928; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
6929; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
6930; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
6931; GCN3-NEXT:    s_ashr_i32 s5, s7, 31
6932; GCN3-NEXT:    s_mov_b32 s4, s7
6933; GCN3-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
6934; GCN3-NEXT:    s_add_u32 s0, s0, s4
6935; GCN3-NEXT:    s_addc_u32 s1, s1, s5
6936; GCN3-NEXT:    v_mov_b32_e32 v0, s0
6937; GCN3-NEXT:    v_mov_b32_e32 v1, s1
6938; GCN3-NEXT:    flat_load_dword v2, v[0:1] offset:16
6939; GCN3-NEXT:    s_mov_b64 s[0:1], 0
6940; GCN3-NEXT:  .LBB126_1: ; %atomicrmw.start
6941; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
6942; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6943; GCN3-NEXT:    v_mov_b32_e32 v3, v2
6944; GCN3-NEXT:    v_min_i32_e32 v2, s6, v3
6945; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
6946; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6947; GCN3-NEXT:    buffer_wbinvl1_vol
6948; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
6949; GCN3-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
6950; GCN3-NEXT:    s_andn2_b64 exec, exec, s[0:1]
6951; GCN3-NEXT:    s_cbranch_execnz .LBB126_1
6952; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
6953; GCN3-NEXT:    s_or_b64 exec, exec, s[0:1]
6954; GCN3-NEXT:    v_mov_b32_e32 v0, s2
6955; GCN3-NEXT:    v_mov_b32_e32 v1, s3
6956; GCN3-NEXT:    flat_store_dword v[0:1], v2
6957; GCN3-NEXT:    s_endpgm
6958entry:
6959  %ptr = getelementptr i32, ptr %out, i32 %index
6960  %gep = getelementptr i32, ptr %ptr, i32 4
6961  %tmp0 = atomicrmw min ptr %gep, i32 %in seq_cst
6962  store i32 %tmp0, ptr %out2
6963  ret void
6964}
6965
6966define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) {
6967; GCN1-LABEL: atomic_min_i32:
6968; GCN1:       ; %bb.0: ; %entry
6969; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x9
6970; GCN1-NEXT:    s_load_dword s2, s[4:5], 0xb
6971; GCN1-NEXT:    s_mov_b64 s[0:1], 0
6972; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
6973; GCN1-NEXT:    v_mov_b32_e32 v0, s6
6974; GCN1-NEXT:    v_mov_b32_e32 v1, s7
6975; GCN1-NEXT:    flat_load_dword v3, v[0:1]
6976; GCN1-NEXT:  .LBB127_1: ; %atomicrmw.start
6977; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
6978; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6979; GCN1-NEXT:    v_min_i32_e32 v2, s2, v3
6980; GCN1-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6981; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6982; GCN1-NEXT:    buffer_wbinvl1_vol
6983; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
6984; GCN1-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
6985; GCN1-NEXT:    v_mov_b32_e32 v3, v2
6986; GCN1-NEXT:    s_andn2_b64 exec, exec, s[0:1]
6987; GCN1-NEXT:    s_cbranch_execnz .LBB127_1
6988; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
6989; GCN1-NEXT:    s_endpgm
6990;
6991; GCN2-LABEL: atomic_min_i32:
6992; GCN2:       ; %bb.0: ; %entry
6993; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
6994; GCN2-NEXT:    s_load_dword s2, s[4:5], 0x2c
6995; GCN2-NEXT:    s_mov_b64 s[0:1], 0
6996; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
6997; GCN2-NEXT:    v_mov_b32_e32 v0, s6
6998; GCN2-NEXT:    v_mov_b32_e32 v1, s7
6999; GCN2-NEXT:    flat_load_dword v3, v[0:1]
7000; GCN2-NEXT:  .LBB127_1: ; %atomicrmw.start
7001; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
7002; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7003; GCN2-NEXT:    v_min_i32_e32 v2, s2, v3
7004; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7005; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7006; GCN2-NEXT:    buffer_wbinvl1_vol
7007; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
7008; GCN2-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
7009; GCN2-NEXT:    v_mov_b32_e32 v3, v2
7010; GCN2-NEXT:    s_andn2_b64 exec, exec, s[0:1]
7011; GCN2-NEXT:    s_cbranch_execnz .LBB127_1
7012; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
7013; GCN2-NEXT:    s_endpgm
7014;
7015; GCN3-LABEL: atomic_min_i32:
7016; GCN3:       ; %bb.0: ; %entry
7017; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
7018; GCN3-NEXT:    s_load_dword s2, s[4:5], 0x2c
7019; GCN3-NEXT:    s_mov_b64 s[0:1], 0
7020; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
7021; GCN3-NEXT:    v_mov_b32_e32 v0, s6
7022; GCN3-NEXT:    v_mov_b32_e32 v1, s7
7023; GCN3-NEXT:    flat_load_dword v3, v[0:1]
7024; GCN3-NEXT:  .LBB127_1: ; %atomicrmw.start
7025; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
7026; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7027; GCN3-NEXT:    v_min_i32_e32 v2, s2, v3
7028; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7029; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7030; GCN3-NEXT:    buffer_wbinvl1_vol
7031; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
7032; GCN3-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
7033; GCN3-NEXT:    v_mov_b32_e32 v3, v2
7034; GCN3-NEXT:    s_andn2_b64 exec, exec, s[0:1]
7035; GCN3-NEXT:    s_cbranch_execnz .LBB127_1
7036; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
7037; GCN3-NEXT:    s_endpgm
7038entry:
7039  %tmp0 = atomicrmw min ptr %out, i32 %in seq_cst
7040  ret void
7041}
7042
7043define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i32 %index) {
7044; GCN1-LABEL: atomic_min_i32_ret_addr64:
7045; GCN1:       ; %bb.0: ; %entry
7046; GCN1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
7047; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
7048; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
7049; GCN1-NEXT:    s_ashr_i32 s5, s7, 31
7050; GCN1-NEXT:    s_mov_b32 s4, s7
7051; GCN1-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
7052; GCN1-NEXT:    s_add_u32 s0, s0, s4
7053; GCN1-NEXT:    s_addc_u32 s1, s1, s5
7054; GCN1-NEXT:    v_mov_b32_e32 v0, s0
7055; GCN1-NEXT:    v_mov_b32_e32 v1, s1
7056; GCN1-NEXT:    flat_load_dword v2, v[0:1]
7057; GCN1-NEXT:    s_mov_b64 s[0:1], 0
7058; GCN1-NEXT:  .LBB128_1: ; %atomicrmw.start
7059; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
7060; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7061; GCN1-NEXT:    v_mov_b32_e32 v3, v2
7062; GCN1-NEXT:    v_min_i32_e32 v2, s6, v3
7063; GCN1-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7064; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7065; GCN1-NEXT:    buffer_wbinvl1_vol
7066; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
7067; GCN1-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
7068; GCN1-NEXT:    s_andn2_b64 exec, exec, s[0:1]
7069; GCN1-NEXT:    s_cbranch_execnz .LBB128_1
7070; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
7071; GCN1-NEXT:    s_or_b64 exec, exec, s[0:1]
7072; GCN1-NEXT:    v_mov_b32_e32 v0, s2
7073; GCN1-NEXT:    v_mov_b32_e32 v1, s3
7074; GCN1-NEXT:    flat_store_dword v[0:1], v2
7075; GCN1-NEXT:    s_endpgm
7076;
7077; GCN2-LABEL: atomic_min_i32_ret_addr64:
7078; GCN2:       ; %bb.0: ; %entry
7079; GCN2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
7080; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
7081; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
7082; GCN2-NEXT:    s_ashr_i32 s5, s7, 31
7083; GCN2-NEXT:    s_mov_b32 s4, s7
7084; GCN2-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
7085; GCN2-NEXT:    s_add_u32 s0, s0, s4
7086; GCN2-NEXT:    s_addc_u32 s1, s1, s5
7087; GCN2-NEXT:    v_mov_b32_e32 v0, s0
7088; GCN2-NEXT:    v_mov_b32_e32 v1, s1
7089; GCN2-NEXT:    flat_load_dword v2, v[0:1]
7090; GCN2-NEXT:    s_mov_b64 s[0:1], 0
7091; GCN2-NEXT:  .LBB128_1: ; %atomicrmw.start
7092; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
7093; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7094; GCN2-NEXT:    v_mov_b32_e32 v3, v2
7095; GCN2-NEXT:    v_min_i32_e32 v2, s6, v3
7096; GCN2-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7097; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7098; GCN2-NEXT:    buffer_wbinvl1_vol
7099; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
7100; GCN2-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
7101; GCN2-NEXT:    s_andn2_b64 exec, exec, s[0:1]
7102; GCN2-NEXT:    s_cbranch_execnz .LBB128_1
7103; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
7104; GCN2-NEXT:    s_or_b64 exec, exec, s[0:1]
7105; GCN2-NEXT:    v_mov_b32_e32 v0, s2
7106; GCN2-NEXT:    v_mov_b32_e32 v1, s3
7107; GCN2-NEXT:    flat_store_dword v[0:1], v2
7108; GCN2-NEXT:    s_endpgm
7109;
7110; GCN3-LABEL: atomic_min_i32_ret_addr64:
7111; GCN3:       ; %bb.0: ; %entry
7112; GCN3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
7113; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
7114; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
7115; GCN3-NEXT:    s_ashr_i32 s5, s7, 31
7116; GCN3-NEXT:    s_mov_b32 s4, s7
7117; GCN3-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
7118; GCN3-NEXT:    s_add_u32 s0, s0, s4
7119; GCN3-NEXT:    s_addc_u32 s1, s1, s5
7120; GCN3-NEXT:    v_mov_b32_e32 v0, s0
7121; GCN3-NEXT:    v_mov_b32_e32 v1, s1
7122; GCN3-NEXT:    flat_load_dword v2, v[0:1]
7123; GCN3-NEXT:    s_mov_b64 s[0:1], 0
7124; GCN3-NEXT:  .LBB128_1: ; %atomicrmw.start
7125; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
7126; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7127; GCN3-NEXT:    v_mov_b32_e32 v3, v2
7128; GCN3-NEXT:    v_min_i32_e32 v2, s6, v3
7129; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7130; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7131; GCN3-NEXT:    buffer_wbinvl1_vol
7132; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
7133; GCN3-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
7134; GCN3-NEXT:    s_andn2_b64 exec, exec, s[0:1]
7135; GCN3-NEXT:    s_cbranch_execnz .LBB128_1
7136; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
7137; GCN3-NEXT:    s_or_b64 exec, exec, s[0:1]
7138; GCN3-NEXT:    v_mov_b32_e32 v0, s2
7139; GCN3-NEXT:    v_mov_b32_e32 v1, s3
7140; GCN3-NEXT:    flat_store_dword v[0:1], v2
7141; GCN3-NEXT:    s_endpgm
7142entry:
7143  %ptr = getelementptr i32, ptr %out, i32 %index
7144  %tmp0 = atomicrmw min ptr %ptr, i32 %in seq_cst
7145  store i32 %tmp0, ptr %out2
7146  ret void
7147}
7148
7149define void @flat_min_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
7150; GCN1-LABEL: flat_min_i32_noret_offset__amdgpu_no_remote_memory:
7151; GCN1:       ; %bb.0:
7152; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7153; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
7154; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
7155; GCN1-NEXT:    flat_load_dword v4, v[0:1]
7156; GCN1-NEXT:    s_mov_b64 s[4:5], 0
7157; GCN1-NEXT:  .LBB129_1: ; %atomicrmw.start
7158; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
7159; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7160; GCN1-NEXT:    v_min_i32_e32 v3, v4, v2
7161; GCN1-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
7162; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7163; GCN1-NEXT:    buffer_wbinvl1_vol
7164; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
7165; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7166; GCN1-NEXT:    v_mov_b32_e32 v4, v3
7167; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7168; GCN1-NEXT:    s_cbranch_execnz .LBB129_1
7169; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
7170; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
7171; GCN1-NEXT:    s_setpc_b64 s[30:31]
7172;
7173; GCN2-LABEL: flat_min_i32_noret_offset__amdgpu_no_remote_memory:
7174; GCN2:       ; %bb.0:
7175; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7176; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
7177; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
7178; GCN2-NEXT:    flat_load_dword v4, v[0:1]
7179; GCN2-NEXT:    s_mov_b64 s[4:5], 0
7180; GCN2-NEXT:  .LBB129_1: ; %atomicrmw.start
7181; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
7182; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7183; GCN2-NEXT:    v_min_i32_e32 v3, v4, v2
7184; GCN2-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
7185; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7186; GCN2-NEXT:    buffer_wbinvl1_vol
7187; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
7188; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7189; GCN2-NEXT:    v_mov_b32_e32 v4, v3
7190; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7191; GCN2-NEXT:    s_cbranch_execnz .LBB129_1
7192; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
7193; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
7194; GCN2-NEXT:    s_setpc_b64 s[30:31]
7195;
7196; GCN3-LABEL: flat_min_i32_noret_offset__amdgpu_no_remote_memory:
7197; GCN3:       ; %bb.0:
7198; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7199; GCN3-NEXT:    flat_load_dword v4, v[0:1] offset:16
7200; GCN3-NEXT:    s_mov_b64 s[4:5], 0
7201; GCN3-NEXT:  .LBB129_1: ; %atomicrmw.start
7202; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
7203; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7204; GCN3-NEXT:    v_min_i32_e32 v3, v4, v2
7205; GCN3-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
7206; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7207; GCN3-NEXT:    buffer_wbinvl1_vol
7208; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
7209; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7210; GCN3-NEXT:    v_mov_b32_e32 v4, v3
7211; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7212; GCN3-NEXT:    s_cbranch_execnz .LBB129_1
7213; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
7214; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
7215; GCN3-NEXT:    s_setpc_b64 s[30:31]
7216  %gep = getelementptr i32, ptr %out, i64 4
7217  %tmp0 = atomicrmw min ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
7218  ret void
7219}
7220
7221define i32 @flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
7222; GCN1-LABEL: flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory:
7223; GCN1:       ; %bb.0:
7224; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7225; GCN1-NEXT:    v_add_i32_e32 v3, vcc, 16, v0
7226; GCN1-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
7227; GCN1-NEXT:    flat_load_dword v0, v[3:4]
7228; GCN1-NEXT:    s_mov_b64 s[4:5], 0
7229; GCN1-NEXT:  .LBB130_1: ; %atomicrmw.start
7230; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
7231; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7232; GCN1-NEXT:    v_mov_b32_e32 v1, v0
7233; GCN1-NEXT:    v_min_i32_e32 v0, v1, v2
7234; GCN1-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
7235; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7236; GCN1-NEXT:    buffer_wbinvl1_vol
7237; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
7238; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7239; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7240; GCN1-NEXT:    s_cbranch_execnz .LBB130_1
7241; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
7242; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
7243; GCN1-NEXT:    s_setpc_b64 s[30:31]
7244;
7245; GCN2-LABEL: flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory:
7246; GCN2:       ; %bb.0:
7247; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7248; GCN2-NEXT:    v_add_u32_e32 v3, vcc, 16, v0
7249; GCN2-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
7250; GCN2-NEXT:    flat_load_dword v0, v[3:4]
7251; GCN2-NEXT:    s_mov_b64 s[4:5], 0
7252; GCN2-NEXT:  .LBB130_1: ; %atomicrmw.start
7253; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
7254; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7255; GCN2-NEXT:    v_mov_b32_e32 v1, v0
7256; GCN2-NEXT:    v_min_i32_e32 v0, v1, v2
7257; GCN2-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
7258; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7259; GCN2-NEXT:    buffer_wbinvl1_vol
7260; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
7261; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7262; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7263; GCN2-NEXT:    s_cbranch_execnz .LBB130_1
7264; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
7265; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
7266; GCN2-NEXT:    s_setpc_b64 s[30:31]
7267;
7268; GCN3-LABEL: flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory:
7269; GCN3:       ; %bb.0:
7270; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7271; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
7272; GCN3-NEXT:    s_mov_b64 s[4:5], 0
7273; GCN3-NEXT:  .LBB130_1: ; %atomicrmw.start
7274; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
7275; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7276; GCN3-NEXT:    v_mov_b32_e32 v4, v3
7277; GCN3-NEXT:    v_min_i32_e32 v3, v4, v2
7278; GCN3-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
7279; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7280; GCN3-NEXT:    buffer_wbinvl1_vol
7281; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
7282; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7283; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7284; GCN3-NEXT:    s_cbranch_execnz .LBB130_1
7285; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
7286; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
7287; GCN3-NEXT:    v_mov_b32_e32 v0, v3
7288; GCN3-NEXT:    s_setpc_b64 s[30:31]
7289  %gep = getelementptr i32, ptr %out, i64 4
7290  %result = atomicrmw min ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
7291  ret i32 %result
7292}
7293
7294; ---------------------------------------------------------------------
7295; atomicrmw uinc_wrap
7296; ---------------------------------------------------------------------
7297
7298define void @flat_atomic_uinc_wrap_i32_noret(ptr %ptr, i32 %in) {
7299; GCN1-LABEL: flat_atomic_uinc_wrap_i32_noret:
7300; GCN1:       ; %bb.0:
7301; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7302; GCN1-NEXT:    flat_atomic_inc v[0:1], v2
7303; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7304; GCN1-NEXT:    buffer_wbinvl1_vol
7305; GCN1-NEXT:    s_setpc_b64 s[30:31]
7306;
7307; GCN2-LABEL: flat_atomic_uinc_wrap_i32_noret:
7308; GCN2:       ; %bb.0:
7309; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7310; GCN2-NEXT:    flat_atomic_inc v[0:1], v2
7311; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7312; GCN2-NEXT:    buffer_wbinvl1_vol
7313; GCN2-NEXT:    s_setpc_b64 s[30:31]
7314;
7315; GCN3-LABEL: flat_atomic_uinc_wrap_i32_noret:
7316; GCN3:       ; %bb.0:
7317; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7318; GCN3-NEXT:    flat_atomic_inc v[0:1], v2
7319; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7320; GCN3-NEXT:    buffer_wbinvl1_vol
7321; GCN3-NEXT:    s_setpc_b64 s[30:31]
7322  %tmp0 = atomicrmw uinc_wrap ptr %ptr, i32 %in seq_cst
7323  ret void
7324}
7325
7326define void @flat_atomic_uinc_wrap_i32_noret_offset(ptr %out, i32 %in) {
7327; GCN1-LABEL: flat_atomic_uinc_wrap_i32_noret_offset:
7328; GCN1:       ; %bb.0:
7329; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7330; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
7331; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
7332; GCN1-NEXT:    flat_atomic_inc v[0:1], v2
7333; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7334; GCN1-NEXT:    buffer_wbinvl1_vol
7335; GCN1-NEXT:    s_setpc_b64 s[30:31]
7336;
7337; GCN2-LABEL: flat_atomic_uinc_wrap_i32_noret_offset:
7338; GCN2:       ; %bb.0:
7339; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7340; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
7341; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
7342; GCN2-NEXT:    flat_atomic_inc v[0:1], v2
7343; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7344; GCN2-NEXT:    buffer_wbinvl1_vol
7345; GCN2-NEXT:    s_setpc_b64 s[30:31]
7346;
7347; GCN3-LABEL: flat_atomic_uinc_wrap_i32_noret_offset:
7348; GCN3:       ; %bb.0:
7349; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7350; GCN3-NEXT:    flat_atomic_inc v[0:1], v2 offset:16
7351; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7352; GCN3-NEXT:    buffer_wbinvl1_vol
7353; GCN3-NEXT:    s_setpc_b64 s[30:31]
7354  %gep = getelementptr i32, ptr %out, i32 4
7355  %tmp0 = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst
7356  ret void
7357}
7358
7359define i32 @flat_atomic_uinc_wrap_i32_ret(ptr %ptr, i32 %in) {
7360; GCN1-LABEL: flat_atomic_uinc_wrap_i32_ret:
7361; GCN1:       ; %bb.0:
7362; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7363; GCN1-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
7364; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7365; GCN1-NEXT:    buffer_wbinvl1_vol
7366; GCN1-NEXT:    s_setpc_b64 s[30:31]
7367;
7368; GCN2-LABEL: flat_atomic_uinc_wrap_i32_ret:
7369; GCN2:       ; %bb.0:
7370; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7371; GCN2-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
7372; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7373; GCN2-NEXT:    buffer_wbinvl1_vol
7374; GCN2-NEXT:    s_setpc_b64 s[30:31]
7375;
7376; GCN3-LABEL: flat_atomic_uinc_wrap_i32_ret:
7377; GCN3:       ; %bb.0:
7378; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7379; GCN3-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
7380; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7381; GCN3-NEXT:    buffer_wbinvl1_vol
7382; GCN3-NEXT:    s_setpc_b64 s[30:31]
7383  %result = atomicrmw uinc_wrap ptr %ptr, i32 %in seq_cst
7384  ret i32 %result
7385}
7386
7387define i32 @flat_atomic_uinc_wrap_i32_ret_offset(ptr %out, i32 %in) {
7388; GCN1-LABEL: flat_atomic_uinc_wrap_i32_ret_offset:
7389; GCN1:       ; %bb.0:
7390; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7391; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
7392; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
7393; GCN1-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
7394; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7395; GCN1-NEXT:    buffer_wbinvl1_vol
7396; GCN1-NEXT:    s_setpc_b64 s[30:31]
7397;
7398; GCN2-LABEL: flat_atomic_uinc_wrap_i32_ret_offset:
7399; GCN2:       ; %bb.0:
7400; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7401; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
7402; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
7403; GCN2-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
7404; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7405; GCN2-NEXT:    buffer_wbinvl1_vol
7406; GCN2-NEXT:    s_setpc_b64 s[30:31]
7407;
7408; GCN3-LABEL: flat_atomic_uinc_wrap_i32_ret_offset:
7409; GCN3:       ; %bb.0:
7410; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7411; GCN3-NEXT:    flat_atomic_inc v0, v[0:1], v2 offset:16 glc
7412; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7413; GCN3-NEXT:    buffer_wbinvl1_vol
7414; GCN3-NEXT:    s_setpc_b64 s[30:31]
7415  %gep = getelementptr i32, ptr %out, i32 4
7416  %result = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst
7417  ret i32 %result
7418}
7419
7420define amdgpu_gfx void @flat_atomic_uinc_wrap_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) {
7421; GCN1-LABEL: flat_atomic_uinc_wrap_i32_noret_scalar:
7422; GCN1:       ; %bb.0:
7423; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7424; GCN1-NEXT:    v_mov_b32_e32 v0, s4
7425; GCN1-NEXT:    v_mov_b32_e32 v1, s5
7426; GCN1-NEXT:    v_mov_b32_e32 v2, s6
7427; GCN1-NEXT:    flat_atomic_inc v[0:1], v2
7428; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7429; GCN1-NEXT:    buffer_wbinvl1_vol
7430; GCN1-NEXT:    s_setpc_b64 s[30:31]
7431;
7432; GCN2-LABEL: flat_atomic_uinc_wrap_i32_noret_scalar:
7433; GCN2:       ; %bb.0:
7434; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7435; GCN2-NEXT:    v_mov_b32_e32 v0, s4
7436; GCN2-NEXT:    v_mov_b32_e32 v1, s5
7437; GCN2-NEXT:    v_mov_b32_e32 v2, s6
7438; GCN2-NEXT:    flat_atomic_inc v[0:1], v2
7439; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7440; GCN2-NEXT:    buffer_wbinvl1_vol
7441; GCN2-NEXT:    s_setpc_b64 s[30:31]
7442;
7443; GCN3-LABEL: flat_atomic_uinc_wrap_i32_noret_scalar:
7444; GCN3:       ; %bb.0:
7445; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7446; GCN3-NEXT:    v_mov_b32_e32 v0, s4
7447; GCN3-NEXT:    v_mov_b32_e32 v1, s5
7448; GCN3-NEXT:    v_mov_b32_e32 v2, s6
7449; GCN3-NEXT:    flat_atomic_inc v[0:1], v2
7450; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7451; GCN3-NEXT:    buffer_wbinvl1_vol
7452; GCN3-NEXT:    s_setpc_b64 s[30:31]
7453  %tmp0 = atomicrmw uinc_wrap ptr %ptr, i32 %in seq_cst
7454  ret void
7455}
7456
7457define amdgpu_gfx void @flat_atomic_uinc_wrap_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) {
7458; GCN1-LABEL: flat_atomic_uinc_wrap_i32_noret_offset_scalar:
7459; GCN1:       ; %bb.0:
7460; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7461; GCN1-NEXT:    s_add_u32 s34, s4, 16
7462; GCN1-NEXT:    s_addc_u32 s35, s5, 0
7463; GCN1-NEXT:    v_mov_b32_e32 v0, s34
7464; GCN1-NEXT:    v_mov_b32_e32 v1, s35
7465; GCN1-NEXT:    v_mov_b32_e32 v2, s6
7466; GCN1-NEXT:    flat_atomic_inc v[0:1], v2
7467; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7468; GCN1-NEXT:    buffer_wbinvl1_vol
7469; GCN1-NEXT:    s_setpc_b64 s[30:31]
7470;
7471; GCN2-LABEL: flat_atomic_uinc_wrap_i32_noret_offset_scalar:
7472; GCN2:       ; %bb.0:
7473; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7474; GCN2-NEXT:    s_add_u32 s34, s4, 16
7475; GCN2-NEXT:    s_addc_u32 s35, s5, 0
7476; GCN2-NEXT:    v_mov_b32_e32 v0, s34
7477; GCN2-NEXT:    v_mov_b32_e32 v1, s35
7478; GCN2-NEXT:    v_mov_b32_e32 v2, s6
7479; GCN2-NEXT:    flat_atomic_inc v[0:1], v2
7480; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7481; GCN2-NEXT:    buffer_wbinvl1_vol
7482; GCN2-NEXT:    s_setpc_b64 s[30:31]
7483;
7484; GCN3-LABEL: flat_atomic_uinc_wrap_i32_noret_offset_scalar:
7485; GCN3:       ; %bb.0:
7486; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7487; GCN3-NEXT:    v_mov_b32_e32 v0, s4
7488; GCN3-NEXT:    v_mov_b32_e32 v1, s5
7489; GCN3-NEXT:    v_mov_b32_e32 v2, s6
7490; GCN3-NEXT:    flat_atomic_inc v[0:1], v2 offset:16
7491; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7492; GCN3-NEXT:    buffer_wbinvl1_vol
7493; GCN3-NEXT:    s_setpc_b64 s[30:31]
7494  %gep = getelementptr i32, ptr %out, i32 4
7495  %tmp0 = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst
7496  ret void
7497}
7498
7499define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) {
7500; GCN1-LABEL: flat_atomic_uinc_wrap_i32_ret_scalar:
7501; GCN1:       ; %bb.0:
7502; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7503; GCN1-NEXT:    v_mov_b32_e32 v0, s4
7504; GCN1-NEXT:    v_mov_b32_e32 v1, s5
7505; GCN1-NEXT:    v_mov_b32_e32 v2, s6
7506; GCN1-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
7507; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7508; GCN1-NEXT:    buffer_wbinvl1_vol
7509; GCN1-NEXT:    s_setpc_b64 s[30:31]
7510;
7511; GCN2-LABEL: flat_atomic_uinc_wrap_i32_ret_scalar:
7512; GCN2:       ; %bb.0:
7513; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7514; GCN2-NEXT:    v_mov_b32_e32 v0, s4
7515; GCN2-NEXT:    v_mov_b32_e32 v1, s5
7516; GCN2-NEXT:    v_mov_b32_e32 v2, s6
7517; GCN2-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
7518; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7519; GCN2-NEXT:    buffer_wbinvl1_vol
7520; GCN2-NEXT:    s_setpc_b64 s[30:31]
7521;
7522; GCN3-LABEL: flat_atomic_uinc_wrap_i32_ret_scalar:
7523; GCN3:       ; %bb.0:
7524; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7525; GCN3-NEXT:    v_mov_b32_e32 v0, s4
7526; GCN3-NEXT:    v_mov_b32_e32 v1, s5
7527; GCN3-NEXT:    v_mov_b32_e32 v2, s6
7528; GCN3-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
7529; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7530; GCN3-NEXT:    buffer_wbinvl1_vol
7531; GCN3-NEXT:    s_setpc_b64 s[30:31]
7532  %result = atomicrmw uinc_wrap ptr %ptr, i32 %in seq_cst
7533  ret i32 %result
7534}
7535
7536define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) {
7537; GCN1-LABEL: flat_atomic_uinc_wrap_i32_ret_offset_scalar:
7538; GCN1:       ; %bb.0:
7539; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7540; GCN1-NEXT:    s_add_u32 s34, s4, 16
7541; GCN1-NEXT:    s_addc_u32 s35, s5, 0
7542; GCN1-NEXT:    v_mov_b32_e32 v0, s34
7543; GCN1-NEXT:    v_mov_b32_e32 v1, s35
7544; GCN1-NEXT:    v_mov_b32_e32 v2, s6
7545; GCN1-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
7546; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7547; GCN1-NEXT:    buffer_wbinvl1_vol
7548; GCN1-NEXT:    s_setpc_b64 s[30:31]
7549;
7550; GCN2-LABEL: flat_atomic_uinc_wrap_i32_ret_offset_scalar:
7551; GCN2:       ; %bb.0:
7552; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7553; GCN2-NEXT:    s_add_u32 s34, s4, 16
7554; GCN2-NEXT:    s_addc_u32 s35, s5, 0
7555; GCN2-NEXT:    v_mov_b32_e32 v0, s34
7556; GCN2-NEXT:    v_mov_b32_e32 v1, s35
7557; GCN2-NEXT:    v_mov_b32_e32 v2, s6
7558; GCN2-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
7559; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7560; GCN2-NEXT:    buffer_wbinvl1_vol
7561; GCN2-NEXT:    s_setpc_b64 s[30:31]
7562;
7563; GCN3-LABEL: flat_atomic_uinc_wrap_i32_ret_offset_scalar:
7564; GCN3:       ; %bb.0:
7565; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7566; GCN3-NEXT:    v_mov_b32_e32 v0, s4
7567; GCN3-NEXT:    v_mov_b32_e32 v1, s5
7568; GCN3-NEXT:    v_mov_b32_e32 v2, s6
7569; GCN3-NEXT:    flat_atomic_inc v0, v[0:1], v2 offset:16 glc
7570; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7571; GCN3-NEXT:    buffer_wbinvl1_vol
7572; GCN3-NEXT:    s_setpc_b64 s[30:31]
7573  %gep = getelementptr i32, ptr %out, i32 4
7574  %result = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst
7575  ret i32 %result
7576}
7577
7578define void @flat_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
7579; GCN1-LABEL: flat_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory:
7580; GCN1:       ; %bb.0:
7581; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7582; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
7583; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
7584; GCN1-NEXT:    flat_atomic_inc v[0:1], v2
7585; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7586; GCN1-NEXT:    buffer_wbinvl1_vol
7587; GCN1-NEXT:    s_setpc_b64 s[30:31]
7588;
7589; GCN2-LABEL: flat_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory:
7590; GCN2:       ; %bb.0:
7591; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7592; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
7593; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
7594; GCN2-NEXT:    flat_atomic_inc v[0:1], v2
7595; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7596; GCN2-NEXT:    buffer_wbinvl1_vol
7597; GCN2-NEXT:    s_setpc_b64 s[30:31]
7598;
7599; GCN3-LABEL: flat_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory:
7600; GCN3:       ; %bb.0:
7601; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7602; GCN3-NEXT:    flat_atomic_inc v[0:1], v2 offset:16
7603; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7604; GCN3-NEXT:    buffer_wbinvl1_vol
7605; GCN3-NEXT:    s_setpc_b64 s[30:31]
7606  %gep = getelementptr i32, ptr %out, i64 4
7607  %tmp0 = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
7608  ret void
7609}
7610
7611define i32 @flat_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
7612; GCN1-LABEL: flat_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory:
7613; GCN1:       ; %bb.0:
7614; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7615; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
7616; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
7617; GCN1-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
7618; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7619; GCN1-NEXT:    buffer_wbinvl1_vol
7620; GCN1-NEXT:    s_setpc_b64 s[30:31]
7621;
7622; GCN2-LABEL: flat_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory:
7623; GCN2:       ; %bb.0:
7624; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7625; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
7626; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
7627; GCN2-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
7628; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7629; GCN2-NEXT:    buffer_wbinvl1_vol
7630; GCN2-NEXT:    s_setpc_b64 s[30:31]
7631;
7632; GCN3-LABEL: flat_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory:
7633; GCN3:       ; %bb.0:
7634; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7635; GCN3-NEXT:    flat_atomic_inc v0, v[0:1], v2 offset:16 glc
7636; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7637; GCN3-NEXT:    buffer_wbinvl1_vol
7638; GCN3-NEXT:    s_setpc_b64 s[30:31]
7639  %gep = getelementptr i32, ptr %out, i64 4
7640  %result = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
7641  ret i32 %result
7642}
7643
7644; ---------------------------------------------------------------------
7645; atomicrmw udec_wrap
7646; ---------------------------------------------------------------------
7647
7648define void @flat_atomic_udec_wrap_i32_noret(ptr %ptr, i32 %in) {
7649; GCN1-LABEL: flat_atomic_udec_wrap_i32_noret:
7650; GCN1:       ; %bb.0:
7651; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7652; GCN1-NEXT:    flat_atomic_dec v[0:1], v2
7653; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7654; GCN1-NEXT:    buffer_wbinvl1_vol
7655; GCN1-NEXT:    s_setpc_b64 s[30:31]
7656;
7657; GCN2-LABEL: flat_atomic_udec_wrap_i32_noret:
7658; GCN2:       ; %bb.0:
7659; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7660; GCN2-NEXT:    flat_atomic_dec v[0:1], v2
7661; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7662; GCN2-NEXT:    buffer_wbinvl1_vol
7663; GCN2-NEXT:    s_setpc_b64 s[30:31]
7664;
7665; GCN3-LABEL: flat_atomic_udec_wrap_i32_noret:
7666; GCN3:       ; %bb.0:
7667; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7668; GCN3-NEXT:    flat_atomic_dec v[0:1], v2
7669; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7670; GCN3-NEXT:    buffer_wbinvl1_vol
7671; GCN3-NEXT:    s_setpc_b64 s[30:31]
7672  %tmp0 = atomicrmw udec_wrap ptr %ptr, i32 %in seq_cst
7673  ret void
7674}
7675
7676define void @flat_atomic_udec_wrap_i32_noret_offset(ptr %out, i32 %in) {
7677; GCN1-LABEL: flat_atomic_udec_wrap_i32_noret_offset:
7678; GCN1:       ; %bb.0:
7679; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7680; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
7681; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
7682; GCN1-NEXT:    flat_atomic_dec v[0:1], v2
7683; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7684; GCN1-NEXT:    buffer_wbinvl1_vol
7685; GCN1-NEXT:    s_setpc_b64 s[30:31]
7686;
7687; GCN2-LABEL: flat_atomic_udec_wrap_i32_noret_offset:
7688; GCN2:       ; %bb.0:
7689; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7690; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
7691; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
7692; GCN2-NEXT:    flat_atomic_dec v[0:1], v2
7693; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7694; GCN2-NEXT:    buffer_wbinvl1_vol
7695; GCN2-NEXT:    s_setpc_b64 s[30:31]
7696;
7697; GCN3-LABEL: flat_atomic_udec_wrap_i32_noret_offset:
7698; GCN3:       ; %bb.0:
7699; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7700; GCN3-NEXT:    flat_atomic_dec v[0:1], v2 offset:16
7701; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7702; GCN3-NEXT:    buffer_wbinvl1_vol
7703; GCN3-NEXT:    s_setpc_b64 s[30:31]
7704  %gep = getelementptr i32, ptr %out, i32 4
7705  %tmp0 = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst
7706  ret void
7707}
7708
7709define i32 @flat_atomic_udec_wrap_i32_ret(ptr %ptr, i32 %in) {
7710; GCN1-LABEL: flat_atomic_udec_wrap_i32_ret:
7711; GCN1:       ; %bb.0:
7712; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7713; GCN1-NEXT:    flat_atomic_dec v0, v[0:1], v2 glc
7714; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7715; GCN1-NEXT:    buffer_wbinvl1_vol
7716; GCN1-NEXT:    s_setpc_b64 s[30:31]
7717;
7718; GCN2-LABEL: flat_atomic_udec_wrap_i32_ret:
7719; GCN2:       ; %bb.0:
7720; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7721; GCN2-NEXT:    flat_atomic_dec v0, v[0:1], v2 glc
7722; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7723; GCN2-NEXT:    buffer_wbinvl1_vol
7724; GCN2-NEXT:    s_setpc_b64 s[30:31]
7725;
7726; GCN3-LABEL: flat_atomic_udec_wrap_i32_ret:
7727; GCN3:       ; %bb.0:
7728; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7729; GCN3-NEXT:    flat_atomic_dec v0, v[0:1], v2 glc
7730; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7731; GCN3-NEXT:    buffer_wbinvl1_vol
7732; GCN3-NEXT:    s_setpc_b64 s[30:31]
7733  %result = atomicrmw udec_wrap ptr %ptr, i32 %in seq_cst
7734  ret i32 %result
7735}
7736
7737define i32 @flat_atomic_udec_wrap_i32_ret_offset(ptr %out, i32 %in) {
7738; GCN1-LABEL: flat_atomic_udec_wrap_i32_ret_offset:
7739; GCN1:       ; %bb.0:
7740; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7741; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
7742; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
7743; GCN1-NEXT:    flat_atomic_dec v0, v[0:1], v2 glc
7744; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7745; GCN1-NEXT:    buffer_wbinvl1_vol
7746; GCN1-NEXT:    s_setpc_b64 s[30:31]
7747;
7748; GCN2-LABEL: flat_atomic_udec_wrap_i32_ret_offset:
7749; GCN2:       ; %bb.0:
7750; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7751; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
7752; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
7753; GCN2-NEXT:    flat_atomic_dec v0, v[0:1], v2 glc
7754; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7755; GCN2-NEXT:    buffer_wbinvl1_vol
7756; GCN2-NEXT:    s_setpc_b64 s[30:31]
7757;
7758; GCN3-LABEL: flat_atomic_udec_wrap_i32_ret_offset:
7759; GCN3:       ; %bb.0:
7760; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7761; GCN3-NEXT:    flat_atomic_dec v0, v[0:1], v2 offset:16 glc
7762; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7763; GCN3-NEXT:    buffer_wbinvl1_vol
7764; GCN3-NEXT:    s_setpc_b64 s[30:31]
7765  %gep = getelementptr i32, ptr %out, i32 4
7766  %result = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst
7767  ret i32 %result
7768}
7769
7770define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) {
7771; GCN1-LABEL: flat_atomic_udec_wrap_i32_noret_scalar:
7772; GCN1:       ; %bb.0:
7773; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7774; GCN1-NEXT:    v_mov_b32_e32 v0, s4
7775; GCN1-NEXT:    v_mov_b32_e32 v1, s5
7776; GCN1-NEXT:    v_mov_b32_e32 v2, s6
7777; GCN1-NEXT:    flat_atomic_dec v[0:1], v2
7778; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7779; GCN1-NEXT:    buffer_wbinvl1_vol
7780; GCN1-NEXT:    s_setpc_b64 s[30:31]
7781;
7782; GCN2-LABEL: flat_atomic_udec_wrap_i32_noret_scalar:
7783; GCN2:       ; %bb.0:
7784; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7785; GCN2-NEXT:    v_mov_b32_e32 v0, s4
7786; GCN2-NEXT:    v_mov_b32_e32 v1, s5
7787; GCN2-NEXT:    v_mov_b32_e32 v2, s6
7788; GCN2-NEXT:    flat_atomic_dec v[0:1], v2
7789; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7790; GCN2-NEXT:    buffer_wbinvl1_vol
7791; GCN2-NEXT:    s_setpc_b64 s[30:31]
7792;
7793; GCN3-LABEL: flat_atomic_udec_wrap_i32_noret_scalar:
7794; GCN3:       ; %bb.0:
7795; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7796; GCN3-NEXT:    v_mov_b32_e32 v0, s4
7797; GCN3-NEXT:    v_mov_b32_e32 v1, s5
7798; GCN3-NEXT:    v_mov_b32_e32 v2, s6
7799; GCN3-NEXT:    flat_atomic_dec v[0:1], v2
7800; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7801; GCN3-NEXT:    buffer_wbinvl1_vol
7802; GCN3-NEXT:    s_setpc_b64 s[30:31]
7803  %tmp0 = atomicrmw udec_wrap ptr %ptr, i32 %in seq_cst
7804  ret void
7805}
7806
7807define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) {
7808; GCN1-LABEL: flat_atomic_udec_wrap_i32_noret_offset_scalar:
7809; GCN1:       ; %bb.0:
7810; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7811; GCN1-NEXT:    s_add_u32 s34, s4, 16
7812; GCN1-NEXT:    s_addc_u32 s35, s5, 0
7813; GCN1-NEXT:    v_mov_b32_e32 v0, s34
7814; GCN1-NEXT:    v_mov_b32_e32 v1, s35
7815; GCN1-NEXT:    v_mov_b32_e32 v2, s6
7816; GCN1-NEXT:    flat_atomic_dec v[0:1], v2
7817; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7818; GCN1-NEXT:    buffer_wbinvl1_vol
7819; GCN1-NEXT:    s_setpc_b64 s[30:31]
7820;
7821; GCN2-LABEL: flat_atomic_udec_wrap_i32_noret_offset_scalar:
7822; GCN2:       ; %bb.0:
7823; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7824; GCN2-NEXT:    s_add_u32 s34, s4, 16
7825; GCN2-NEXT:    s_addc_u32 s35, s5, 0
7826; GCN2-NEXT:    v_mov_b32_e32 v0, s34
7827; GCN2-NEXT:    v_mov_b32_e32 v1, s35
7828; GCN2-NEXT:    v_mov_b32_e32 v2, s6
7829; GCN2-NEXT:    flat_atomic_dec v[0:1], v2
7830; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7831; GCN2-NEXT:    buffer_wbinvl1_vol
7832; GCN2-NEXT:    s_setpc_b64 s[30:31]
7833;
7834; GCN3-LABEL: flat_atomic_udec_wrap_i32_noret_offset_scalar:
7835; GCN3:       ; %bb.0:
7836; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7837; GCN3-NEXT:    v_mov_b32_e32 v0, s4
7838; GCN3-NEXT:    v_mov_b32_e32 v1, s5
7839; GCN3-NEXT:    v_mov_b32_e32 v2, s6
7840; GCN3-NEXT:    flat_atomic_dec v[0:1], v2 offset:16
7841; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7842; GCN3-NEXT:    buffer_wbinvl1_vol
7843; GCN3-NEXT:    s_setpc_b64 s[30:31]
7844  %gep = getelementptr i32, ptr %out, i32 4
7845  %tmp0 = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst
7846  ret void
7847}
7848
7849define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) {
7850; GCN1-LABEL: flat_atomic_udec_wrap_i32_ret_scalar:
7851; GCN1:       ; %bb.0:
7852; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7853; GCN1-NEXT:    v_mov_b32_e32 v0, s4
7854; GCN1-NEXT:    v_mov_b32_e32 v1, s5
7855; GCN1-NEXT:    v_mov_b32_e32 v2, s6
7856; GCN1-NEXT:    flat_atomic_dec v0, v[0:1], v2 glc
7857; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7858; GCN1-NEXT:    buffer_wbinvl1_vol
7859; GCN1-NEXT:    s_setpc_b64 s[30:31]
7860;
7861; GCN2-LABEL: flat_atomic_udec_wrap_i32_ret_scalar:
7862; GCN2:       ; %bb.0:
7863; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7864; GCN2-NEXT:    v_mov_b32_e32 v0, s4
7865; GCN2-NEXT:    v_mov_b32_e32 v1, s5
7866; GCN2-NEXT:    v_mov_b32_e32 v2, s6
7867; GCN2-NEXT:    flat_atomic_dec v0, v[0:1], v2 glc
7868; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7869; GCN2-NEXT:    buffer_wbinvl1_vol
7870; GCN2-NEXT:    s_setpc_b64 s[30:31]
7871;
7872; GCN3-LABEL: flat_atomic_udec_wrap_i32_ret_scalar:
7873; GCN3:       ; %bb.0:
7874; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7875; GCN3-NEXT:    v_mov_b32_e32 v0, s4
7876; GCN3-NEXT:    v_mov_b32_e32 v1, s5
7877; GCN3-NEXT:    v_mov_b32_e32 v2, s6
7878; GCN3-NEXT:    flat_atomic_dec v0, v[0:1], v2 glc
7879; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7880; GCN3-NEXT:    buffer_wbinvl1_vol
7881; GCN3-NEXT:    s_setpc_b64 s[30:31]
7882  %result = atomicrmw udec_wrap ptr %ptr, i32 %in seq_cst
7883  ret i32 %result
7884}
7885
7886define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) {
7887; GCN1-LABEL: flat_atomic_udec_wrap_i32_ret_offset_scalar:
7888; GCN1:       ; %bb.0:
7889; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7890; GCN1-NEXT:    s_add_u32 s34, s4, 16
7891; GCN1-NEXT:    s_addc_u32 s35, s5, 0
7892; GCN1-NEXT:    v_mov_b32_e32 v0, s34
7893; GCN1-NEXT:    v_mov_b32_e32 v1, s35
7894; GCN1-NEXT:    v_mov_b32_e32 v2, s6
7895; GCN1-NEXT:    flat_atomic_dec v0, v[0:1], v2 glc
7896; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7897; GCN1-NEXT:    buffer_wbinvl1_vol
7898; GCN1-NEXT:    s_setpc_b64 s[30:31]
7899;
7900; GCN2-LABEL: flat_atomic_udec_wrap_i32_ret_offset_scalar:
7901; GCN2:       ; %bb.0:
7902; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7903; GCN2-NEXT:    s_add_u32 s34, s4, 16
7904; GCN2-NEXT:    s_addc_u32 s35, s5, 0
7905; GCN2-NEXT:    v_mov_b32_e32 v0, s34
7906; GCN2-NEXT:    v_mov_b32_e32 v1, s35
7907; GCN2-NEXT:    v_mov_b32_e32 v2, s6
7908; GCN2-NEXT:    flat_atomic_dec v0, v[0:1], v2 glc
7909; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7910; GCN2-NEXT:    buffer_wbinvl1_vol
7911; GCN2-NEXT:    s_setpc_b64 s[30:31]
7912;
7913; GCN3-LABEL: flat_atomic_udec_wrap_i32_ret_offset_scalar:
7914; GCN3:       ; %bb.0:
7915; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7916; GCN3-NEXT:    v_mov_b32_e32 v0, s4
7917; GCN3-NEXT:    v_mov_b32_e32 v1, s5
7918; GCN3-NEXT:    v_mov_b32_e32 v2, s6
7919; GCN3-NEXT:    flat_atomic_dec v0, v[0:1], v2 offset:16 glc
7920; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7921; GCN3-NEXT:    buffer_wbinvl1_vol
7922; GCN3-NEXT:    s_setpc_b64 s[30:31]
7923  %gep = getelementptr i32, ptr %out, i32 4
7924  %result = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst
7925  ret i32 %result
7926}
7927
7928define void @flat_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
7929; GCN1-LABEL: flat_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory:
7930; GCN1:       ; %bb.0:
7931; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7932; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
7933; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
7934; GCN1-NEXT:    flat_atomic_dec v[0:1], v2
7935; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7936; GCN1-NEXT:    buffer_wbinvl1_vol
7937; GCN1-NEXT:    s_setpc_b64 s[30:31]
7938;
7939; GCN2-LABEL: flat_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory:
7940; GCN2:       ; %bb.0:
7941; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7942; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
7943; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
7944; GCN2-NEXT:    flat_atomic_dec v[0:1], v2
7945; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7946; GCN2-NEXT:    buffer_wbinvl1_vol
7947; GCN2-NEXT:    s_setpc_b64 s[30:31]
7948;
7949; GCN3-LABEL: flat_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory:
7950; GCN3:       ; %bb.0:
7951; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7952; GCN3-NEXT:    flat_atomic_dec v[0:1], v2 offset:16
7953; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7954; GCN3-NEXT:    buffer_wbinvl1_vol
7955; GCN3-NEXT:    s_setpc_b64 s[30:31]
7956  %gep = getelementptr i32, ptr %out, i64 4
7957  %tmp0 = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
7958  ret void
7959}
7960
7961define i32 @flat_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
7962; GCN1-LABEL: flat_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory:
7963; GCN1:       ; %bb.0:
7964; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7965; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
7966; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
7967; GCN1-NEXT:    flat_atomic_dec v0, v[0:1], v2 glc
7968; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7969; GCN1-NEXT:    buffer_wbinvl1_vol
7970; GCN1-NEXT:    s_setpc_b64 s[30:31]
7971;
7972; GCN2-LABEL: flat_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory:
7973; GCN2:       ; %bb.0:
7974; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7975; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
7976; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
7977; GCN2-NEXT:    flat_atomic_dec v0, v[0:1], v2 glc
7978; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7979; GCN2-NEXT:    buffer_wbinvl1_vol
7980; GCN2-NEXT:    s_setpc_b64 s[30:31]
7981;
7982; GCN3-LABEL: flat_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory:
7983; GCN3:       ; %bb.0:
7984; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7985; GCN3-NEXT:    flat_atomic_dec v0, v[0:1], v2 offset:16 glc
7986; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7987; GCN3-NEXT:    buffer_wbinvl1_vol
7988; GCN3-NEXT:    s_setpc_b64 s[30:31]
7989  %gep = getelementptr i32, ptr %out, i64 4
7990  %result = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
7991  ret i32 %result
7992}
7993
7994!0 = !{}
7995