xref: /llvm-project/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll (revision eeac0ffaf46cf9f9b0f680b9940cc4b68a0286d8)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
3; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5
6; ---------------------------------------------------------------------
7; atomicrmw xchg
8; ---------------------------------------------------------------------
9
10define void @global_atomic_xchg_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
11; SI-LABEL: global_atomic_xchg_i32_noret:
12; SI:       ; %bb.0:
13; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14; SI-NEXT:    s_mov_b32 s6, 0
15; SI-NEXT:    s_mov_b32 s7, 0xf000
16; SI-NEXT:    s_mov_b32 s4, s6
17; SI-NEXT:    s_mov_b32 s5, s6
18; SI-NEXT:    buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64
19; SI-NEXT:    s_waitcnt vmcnt(0)
20; SI-NEXT:    buffer_wbinvl1
21; SI-NEXT:    s_waitcnt expcnt(0)
22; SI-NEXT:    s_setpc_b64 s[30:31]
23;
24; VI-LABEL: global_atomic_xchg_i32_noret:
25; VI:       ; %bb.0:
26; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27; VI-NEXT:    flat_atomic_swap v[0:1], v2
28; VI-NEXT:    s_waitcnt vmcnt(0)
29; VI-NEXT:    buffer_wbinvl1_vol
30; VI-NEXT:    s_setpc_b64 s[30:31]
31;
32; GFX9-LABEL: global_atomic_xchg_i32_noret:
33; GFX9:       ; %bb.0:
34; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35; GFX9-NEXT:    global_atomic_swap v[0:1], v2, off
36; GFX9-NEXT:    s_waitcnt vmcnt(0)
37; GFX9-NEXT:    buffer_wbinvl1_vol
38; GFX9-NEXT:    s_setpc_b64 s[30:31]
39  %tmp0 = atomicrmw xchg ptr addrspace(1) %ptr, i32 %in seq_cst
40  ret void
41}
42
43define void @global_atomic_xchg_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
44; SI-LABEL: global_atomic_xchg_i32_noret_offset:
45; SI:       ; %bb.0:
46; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
47; SI-NEXT:    s_mov_b32 s6, 0
48; SI-NEXT:    s_mov_b32 s7, 0xf000
49; SI-NEXT:    s_mov_b32 s4, s6
50; SI-NEXT:    s_mov_b32 s5, s6
51; SI-NEXT:    buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 offset:16
52; SI-NEXT:    s_waitcnt vmcnt(0)
53; SI-NEXT:    buffer_wbinvl1
54; SI-NEXT:    s_waitcnt expcnt(0)
55; SI-NEXT:    s_setpc_b64 s[30:31]
56;
57; VI-LABEL: global_atomic_xchg_i32_noret_offset:
58; VI:       ; %bb.0:
59; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
60; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
61; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
62; VI-NEXT:    flat_atomic_swap v[0:1], v2
63; VI-NEXT:    s_waitcnt vmcnt(0)
64; VI-NEXT:    buffer_wbinvl1_vol
65; VI-NEXT:    s_setpc_b64 s[30:31]
66;
67; GFX9-LABEL: global_atomic_xchg_i32_noret_offset:
68; GFX9:       ; %bb.0:
69; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
70; GFX9-NEXT:    global_atomic_swap v[0:1], v2, off offset:16
71; GFX9-NEXT:    s_waitcnt vmcnt(0)
72; GFX9-NEXT:    buffer_wbinvl1_vol
73; GFX9-NEXT:    s_setpc_b64 s[30:31]
74  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
75  %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, i32 %in seq_cst
76  ret void
77}
78
79define i32 @global_atomic_xchg_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
80; SI-LABEL: global_atomic_xchg_i32_ret:
81; SI:       ; %bb.0:
82; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
83; SI-NEXT:    s_mov_b32 s6, 0
84; SI-NEXT:    s_mov_b32 s7, 0xf000
85; SI-NEXT:    s_mov_b32 s4, s6
86; SI-NEXT:    s_mov_b32 s5, s6
87; SI-NEXT:    buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 glc
88; SI-NEXT:    s_waitcnt vmcnt(0)
89; SI-NEXT:    buffer_wbinvl1
90; SI-NEXT:    v_mov_b32_e32 v0, v2
91; SI-NEXT:    s_waitcnt expcnt(0)
92; SI-NEXT:    s_setpc_b64 s[30:31]
93;
94; VI-LABEL: global_atomic_xchg_i32_ret:
95; VI:       ; %bb.0:
96; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
97; VI-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
98; VI-NEXT:    s_waitcnt vmcnt(0)
99; VI-NEXT:    buffer_wbinvl1_vol
100; VI-NEXT:    s_setpc_b64 s[30:31]
101;
102; GFX9-LABEL: global_atomic_xchg_i32_ret:
103; GFX9:       ; %bb.0:
104; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
105; GFX9-NEXT:    global_atomic_swap v0, v[0:1], v2, off glc
106; GFX9-NEXT:    s_waitcnt vmcnt(0)
107; GFX9-NEXT:    buffer_wbinvl1_vol
108; GFX9-NEXT:    s_setpc_b64 s[30:31]
109  %result = atomicrmw xchg ptr addrspace(1) %ptr, i32 %in seq_cst
110  ret i32 %result
111}
112
113define i32 @global_atomic_xchg_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
114; SI-LABEL: global_atomic_xchg_i32_ret_offset:
115; SI:       ; %bb.0:
116; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
117; SI-NEXT:    s_mov_b32 s6, 0
118; SI-NEXT:    s_mov_b32 s7, 0xf000
119; SI-NEXT:    s_mov_b32 s4, s6
120; SI-NEXT:    s_mov_b32 s5, s6
121; SI-NEXT:    buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
122; SI-NEXT:    s_waitcnt vmcnt(0)
123; SI-NEXT:    buffer_wbinvl1
124; SI-NEXT:    v_mov_b32_e32 v0, v2
125; SI-NEXT:    s_waitcnt expcnt(0)
126; SI-NEXT:    s_setpc_b64 s[30:31]
127;
128; VI-LABEL: global_atomic_xchg_i32_ret_offset:
129; VI:       ; %bb.0:
130; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
131; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
132; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
133; VI-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
134; VI-NEXT:    s_waitcnt vmcnt(0)
135; VI-NEXT:    buffer_wbinvl1_vol
136; VI-NEXT:    s_setpc_b64 s[30:31]
137;
138; GFX9-LABEL: global_atomic_xchg_i32_ret_offset:
139; GFX9:       ; %bb.0:
140; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
141; GFX9-NEXT:    global_atomic_swap v0, v[0:1], v2, off offset:16 glc
142; GFX9-NEXT:    s_waitcnt vmcnt(0)
143; GFX9-NEXT:    buffer_wbinvl1_vol
144; GFX9-NEXT:    s_setpc_b64 s[30:31]
145  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
146  %result = atomicrmw xchg ptr addrspace(1) %gep, i32 %in seq_cst
147  ret i32 %result
148}
149
150define amdgpu_gfx void @global_atomic_xchg_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
151; SI-LABEL: global_atomic_xchg_i32_noret_scalar:
152; SI:       ; %bb.0:
153; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
154; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
155; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
156; SI-NEXT:    s_mov_b64 exec, s[34:35]
157; SI-NEXT:    s_waitcnt expcnt(0)
158; SI-NEXT:    v_writelane_b32 v1, s6, 0
159; SI-NEXT:    v_writelane_b32 v1, s7, 1
160; SI-NEXT:    s_mov_b32 s34, s6
161; SI-NEXT:    s_mov_b32 s7, 0xf000
162; SI-NEXT:    s_mov_b32 s6, -1
163; SI-NEXT:    v_mov_b32_e32 v0, s34
164; SI-NEXT:    s_waitcnt vmcnt(0)
165; SI-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
166; SI-NEXT:    s_waitcnt vmcnt(0)
167; SI-NEXT:    buffer_wbinvl1
168; SI-NEXT:    v_readlane_b32 s7, v1, 1
169; SI-NEXT:    v_readlane_b32 s6, v1, 0
170; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
171; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
172; SI-NEXT:    s_mov_b64 exec, s[34:35]
173; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
174; SI-NEXT:    s_setpc_b64 s[30:31]
175;
176; VI-LABEL: global_atomic_xchg_i32_noret_scalar:
177; VI:       ; %bb.0:
178; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
179; VI-NEXT:    v_mov_b32_e32 v0, s4
180; VI-NEXT:    v_mov_b32_e32 v1, s5
181; VI-NEXT:    v_mov_b32_e32 v2, s6
182; VI-NEXT:    flat_atomic_swap v[0:1], v2
183; VI-NEXT:    s_waitcnt vmcnt(0)
184; VI-NEXT:    buffer_wbinvl1_vol
185; VI-NEXT:    s_setpc_b64 s[30:31]
186;
187; GFX9-LABEL: global_atomic_xchg_i32_noret_scalar:
188; GFX9:       ; %bb.0:
189; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
190; GFX9-NEXT:    v_mov_b32_e32 v0, 0
191; GFX9-NEXT:    v_mov_b32_e32 v1, s6
192; GFX9-NEXT:    global_atomic_swap v0, v1, s[4:5]
193; GFX9-NEXT:    s_waitcnt vmcnt(0)
194; GFX9-NEXT:    buffer_wbinvl1_vol
195; GFX9-NEXT:    s_setpc_b64 s[30:31]
196  %tmp0 = atomicrmw xchg ptr addrspace(1) %ptr, i32 %in seq_cst
197  ret void
198}
199
200define amdgpu_gfx void @global_atomic_xchg_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
201; SI-LABEL: global_atomic_xchg_i32_noret_offset_scalar:
202; SI:       ; %bb.0:
203; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
204; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
205; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
206; SI-NEXT:    s_mov_b64 exec, s[34:35]
207; SI-NEXT:    s_waitcnt expcnt(0)
208; SI-NEXT:    v_writelane_b32 v1, s6, 0
209; SI-NEXT:    v_writelane_b32 v1, s7, 1
210; SI-NEXT:    s_mov_b32 s34, s6
211; SI-NEXT:    s_mov_b32 s7, 0xf000
212; SI-NEXT:    s_mov_b32 s6, -1
213; SI-NEXT:    v_mov_b32_e32 v0, s34
214; SI-NEXT:    s_waitcnt vmcnt(0)
215; SI-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 offset:16
216; SI-NEXT:    s_waitcnt vmcnt(0)
217; SI-NEXT:    buffer_wbinvl1
218; SI-NEXT:    v_readlane_b32 s7, v1, 1
219; SI-NEXT:    v_readlane_b32 s6, v1, 0
220; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
221; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
222; SI-NEXT:    s_mov_b64 exec, s[34:35]
223; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
224; SI-NEXT:    s_setpc_b64 s[30:31]
225;
226; VI-LABEL: global_atomic_xchg_i32_noret_offset_scalar:
227; VI:       ; %bb.0:
228; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
229; VI-NEXT:    s_add_u32 s34, s4, 16
230; VI-NEXT:    s_addc_u32 s35, s5, 0
231; VI-NEXT:    v_mov_b32_e32 v0, s34
232; VI-NEXT:    v_mov_b32_e32 v1, s35
233; VI-NEXT:    v_mov_b32_e32 v2, s6
234; VI-NEXT:    flat_atomic_swap v[0:1], v2
235; VI-NEXT:    s_waitcnt vmcnt(0)
236; VI-NEXT:    buffer_wbinvl1_vol
237; VI-NEXT:    s_setpc_b64 s[30:31]
238;
239; GFX9-LABEL: global_atomic_xchg_i32_noret_offset_scalar:
240; GFX9:       ; %bb.0:
241; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
242; GFX9-NEXT:    v_mov_b32_e32 v0, 0
243; GFX9-NEXT:    v_mov_b32_e32 v1, s6
244; GFX9-NEXT:    global_atomic_swap v0, v1, s[4:5] offset:16
245; GFX9-NEXT:    s_waitcnt vmcnt(0)
246; GFX9-NEXT:    buffer_wbinvl1_vol
247; GFX9-NEXT:    s_setpc_b64 s[30:31]
248  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
249  %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, i32 %in seq_cst
250  ret void
251}
252
253define amdgpu_gfx i32 @global_atomic_xchg_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
254; SI-LABEL: global_atomic_xchg_i32_ret_scalar:
255; SI:       ; %bb.0:
256; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
257; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
258; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
259; SI-NEXT:    s_mov_b64 exec, s[34:35]
260; SI-NEXT:    s_waitcnt expcnt(0)
261; SI-NEXT:    v_writelane_b32 v1, s6, 0
262; SI-NEXT:    v_writelane_b32 v1, s7, 1
263; SI-NEXT:    s_mov_b32 s34, s6
264; SI-NEXT:    s_mov_b32 s7, 0xf000
265; SI-NEXT:    s_mov_b32 s6, -1
266; SI-NEXT:    v_mov_b32_e32 v0, s34
267; SI-NEXT:    s_waitcnt vmcnt(0)
268; SI-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
269; SI-NEXT:    s_waitcnt vmcnt(0)
270; SI-NEXT:    buffer_wbinvl1
271; SI-NEXT:    v_readlane_b32 s7, v1, 1
272; SI-NEXT:    v_readlane_b32 s6, v1, 0
273; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
274; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
275; SI-NEXT:    s_mov_b64 exec, s[34:35]
276; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
277; SI-NEXT:    s_setpc_b64 s[30:31]
278;
279; VI-LABEL: global_atomic_xchg_i32_ret_scalar:
280; VI:       ; %bb.0:
281; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
282; VI-NEXT:    v_mov_b32_e32 v0, s4
283; VI-NEXT:    v_mov_b32_e32 v1, s5
284; VI-NEXT:    v_mov_b32_e32 v2, s6
285; VI-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
286; VI-NEXT:    s_waitcnt vmcnt(0)
287; VI-NEXT:    buffer_wbinvl1_vol
288; VI-NEXT:    s_setpc_b64 s[30:31]
289;
290; GFX9-LABEL: global_atomic_xchg_i32_ret_scalar:
291; GFX9:       ; %bb.0:
292; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
293; GFX9-NEXT:    v_mov_b32_e32 v0, 0
294; GFX9-NEXT:    v_mov_b32_e32 v1, s6
295; GFX9-NEXT:    global_atomic_swap v0, v0, v1, s[4:5] glc
296; GFX9-NEXT:    s_waitcnt vmcnt(0)
297; GFX9-NEXT:    buffer_wbinvl1_vol
298; GFX9-NEXT:    s_setpc_b64 s[30:31]
299  %result = atomicrmw xchg ptr addrspace(1) %ptr, i32 %in seq_cst
300  ret i32 %result
301}
302
303define amdgpu_gfx i32 @global_atomic_xchg_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
304; SI-LABEL: global_atomic_xchg_i32_ret_offset_scalar:
305; SI:       ; %bb.0:
306; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
307; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
308; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
309; SI-NEXT:    s_mov_b64 exec, s[34:35]
310; SI-NEXT:    s_waitcnt expcnt(0)
311; SI-NEXT:    v_writelane_b32 v1, s6, 0
312; SI-NEXT:    v_writelane_b32 v1, s7, 1
313; SI-NEXT:    s_mov_b32 s34, s6
314; SI-NEXT:    s_mov_b32 s7, 0xf000
315; SI-NEXT:    s_mov_b32 s6, -1
316; SI-NEXT:    v_mov_b32_e32 v0, s34
317; SI-NEXT:    s_waitcnt vmcnt(0)
318; SI-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 offset:16 glc
319; SI-NEXT:    s_waitcnt vmcnt(0)
320; SI-NEXT:    buffer_wbinvl1
321; SI-NEXT:    v_readlane_b32 s7, v1, 1
322; SI-NEXT:    v_readlane_b32 s6, v1, 0
323; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
324; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
325; SI-NEXT:    s_mov_b64 exec, s[34:35]
326; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
327; SI-NEXT:    s_setpc_b64 s[30:31]
328;
329; VI-LABEL: global_atomic_xchg_i32_ret_offset_scalar:
330; VI:       ; %bb.0:
331; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
332; VI-NEXT:    s_add_u32 s34, s4, 16
333; VI-NEXT:    s_addc_u32 s35, s5, 0
334; VI-NEXT:    v_mov_b32_e32 v0, s34
335; VI-NEXT:    v_mov_b32_e32 v1, s35
336; VI-NEXT:    v_mov_b32_e32 v2, s6
337; VI-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
338; VI-NEXT:    s_waitcnt vmcnt(0)
339; VI-NEXT:    buffer_wbinvl1_vol
340; VI-NEXT:    s_setpc_b64 s[30:31]
341;
342; GFX9-LABEL: global_atomic_xchg_i32_ret_offset_scalar:
343; GFX9:       ; %bb.0:
344; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
345; GFX9-NEXT:    v_mov_b32_e32 v0, 0
346; GFX9-NEXT:    v_mov_b32_e32 v1, s6
347; GFX9-NEXT:    global_atomic_swap v0, v0, v1, s[4:5] offset:16 glc
348; GFX9-NEXT:    s_waitcnt vmcnt(0)
349; GFX9-NEXT:    buffer_wbinvl1_vol
350; GFX9-NEXT:    s_setpc_b64 s[30:31]
351  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
352  %result = atomicrmw xchg ptr addrspace(1) %gep, i32 %in seq_cst
353  ret i32 %result
354}
355
356define void @global_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
357; SI-LABEL: global_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory:
358; SI:       ; %bb.0:
359; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
360; SI-NEXT:    s_mov_b32 s6, 0
361; SI-NEXT:    s_mov_b32 s7, 0xf000
362; SI-NEXT:    s_mov_b32 s4, s6
363; SI-NEXT:    s_mov_b32 s5, s6
364; SI-NEXT:    buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 offset:16
365; SI-NEXT:    s_waitcnt vmcnt(0)
366; SI-NEXT:    buffer_wbinvl1
367; SI-NEXT:    s_waitcnt expcnt(0)
368; SI-NEXT:    s_setpc_b64 s[30:31]
369;
370; VI-LABEL: global_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory:
371; VI:       ; %bb.0:
372; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
373; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
374; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
375; VI-NEXT:    flat_atomic_swap v[0:1], v2
376; VI-NEXT:    s_waitcnt vmcnt(0)
377; VI-NEXT:    buffer_wbinvl1_vol
378; VI-NEXT:    s_setpc_b64 s[30:31]
379;
380; GFX9-LABEL: global_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory:
381; GFX9:       ; %bb.0:
382; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
383; GFX9-NEXT:    global_atomic_swap v[0:1], v2, off offset:16
384; GFX9-NEXT:    s_waitcnt vmcnt(0)
385; GFX9-NEXT:    buffer_wbinvl1_vol
386; GFX9-NEXT:    s_setpc_b64 s[30:31]
387  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
388  %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
389  ret void
390}
391
392define i32 @global_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
393; SI-LABEL: global_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory:
394; SI:       ; %bb.0:
395; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
396; SI-NEXT:    s_mov_b32 s6, 0
397; SI-NEXT:    s_mov_b32 s7, 0xf000
398; SI-NEXT:    s_mov_b32 s4, s6
399; SI-NEXT:    s_mov_b32 s5, s6
400; SI-NEXT:    buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
401; SI-NEXT:    s_waitcnt vmcnt(0)
402; SI-NEXT:    buffer_wbinvl1
403; SI-NEXT:    v_mov_b32_e32 v0, v2
404; SI-NEXT:    s_waitcnt expcnt(0)
405; SI-NEXT:    s_setpc_b64 s[30:31]
406;
407; VI-LABEL: global_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory:
408; VI:       ; %bb.0:
409; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
410; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
411; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
412; VI-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
413; VI-NEXT:    s_waitcnt vmcnt(0)
414; VI-NEXT:    buffer_wbinvl1_vol
415; VI-NEXT:    s_setpc_b64 s[30:31]
416;
417; GFX9-LABEL: global_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory:
418; GFX9:       ; %bb.0:
419; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
420; GFX9-NEXT:    global_atomic_swap v0, v[0:1], v2, off offset:16 glc
421; GFX9-NEXT:    s_waitcnt vmcnt(0)
422; GFX9-NEXT:    buffer_wbinvl1_vol
423; GFX9-NEXT:    s_setpc_b64 s[30:31]
424  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
425  %result = atomicrmw xchg ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
426  ret i32 %result
427}
428
429; ---------------------------------------------------------------------
430; atomicrmw xchg f32
431; ---------------------------------------------------------------------
432
433define void @global_atomic_xchg_f32_noret(ptr addrspace(1) %ptr, float %in) {
434; SI-LABEL: global_atomic_xchg_f32_noret:
435; SI:       ; %bb.0:
436; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
437; SI-NEXT:    s_mov_b32 s6, 0
438; SI-NEXT:    s_mov_b32 s7, 0xf000
439; SI-NEXT:    s_mov_b32 s4, s6
440; SI-NEXT:    s_mov_b32 s5, s6
441; SI-NEXT:    buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64
442; SI-NEXT:    s_waitcnt vmcnt(0)
443; SI-NEXT:    buffer_wbinvl1
444; SI-NEXT:    s_waitcnt expcnt(0)
445; SI-NEXT:    s_setpc_b64 s[30:31]
446;
447; VI-LABEL: global_atomic_xchg_f32_noret:
448; VI:       ; %bb.0:
449; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
450; VI-NEXT:    flat_atomic_swap v[0:1], v2
451; VI-NEXT:    s_waitcnt vmcnt(0)
452; VI-NEXT:    buffer_wbinvl1_vol
453; VI-NEXT:    s_setpc_b64 s[30:31]
454;
455; GFX9-LABEL: global_atomic_xchg_f32_noret:
456; GFX9:       ; %bb.0:
457; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
458; GFX9-NEXT:    global_atomic_swap v[0:1], v2, off
459; GFX9-NEXT:    s_waitcnt vmcnt(0)
460; GFX9-NEXT:    buffer_wbinvl1_vol
461; GFX9-NEXT:    s_setpc_b64 s[30:31]
462  %tmp0 = atomicrmw xchg ptr addrspace(1) %ptr, float %in seq_cst
463  ret void
464}
465
466define void @global_atomic_xchg_f32_noret_offset(ptr addrspace(1) %out, float %in) {
467; SI-LABEL: global_atomic_xchg_f32_noret_offset:
468; SI:       ; %bb.0:
469; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
470; SI-NEXT:    s_mov_b32 s6, 0
471; SI-NEXT:    s_mov_b32 s7, 0xf000
472; SI-NEXT:    s_mov_b32 s4, s6
473; SI-NEXT:    s_mov_b32 s5, s6
474; SI-NEXT:    buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 offset:16
475; SI-NEXT:    s_waitcnt vmcnt(0)
476; SI-NEXT:    buffer_wbinvl1
477; SI-NEXT:    s_waitcnt expcnt(0)
478; SI-NEXT:    s_setpc_b64 s[30:31]
479;
480; VI-LABEL: global_atomic_xchg_f32_noret_offset:
481; VI:       ; %bb.0:
482; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
483; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
484; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
485; VI-NEXT:    flat_atomic_swap v[0:1], v2
486; VI-NEXT:    s_waitcnt vmcnt(0)
487; VI-NEXT:    buffer_wbinvl1_vol
488; VI-NEXT:    s_setpc_b64 s[30:31]
489;
490; GFX9-LABEL: global_atomic_xchg_f32_noret_offset:
491; GFX9:       ; %bb.0:
492; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
493; GFX9-NEXT:    global_atomic_swap v[0:1], v2, off offset:16
494; GFX9-NEXT:    s_waitcnt vmcnt(0)
495; GFX9-NEXT:    buffer_wbinvl1_vol
496; GFX9-NEXT:    s_setpc_b64 s[30:31]
497  %gep = getelementptr float, ptr addrspace(1) %out, i32 4
498  %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, float %in seq_cst
499  ret void
500}
501
502define float @global_atomic_xchg_f32_ret(ptr addrspace(1) %ptr, float %in) {
503; SI-LABEL: global_atomic_xchg_f32_ret:
504; SI:       ; %bb.0:
505; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
506; SI-NEXT:    s_mov_b32 s6, 0
507; SI-NEXT:    s_mov_b32 s7, 0xf000
508; SI-NEXT:    s_mov_b32 s4, s6
509; SI-NEXT:    s_mov_b32 s5, s6
510; SI-NEXT:    buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 glc
511; SI-NEXT:    s_waitcnt vmcnt(0)
512; SI-NEXT:    buffer_wbinvl1
513; SI-NEXT:    v_mov_b32_e32 v0, v2
514; SI-NEXT:    s_waitcnt expcnt(0)
515; SI-NEXT:    s_setpc_b64 s[30:31]
516;
517; VI-LABEL: global_atomic_xchg_f32_ret:
518; VI:       ; %bb.0:
519; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
520; VI-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
521; VI-NEXT:    s_waitcnt vmcnt(0)
522; VI-NEXT:    buffer_wbinvl1_vol
523; VI-NEXT:    s_setpc_b64 s[30:31]
524;
525; GFX9-LABEL: global_atomic_xchg_f32_ret:
526; GFX9:       ; %bb.0:
527; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
528; GFX9-NEXT:    global_atomic_swap v0, v[0:1], v2, off glc
529; GFX9-NEXT:    s_waitcnt vmcnt(0)
530; GFX9-NEXT:    buffer_wbinvl1_vol
531; GFX9-NEXT:    s_setpc_b64 s[30:31]
532  %result = atomicrmw xchg ptr addrspace(1) %ptr, float %in seq_cst
533  ret float %result
534}
535
536define float @global_atomic_xchg_f32_ret_offset(ptr addrspace(1) %out, float %in) {
537; SI-LABEL: global_atomic_xchg_f32_ret_offset:
538; SI:       ; %bb.0:
539; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
540; SI-NEXT:    s_mov_b32 s6, 0
541; SI-NEXT:    s_mov_b32 s7, 0xf000
542; SI-NEXT:    s_mov_b32 s4, s6
543; SI-NEXT:    s_mov_b32 s5, s6
544; SI-NEXT:    buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
545; SI-NEXT:    s_waitcnt vmcnt(0)
546; SI-NEXT:    buffer_wbinvl1
547; SI-NEXT:    v_mov_b32_e32 v0, v2
548; SI-NEXT:    s_waitcnt expcnt(0)
549; SI-NEXT:    s_setpc_b64 s[30:31]
550;
551; VI-LABEL: global_atomic_xchg_f32_ret_offset:
552; VI:       ; %bb.0:
553; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
554; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
555; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
556; VI-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
557; VI-NEXT:    s_waitcnt vmcnt(0)
558; VI-NEXT:    buffer_wbinvl1_vol
559; VI-NEXT:    s_setpc_b64 s[30:31]
560;
561; GFX9-LABEL: global_atomic_xchg_f32_ret_offset:
562; GFX9:       ; %bb.0:
563; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
564; GFX9-NEXT:    global_atomic_swap v0, v[0:1], v2, off offset:16 glc
565; GFX9-NEXT:    s_waitcnt vmcnt(0)
566; GFX9-NEXT:    buffer_wbinvl1_vol
567; GFX9-NEXT:    s_setpc_b64 s[30:31]
568  %gep = getelementptr float, ptr addrspace(1) %out, i32 4
569  %result = atomicrmw xchg ptr addrspace(1) %gep, float %in seq_cst
570  ret float %result
571}
572
573define amdgpu_gfx void @global_atomic_xchg_f32_noret_scalar(ptr addrspace(1) inreg %ptr, float inreg %in) {
574; SI-LABEL: global_atomic_xchg_f32_noret_scalar:
575; SI:       ; %bb.0:
576; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
577; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
578; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
579; SI-NEXT:    s_mov_b64 exec, s[34:35]
580; SI-NEXT:    s_waitcnt expcnt(0)
581; SI-NEXT:    v_writelane_b32 v1, s6, 0
582; SI-NEXT:    v_writelane_b32 v1, s7, 1
583; SI-NEXT:    s_mov_b32 s34, s6
584; SI-NEXT:    s_mov_b32 s7, 0xf000
585; SI-NEXT:    s_mov_b32 s6, -1
586; SI-NEXT:    v_mov_b32_e32 v0, s34
587; SI-NEXT:    s_waitcnt vmcnt(0)
588; SI-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
589; SI-NEXT:    s_waitcnt vmcnt(0)
590; SI-NEXT:    buffer_wbinvl1
591; SI-NEXT:    v_readlane_b32 s7, v1, 1
592; SI-NEXT:    v_readlane_b32 s6, v1, 0
593; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
594; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
595; SI-NEXT:    s_mov_b64 exec, s[34:35]
596; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
597; SI-NEXT:    s_setpc_b64 s[30:31]
598;
599; VI-LABEL: global_atomic_xchg_f32_noret_scalar:
600; VI:       ; %bb.0:
601; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
602; VI-NEXT:    v_mov_b32_e32 v0, s4
603; VI-NEXT:    v_mov_b32_e32 v1, s5
604; VI-NEXT:    v_mov_b32_e32 v2, s6
605; VI-NEXT:    flat_atomic_swap v[0:1], v2
606; VI-NEXT:    s_waitcnt vmcnt(0)
607; VI-NEXT:    buffer_wbinvl1_vol
608; VI-NEXT:    s_setpc_b64 s[30:31]
609;
610; GFX9-LABEL: global_atomic_xchg_f32_noret_scalar:
611; GFX9:       ; %bb.0:
612; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
613; GFX9-NEXT:    v_mov_b32_e32 v0, 0
614; GFX9-NEXT:    v_mov_b32_e32 v1, s6
615; GFX9-NEXT:    global_atomic_swap v0, v1, s[4:5]
616; GFX9-NEXT:    s_waitcnt vmcnt(0)
617; GFX9-NEXT:    buffer_wbinvl1_vol
618; GFX9-NEXT:    s_setpc_b64 s[30:31]
619  %tmp0 = atomicrmw xchg ptr addrspace(1) %ptr, float %in seq_cst
620  ret void
621}
622
623define amdgpu_gfx void @global_atomic_xchg_f32_noret_offset_scalar(ptr addrspace(1) inreg %out, float inreg %in) {
624; SI-LABEL: global_atomic_xchg_f32_noret_offset_scalar:
625; SI:       ; %bb.0:
626; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
627; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
628; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
629; SI-NEXT:    s_mov_b64 exec, s[34:35]
630; SI-NEXT:    s_waitcnt expcnt(0)
631; SI-NEXT:    v_writelane_b32 v1, s6, 0
632; SI-NEXT:    v_writelane_b32 v1, s7, 1
633; SI-NEXT:    s_mov_b32 s34, s6
634; SI-NEXT:    s_mov_b32 s7, 0xf000
635; SI-NEXT:    s_mov_b32 s6, -1
636; SI-NEXT:    v_mov_b32_e32 v0, s34
637; SI-NEXT:    s_waitcnt vmcnt(0)
638; SI-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 offset:16
639; SI-NEXT:    s_waitcnt vmcnt(0)
640; SI-NEXT:    buffer_wbinvl1
641; SI-NEXT:    v_readlane_b32 s7, v1, 1
642; SI-NEXT:    v_readlane_b32 s6, v1, 0
643; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
644; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
645; SI-NEXT:    s_mov_b64 exec, s[34:35]
646; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
647; SI-NEXT:    s_setpc_b64 s[30:31]
648;
649; VI-LABEL: global_atomic_xchg_f32_noret_offset_scalar:
650; VI:       ; %bb.0:
651; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
652; VI-NEXT:    s_add_u32 s34, s4, 16
653; VI-NEXT:    s_addc_u32 s35, s5, 0
654; VI-NEXT:    v_mov_b32_e32 v0, s34
655; VI-NEXT:    v_mov_b32_e32 v1, s35
656; VI-NEXT:    v_mov_b32_e32 v2, s6
657; VI-NEXT:    flat_atomic_swap v[0:1], v2
658; VI-NEXT:    s_waitcnt vmcnt(0)
659; VI-NEXT:    buffer_wbinvl1_vol
660; VI-NEXT:    s_setpc_b64 s[30:31]
661;
662; GFX9-LABEL: global_atomic_xchg_f32_noret_offset_scalar:
663; GFX9:       ; %bb.0:
664; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
665; GFX9-NEXT:    v_mov_b32_e32 v0, 0
666; GFX9-NEXT:    v_mov_b32_e32 v1, s6
667; GFX9-NEXT:    global_atomic_swap v0, v1, s[4:5] offset:16
668; GFX9-NEXT:    s_waitcnt vmcnt(0)
669; GFX9-NEXT:    buffer_wbinvl1_vol
670; GFX9-NEXT:    s_setpc_b64 s[30:31]
671  %gep = getelementptr float, ptr addrspace(1) %out, i32 4
672  %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, float %in seq_cst
673  ret void
674}
675
676define amdgpu_gfx float @global_atomic_xchg_f32_ret_scalar(ptr addrspace(1) inreg %ptr, float inreg %in) {
677; SI-LABEL: global_atomic_xchg_f32_ret_scalar:
678; SI:       ; %bb.0:
679; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
680; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
681; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
682; SI-NEXT:    s_mov_b64 exec, s[34:35]
683; SI-NEXT:    s_waitcnt expcnt(0)
684; SI-NEXT:    v_writelane_b32 v1, s6, 0
685; SI-NEXT:    v_writelane_b32 v1, s7, 1
686; SI-NEXT:    s_mov_b32 s34, s6
687; SI-NEXT:    s_mov_b32 s7, 0xf000
688; SI-NEXT:    s_mov_b32 s6, -1
689; SI-NEXT:    v_mov_b32_e32 v0, s34
690; SI-NEXT:    s_waitcnt vmcnt(0)
691; SI-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
692; SI-NEXT:    s_waitcnt vmcnt(0)
693; SI-NEXT:    buffer_wbinvl1
694; SI-NEXT:    v_readlane_b32 s7, v1, 1
695; SI-NEXT:    v_readlane_b32 s6, v1, 0
696; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
697; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
698; SI-NEXT:    s_mov_b64 exec, s[34:35]
699; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
700; SI-NEXT:    s_setpc_b64 s[30:31]
701;
702; VI-LABEL: global_atomic_xchg_f32_ret_scalar:
703; VI:       ; %bb.0:
704; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
705; VI-NEXT:    v_mov_b32_e32 v0, s4
706; VI-NEXT:    v_mov_b32_e32 v1, s5
707; VI-NEXT:    v_mov_b32_e32 v2, s6
708; VI-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
709; VI-NEXT:    s_waitcnt vmcnt(0)
710; VI-NEXT:    buffer_wbinvl1_vol
711; VI-NEXT:    s_setpc_b64 s[30:31]
712;
713; GFX9-LABEL: global_atomic_xchg_f32_ret_scalar:
714; GFX9:       ; %bb.0:
715; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
716; GFX9-NEXT:    v_mov_b32_e32 v0, 0
717; GFX9-NEXT:    v_mov_b32_e32 v1, s6
718; GFX9-NEXT:    global_atomic_swap v0, v0, v1, s[4:5] glc
719; GFX9-NEXT:    s_waitcnt vmcnt(0)
720; GFX9-NEXT:    buffer_wbinvl1_vol
721; GFX9-NEXT:    s_setpc_b64 s[30:31]
722  %result = atomicrmw xchg ptr addrspace(1) %ptr, float %in seq_cst
723  ret float %result
724}
725
726define amdgpu_gfx float @global_atomic_xchg_f32_ret_offset_scalar(ptr addrspace(1) inreg %out, float inreg %in) {
727; SI-LABEL: global_atomic_xchg_f32_ret_offset_scalar:
728; SI:       ; %bb.0:
729; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
730; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
731; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
732; SI-NEXT:    s_mov_b64 exec, s[34:35]
733; SI-NEXT:    s_waitcnt expcnt(0)
734; SI-NEXT:    v_writelane_b32 v1, s6, 0
735; SI-NEXT:    v_writelane_b32 v1, s7, 1
736; SI-NEXT:    s_mov_b32 s34, s6
737; SI-NEXT:    s_mov_b32 s7, 0xf000
738; SI-NEXT:    s_mov_b32 s6, -1
739; SI-NEXT:    v_mov_b32_e32 v0, s34
740; SI-NEXT:    s_waitcnt vmcnt(0)
741; SI-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 offset:16 glc
742; SI-NEXT:    s_waitcnt vmcnt(0)
743; SI-NEXT:    buffer_wbinvl1
744; SI-NEXT:    v_readlane_b32 s7, v1, 1
745; SI-NEXT:    v_readlane_b32 s6, v1, 0
746; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
747; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
748; SI-NEXT:    s_mov_b64 exec, s[34:35]
749; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
750; SI-NEXT:    s_setpc_b64 s[30:31]
751;
752; VI-LABEL: global_atomic_xchg_f32_ret_offset_scalar:
753; VI:       ; %bb.0:
754; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
755; VI-NEXT:    s_add_u32 s34, s4, 16
756; VI-NEXT:    s_addc_u32 s35, s5, 0
757; VI-NEXT:    v_mov_b32_e32 v0, s34
758; VI-NEXT:    v_mov_b32_e32 v1, s35
759; VI-NEXT:    v_mov_b32_e32 v2, s6
760; VI-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
761; VI-NEXT:    s_waitcnt vmcnt(0)
762; VI-NEXT:    buffer_wbinvl1_vol
763; VI-NEXT:    s_setpc_b64 s[30:31]
764;
765; GFX9-LABEL: global_atomic_xchg_f32_ret_offset_scalar:
766; GFX9:       ; %bb.0:
767; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
768; GFX9-NEXT:    v_mov_b32_e32 v0, 0
769; GFX9-NEXT:    v_mov_b32_e32 v1, s6
770; GFX9-NEXT:    global_atomic_swap v0, v0, v1, s[4:5] offset:16 glc
771; GFX9-NEXT:    s_waitcnt vmcnt(0)
772; GFX9-NEXT:    buffer_wbinvl1_vol
773; GFX9-NEXT:    s_setpc_b64 s[30:31]
774  %gep = getelementptr float, ptr addrspace(1) %out, i32 4
775  %result = atomicrmw xchg ptr addrspace(1) %gep, float %in seq_cst
776  ret float %result
777}
778
779define void @global_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, float %in) {
780; SI-LABEL: global_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory:
781; SI:       ; %bb.0:
782; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
783; SI-NEXT:    s_mov_b32 s6, 0
784; SI-NEXT:    s_mov_b32 s7, 0xf000
785; SI-NEXT:    s_mov_b32 s4, s6
786; SI-NEXT:    s_mov_b32 s5, s6
787; SI-NEXT:    buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 offset:16
788; SI-NEXT:    s_waitcnt vmcnt(0)
789; SI-NEXT:    buffer_wbinvl1
790; SI-NEXT:    s_waitcnt expcnt(0)
791; SI-NEXT:    s_setpc_b64 s[30:31]
792;
793; VI-LABEL: global_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory:
794; VI:       ; %bb.0:
795; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
796; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
797; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
798; VI-NEXT:    flat_atomic_swap v[0:1], v2
799; VI-NEXT:    s_waitcnt vmcnt(0)
800; VI-NEXT:    buffer_wbinvl1_vol
801; VI-NEXT:    s_setpc_b64 s[30:31]
802;
803; GFX9-LABEL: global_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory:
804; GFX9:       ; %bb.0:
805; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
806; GFX9-NEXT:    global_atomic_swap v[0:1], v2, off offset:16
807; GFX9-NEXT:    s_waitcnt vmcnt(0)
808; GFX9-NEXT:    buffer_wbinvl1_vol
809; GFX9-NEXT:    s_setpc_b64 s[30:31]
810  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
811  %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, float %in seq_cst, !amdgpu.no.remote.memory !0
812  ret void
813}
814
815define float @global_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, float %in) {
816; SI-LABEL: global_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory:
817; SI:       ; %bb.0:
818; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
819; SI-NEXT:    s_mov_b32 s6, 0
820; SI-NEXT:    s_mov_b32 s7, 0xf000
821; SI-NEXT:    s_mov_b32 s4, s6
822; SI-NEXT:    s_mov_b32 s5, s6
823; SI-NEXT:    buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
824; SI-NEXT:    s_waitcnt vmcnt(0)
825; SI-NEXT:    buffer_wbinvl1
826; SI-NEXT:    v_mov_b32_e32 v0, v2
827; SI-NEXT:    s_waitcnt expcnt(0)
828; SI-NEXT:    s_setpc_b64 s[30:31]
829;
830; VI-LABEL: global_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory:
831; VI:       ; %bb.0:
832; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
833; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
834; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
835; VI-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
836; VI-NEXT:    s_waitcnt vmcnt(0)
837; VI-NEXT:    buffer_wbinvl1_vol
838; VI-NEXT:    s_setpc_b64 s[30:31]
839;
840; GFX9-LABEL: global_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory:
841; GFX9:       ; %bb.0:
842; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
843; GFX9-NEXT:    global_atomic_swap v0, v[0:1], v2, off offset:16 glc
844; GFX9-NEXT:    s_waitcnt vmcnt(0)
845; GFX9-NEXT:    buffer_wbinvl1_vol
846; GFX9-NEXT:    s_setpc_b64 s[30:31]
847  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
848  %result = atomicrmw xchg ptr addrspace(1) %gep, float %in seq_cst, !amdgpu.no.remote.memory !0
849  ret float %result
850}
851
852; ---------------------------------------------------------------------
853; atomicrmw add
854; ---------------------------------------------------------------------
855
856define void @global_atomic_add_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
857; SI-LABEL: global_atomic_add_i32_noret:
858; SI:       ; %bb.0:
859; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
860; SI-NEXT:    s_mov_b32 s6, 0
861; SI-NEXT:    s_mov_b32 s7, 0xf000
862; SI-NEXT:    s_mov_b32 s4, s6
863; SI-NEXT:    s_mov_b32 s5, s6
864; SI-NEXT:    buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64
865; SI-NEXT:    s_waitcnt vmcnt(0)
866; SI-NEXT:    buffer_wbinvl1
867; SI-NEXT:    s_waitcnt expcnt(0)
868; SI-NEXT:    s_setpc_b64 s[30:31]
869;
870; VI-LABEL: global_atomic_add_i32_noret:
871; VI:       ; %bb.0:
872; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
873; VI-NEXT:    flat_atomic_add v[0:1], v2
874; VI-NEXT:    s_waitcnt vmcnt(0)
875; VI-NEXT:    buffer_wbinvl1_vol
876; VI-NEXT:    s_setpc_b64 s[30:31]
877;
878; GFX9-LABEL: global_atomic_add_i32_noret:
879; GFX9:       ; %bb.0:
880; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
881; GFX9-NEXT:    global_atomic_add v[0:1], v2, off
882; GFX9-NEXT:    s_waitcnt vmcnt(0)
883; GFX9-NEXT:    buffer_wbinvl1_vol
884; GFX9-NEXT:    s_setpc_b64 s[30:31]
885  %tmp0 = atomicrmw add ptr addrspace(1) %ptr, i32 %in seq_cst
886  ret void
887}
888
889define void @global_atomic_add_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
890; SI-LABEL: global_atomic_add_i32_noret_offset:
891; SI:       ; %bb.0:
892; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
893; SI-NEXT:    s_mov_b32 s6, 0
894; SI-NEXT:    s_mov_b32 s7, 0xf000
895; SI-NEXT:    s_mov_b32 s4, s6
896; SI-NEXT:    s_mov_b32 s5, s6
897; SI-NEXT:    buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 offset:16
898; SI-NEXT:    s_waitcnt vmcnt(0)
899; SI-NEXT:    buffer_wbinvl1
900; SI-NEXT:    s_waitcnt expcnt(0)
901; SI-NEXT:    s_setpc_b64 s[30:31]
902;
903; VI-LABEL: global_atomic_add_i32_noret_offset:
904; VI:       ; %bb.0:
905; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
906; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
907; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
908; VI-NEXT:    flat_atomic_add v[0:1], v2
909; VI-NEXT:    s_waitcnt vmcnt(0)
910; VI-NEXT:    buffer_wbinvl1_vol
911; VI-NEXT:    s_setpc_b64 s[30:31]
912;
913; GFX9-LABEL: global_atomic_add_i32_noret_offset:
914; GFX9:       ; %bb.0:
915; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
916; GFX9-NEXT:    global_atomic_add v[0:1], v2, off offset:16
917; GFX9-NEXT:    s_waitcnt vmcnt(0)
918; GFX9-NEXT:    buffer_wbinvl1_vol
919; GFX9-NEXT:    s_setpc_b64 s[30:31]
920  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
921  %tmp0 = atomicrmw add ptr addrspace(1) %gep, i32 %in seq_cst
922  ret void
923}
924
925define i32 @global_atomic_add_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
926; SI-LABEL: global_atomic_add_i32_ret:
927; SI:       ; %bb.0:
928; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
929; SI-NEXT:    s_mov_b32 s6, 0
930; SI-NEXT:    s_mov_b32 s7, 0xf000
931; SI-NEXT:    s_mov_b32 s4, s6
932; SI-NEXT:    s_mov_b32 s5, s6
933; SI-NEXT:    buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 glc
934; SI-NEXT:    s_waitcnt vmcnt(0)
935; SI-NEXT:    buffer_wbinvl1
936; SI-NEXT:    v_mov_b32_e32 v0, v2
937; SI-NEXT:    s_waitcnt expcnt(0)
938; SI-NEXT:    s_setpc_b64 s[30:31]
939;
940; VI-LABEL: global_atomic_add_i32_ret:
941; VI:       ; %bb.0:
942; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
943; VI-NEXT:    flat_atomic_add v0, v[0:1], v2 glc
944; VI-NEXT:    s_waitcnt vmcnt(0)
945; VI-NEXT:    buffer_wbinvl1_vol
946; VI-NEXT:    s_setpc_b64 s[30:31]
947;
948; GFX9-LABEL: global_atomic_add_i32_ret:
949; GFX9:       ; %bb.0:
950; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
951; GFX9-NEXT:    global_atomic_add v0, v[0:1], v2, off glc
952; GFX9-NEXT:    s_waitcnt vmcnt(0)
953; GFX9-NEXT:    buffer_wbinvl1_vol
954; GFX9-NEXT:    s_setpc_b64 s[30:31]
955  %result = atomicrmw add ptr addrspace(1) %ptr, i32 %in seq_cst
956  ret i32 %result
957}
958
959define i32 @global_atomic_add_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
960; SI-LABEL: global_atomic_add_i32_ret_offset:
961; SI:       ; %bb.0:
962; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
963; SI-NEXT:    s_mov_b32 s6, 0
964; SI-NEXT:    s_mov_b32 s7, 0xf000
965; SI-NEXT:    s_mov_b32 s4, s6
966; SI-NEXT:    s_mov_b32 s5, s6
967; SI-NEXT:    buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
968; SI-NEXT:    s_waitcnt vmcnt(0)
969; SI-NEXT:    buffer_wbinvl1
970; SI-NEXT:    v_mov_b32_e32 v0, v2
971; SI-NEXT:    s_waitcnt expcnt(0)
972; SI-NEXT:    s_setpc_b64 s[30:31]
973;
974; VI-LABEL: global_atomic_add_i32_ret_offset:
975; VI:       ; %bb.0:
976; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
977; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
978; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
979; VI-NEXT:    flat_atomic_add v0, v[0:1], v2 glc
980; VI-NEXT:    s_waitcnt vmcnt(0)
981; VI-NEXT:    buffer_wbinvl1_vol
982; VI-NEXT:    s_setpc_b64 s[30:31]
983;
984; GFX9-LABEL: global_atomic_add_i32_ret_offset:
985; GFX9:       ; %bb.0:
986; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
987; GFX9-NEXT:    global_atomic_add v0, v[0:1], v2, off offset:16 glc
988; GFX9-NEXT:    s_waitcnt vmcnt(0)
989; GFX9-NEXT:    buffer_wbinvl1_vol
990; GFX9-NEXT:    s_setpc_b64 s[30:31]
991  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
992  %result = atomicrmw add ptr addrspace(1) %gep, i32 %in seq_cst
993  ret i32 %result
994}
995
996define amdgpu_gfx void @global_atomic_add_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
997; SI-LABEL: global_atomic_add_i32_noret_scalar:
998; SI:       ; %bb.0:
999; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1000; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
1001; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
1002; SI-NEXT:    s_mov_b64 exec, s[34:35]
1003; SI-NEXT:    s_waitcnt expcnt(0)
1004; SI-NEXT:    v_writelane_b32 v1, s6, 0
1005; SI-NEXT:    v_writelane_b32 v1, s7, 1
1006; SI-NEXT:    s_mov_b32 s34, s6
1007; SI-NEXT:    s_mov_b32 s7, 0xf000
1008; SI-NEXT:    s_mov_b32 s6, -1
1009; SI-NEXT:    v_mov_b32_e32 v0, s34
1010; SI-NEXT:    s_waitcnt vmcnt(0)
1011; SI-NEXT:    buffer_atomic_add v0, off, s[4:7], 0
1012; SI-NEXT:    s_waitcnt vmcnt(0)
1013; SI-NEXT:    buffer_wbinvl1
1014; SI-NEXT:    v_readlane_b32 s7, v1, 1
1015; SI-NEXT:    v_readlane_b32 s6, v1, 0
1016; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
1017; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
1018; SI-NEXT:    s_mov_b64 exec, s[34:35]
1019; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
1020; SI-NEXT:    s_setpc_b64 s[30:31]
1021;
1022; VI-LABEL: global_atomic_add_i32_noret_scalar:
1023; VI:       ; %bb.0:
1024; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1025; VI-NEXT:    v_mov_b32_e32 v0, s4
1026; VI-NEXT:    v_mov_b32_e32 v1, s5
1027; VI-NEXT:    v_mov_b32_e32 v2, s6
1028; VI-NEXT:    flat_atomic_add v[0:1], v2
1029; VI-NEXT:    s_waitcnt vmcnt(0)
1030; VI-NEXT:    buffer_wbinvl1_vol
1031; VI-NEXT:    s_setpc_b64 s[30:31]
1032;
1033; GFX9-LABEL: global_atomic_add_i32_noret_scalar:
1034; GFX9:       ; %bb.0:
1035; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1036; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1037; GFX9-NEXT:    v_mov_b32_e32 v1, s6
1038; GFX9-NEXT:    global_atomic_add v0, v1, s[4:5]
1039; GFX9-NEXT:    s_waitcnt vmcnt(0)
1040; GFX9-NEXT:    buffer_wbinvl1_vol
1041; GFX9-NEXT:    s_setpc_b64 s[30:31]
1042  %tmp0 = atomicrmw add ptr addrspace(1) %ptr, i32 %in seq_cst
1043  ret void
1044}
1045
1046define amdgpu_gfx void @global_atomic_add_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
1047; SI-LABEL: global_atomic_add_i32_noret_offset_scalar:
1048; SI:       ; %bb.0:
1049; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1050; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
1051; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
1052; SI-NEXT:    s_mov_b64 exec, s[34:35]
1053; SI-NEXT:    s_waitcnt expcnt(0)
1054; SI-NEXT:    v_writelane_b32 v1, s6, 0
1055; SI-NEXT:    v_writelane_b32 v1, s7, 1
1056; SI-NEXT:    s_mov_b32 s34, s6
1057; SI-NEXT:    s_mov_b32 s7, 0xf000
1058; SI-NEXT:    s_mov_b32 s6, -1
1059; SI-NEXT:    v_mov_b32_e32 v0, s34
1060; SI-NEXT:    s_waitcnt vmcnt(0)
1061; SI-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 offset:16
1062; SI-NEXT:    s_waitcnt vmcnt(0)
1063; SI-NEXT:    buffer_wbinvl1
1064; SI-NEXT:    v_readlane_b32 s7, v1, 1
1065; SI-NEXT:    v_readlane_b32 s6, v1, 0
1066; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
1067; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
1068; SI-NEXT:    s_mov_b64 exec, s[34:35]
1069; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
1070; SI-NEXT:    s_setpc_b64 s[30:31]
1071;
1072; VI-LABEL: global_atomic_add_i32_noret_offset_scalar:
1073; VI:       ; %bb.0:
1074; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1075; VI-NEXT:    s_add_u32 s34, s4, 16
1076; VI-NEXT:    s_addc_u32 s35, s5, 0
1077; VI-NEXT:    v_mov_b32_e32 v0, s34
1078; VI-NEXT:    v_mov_b32_e32 v1, s35
1079; VI-NEXT:    v_mov_b32_e32 v2, s6
1080; VI-NEXT:    flat_atomic_add v[0:1], v2
1081; VI-NEXT:    s_waitcnt vmcnt(0)
1082; VI-NEXT:    buffer_wbinvl1_vol
1083; VI-NEXT:    s_setpc_b64 s[30:31]
1084;
1085; GFX9-LABEL: global_atomic_add_i32_noret_offset_scalar:
1086; GFX9:       ; %bb.0:
1087; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1088; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1089; GFX9-NEXT:    v_mov_b32_e32 v1, s6
1090; GFX9-NEXT:    global_atomic_add v0, v1, s[4:5] offset:16
1091; GFX9-NEXT:    s_waitcnt vmcnt(0)
1092; GFX9-NEXT:    buffer_wbinvl1_vol
1093; GFX9-NEXT:    s_setpc_b64 s[30:31]
1094  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
1095  %tmp0 = atomicrmw add ptr addrspace(1) %gep, i32 %in seq_cst
1096  ret void
1097}
1098
1099define amdgpu_gfx i32 @global_atomic_add_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
1100; SI-LABEL: global_atomic_add_i32_ret_scalar:
1101; SI:       ; %bb.0:
1102; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1103; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
1104; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
1105; SI-NEXT:    s_mov_b64 exec, s[34:35]
1106; SI-NEXT:    s_waitcnt expcnt(0)
1107; SI-NEXT:    v_writelane_b32 v1, s6, 0
1108; SI-NEXT:    v_writelane_b32 v1, s7, 1
1109; SI-NEXT:    s_mov_b32 s34, s6
1110; SI-NEXT:    s_mov_b32 s7, 0xf000
1111; SI-NEXT:    s_mov_b32 s6, -1
1112; SI-NEXT:    v_mov_b32_e32 v0, s34
1113; SI-NEXT:    s_waitcnt vmcnt(0)
1114; SI-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
1115; SI-NEXT:    s_waitcnt vmcnt(0)
1116; SI-NEXT:    buffer_wbinvl1
1117; SI-NEXT:    v_readlane_b32 s7, v1, 1
1118; SI-NEXT:    v_readlane_b32 s6, v1, 0
1119; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
1120; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
1121; SI-NEXT:    s_mov_b64 exec, s[34:35]
1122; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
1123; SI-NEXT:    s_setpc_b64 s[30:31]
1124;
1125; VI-LABEL: global_atomic_add_i32_ret_scalar:
1126; VI:       ; %bb.0:
1127; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1128; VI-NEXT:    v_mov_b32_e32 v0, s4
1129; VI-NEXT:    v_mov_b32_e32 v1, s5
1130; VI-NEXT:    v_mov_b32_e32 v2, s6
1131; VI-NEXT:    flat_atomic_add v0, v[0:1], v2 glc
1132; VI-NEXT:    s_waitcnt vmcnt(0)
1133; VI-NEXT:    buffer_wbinvl1_vol
1134; VI-NEXT:    s_setpc_b64 s[30:31]
1135;
1136; GFX9-LABEL: global_atomic_add_i32_ret_scalar:
1137; GFX9:       ; %bb.0:
1138; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1139; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1140; GFX9-NEXT:    v_mov_b32_e32 v1, s6
1141; GFX9-NEXT:    global_atomic_add v0, v0, v1, s[4:5] glc
1142; GFX9-NEXT:    s_waitcnt vmcnt(0)
1143; GFX9-NEXT:    buffer_wbinvl1_vol
1144; GFX9-NEXT:    s_setpc_b64 s[30:31]
1145  %result = atomicrmw add ptr addrspace(1) %ptr, i32 %in seq_cst
1146  ret i32 %result
1147}
1148
1149define amdgpu_gfx i32 @global_atomic_add_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
1150; SI-LABEL: global_atomic_add_i32_ret_offset_scalar:
1151; SI:       ; %bb.0:
1152; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1153; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
1154; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
1155; SI-NEXT:    s_mov_b64 exec, s[34:35]
1156; SI-NEXT:    s_waitcnt expcnt(0)
1157; SI-NEXT:    v_writelane_b32 v1, s6, 0
1158; SI-NEXT:    v_writelane_b32 v1, s7, 1
1159; SI-NEXT:    s_mov_b32 s34, s6
1160; SI-NEXT:    s_mov_b32 s7, 0xf000
1161; SI-NEXT:    s_mov_b32 s6, -1
1162; SI-NEXT:    v_mov_b32_e32 v0, s34
1163; SI-NEXT:    s_waitcnt vmcnt(0)
1164; SI-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 offset:16 glc
1165; SI-NEXT:    s_waitcnt vmcnt(0)
1166; SI-NEXT:    buffer_wbinvl1
1167; SI-NEXT:    v_readlane_b32 s7, v1, 1
1168; SI-NEXT:    v_readlane_b32 s6, v1, 0
1169; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
1170; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
1171; SI-NEXT:    s_mov_b64 exec, s[34:35]
1172; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
1173; SI-NEXT:    s_setpc_b64 s[30:31]
1174;
1175; VI-LABEL: global_atomic_add_i32_ret_offset_scalar:
1176; VI:       ; %bb.0:
1177; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1178; VI-NEXT:    s_add_u32 s34, s4, 16
1179; VI-NEXT:    s_addc_u32 s35, s5, 0
1180; VI-NEXT:    v_mov_b32_e32 v0, s34
1181; VI-NEXT:    v_mov_b32_e32 v1, s35
1182; VI-NEXT:    v_mov_b32_e32 v2, s6
1183; VI-NEXT:    flat_atomic_add v0, v[0:1], v2 glc
1184; VI-NEXT:    s_waitcnt vmcnt(0)
1185; VI-NEXT:    buffer_wbinvl1_vol
1186; VI-NEXT:    s_setpc_b64 s[30:31]
1187;
1188; GFX9-LABEL: global_atomic_add_i32_ret_offset_scalar:
1189; GFX9:       ; %bb.0:
1190; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1191; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1192; GFX9-NEXT:    v_mov_b32_e32 v1, s6
1193; GFX9-NEXT:    global_atomic_add v0, v0, v1, s[4:5] offset:16 glc
1194; GFX9-NEXT:    s_waitcnt vmcnt(0)
1195; GFX9-NEXT:    buffer_wbinvl1_vol
1196; GFX9-NEXT:    s_setpc_b64 s[30:31]
1197  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
1198  %result = atomicrmw add ptr addrspace(1) %gep, i32 %in seq_cst
1199  ret i32 %result
1200}
1201
1202define void @global_atomic_add_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
1203; SI-LABEL: global_atomic_add_i32_noret_offset__amdgpu_no_remote_memory:
1204; SI:       ; %bb.0:
1205; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1206; SI-NEXT:    s_mov_b32 s6, 0
1207; SI-NEXT:    s_mov_b32 s7, 0xf000
1208; SI-NEXT:    s_mov_b32 s4, s6
1209; SI-NEXT:    s_mov_b32 s5, s6
1210; SI-NEXT:    buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 offset:16
1211; SI-NEXT:    s_waitcnt vmcnt(0)
1212; SI-NEXT:    buffer_wbinvl1
1213; SI-NEXT:    s_waitcnt expcnt(0)
1214; SI-NEXT:    s_setpc_b64 s[30:31]
1215;
1216; VI-LABEL: global_atomic_add_i32_noret_offset__amdgpu_no_remote_memory:
1217; VI:       ; %bb.0:
1218; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1219; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
1220; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1221; VI-NEXT:    flat_atomic_add v[0:1], v2
1222; VI-NEXT:    s_waitcnt vmcnt(0)
1223; VI-NEXT:    buffer_wbinvl1_vol
1224; VI-NEXT:    s_setpc_b64 s[30:31]
1225;
1226; GFX9-LABEL: global_atomic_add_i32_noret_offset__amdgpu_no_remote_memory:
1227; GFX9:       ; %bb.0:
1228; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1229; GFX9-NEXT:    global_atomic_add v[0:1], v2, off offset:16
1230; GFX9-NEXT:    s_waitcnt vmcnt(0)
1231; GFX9-NEXT:    buffer_wbinvl1_vol
1232; GFX9-NEXT:    s_setpc_b64 s[30:31]
1233  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
1234  %tmp0 = atomicrmw add ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
1235  ret void
1236}
1237
1238define i32 @global_atomic_add_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
1239; SI-LABEL: global_atomic_add_i32_ret_offset__amdgpu_no_remote_memory:
1240; SI:       ; %bb.0:
1241; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1242; SI-NEXT:    s_mov_b32 s6, 0
1243; SI-NEXT:    s_mov_b32 s7, 0xf000
1244; SI-NEXT:    s_mov_b32 s4, s6
1245; SI-NEXT:    s_mov_b32 s5, s6
1246; SI-NEXT:    buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
1247; SI-NEXT:    s_waitcnt vmcnt(0)
1248; SI-NEXT:    buffer_wbinvl1
1249; SI-NEXT:    v_mov_b32_e32 v0, v2
1250; SI-NEXT:    s_waitcnt expcnt(0)
1251; SI-NEXT:    s_setpc_b64 s[30:31]
1252;
1253; VI-LABEL: global_atomic_add_i32_ret_offset__amdgpu_no_remote_memory:
1254; VI:       ; %bb.0:
1255; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1256; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
1257; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1258; VI-NEXT:    flat_atomic_add v0, v[0:1], v2 glc
1259; VI-NEXT:    s_waitcnt vmcnt(0)
1260; VI-NEXT:    buffer_wbinvl1_vol
1261; VI-NEXT:    s_setpc_b64 s[30:31]
1262;
1263; GFX9-LABEL: global_atomic_add_i32_ret_offset__amdgpu_no_remote_memory:
1264; GFX9:       ; %bb.0:
1265; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1266; GFX9-NEXT:    global_atomic_add v0, v[0:1], v2, off offset:16 glc
1267; GFX9-NEXT:    s_waitcnt vmcnt(0)
1268; GFX9-NEXT:    buffer_wbinvl1_vol
1269; GFX9-NEXT:    s_setpc_b64 s[30:31]
1270  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
1271  %result = atomicrmw add ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
1272  ret i32 %result
1273}
1274
1275; ---------------------------------------------------------------------
1276; atomicrmw sub
1277; ---------------------------------------------------------------------
1278
1279define void @global_atomic_sub_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
1280; SI-LABEL: global_atomic_sub_i32_noret:
1281; SI:       ; %bb.0:
1282; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1283; SI-NEXT:    s_mov_b32 s6, 0
1284; SI-NEXT:    s_mov_b32 s7, 0xf000
1285; SI-NEXT:    s_mov_b32 s4, s6
1286; SI-NEXT:    s_mov_b32 s5, s6
1287; SI-NEXT:    buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64
1288; SI-NEXT:    s_waitcnt vmcnt(0)
1289; SI-NEXT:    buffer_wbinvl1
1290; SI-NEXT:    s_waitcnt expcnt(0)
1291; SI-NEXT:    s_setpc_b64 s[30:31]
1292;
1293; VI-LABEL: global_atomic_sub_i32_noret:
1294; VI:       ; %bb.0:
1295; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1296; VI-NEXT:    flat_atomic_sub v[0:1], v2
1297; VI-NEXT:    s_waitcnt vmcnt(0)
1298; VI-NEXT:    buffer_wbinvl1_vol
1299; VI-NEXT:    s_setpc_b64 s[30:31]
1300;
1301; GFX9-LABEL: global_atomic_sub_i32_noret:
1302; GFX9:       ; %bb.0:
1303; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1304; GFX9-NEXT:    global_atomic_sub v[0:1], v2, off
1305; GFX9-NEXT:    s_waitcnt vmcnt(0)
1306; GFX9-NEXT:    buffer_wbinvl1_vol
1307; GFX9-NEXT:    s_setpc_b64 s[30:31]
1308  %tmp0 = atomicrmw sub ptr addrspace(1) %ptr, i32 %in seq_cst
1309  ret void
1310}
1311
1312define void @global_atomic_sub_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
1313; SI-LABEL: global_atomic_sub_i32_noret_offset:
1314; SI:       ; %bb.0:
1315; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1316; SI-NEXT:    s_mov_b32 s6, 0
1317; SI-NEXT:    s_mov_b32 s7, 0xf000
1318; SI-NEXT:    s_mov_b32 s4, s6
1319; SI-NEXT:    s_mov_b32 s5, s6
1320; SI-NEXT:    buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 offset:16
1321; SI-NEXT:    s_waitcnt vmcnt(0)
1322; SI-NEXT:    buffer_wbinvl1
1323; SI-NEXT:    s_waitcnt expcnt(0)
1324; SI-NEXT:    s_setpc_b64 s[30:31]
1325;
1326; VI-LABEL: global_atomic_sub_i32_noret_offset:
1327; VI:       ; %bb.0:
1328; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1329; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
1330; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1331; VI-NEXT:    flat_atomic_sub v[0:1], v2
1332; VI-NEXT:    s_waitcnt vmcnt(0)
1333; VI-NEXT:    buffer_wbinvl1_vol
1334; VI-NEXT:    s_setpc_b64 s[30:31]
1335;
1336; GFX9-LABEL: global_atomic_sub_i32_noret_offset:
1337; GFX9:       ; %bb.0:
1338; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1339; GFX9-NEXT:    global_atomic_sub v[0:1], v2, off offset:16
1340; GFX9-NEXT:    s_waitcnt vmcnt(0)
1341; GFX9-NEXT:    buffer_wbinvl1_vol
1342; GFX9-NEXT:    s_setpc_b64 s[30:31]
1343  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
1344  %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst
1345  ret void
1346}
1347
1348define i32 @global_atomic_sub_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
1349; SI-LABEL: global_atomic_sub_i32_ret:
1350; SI:       ; %bb.0:
1351; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1352; SI-NEXT:    s_mov_b32 s6, 0
1353; SI-NEXT:    s_mov_b32 s7, 0xf000
1354; SI-NEXT:    s_mov_b32 s4, s6
1355; SI-NEXT:    s_mov_b32 s5, s6
1356; SI-NEXT:    buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 glc
1357; SI-NEXT:    s_waitcnt vmcnt(0)
1358; SI-NEXT:    buffer_wbinvl1
1359; SI-NEXT:    v_mov_b32_e32 v0, v2
1360; SI-NEXT:    s_waitcnt expcnt(0)
1361; SI-NEXT:    s_setpc_b64 s[30:31]
1362;
1363; VI-LABEL: global_atomic_sub_i32_ret:
1364; VI:       ; %bb.0:
1365; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1366; VI-NEXT:    flat_atomic_sub v0, v[0:1], v2 glc
1367; VI-NEXT:    s_waitcnt vmcnt(0)
1368; VI-NEXT:    buffer_wbinvl1_vol
1369; VI-NEXT:    s_setpc_b64 s[30:31]
1370;
1371; GFX9-LABEL: global_atomic_sub_i32_ret:
1372; GFX9:       ; %bb.0:
1373; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1374; GFX9-NEXT:    global_atomic_sub v0, v[0:1], v2, off glc
1375; GFX9-NEXT:    s_waitcnt vmcnt(0)
1376; GFX9-NEXT:    buffer_wbinvl1_vol
1377; GFX9-NEXT:    s_setpc_b64 s[30:31]
1378  %result = atomicrmw sub ptr addrspace(1) %ptr, i32 %in seq_cst
1379  ret i32 %result
1380}
1381
1382define i32 @global_atomic_sub_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
1383; SI-LABEL: global_atomic_sub_i32_ret_offset:
1384; SI:       ; %bb.0:
1385; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1386; SI-NEXT:    s_mov_b32 s6, 0
1387; SI-NEXT:    s_mov_b32 s7, 0xf000
1388; SI-NEXT:    s_mov_b32 s4, s6
1389; SI-NEXT:    s_mov_b32 s5, s6
1390; SI-NEXT:    buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
1391; SI-NEXT:    s_waitcnt vmcnt(0)
1392; SI-NEXT:    buffer_wbinvl1
1393; SI-NEXT:    v_mov_b32_e32 v0, v2
1394; SI-NEXT:    s_waitcnt expcnt(0)
1395; SI-NEXT:    s_setpc_b64 s[30:31]
1396;
1397; VI-LABEL: global_atomic_sub_i32_ret_offset:
1398; VI:       ; %bb.0:
1399; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1400; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
1401; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1402; VI-NEXT:    flat_atomic_sub v0, v[0:1], v2 glc
1403; VI-NEXT:    s_waitcnt vmcnt(0)
1404; VI-NEXT:    buffer_wbinvl1_vol
1405; VI-NEXT:    s_setpc_b64 s[30:31]
1406;
1407; GFX9-LABEL: global_atomic_sub_i32_ret_offset:
1408; GFX9:       ; %bb.0:
1409; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1410; GFX9-NEXT:    global_atomic_sub v0, v[0:1], v2, off offset:16 glc
1411; GFX9-NEXT:    s_waitcnt vmcnt(0)
1412; GFX9-NEXT:    buffer_wbinvl1_vol
1413; GFX9-NEXT:    s_setpc_b64 s[30:31]
1414  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
1415  %result = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst
1416  ret i32 %result
1417}
1418
1419define amdgpu_gfx void @global_atomic_sub_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
1420; SI-LABEL: global_atomic_sub_i32_noret_scalar:
1421; SI:       ; %bb.0:
1422; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1423; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
1424; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
1425; SI-NEXT:    s_mov_b64 exec, s[34:35]
1426; SI-NEXT:    s_waitcnt expcnt(0)
1427; SI-NEXT:    v_writelane_b32 v1, s6, 0
1428; SI-NEXT:    v_writelane_b32 v1, s7, 1
1429; SI-NEXT:    s_mov_b32 s34, s6
1430; SI-NEXT:    s_mov_b32 s7, 0xf000
1431; SI-NEXT:    s_mov_b32 s6, -1
1432; SI-NEXT:    v_mov_b32_e32 v0, s34
1433; SI-NEXT:    s_waitcnt vmcnt(0)
1434; SI-NEXT:    buffer_atomic_sub v0, off, s[4:7], 0
1435; SI-NEXT:    s_waitcnt vmcnt(0)
1436; SI-NEXT:    buffer_wbinvl1
1437; SI-NEXT:    v_readlane_b32 s7, v1, 1
1438; SI-NEXT:    v_readlane_b32 s6, v1, 0
1439; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
1440; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
1441; SI-NEXT:    s_mov_b64 exec, s[34:35]
1442; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
1443; SI-NEXT:    s_setpc_b64 s[30:31]
1444;
1445; VI-LABEL: global_atomic_sub_i32_noret_scalar:
1446; VI:       ; %bb.0:
1447; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1448; VI-NEXT:    v_mov_b32_e32 v0, s4
1449; VI-NEXT:    v_mov_b32_e32 v1, s5
1450; VI-NEXT:    v_mov_b32_e32 v2, s6
1451; VI-NEXT:    flat_atomic_sub v[0:1], v2
1452; VI-NEXT:    s_waitcnt vmcnt(0)
1453; VI-NEXT:    buffer_wbinvl1_vol
1454; VI-NEXT:    s_setpc_b64 s[30:31]
1455;
1456; GFX9-LABEL: global_atomic_sub_i32_noret_scalar:
1457; GFX9:       ; %bb.0:
1458; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1459; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1460; GFX9-NEXT:    v_mov_b32_e32 v1, s6
1461; GFX9-NEXT:    global_atomic_sub v0, v1, s[4:5]
1462; GFX9-NEXT:    s_waitcnt vmcnt(0)
1463; GFX9-NEXT:    buffer_wbinvl1_vol
1464; GFX9-NEXT:    s_setpc_b64 s[30:31]
1465  %tmp0 = atomicrmw sub ptr addrspace(1) %ptr, i32 %in seq_cst
1466  ret void
1467}
1468
1469define amdgpu_gfx void @global_atomic_sub_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
1470; SI-LABEL: global_atomic_sub_i32_noret_offset_scalar:
1471; SI:       ; %bb.0:
1472; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1473; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
1474; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
1475; SI-NEXT:    s_mov_b64 exec, s[34:35]
1476; SI-NEXT:    s_waitcnt expcnt(0)
1477; SI-NEXT:    v_writelane_b32 v1, s6, 0
1478; SI-NEXT:    v_writelane_b32 v1, s7, 1
1479; SI-NEXT:    s_mov_b32 s34, s6
1480; SI-NEXT:    s_mov_b32 s7, 0xf000
1481; SI-NEXT:    s_mov_b32 s6, -1
1482; SI-NEXT:    v_mov_b32_e32 v0, s34
1483; SI-NEXT:    s_waitcnt vmcnt(0)
1484; SI-NEXT:    buffer_atomic_sub v0, off, s[4:7], 0 offset:16
1485; SI-NEXT:    s_waitcnt vmcnt(0)
1486; SI-NEXT:    buffer_wbinvl1
1487; SI-NEXT:    v_readlane_b32 s7, v1, 1
1488; SI-NEXT:    v_readlane_b32 s6, v1, 0
1489; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
1490; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
1491; SI-NEXT:    s_mov_b64 exec, s[34:35]
1492; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
1493; SI-NEXT:    s_setpc_b64 s[30:31]
1494;
1495; VI-LABEL: global_atomic_sub_i32_noret_offset_scalar:
1496; VI:       ; %bb.0:
1497; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1498; VI-NEXT:    s_add_u32 s34, s4, 16
1499; VI-NEXT:    s_addc_u32 s35, s5, 0
1500; VI-NEXT:    v_mov_b32_e32 v0, s34
1501; VI-NEXT:    v_mov_b32_e32 v1, s35
1502; VI-NEXT:    v_mov_b32_e32 v2, s6
1503; VI-NEXT:    flat_atomic_sub v[0:1], v2
1504; VI-NEXT:    s_waitcnt vmcnt(0)
1505; VI-NEXT:    buffer_wbinvl1_vol
1506; VI-NEXT:    s_setpc_b64 s[30:31]
1507;
1508; GFX9-LABEL: global_atomic_sub_i32_noret_offset_scalar:
1509; GFX9:       ; %bb.0:
1510; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1511; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1512; GFX9-NEXT:    v_mov_b32_e32 v1, s6
1513; GFX9-NEXT:    global_atomic_sub v0, v1, s[4:5] offset:16
1514; GFX9-NEXT:    s_waitcnt vmcnt(0)
1515; GFX9-NEXT:    buffer_wbinvl1_vol
1516; GFX9-NEXT:    s_setpc_b64 s[30:31]
1517  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
1518  %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst
1519  ret void
1520}
1521
1522define amdgpu_gfx i32 @global_atomic_sub_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
1523; SI-LABEL: global_atomic_sub_i32_ret_scalar:
1524; SI:       ; %bb.0:
1525; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1526; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
1527; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
1528; SI-NEXT:    s_mov_b64 exec, s[34:35]
1529; SI-NEXT:    s_waitcnt expcnt(0)
1530; SI-NEXT:    v_writelane_b32 v1, s6, 0
1531; SI-NEXT:    v_writelane_b32 v1, s7, 1
1532; SI-NEXT:    s_mov_b32 s34, s6
1533; SI-NEXT:    s_mov_b32 s7, 0xf000
1534; SI-NEXT:    s_mov_b32 s6, -1
1535; SI-NEXT:    v_mov_b32_e32 v0, s34
1536; SI-NEXT:    s_waitcnt vmcnt(0)
1537; SI-NEXT:    buffer_atomic_sub v0, off, s[4:7], 0 glc
1538; SI-NEXT:    s_waitcnt vmcnt(0)
1539; SI-NEXT:    buffer_wbinvl1
1540; SI-NEXT:    v_readlane_b32 s7, v1, 1
1541; SI-NEXT:    v_readlane_b32 s6, v1, 0
1542; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
1543; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
1544; SI-NEXT:    s_mov_b64 exec, s[34:35]
1545; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
1546; SI-NEXT:    s_setpc_b64 s[30:31]
1547;
1548; VI-LABEL: global_atomic_sub_i32_ret_scalar:
1549; VI:       ; %bb.0:
1550; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1551; VI-NEXT:    v_mov_b32_e32 v0, s4
1552; VI-NEXT:    v_mov_b32_e32 v1, s5
1553; VI-NEXT:    v_mov_b32_e32 v2, s6
1554; VI-NEXT:    flat_atomic_sub v0, v[0:1], v2 glc
1555; VI-NEXT:    s_waitcnt vmcnt(0)
1556; VI-NEXT:    buffer_wbinvl1_vol
1557; VI-NEXT:    s_setpc_b64 s[30:31]
1558;
1559; GFX9-LABEL: global_atomic_sub_i32_ret_scalar:
1560; GFX9:       ; %bb.0:
1561; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1562; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1563; GFX9-NEXT:    v_mov_b32_e32 v1, s6
1564; GFX9-NEXT:    global_atomic_sub v0, v0, v1, s[4:5] glc
1565; GFX9-NEXT:    s_waitcnt vmcnt(0)
1566; GFX9-NEXT:    buffer_wbinvl1_vol
1567; GFX9-NEXT:    s_setpc_b64 s[30:31]
1568  %result = atomicrmw sub ptr addrspace(1) %ptr, i32 %in seq_cst
1569  ret i32 %result
1570}
1571
1572define amdgpu_gfx i32 @global_atomic_sub_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
1573; SI-LABEL: global_atomic_sub_i32_ret_offset_scalar:
1574; SI:       ; %bb.0:
1575; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1576; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
1577; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
1578; SI-NEXT:    s_mov_b64 exec, s[34:35]
1579; SI-NEXT:    s_waitcnt expcnt(0)
1580; SI-NEXT:    v_writelane_b32 v1, s6, 0
1581; SI-NEXT:    v_writelane_b32 v1, s7, 1
1582; SI-NEXT:    s_mov_b32 s34, s6
1583; SI-NEXT:    s_mov_b32 s7, 0xf000
1584; SI-NEXT:    s_mov_b32 s6, -1
1585; SI-NEXT:    v_mov_b32_e32 v0, s34
1586; SI-NEXT:    s_waitcnt vmcnt(0)
1587; SI-NEXT:    buffer_atomic_sub v0, off, s[4:7], 0 offset:16 glc
1588; SI-NEXT:    s_waitcnt vmcnt(0)
1589; SI-NEXT:    buffer_wbinvl1
1590; SI-NEXT:    v_readlane_b32 s7, v1, 1
1591; SI-NEXT:    v_readlane_b32 s6, v1, 0
1592; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
1593; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
1594; SI-NEXT:    s_mov_b64 exec, s[34:35]
1595; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
1596; SI-NEXT:    s_setpc_b64 s[30:31]
1597;
1598; VI-LABEL: global_atomic_sub_i32_ret_offset_scalar:
1599; VI:       ; %bb.0:
1600; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1601; VI-NEXT:    s_add_u32 s34, s4, 16
1602; VI-NEXT:    s_addc_u32 s35, s5, 0
1603; VI-NEXT:    v_mov_b32_e32 v0, s34
1604; VI-NEXT:    v_mov_b32_e32 v1, s35
1605; VI-NEXT:    v_mov_b32_e32 v2, s6
1606; VI-NEXT:    flat_atomic_sub v0, v[0:1], v2 glc
1607; VI-NEXT:    s_waitcnt vmcnt(0)
1608; VI-NEXT:    buffer_wbinvl1_vol
1609; VI-NEXT:    s_setpc_b64 s[30:31]
1610;
1611; GFX9-LABEL: global_atomic_sub_i32_ret_offset_scalar:
1612; GFX9:       ; %bb.0:
1613; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1614; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1615; GFX9-NEXT:    v_mov_b32_e32 v1, s6
1616; GFX9-NEXT:    global_atomic_sub v0, v0, v1, s[4:5] offset:16 glc
1617; GFX9-NEXT:    s_waitcnt vmcnt(0)
1618; GFX9-NEXT:    buffer_wbinvl1_vol
1619; GFX9-NEXT:    s_setpc_b64 s[30:31]
1620  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
1621  %result = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst
1622  ret i32 %result
1623}
1624
1625define i32 @global_atomic_sub_0_i32_ret(ptr addrspace(1) %ptr) {
1626; SI-LABEL: global_atomic_sub_0_i32_ret:
1627; SI:       ; %bb.0:
1628; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1629; SI-NEXT:    s_mov_b32 s7, 0xf000
1630; SI-NEXT:    s_mov_b32 s6, 0
1631; SI-NEXT:    v_mov_b32_e32 v2, 0
1632; SI-NEXT:    s_mov_b32 s4, s6
1633; SI-NEXT:    s_mov_b32 s5, s6
1634; SI-NEXT:    buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 glc
1635; SI-NEXT:    s_waitcnt vmcnt(0)
1636; SI-NEXT:    buffer_wbinvl1
1637; SI-NEXT:    v_mov_b32_e32 v0, v2
1638; SI-NEXT:    s_waitcnt expcnt(0)
1639; SI-NEXT:    s_setpc_b64 s[30:31]
1640;
1641; VI-LABEL: global_atomic_sub_0_i32_ret:
1642; VI:       ; %bb.0:
1643; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1644; VI-NEXT:    v_mov_b32_e32 v2, 0
1645; VI-NEXT:    flat_atomic_add v0, v[0:1], v2 glc
1646; VI-NEXT:    s_waitcnt vmcnt(0)
1647; VI-NEXT:    buffer_wbinvl1_vol
1648; VI-NEXT:    s_setpc_b64 s[30:31]
1649;
1650; GFX9-LABEL: global_atomic_sub_0_i32_ret:
1651; GFX9:       ; %bb.0:
1652; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1653; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1654; GFX9-NEXT:    global_atomic_add v0, v[0:1], v2, off glc
1655; GFX9-NEXT:    s_waitcnt vmcnt(0)
1656; GFX9-NEXT:    buffer_wbinvl1_vol
1657; GFX9-NEXT:    s_setpc_b64 s[30:31]
1658  %result = atomicrmw sub ptr addrspace(1) %ptr, i32 0 seq_cst
1659  ret i32 %result
1660}
1661
1662define void @global_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
1663; SI-LABEL: global_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory:
1664; SI:       ; %bb.0:
1665; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1666; SI-NEXT:    s_mov_b32 s6, 0
1667; SI-NEXT:    s_mov_b32 s7, 0xf000
1668; SI-NEXT:    s_mov_b32 s4, s6
1669; SI-NEXT:    s_mov_b32 s5, s6
1670; SI-NEXT:    buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 offset:16
1671; SI-NEXT:    s_waitcnt vmcnt(0)
1672; SI-NEXT:    buffer_wbinvl1
1673; SI-NEXT:    s_waitcnt expcnt(0)
1674; SI-NEXT:    s_setpc_b64 s[30:31]
1675;
1676; VI-LABEL: global_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory:
1677; VI:       ; %bb.0:
1678; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1679; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
1680; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1681; VI-NEXT:    flat_atomic_sub v[0:1], v2
1682; VI-NEXT:    s_waitcnt vmcnt(0)
1683; VI-NEXT:    buffer_wbinvl1_vol
1684; VI-NEXT:    s_setpc_b64 s[30:31]
1685;
1686; GFX9-LABEL: global_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory:
1687; GFX9:       ; %bb.0:
1688; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1689; GFX9-NEXT:    global_atomic_sub v[0:1], v2, off offset:16
1690; GFX9-NEXT:    s_waitcnt vmcnt(0)
1691; GFX9-NEXT:    buffer_wbinvl1_vol
1692; GFX9-NEXT:    s_setpc_b64 s[30:31]
1693  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
1694  %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
1695  ret void
1696}
1697
1698define i32 @global_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
1699; SI-LABEL: global_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory:
1700; SI:       ; %bb.0:
1701; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1702; SI-NEXT:    s_mov_b32 s6, 0
1703; SI-NEXT:    s_mov_b32 s7, 0xf000
1704; SI-NEXT:    s_mov_b32 s4, s6
1705; SI-NEXT:    s_mov_b32 s5, s6
1706; SI-NEXT:    buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
1707; SI-NEXT:    s_waitcnt vmcnt(0)
1708; SI-NEXT:    buffer_wbinvl1
1709; SI-NEXT:    v_mov_b32_e32 v0, v2
1710; SI-NEXT:    s_waitcnt expcnt(0)
1711; SI-NEXT:    s_setpc_b64 s[30:31]
1712;
1713; VI-LABEL: global_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory:
1714; VI:       ; %bb.0:
1715; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1716; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
1717; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1718; VI-NEXT:    flat_atomic_sub v0, v[0:1], v2 glc
1719; VI-NEXT:    s_waitcnt vmcnt(0)
1720; VI-NEXT:    buffer_wbinvl1_vol
1721; VI-NEXT:    s_setpc_b64 s[30:31]
1722;
1723; GFX9-LABEL: global_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory:
1724; GFX9:       ; %bb.0:
1725; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1726; GFX9-NEXT:    global_atomic_sub v0, v[0:1], v2, off offset:16 glc
1727; GFX9-NEXT:    s_waitcnt vmcnt(0)
1728; GFX9-NEXT:    buffer_wbinvl1_vol
1729; GFX9-NEXT:    s_setpc_b64 s[30:31]
1730  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
1731  %result = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
1732  ret i32 %result
1733}
1734
1735; ---------------------------------------------------------------------
1736; atomicrmw and
1737; ---------------------------------------------------------------------
1738
1739define void @global_atomic_and_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
1740; SI-LABEL: global_atomic_and_i32_noret:
1741; SI:       ; %bb.0:
1742; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1743; SI-NEXT:    s_mov_b32 s6, 0
1744; SI-NEXT:    s_mov_b32 s7, 0xf000
1745; SI-NEXT:    s_mov_b32 s4, s6
1746; SI-NEXT:    s_mov_b32 s5, s6
1747; SI-NEXT:    buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64
1748; SI-NEXT:    s_waitcnt vmcnt(0)
1749; SI-NEXT:    buffer_wbinvl1
1750; SI-NEXT:    s_waitcnt expcnt(0)
1751; SI-NEXT:    s_setpc_b64 s[30:31]
1752;
1753; VI-LABEL: global_atomic_and_i32_noret:
1754; VI:       ; %bb.0:
1755; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1756; VI-NEXT:    flat_atomic_and v[0:1], v2
1757; VI-NEXT:    s_waitcnt vmcnt(0)
1758; VI-NEXT:    buffer_wbinvl1_vol
1759; VI-NEXT:    s_setpc_b64 s[30:31]
1760;
1761; GFX9-LABEL: global_atomic_and_i32_noret:
1762; GFX9:       ; %bb.0:
1763; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1764; GFX9-NEXT:    global_atomic_and v[0:1], v2, off
1765; GFX9-NEXT:    s_waitcnt vmcnt(0)
1766; GFX9-NEXT:    buffer_wbinvl1_vol
1767; GFX9-NEXT:    s_setpc_b64 s[30:31]
1768  %tmp0 = atomicrmw and ptr addrspace(1) %ptr, i32 %in seq_cst
1769  ret void
1770}
1771
1772define void @global_atomic_and_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
1773; SI-LABEL: global_atomic_and_i32_noret_offset:
1774; SI:       ; %bb.0:
1775; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1776; SI-NEXT:    s_mov_b32 s6, 0
1777; SI-NEXT:    s_mov_b32 s7, 0xf000
1778; SI-NEXT:    s_mov_b32 s4, s6
1779; SI-NEXT:    s_mov_b32 s5, s6
1780; SI-NEXT:    buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 offset:16
1781; SI-NEXT:    s_waitcnt vmcnt(0)
1782; SI-NEXT:    buffer_wbinvl1
1783; SI-NEXT:    s_waitcnt expcnt(0)
1784; SI-NEXT:    s_setpc_b64 s[30:31]
1785;
1786; VI-LABEL: global_atomic_and_i32_noret_offset:
1787; VI:       ; %bb.0:
1788; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1789; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
1790; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1791; VI-NEXT:    flat_atomic_and v[0:1], v2
1792; VI-NEXT:    s_waitcnt vmcnt(0)
1793; VI-NEXT:    buffer_wbinvl1_vol
1794; VI-NEXT:    s_setpc_b64 s[30:31]
1795;
1796; GFX9-LABEL: global_atomic_and_i32_noret_offset:
1797; GFX9:       ; %bb.0:
1798; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1799; GFX9-NEXT:    global_atomic_and v[0:1], v2, off offset:16
1800; GFX9-NEXT:    s_waitcnt vmcnt(0)
1801; GFX9-NEXT:    buffer_wbinvl1_vol
1802; GFX9-NEXT:    s_setpc_b64 s[30:31]
1803  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
1804  %tmp0 = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst
1805  ret void
1806}
1807
1808define i32 @global_atomic_and_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
1809; SI-LABEL: global_atomic_and_i32_ret:
1810; SI:       ; %bb.0:
1811; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1812; SI-NEXT:    s_mov_b32 s6, 0
1813; SI-NEXT:    s_mov_b32 s7, 0xf000
1814; SI-NEXT:    s_mov_b32 s4, s6
1815; SI-NEXT:    s_mov_b32 s5, s6
1816; SI-NEXT:    buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 glc
1817; SI-NEXT:    s_waitcnt vmcnt(0)
1818; SI-NEXT:    buffer_wbinvl1
1819; SI-NEXT:    v_mov_b32_e32 v0, v2
1820; SI-NEXT:    s_waitcnt expcnt(0)
1821; SI-NEXT:    s_setpc_b64 s[30:31]
1822;
1823; VI-LABEL: global_atomic_and_i32_ret:
1824; VI:       ; %bb.0:
1825; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1826; VI-NEXT:    flat_atomic_and v0, v[0:1], v2 glc
1827; VI-NEXT:    s_waitcnt vmcnt(0)
1828; VI-NEXT:    buffer_wbinvl1_vol
1829; VI-NEXT:    s_setpc_b64 s[30:31]
1830;
1831; GFX9-LABEL: global_atomic_and_i32_ret:
1832; GFX9:       ; %bb.0:
1833; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1834; GFX9-NEXT:    global_atomic_and v0, v[0:1], v2, off glc
1835; GFX9-NEXT:    s_waitcnt vmcnt(0)
1836; GFX9-NEXT:    buffer_wbinvl1_vol
1837; GFX9-NEXT:    s_setpc_b64 s[30:31]
1838  %result = atomicrmw and ptr addrspace(1) %ptr, i32 %in seq_cst
1839  ret i32 %result
1840}
1841
1842define i32 @global_atomic_and_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
1843; SI-LABEL: global_atomic_and_i32_ret_offset:
1844; SI:       ; %bb.0:
1845; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1846; SI-NEXT:    s_mov_b32 s6, 0
1847; SI-NEXT:    s_mov_b32 s7, 0xf000
1848; SI-NEXT:    s_mov_b32 s4, s6
1849; SI-NEXT:    s_mov_b32 s5, s6
1850; SI-NEXT:    buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
1851; SI-NEXT:    s_waitcnt vmcnt(0)
1852; SI-NEXT:    buffer_wbinvl1
1853; SI-NEXT:    v_mov_b32_e32 v0, v2
1854; SI-NEXT:    s_waitcnt expcnt(0)
1855; SI-NEXT:    s_setpc_b64 s[30:31]
1856;
1857; VI-LABEL: global_atomic_and_i32_ret_offset:
1858; VI:       ; %bb.0:
1859; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1860; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
1861; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1862; VI-NEXT:    flat_atomic_and v0, v[0:1], v2 glc
1863; VI-NEXT:    s_waitcnt vmcnt(0)
1864; VI-NEXT:    buffer_wbinvl1_vol
1865; VI-NEXT:    s_setpc_b64 s[30:31]
1866;
1867; GFX9-LABEL: global_atomic_and_i32_ret_offset:
1868; GFX9:       ; %bb.0:
1869; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1870; GFX9-NEXT:    global_atomic_and v0, v[0:1], v2, off offset:16 glc
1871; GFX9-NEXT:    s_waitcnt vmcnt(0)
1872; GFX9-NEXT:    buffer_wbinvl1_vol
1873; GFX9-NEXT:    s_setpc_b64 s[30:31]
1874  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
1875  %result = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst
1876  ret i32 %result
1877}
1878
1879define amdgpu_gfx void @global_atomic_and_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
1880; SI-LABEL: global_atomic_and_i32_noret_scalar:
1881; SI:       ; %bb.0:
1882; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1883; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
1884; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
1885; SI-NEXT:    s_mov_b64 exec, s[34:35]
1886; SI-NEXT:    s_waitcnt expcnt(0)
1887; SI-NEXT:    v_writelane_b32 v1, s6, 0
1888; SI-NEXT:    v_writelane_b32 v1, s7, 1
1889; SI-NEXT:    s_mov_b32 s34, s6
1890; SI-NEXT:    s_mov_b32 s7, 0xf000
1891; SI-NEXT:    s_mov_b32 s6, -1
1892; SI-NEXT:    v_mov_b32_e32 v0, s34
1893; SI-NEXT:    s_waitcnt vmcnt(0)
1894; SI-NEXT:    buffer_atomic_and v0, off, s[4:7], 0
1895; SI-NEXT:    s_waitcnt vmcnt(0)
1896; SI-NEXT:    buffer_wbinvl1
1897; SI-NEXT:    v_readlane_b32 s7, v1, 1
1898; SI-NEXT:    v_readlane_b32 s6, v1, 0
1899; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
1900; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
1901; SI-NEXT:    s_mov_b64 exec, s[34:35]
1902; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
1903; SI-NEXT:    s_setpc_b64 s[30:31]
1904;
1905; VI-LABEL: global_atomic_and_i32_noret_scalar:
1906; VI:       ; %bb.0:
1907; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1908; VI-NEXT:    v_mov_b32_e32 v0, s4
1909; VI-NEXT:    v_mov_b32_e32 v1, s5
1910; VI-NEXT:    v_mov_b32_e32 v2, s6
1911; VI-NEXT:    flat_atomic_and v[0:1], v2
1912; VI-NEXT:    s_waitcnt vmcnt(0)
1913; VI-NEXT:    buffer_wbinvl1_vol
1914; VI-NEXT:    s_setpc_b64 s[30:31]
1915;
1916; GFX9-LABEL: global_atomic_and_i32_noret_scalar:
1917; GFX9:       ; %bb.0:
1918; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1919; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1920; GFX9-NEXT:    v_mov_b32_e32 v1, s6
1921; GFX9-NEXT:    global_atomic_and v0, v1, s[4:5]
1922; GFX9-NEXT:    s_waitcnt vmcnt(0)
1923; GFX9-NEXT:    buffer_wbinvl1_vol
1924; GFX9-NEXT:    s_setpc_b64 s[30:31]
1925  %tmp0 = atomicrmw and ptr addrspace(1) %ptr, i32 %in seq_cst
1926  ret void
1927}
1928
1929define amdgpu_gfx void @global_atomic_and_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
1930; SI-LABEL: global_atomic_and_i32_noret_offset_scalar:
1931; SI:       ; %bb.0:
1932; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1933; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
1934; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
1935; SI-NEXT:    s_mov_b64 exec, s[34:35]
1936; SI-NEXT:    s_waitcnt expcnt(0)
1937; SI-NEXT:    v_writelane_b32 v1, s6, 0
1938; SI-NEXT:    v_writelane_b32 v1, s7, 1
1939; SI-NEXT:    s_mov_b32 s34, s6
1940; SI-NEXT:    s_mov_b32 s7, 0xf000
1941; SI-NEXT:    s_mov_b32 s6, -1
1942; SI-NEXT:    v_mov_b32_e32 v0, s34
1943; SI-NEXT:    s_waitcnt vmcnt(0)
1944; SI-NEXT:    buffer_atomic_and v0, off, s[4:7], 0 offset:16
1945; SI-NEXT:    s_waitcnt vmcnt(0)
1946; SI-NEXT:    buffer_wbinvl1
1947; SI-NEXT:    v_readlane_b32 s7, v1, 1
1948; SI-NEXT:    v_readlane_b32 s6, v1, 0
1949; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
1950; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
1951; SI-NEXT:    s_mov_b64 exec, s[34:35]
1952; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
1953; SI-NEXT:    s_setpc_b64 s[30:31]
1954;
1955; VI-LABEL: global_atomic_and_i32_noret_offset_scalar:
1956; VI:       ; %bb.0:
1957; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1958; VI-NEXT:    s_add_u32 s34, s4, 16
1959; VI-NEXT:    s_addc_u32 s35, s5, 0
1960; VI-NEXT:    v_mov_b32_e32 v0, s34
1961; VI-NEXT:    v_mov_b32_e32 v1, s35
1962; VI-NEXT:    v_mov_b32_e32 v2, s6
1963; VI-NEXT:    flat_atomic_and v[0:1], v2
1964; VI-NEXT:    s_waitcnt vmcnt(0)
1965; VI-NEXT:    buffer_wbinvl1_vol
1966; VI-NEXT:    s_setpc_b64 s[30:31]
1967;
1968; GFX9-LABEL: global_atomic_and_i32_noret_offset_scalar:
1969; GFX9:       ; %bb.0:
1970; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1971; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1972; GFX9-NEXT:    v_mov_b32_e32 v1, s6
1973; GFX9-NEXT:    global_atomic_and v0, v1, s[4:5] offset:16
1974; GFX9-NEXT:    s_waitcnt vmcnt(0)
1975; GFX9-NEXT:    buffer_wbinvl1_vol
1976; GFX9-NEXT:    s_setpc_b64 s[30:31]
1977  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
1978  %tmp0 = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst
1979  ret void
1980}
1981
1982define amdgpu_gfx i32 @global_atomic_and_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
1983; SI-LABEL: global_atomic_and_i32_ret_scalar:
1984; SI:       ; %bb.0:
1985; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1986; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
1987; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
1988; SI-NEXT:    s_mov_b64 exec, s[34:35]
1989; SI-NEXT:    s_waitcnt expcnt(0)
1990; SI-NEXT:    v_writelane_b32 v1, s6, 0
1991; SI-NEXT:    v_writelane_b32 v1, s7, 1
1992; SI-NEXT:    s_mov_b32 s34, s6
1993; SI-NEXT:    s_mov_b32 s7, 0xf000
1994; SI-NEXT:    s_mov_b32 s6, -1
1995; SI-NEXT:    v_mov_b32_e32 v0, s34
1996; SI-NEXT:    s_waitcnt vmcnt(0)
1997; SI-NEXT:    buffer_atomic_and v0, off, s[4:7], 0 glc
1998; SI-NEXT:    s_waitcnt vmcnt(0)
1999; SI-NEXT:    buffer_wbinvl1
2000; SI-NEXT:    v_readlane_b32 s7, v1, 1
2001; SI-NEXT:    v_readlane_b32 s6, v1, 0
2002; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
2003; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
2004; SI-NEXT:    s_mov_b64 exec, s[34:35]
2005; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
2006; SI-NEXT:    s_setpc_b64 s[30:31]
2007;
2008; VI-LABEL: global_atomic_and_i32_ret_scalar:
2009; VI:       ; %bb.0:
2010; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2011; VI-NEXT:    v_mov_b32_e32 v0, s4
2012; VI-NEXT:    v_mov_b32_e32 v1, s5
2013; VI-NEXT:    v_mov_b32_e32 v2, s6
2014; VI-NEXT:    flat_atomic_and v0, v[0:1], v2 glc
2015; VI-NEXT:    s_waitcnt vmcnt(0)
2016; VI-NEXT:    buffer_wbinvl1_vol
2017; VI-NEXT:    s_setpc_b64 s[30:31]
2018;
2019; GFX9-LABEL: global_atomic_and_i32_ret_scalar:
2020; GFX9:       ; %bb.0:
2021; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2022; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2023; GFX9-NEXT:    v_mov_b32_e32 v1, s6
2024; GFX9-NEXT:    global_atomic_and v0, v0, v1, s[4:5] glc
2025; GFX9-NEXT:    s_waitcnt vmcnt(0)
2026; GFX9-NEXT:    buffer_wbinvl1_vol
2027; GFX9-NEXT:    s_setpc_b64 s[30:31]
2028  %result = atomicrmw and ptr addrspace(1) %ptr, i32 %in seq_cst
2029  ret i32 %result
2030}
2031
2032define amdgpu_gfx i32 @global_atomic_and_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
2033; SI-LABEL: global_atomic_and_i32_ret_offset_scalar:
2034; SI:       ; %bb.0:
2035; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2036; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
2037; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
2038; SI-NEXT:    s_mov_b64 exec, s[34:35]
2039; SI-NEXT:    s_waitcnt expcnt(0)
2040; SI-NEXT:    v_writelane_b32 v1, s6, 0
2041; SI-NEXT:    v_writelane_b32 v1, s7, 1
2042; SI-NEXT:    s_mov_b32 s34, s6
2043; SI-NEXT:    s_mov_b32 s7, 0xf000
2044; SI-NEXT:    s_mov_b32 s6, -1
2045; SI-NEXT:    v_mov_b32_e32 v0, s34
2046; SI-NEXT:    s_waitcnt vmcnt(0)
2047; SI-NEXT:    buffer_atomic_and v0, off, s[4:7], 0 offset:16 glc
2048; SI-NEXT:    s_waitcnt vmcnt(0)
2049; SI-NEXT:    buffer_wbinvl1
2050; SI-NEXT:    v_readlane_b32 s7, v1, 1
2051; SI-NEXT:    v_readlane_b32 s6, v1, 0
2052; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
2053; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
2054; SI-NEXT:    s_mov_b64 exec, s[34:35]
2055; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
2056; SI-NEXT:    s_setpc_b64 s[30:31]
2057;
2058; VI-LABEL: global_atomic_and_i32_ret_offset_scalar:
2059; VI:       ; %bb.0:
2060; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2061; VI-NEXT:    s_add_u32 s34, s4, 16
2062; VI-NEXT:    s_addc_u32 s35, s5, 0
2063; VI-NEXT:    v_mov_b32_e32 v0, s34
2064; VI-NEXT:    v_mov_b32_e32 v1, s35
2065; VI-NEXT:    v_mov_b32_e32 v2, s6
2066; VI-NEXT:    flat_atomic_and v0, v[0:1], v2 glc
2067; VI-NEXT:    s_waitcnt vmcnt(0)
2068; VI-NEXT:    buffer_wbinvl1_vol
2069; VI-NEXT:    s_setpc_b64 s[30:31]
2070;
2071; GFX9-LABEL: global_atomic_and_i32_ret_offset_scalar:
2072; GFX9:       ; %bb.0:
2073; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2074; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2075; GFX9-NEXT:    v_mov_b32_e32 v1, s6
2076; GFX9-NEXT:    global_atomic_and v0, v0, v1, s[4:5] offset:16 glc
2077; GFX9-NEXT:    s_waitcnt vmcnt(0)
2078; GFX9-NEXT:    buffer_wbinvl1_vol
2079; GFX9-NEXT:    s_setpc_b64 s[30:31]
2080  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
2081  %result = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst
2082  ret i32 %result
2083}
2084
2085define void @global_atomic_and_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
2086; SI-LABEL: global_atomic_and_i32_noret_offset__amdgpu_no_remote_memory:
2087; SI:       ; %bb.0:
2088; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2089; SI-NEXT:    s_mov_b32 s6, 0
2090; SI-NEXT:    s_mov_b32 s7, 0xf000
2091; SI-NEXT:    s_mov_b32 s4, s6
2092; SI-NEXT:    s_mov_b32 s5, s6
2093; SI-NEXT:    buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 offset:16
2094; SI-NEXT:    s_waitcnt vmcnt(0)
2095; SI-NEXT:    buffer_wbinvl1
2096; SI-NEXT:    s_waitcnt expcnt(0)
2097; SI-NEXT:    s_setpc_b64 s[30:31]
2098;
2099; VI-LABEL: global_atomic_and_i32_noret_offset__amdgpu_no_remote_memory:
2100; VI:       ; %bb.0:
2101; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2102; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
2103; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2104; VI-NEXT:    flat_atomic_and v[0:1], v2
2105; VI-NEXT:    s_waitcnt vmcnt(0)
2106; VI-NEXT:    buffer_wbinvl1_vol
2107; VI-NEXT:    s_setpc_b64 s[30:31]
2108;
2109; GFX9-LABEL: global_atomic_and_i32_noret_offset__amdgpu_no_remote_memory:
2110; GFX9:       ; %bb.0:
2111; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2112; GFX9-NEXT:    global_atomic_and v[0:1], v2, off offset:16
2113; GFX9-NEXT:    s_waitcnt vmcnt(0)
2114; GFX9-NEXT:    buffer_wbinvl1_vol
2115; GFX9-NEXT:    s_setpc_b64 s[30:31]
2116  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
2117  %tmp0 = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
2118  ret void
2119}
2120
2121define i32 @global_atomic_and_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
2122; SI-LABEL: global_atomic_and_i32_ret_offset__amdgpu_no_remote_memory:
2123; SI:       ; %bb.0:
2124; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2125; SI-NEXT:    s_mov_b32 s6, 0
2126; SI-NEXT:    s_mov_b32 s7, 0xf000
2127; SI-NEXT:    s_mov_b32 s4, s6
2128; SI-NEXT:    s_mov_b32 s5, s6
2129; SI-NEXT:    buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
2130; SI-NEXT:    s_waitcnt vmcnt(0)
2131; SI-NEXT:    buffer_wbinvl1
2132; SI-NEXT:    v_mov_b32_e32 v0, v2
2133; SI-NEXT:    s_waitcnt expcnt(0)
2134; SI-NEXT:    s_setpc_b64 s[30:31]
2135;
2136; VI-LABEL: global_atomic_and_i32_ret_offset__amdgpu_no_remote_memory:
2137; VI:       ; %bb.0:
2138; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2139; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
2140; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2141; VI-NEXT:    flat_atomic_and v0, v[0:1], v2 glc
2142; VI-NEXT:    s_waitcnt vmcnt(0)
2143; VI-NEXT:    buffer_wbinvl1_vol
2144; VI-NEXT:    s_setpc_b64 s[30:31]
2145;
2146; GFX9-LABEL: global_atomic_and_i32_ret_offset__amdgpu_no_remote_memory:
2147; GFX9:       ; %bb.0:
2148; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2149; GFX9-NEXT:    global_atomic_and v0, v[0:1], v2, off offset:16 glc
2150; GFX9-NEXT:    s_waitcnt vmcnt(0)
2151; GFX9-NEXT:    buffer_wbinvl1_vol
2152; GFX9-NEXT:    s_setpc_b64 s[30:31]
2153  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
2154  %result = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
2155  ret i32 %result
2156}
2157
2158; ---------------------------------------------------------------------
2159; atomicrmw nand
2160; ---------------------------------------------------------------------
2161
2162define void @global_atomic_nand_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
2163; SI-LABEL: global_atomic_nand_i32_noret:
2164; SI:       ; %bb.0:
2165; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2166; SI-NEXT:    s_mov_b32 s6, 0
2167; SI-NEXT:    s_mov_b32 s7, 0xf000
2168; SI-NEXT:    s_mov_b32 s4, s6
2169; SI-NEXT:    s_mov_b32 s5, s6
2170; SI-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
2171; SI-NEXT:    s_mov_b64 s[8:9], 0
2172; SI-NEXT:  .LBB51_1: ; %atomicrmw.start
2173; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
2174; SI-NEXT:    s_waitcnt vmcnt(0)
2175; SI-NEXT:    v_and_b32_e32 v3, v4, v2
2176; SI-NEXT:    v_not_b32_e32 v3, v3
2177; SI-NEXT:    s_waitcnt expcnt(0)
2178; SI-NEXT:    v_mov_b32_e32 v6, v4
2179; SI-NEXT:    v_mov_b32_e32 v5, v3
2180; SI-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
2181; SI-NEXT:    s_waitcnt vmcnt(0)
2182; SI-NEXT:    buffer_wbinvl1
2183; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
2184; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
2185; SI-NEXT:    v_mov_b32_e32 v4, v5
2186; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
2187; SI-NEXT:    s_cbranch_execnz .LBB51_1
2188; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
2189; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
2190; SI-NEXT:    s_waitcnt expcnt(0)
2191; SI-NEXT:    s_setpc_b64 s[30:31]
2192;
2193; VI-LABEL: global_atomic_nand_i32_noret:
2194; VI:       ; %bb.0:
2195; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2196; VI-NEXT:    flat_load_dword v4, v[0:1]
2197; VI-NEXT:    s_mov_b64 s[4:5], 0
2198; VI-NEXT:  .LBB51_1: ; %atomicrmw.start
2199; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
2200; VI-NEXT:    s_waitcnt vmcnt(0)
2201; VI-NEXT:    v_and_b32_e32 v3, v4, v2
2202; VI-NEXT:    v_not_b32_e32 v3, v3
2203; VI-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
2204; VI-NEXT:    s_waitcnt vmcnt(0)
2205; VI-NEXT:    buffer_wbinvl1_vol
2206; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
2207; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2208; VI-NEXT:    v_mov_b32_e32 v4, v3
2209; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2210; VI-NEXT:    s_cbranch_execnz .LBB51_1
2211; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
2212; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
2213; VI-NEXT:    s_setpc_b64 s[30:31]
2214;
2215; GFX9-LABEL: global_atomic_nand_i32_noret:
2216; GFX9:       ; %bb.0:
2217; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2218; GFX9-NEXT:    global_load_dword v4, v[0:1], off
2219; GFX9-NEXT:    s_mov_b64 s[4:5], 0
2220; GFX9-NEXT:  .LBB51_1: ; %atomicrmw.start
2221; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
2222; GFX9-NEXT:    s_waitcnt vmcnt(0)
2223; GFX9-NEXT:    v_and_b32_e32 v3, v4, v2
2224; GFX9-NEXT:    v_not_b32_e32 v3, v3
2225; GFX9-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
2226; GFX9-NEXT:    s_waitcnt vmcnt(0)
2227; GFX9-NEXT:    buffer_wbinvl1_vol
2228; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
2229; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2230; GFX9-NEXT:    v_mov_b32_e32 v4, v3
2231; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2232; GFX9-NEXT:    s_cbranch_execnz .LBB51_1
2233; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
2234; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
2235; GFX9-NEXT:    s_setpc_b64 s[30:31]
2236  %tmp0 = atomicrmw nand ptr addrspace(1) %ptr, i32 %in seq_cst
2237  ret void
2238}
2239
2240define void @global_atomic_nand_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
2241; SI-LABEL: global_atomic_nand_i32_noret_offset:
2242; SI:       ; %bb.0:
2243; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2244; SI-NEXT:    s_mov_b32 s6, 0
2245; SI-NEXT:    s_mov_b32 s7, 0xf000
2246; SI-NEXT:    s_mov_b32 s4, s6
2247; SI-NEXT:    s_mov_b32 s5, s6
2248; SI-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
2249; SI-NEXT:    s_mov_b64 s[8:9], 0
2250; SI-NEXT:  .LBB52_1: ; %atomicrmw.start
2251; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
2252; SI-NEXT:    s_waitcnt vmcnt(0)
2253; SI-NEXT:    v_and_b32_e32 v3, v4, v2
2254; SI-NEXT:    v_not_b32_e32 v3, v3
2255; SI-NEXT:    s_waitcnt expcnt(0)
2256; SI-NEXT:    v_mov_b32_e32 v6, v4
2257; SI-NEXT:    v_mov_b32_e32 v5, v3
2258; SI-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
2259; SI-NEXT:    s_waitcnt vmcnt(0)
2260; SI-NEXT:    buffer_wbinvl1
2261; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
2262; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
2263; SI-NEXT:    v_mov_b32_e32 v4, v5
2264; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
2265; SI-NEXT:    s_cbranch_execnz .LBB52_1
2266; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
2267; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
2268; SI-NEXT:    s_waitcnt expcnt(0)
2269; SI-NEXT:    s_setpc_b64 s[30:31]
2270;
2271; VI-LABEL: global_atomic_nand_i32_noret_offset:
2272; VI:       ; %bb.0:
2273; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2274; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
2275; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2276; VI-NEXT:    flat_load_dword v4, v[0:1]
2277; VI-NEXT:    s_mov_b64 s[4:5], 0
2278; VI-NEXT:  .LBB52_1: ; %atomicrmw.start
2279; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
2280; VI-NEXT:    s_waitcnt vmcnt(0)
2281; VI-NEXT:    v_and_b32_e32 v3, v4, v2
2282; VI-NEXT:    v_not_b32_e32 v3, v3
2283; VI-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
2284; VI-NEXT:    s_waitcnt vmcnt(0)
2285; VI-NEXT:    buffer_wbinvl1_vol
2286; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
2287; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2288; VI-NEXT:    v_mov_b32_e32 v4, v3
2289; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2290; VI-NEXT:    s_cbranch_execnz .LBB52_1
2291; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
2292; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
2293; VI-NEXT:    s_setpc_b64 s[30:31]
2294;
2295; GFX9-LABEL: global_atomic_nand_i32_noret_offset:
2296; GFX9:       ; %bb.0:
2297; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2298; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:16
2299; GFX9-NEXT:    s_mov_b64 s[4:5], 0
2300; GFX9-NEXT:  .LBB52_1: ; %atomicrmw.start
2301; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
2302; GFX9-NEXT:    s_waitcnt vmcnt(0)
2303; GFX9-NEXT:    v_and_b32_e32 v3, v4, v2
2304; GFX9-NEXT:    v_not_b32_e32 v3, v3
2305; GFX9-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
2306; GFX9-NEXT:    s_waitcnt vmcnt(0)
2307; GFX9-NEXT:    buffer_wbinvl1_vol
2308; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
2309; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2310; GFX9-NEXT:    v_mov_b32_e32 v4, v3
2311; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2312; GFX9-NEXT:    s_cbranch_execnz .LBB52_1
2313; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
2314; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
2315; GFX9-NEXT:    s_setpc_b64 s[30:31]
2316  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
2317  %tmp0 = atomicrmw nand ptr addrspace(1) %gep, i32 %in seq_cst
2318  ret void
2319}
2320
2321define i32 @global_atomic_nand_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
2322; SI-LABEL: global_atomic_nand_i32_ret:
2323; SI:       ; %bb.0:
2324; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2325; SI-NEXT:    s_mov_b32 s6, 0
2326; SI-NEXT:    s_mov_b32 s7, 0xf000
2327; SI-NEXT:    s_mov_b32 s4, s6
2328; SI-NEXT:    s_mov_b32 s5, s6
2329; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
2330; SI-NEXT:    s_mov_b64 s[8:9], 0
2331; SI-NEXT:  .LBB53_1: ; %atomicrmw.start
2332; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
2333; SI-NEXT:    s_waitcnt vmcnt(0)
2334; SI-NEXT:    v_mov_b32_e32 v5, v3
2335; SI-NEXT:    s_waitcnt expcnt(0)
2336; SI-NEXT:    v_and_b32_e32 v3, v5, v2
2337; SI-NEXT:    v_not_b32_e32 v4, v3
2338; SI-NEXT:    v_mov_b32_e32 v3, v4
2339; SI-NEXT:    v_mov_b32_e32 v4, v5
2340; SI-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
2341; SI-NEXT:    s_waitcnt vmcnt(0)
2342; SI-NEXT:    buffer_wbinvl1
2343; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
2344; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
2345; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
2346; SI-NEXT:    s_cbranch_execnz .LBB53_1
2347; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
2348; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
2349; SI-NEXT:    v_mov_b32_e32 v0, v3
2350; SI-NEXT:    s_waitcnt expcnt(0)
2351; SI-NEXT:    s_setpc_b64 s[30:31]
2352;
2353; VI-LABEL: global_atomic_nand_i32_ret:
2354; VI:       ; %bb.0:
2355; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2356; VI-NEXT:    flat_load_dword v3, v[0:1]
2357; VI-NEXT:    s_mov_b64 s[4:5], 0
2358; VI-NEXT:  .LBB53_1: ; %atomicrmw.start
2359; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
2360; VI-NEXT:    s_waitcnt vmcnt(0)
2361; VI-NEXT:    v_mov_b32_e32 v4, v3
2362; VI-NEXT:    v_and_b32_e32 v3, v4, v2
2363; VI-NEXT:    v_not_b32_e32 v3, v3
2364; VI-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
2365; VI-NEXT:    s_waitcnt vmcnt(0)
2366; VI-NEXT:    buffer_wbinvl1_vol
2367; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
2368; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2369; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2370; VI-NEXT:    s_cbranch_execnz .LBB53_1
2371; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
2372; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
2373; VI-NEXT:    v_mov_b32_e32 v0, v3
2374; VI-NEXT:    s_setpc_b64 s[30:31]
2375;
2376; GFX9-LABEL: global_atomic_nand_i32_ret:
2377; GFX9:       ; %bb.0:
2378; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2379; GFX9-NEXT:    global_load_dword v3, v[0:1], off
2380; GFX9-NEXT:    s_mov_b64 s[4:5], 0
2381; GFX9-NEXT:  .LBB53_1: ; %atomicrmw.start
2382; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
2383; GFX9-NEXT:    s_waitcnt vmcnt(0)
2384; GFX9-NEXT:    v_mov_b32_e32 v4, v3
2385; GFX9-NEXT:    v_and_b32_e32 v3, v4, v2
2386; GFX9-NEXT:    v_not_b32_e32 v3, v3
2387; GFX9-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
2388; GFX9-NEXT:    s_waitcnt vmcnt(0)
2389; GFX9-NEXT:    buffer_wbinvl1_vol
2390; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
2391; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2392; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2393; GFX9-NEXT:    s_cbranch_execnz .LBB53_1
2394; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
2395; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
2396; GFX9-NEXT:    v_mov_b32_e32 v0, v3
2397; GFX9-NEXT:    s_setpc_b64 s[30:31]
2398  %result = atomicrmw nand ptr addrspace(1) %ptr, i32 %in seq_cst
2399  ret i32 %result
2400}
2401
2402define i32 @global_atomic_nand_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
2403; SI-LABEL: global_atomic_nand_i32_ret_offset:
2404; SI:       ; %bb.0:
2405; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2406; SI-NEXT:    s_mov_b32 s6, 0
2407; SI-NEXT:    s_mov_b32 s7, 0xf000
2408; SI-NEXT:    s_mov_b32 s4, s6
2409; SI-NEXT:    s_mov_b32 s5, s6
2410; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
2411; SI-NEXT:    s_mov_b64 s[8:9], 0
2412; SI-NEXT:  .LBB54_1: ; %atomicrmw.start
2413; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
2414; SI-NEXT:    s_waitcnt vmcnt(0)
2415; SI-NEXT:    v_mov_b32_e32 v5, v3
2416; SI-NEXT:    s_waitcnt expcnt(0)
2417; SI-NEXT:    v_and_b32_e32 v3, v5, v2
2418; SI-NEXT:    v_not_b32_e32 v4, v3
2419; SI-NEXT:    v_mov_b32_e32 v3, v4
2420; SI-NEXT:    v_mov_b32_e32 v4, v5
2421; SI-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
2422; SI-NEXT:    s_waitcnt vmcnt(0)
2423; SI-NEXT:    buffer_wbinvl1
2424; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
2425; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
2426; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
2427; SI-NEXT:    s_cbranch_execnz .LBB54_1
2428; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
2429; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
2430; SI-NEXT:    v_mov_b32_e32 v0, v3
2431; SI-NEXT:    s_waitcnt expcnt(0)
2432; SI-NEXT:    s_setpc_b64 s[30:31]
2433;
2434; VI-LABEL: global_atomic_nand_i32_ret_offset:
2435; VI:       ; %bb.0:
2436; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2437; VI-NEXT:    v_add_u32_e32 v3, vcc, 16, v0
2438; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
2439; VI-NEXT:    flat_load_dword v0, v[3:4]
2440; VI-NEXT:    s_mov_b64 s[4:5], 0
2441; VI-NEXT:  .LBB54_1: ; %atomicrmw.start
2442; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
2443; VI-NEXT:    s_waitcnt vmcnt(0)
2444; VI-NEXT:    v_mov_b32_e32 v1, v0
2445; VI-NEXT:    v_and_b32_e32 v0, v1, v2
2446; VI-NEXT:    v_not_b32_e32 v0, v0
2447; VI-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
2448; VI-NEXT:    s_waitcnt vmcnt(0)
2449; VI-NEXT:    buffer_wbinvl1_vol
2450; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
2451; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2452; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2453; VI-NEXT:    s_cbranch_execnz .LBB54_1
2454; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
2455; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
2456; VI-NEXT:    s_setpc_b64 s[30:31]
2457;
2458; GFX9-LABEL: global_atomic_nand_i32_ret_offset:
2459; GFX9:       ; %bb.0:
2460; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2461; GFX9-NEXT:    global_load_dword v3, v[0:1], off offset:16
2462; GFX9-NEXT:    s_mov_b64 s[4:5], 0
2463; GFX9-NEXT:  .LBB54_1: ; %atomicrmw.start
2464; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
2465; GFX9-NEXT:    s_waitcnt vmcnt(0)
2466; GFX9-NEXT:    v_mov_b32_e32 v4, v3
2467; GFX9-NEXT:    v_and_b32_e32 v3, v4, v2
2468; GFX9-NEXT:    v_not_b32_e32 v3, v3
2469; GFX9-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
2470; GFX9-NEXT:    s_waitcnt vmcnt(0)
2471; GFX9-NEXT:    buffer_wbinvl1_vol
2472; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
2473; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2474; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2475; GFX9-NEXT:    s_cbranch_execnz .LBB54_1
2476; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
2477; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
2478; GFX9-NEXT:    v_mov_b32_e32 v0, v3
2479; GFX9-NEXT:    s_setpc_b64 s[30:31]
2480  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
2481  %result = atomicrmw nand ptr addrspace(1) %gep, i32 %in seq_cst
2482  ret i32 %result
2483}
2484
2485define amdgpu_gfx void @global_atomic_nand_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
2486; SI-LABEL: global_atomic_nand_i32_noret_scalar:
2487; SI:       ; %bb.0:
2488; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2489; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
2490; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
2491; SI-NEXT:    s_mov_b64 exec, s[34:35]
2492; SI-NEXT:    s_waitcnt expcnt(0)
2493; SI-NEXT:    v_writelane_b32 v4, s6, 0
2494; SI-NEXT:    v_writelane_b32 v4, s7, 1
2495; SI-NEXT:    s_mov_b32 s34, s6
2496; SI-NEXT:    s_mov_b32 s7, 0xf000
2497; SI-NEXT:    s_mov_b32 s6, -1
2498; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
2499; SI-NEXT:    s_mov_b64 s[36:37], 0
2500; SI-NEXT:  .LBB55_1: ; %atomicrmw.start
2501; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
2502; SI-NEXT:    s_waitcnt vmcnt(0)
2503; SI-NEXT:    v_and_b32_e32 v0, s34, v1
2504; SI-NEXT:    v_not_b32_e32 v0, v0
2505; SI-NEXT:    s_waitcnt expcnt(0)
2506; SI-NEXT:    v_mov_b32_e32 v3, v1
2507; SI-NEXT:    v_mov_b32_e32 v2, v0
2508; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
2509; SI-NEXT:    s_waitcnt vmcnt(0)
2510; SI-NEXT:    buffer_wbinvl1
2511; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
2512; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
2513; SI-NEXT:    v_mov_b32_e32 v1, v2
2514; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
2515; SI-NEXT:    s_cbranch_execnz .LBB55_1
2516; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
2517; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
2518; SI-NEXT:    v_readlane_b32 s7, v4, 1
2519; SI-NEXT:    v_readlane_b32 s6, v4, 0
2520; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
2521; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
2522; SI-NEXT:    s_mov_b64 exec, s[34:35]
2523; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
2524; SI-NEXT:    s_setpc_b64 s[30:31]
2525;
2526; VI-LABEL: global_atomic_nand_i32_noret_scalar:
2527; VI:       ; %bb.0:
2528; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2529; VI-NEXT:    v_mov_b32_e32 v0, s4
2530; VI-NEXT:    v_mov_b32_e32 v1, s5
2531; VI-NEXT:    flat_load_dword v3, v[0:1]
2532; VI-NEXT:    s_mov_b64 s[34:35], 0
2533; VI-NEXT:  .LBB55_1: ; %atomicrmw.start
2534; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
2535; VI-NEXT:    s_waitcnt vmcnt(0)
2536; VI-NEXT:    v_and_b32_e32 v2, s6, v3
2537; VI-NEXT:    v_not_b32_e32 v2, v2
2538; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2539; VI-NEXT:    s_waitcnt vmcnt(0)
2540; VI-NEXT:    buffer_wbinvl1_vol
2541; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
2542; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
2543; VI-NEXT:    v_mov_b32_e32 v3, v2
2544; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
2545; VI-NEXT:    s_cbranch_execnz .LBB55_1
2546; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
2547; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
2548; VI-NEXT:    s_setpc_b64 s[30:31]
2549;
2550; GFX9-LABEL: global_atomic_nand_i32_noret_scalar:
2551; GFX9:       ; %bb.0:
2552; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2553; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2554; GFX9-NEXT:    global_load_dword v1, v2, s[4:5]
2555; GFX9-NEXT:    s_mov_b64 s[34:35], 0
2556; GFX9-NEXT:  .LBB55_1: ; %atomicrmw.start
2557; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
2558; GFX9-NEXT:    s_waitcnt vmcnt(0)
2559; GFX9-NEXT:    v_and_b32_e32 v0, s6, v1
2560; GFX9-NEXT:    v_not_b32_e32 v0, v0
2561; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc
2562; GFX9-NEXT:    s_waitcnt vmcnt(0)
2563; GFX9-NEXT:    buffer_wbinvl1_vol
2564; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
2565; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
2566; GFX9-NEXT:    v_mov_b32_e32 v1, v0
2567; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
2568; GFX9-NEXT:    s_cbranch_execnz .LBB55_1
2569; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
2570; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
2571; GFX9-NEXT:    s_setpc_b64 s[30:31]
2572  %tmp0 = atomicrmw nand ptr addrspace(1) %ptr, i32 %in seq_cst
2573  ret void
2574}
2575
2576define amdgpu_gfx void @global_atomic_nand_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
2577; SI-LABEL: global_atomic_nand_i32_noret_offset_scalar:
2578; SI:       ; %bb.0:
2579; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2580; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
2581; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
2582; SI-NEXT:    s_mov_b64 exec, s[34:35]
2583; SI-NEXT:    s_waitcnt expcnt(0)
2584; SI-NEXT:    v_writelane_b32 v4, s6, 0
2585; SI-NEXT:    v_writelane_b32 v4, s7, 1
2586; SI-NEXT:    s_mov_b32 s34, s6
2587; SI-NEXT:    s_mov_b32 s7, 0xf000
2588; SI-NEXT:    s_mov_b32 s6, -1
2589; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0 offset:16
2590; SI-NEXT:    s_mov_b64 s[36:37], 0
2591; SI-NEXT:  .LBB56_1: ; %atomicrmw.start
2592; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
2593; SI-NEXT:    s_waitcnt vmcnt(0)
2594; SI-NEXT:    v_and_b32_e32 v0, s34, v1
2595; SI-NEXT:    v_not_b32_e32 v0, v0
2596; SI-NEXT:    s_waitcnt expcnt(0)
2597; SI-NEXT:    v_mov_b32_e32 v3, v1
2598; SI-NEXT:    v_mov_b32_e32 v2, v0
2599; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
2600; SI-NEXT:    s_waitcnt vmcnt(0)
2601; SI-NEXT:    buffer_wbinvl1
2602; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
2603; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
2604; SI-NEXT:    v_mov_b32_e32 v1, v2
2605; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
2606; SI-NEXT:    s_cbranch_execnz .LBB56_1
2607; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
2608; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
2609; SI-NEXT:    v_readlane_b32 s7, v4, 1
2610; SI-NEXT:    v_readlane_b32 s6, v4, 0
2611; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
2612; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
2613; SI-NEXT:    s_mov_b64 exec, s[34:35]
2614; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
2615; SI-NEXT:    s_setpc_b64 s[30:31]
2616;
2617; VI-LABEL: global_atomic_nand_i32_noret_offset_scalar:
2618; VI:       ; %bb.0:
2619; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2620; VI-NEXT:    s_add_u32 s34, s4, 16
2621; VI-NEXT:    s_addc_u32 s35, s5, 0
2622; VI-NEXT:    v_mov_b32_e32 v0, s34
2623; VI-NEXT:    v_mov_b32_e32 v1, s35
2624; VI-NEXT:    flat_load_dword v3, v[0:1]
2625; VI-NEXT:    s_mov_b64 s[34:35], 0
2626; VI-NEXT:  .LBB56_1: ; %atomicrmw.start
2627; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
2628; VI-NEXT:    s_waitcnt vmcnt(0)
2629; VI-NEXT:    v_and_b32_e32 v2, s6, v3
2630; VI-NEXT:    v_not_b32_e32 v2, v2
2631; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2632; VI-NEXT:    s_waitcnt vmcnt(0)
2633; VI-NEXT:    buffer_wbinvl1_vol
2634; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
2635; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
2636; VI-NEXT:    v_mov_b32_e32 v3, v2
2637; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
2638; VI-NEXT:    s_cbranch_execnz .LBB56_1
2639; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
2640; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
2641; VI-NEXT:    s_setpc_b64 s[30:31]
2642;
2643; GFX9-LABEL: global_atomic_nand_i32_noret_offset_scalar:
2644; GFX9:       ; %bb.0:
2645; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2646; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2647; GFX9-NEXT:    global_load_dword v1, v2, s[4:5] offset:16
2648; GFX9-NEXT:    s_mov_b64 s[34:35], 0
2649; GFX9-NEXT:  .LBB56_1: ; %atomicrmw.start
2650; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
2651; GFX9-NEXT:    s_waitcnt vmcnt(0)
2652; GFX9-NEXT:    v_and_b32_e32 v0, s6, v1
2653; GFX9-NEXT:    v_not_b32_e32 v0, v0
2654; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc
2655; GFX9-NEXT:    s_waitcnt vmcnt(0)
2656; GFX9-NEXT:    buffer_wbinvl1_vol
2657; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
2658; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
2659; GFX9-NEXT:    v_mov_b32_e32 v1, v0
2660; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
2661; GFX9-NEXT:    s_cbranch_execnz .LBB56_1
2662; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
2663; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
2664; GFX9-NEXT:    s_setpc_b64 s[30:31]
2665  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
2666  %tmp0 = atomicrmw nand ptr addrspace(1) %gep, i32 %in seq_cst
2667  ret void
2668}
2669
2670define amdgpu_gfx i32 @global_atomic_nand_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
2671; SI-LABEL: global_atomic_nand_i32_ret_scalar:
2672; SI:       ; %bb.0:
2673; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2674; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
2675; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
2676; SI-NEXT:    s_mov_b64 exec, s[34:35]
2677; SI-NEXT:    s_waitcnt expcnt(0)
2678; SI-NEXT:    v_writelane_b32 v3, s6, 0
2679; SI-NEXT:    v_writelane_b32 v3, s7, 1
2680; SI-NEXT:    s_mov_b32 s34, s6
2681; SI-NEXT:    s_mov_b32 s7, 0xf000
2682; SI-NEXT:    s_mov_b32 s6, -1
2683; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
2684; SI-NEXT:    s_mov_b64 s[36:37], 0
2685; SI-NEXT:  .LBB57_1: ; %atomicrmw.start
2686; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
2687; SI-NEXT:    s_waitcnt vmcnt(0)
2688; SI-NEXT:    v_mov_b32_e32 v2, v0
2689; SI-NEXT:    s_waitcnt expcnt(0)
2690; SI-NEXT:    v_and_b32_e32 v0, s34, v2
2691; SI-NEXT:    v_not_b32_e32 v1, v0
2692; SI-NEXT:    v_mov_b32_e32 v0, v1
2693; SI-NEXT:    v_mov_b32_e32 v1, v2
2694; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
2695; SI-NEXT:    s_waitcnt vmcnt(0)
2696; SI-NEXT:    buffer_wbinvl1
2697; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
2698; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
2699; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
2700; SI-NEXT:    s_cbranch_execnz .LBB57_1
2701; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
2702; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
2703; SI-NEXT:    v_readlane_b32 s7, v3, 1
2704; SI-NEXT:    v_readlane_b32 s6, v3, 0
2705; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
2706; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
2707; SI-NEXT:    s_mov_b64 exec, s[34:35]
2708; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
2709; SI-NEXT:    s_setpc_b64 s[30:31]
2710;
2711; VI-LABEL: global_atomic_nand_i32_ret_scalar:
2712; VI:       ; %bb.0:
2713; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2714; VI-NEXT:    v_mov_b32_e32 v0, s4
2715; VI-NEXT:    v_mov_b32_e32 v1, s5
2716; VI-NEXT:    flat_load_dword v0, v[0:1]
2717; VI-NEXT:    v_mov_b32_e32 v1, s4
2718; VI-NEXT:    s_mov_b64 s[34:35], 0
2719; VI-NEXT:    v_mov_b32_e32 v2, s5
2720; VI-NEXT:  .LBB57_1: ; %atomicrmw.start
2721; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
2722; VI-NEXT:    s_waitcnt vmcnt(0)
2723; VI-NEXT:    v_mov_b32_e32 v4, v0
2724; VI-NEXT:    v_and_b32_e32 v0, s6, v4
2725; VI-NEXT:    v_not_b32_e32 v3, v0
2726; VI-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
2727; VI-NEXT:    s_waitcnt vmcnt(0)
2728; VI-NEXT:    buffer_wbinvl1_vol
2729; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
2730; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
2731; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
2732; VI-NEXT:    s_cbranch_execnz .LBB57_1
2733; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
2734; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
2735; VI-NEXT:    s_setpc_b64 s[30:31]
2736;
2737; GFX9-LABEL: global_atomic_nand_i32_ret_scalar:
2738; GFX9:       ; %bb.0:
2739; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2740; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2741; GFX9-NEXT:    global_load_dword v0, v1, s[4:5]
2742; GFX9-NEXT:    s_mov_b64 s[34:35], 0
2743; GFX9-NEXT:  .LBB57_1: ; %atomicrmw.start
2744; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
2745; GFX9-NEXT:    s_waitcnt vmcnt(0)
2746; GFX9-NEXT:    v_mov_b32_e32 v3, v0
2747; GFX9-NEXT:    v_and_b32_e32 v0, s6, v3
2748; GFX9-NEXT:    v_not_b32_e32 v2, v0
2749; GFX9-NEXT:    global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc
2750; GFX9-NEXT:    s_waitcnt vmcnt(0)
2751; GFX9-NEXT:    buffer_wbinvl1_vol
2752; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
2753; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
2754; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
2755; GFX9-NEXT:    s_cbranch_execnz .LBB57_1
2756; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
2757; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
2758; GFX9-NEXT:    s_setpc_b64 s[30:31]
2759  %result = atomicrmw nand ptr addrspace(1) %ptr, i32 %in seq_cst
2760  ret i32 %result
2761}
2762
2763define amdgpu_gfx i32 @global_atomic_nand_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
2764; SI-LABEL: global_atomic_nand_i32_ret_offset_scalar:
2765; SI:       ; %bb.0:
2766; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2767; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
2768; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
2769; SI-NEXT:    s_mov_b64 exec, s[34:35]
2770; SI-NEXT:    s_waitcnt expcnt(0)
2771; SI-NEXT:    v_writelane_b32 v3, s6, 0
2772; SI-NEXT:    v_writelane_b32 v3, s7, 1
2773; SI-NEXT:    s_mov_b32 s34, s6
2774; SI-NEXT:    s_mov_b32 s7, 0xf000
2775; SI-NEXT:    s_mov_b32 s6, -1
2776; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0 offset:16
2777; SI-NEXT:    s_mov_b64 s[36:37], 0
2778; SI-NEXT:  .LBB58_1: ; %atomicrmw.start
2779; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
2780; SI-NEXT:    s_waitcnt vmcnt(0)
2781; SI-NEXT:    v_mov_b32_e32 v2, v0
2782; SI-NEXT:    s_waitcnt expcnt(0)
2783; SI-NEXT:    v_and_b32_e32 v0, s34, v2
2784; SI-NEXT:    v_not_b32_e32 v1, v0
2785; SI-NEXT:    v_mov_b32_e32 v0, v1
2786; SI-NEXT:    v_mov_b32_e32 v1, v2
2787; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
2788; SI-NEXT:    s_waitcnt vmcnt(0)
2789; SI-NEXT:    buffer_wbinvl1
2790; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
2791; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
2792; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
2793; SI-NEXT:    s_cbranch_execnz .LBB58_1
2794; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
2795; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
2796; SI-NEXT:    v_readlane_b32 s7, v3, 1
2797; SI-NEXT:    v_readlane_b32 s6, v3, 0
2798; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
2799; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
2800; SI-NEXT:    s_mov_b64 exec, s[34:35]
2801; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
2802; SI-NEXT:    s_setpc_b64 s[30:31]
2803;
2804; VI-LABEL: global_atomic_nand_i32_ret_offset_scalar:
2805; VI:       ; %bb.0:
2806; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2807; VI-NEXT:    s_add_u32 s34, s4, 16
2808; VI-NEXT:    s_addc_u32 s35, s5, 0
2809; VI-NEXT:    v_mov_b32_e32 v1, s34
2810; VI-NEXT:    v_mov_b32_e32 v2, s35
2811; VI-NEXT:    flat_load_dword v0, v[1:2]
2812; VI-NEXT:    s_mov_b64 s[34:35], 0
2813; VI-NEXT:  .LBB58_1: ; %atomicrmw.start
2814; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
2815; VI-NEXT:    s_waitcnt vmcnt(0)
2816; VI-NEXT:    v_mov_b32_e32 v4, v0
2817; VI-NEXT:    v_and_b32_e32 v0, s6, v4
2818; VI-NEXT:    v_not_b32_e32 v3, v0
2819; VI-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
2820; VI-NEXT:    s_waitcnt vmcnt(0)
2821; VI-NEXT:    buffer_wbinvl1_vol
2822; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
2823; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
2824; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
2825; VI-NEXT:    s_cbranch_execnz .LBB58_1
2826; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
2827; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
2828; VI-NEXT:    s_setpc_b64 s[30:31]
2829;
2830; GFX9-LABEL: global_atomic_nand_i32_ret_offset_scalar:
2831; GFX9:       ; %bb.0:
2832; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2833; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2834; GFX9-NEXT:    global_load_dword v0, v1, s[4:5] offset:16
2835; GFX9-NEXT:    s_mov_b64 s[34:35], 0
2836; GFX9-NEXT:  .LBB58_1: ; %atomicrmw.start
2837; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
2838; GFX9-NEXT:    s_waitcnt vmcnt(0)
2839; GFX9-NEXT:    v_mov_b32_e32 v3, v0
2840; GFX9-NEXT:    v_and_b32_e32 v0, s6, v3
2841; GFX9-NEXT:    v_not_b32_e32 v2, v0
2842; GFX9-NEXT:    global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc
2843; GFX9-NEXT:    s_waitcnt vmcnt(0)
2844; GFX9-NEXT:    buffer_wbinvl1_vol
2845; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
2846; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
2847; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
2848; GFX9-NEXT:    s_cbranch_execnz .LBB58_1
2849; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
2850; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
2851; GFX9-NEXT:    s_setpc_b64 s[30:31]
2852  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
2853  %result = atomicrmw nand ptr addrspace(1) %gep, i32 %in seq_cst
2854  ret i32 %result
2855}
2856
2857define void @global_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
2858; SI-LABEL: global_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory:
2859; SI:       ; %bb.0:
2860; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2861; SI-NEXT:    s_mov_b32 s6, 0
2862; SI-NEXT:    s_mov_b32 s7, 0xf000
2863; SI-NEXT:    s_mov_b32 s4, s6
2864; SI-NEXT:    s_mov_b32 s5, s6
2865; SI-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
2866; SI-NEXT:    s_mov_b64 s[8:9], 0
2867; SI-NEXT:  .LBB59_1: ; %atomicrmw.start
2868; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
2869; SI-NEXT:    s_waitcnt vmcnt(0)
2870; SI-NEXT:    v_and_b32_e32 v3, v4, v2
2871; SI-NEXT:    v_not_b32_e32 v3, v3
2872; SI-NEXT:    s_waitcnt expcnt(0)
2873; SI-NEXT:    v_mov_b32_e32 v6, v4
2874; SI-NEXT:    v_mov_b32_e32 v5, v3
2875; SI-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
2876; SI-NEXT:    s_waitcnt vmcnt(0)
2877; SI-NEXT:    buffer_wbinvl1
2878; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
2879; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
2880; SI-NEXT:    v_mov_b32_e32 v4, v5
2881; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
2882; SI-NEXT:    s_cbranch_execnz .LBB59_1
2883; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
2884; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
2885; SI-NEXT:    s_waitcnt expcnt(0)
2886; SI-NEXT:    s_setpc_b64 s[30:31]
2887;
2888; VI-LABEL: global_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory:
2889; VI:       ; %bb.0:
2890; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2891; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
2892; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2893; VI-NEXT:    flat_load_dword v4, v[0:1]
2894; VI-NEXT:    s_mov_b64 s[4:5], 0
2895; VI-NEXT:  .LBB59_1: ; %atomicrmw.start
2896; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
2897; VI-NEXT:    s_waitcnt vmcnt(0)
2898; VI-NEXT:    v_and_b32_e32 v3, v4, v2
2899; VI-NEXT:    v_not_b32_e32 v3, v3
2900; VI-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
2901; VI-NEXT:    s_waitcnt vmcnt(0)
2902; VI-NEXT:    buffer_wbinvl1_vol
2903; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
2904; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2905; VI-NEXT:    v_mov_b32_e32 v4, v3
2906; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2907; VI-NEXT:    s_cbranch_execnz .LBB59_1
2908; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
2909; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
2910; VI-NEXT:    s_setpc_b64 s[30:31]
2911;
2912; GFX9-LABEL: global_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory:
2913; GFX9:       ; %bb.0:
2914; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2915; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:16
2916; GFX9-NEXT:    s_mov_b64 s[4:5], 0
2917; GFX9-NEXT:  .LBB59_1: ; %atomicrmw.start
2918; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
2919; GFX9-NEXT:    s_waitcnt vmcnt(0)
2920; GFX9-NEXT:    v_and_b32_e32 v3, v4, v2
2921; GFX9-NEXT:    v_not_b32_e32 v3, v3
2922; GFX9-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
2923; GFX9-NEXT:    s_waitcnt vmcnt(0)
2924; GFX9-NEXT:    buffer_wbinvl1_vol
2925; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
2926; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2927; GFX9-NEXT:    v_mov_b32_e32 v4, v3
2928; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2929; GFX9-NEXT:    s_cbranch_execnz .LBB59_1
2930; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
2931; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
2932; GFX9-NEXT:    s_setpc_b64 s[30:31]
2933  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
2934  %tmp0 = atomicrmw nand ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
2935  ret void
2936}
2937
2938define i32 @global_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
2939; SI-LABEL: global_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory:
2940; SI:       ; %bb.0:
2941; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2942; SI-NEXT:    s_mov_b32 s6, 0
2943; SI-NEXT:    s_mov_b32 s7, 0xf000
2944; SI-NEXT:    s_mov_b32 s4, s6
2945; SI-NEXT:    s_mov_b32 s5, s6
2946; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
2947; SI-NEXT:    s_mov_b64 s[8:9], 0
2948; SI-NEXT:  .LBB60_1: ; %atomicrmw.start
2949; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
2950; SI-NEXT:    s_waitcnt vmcnt(0)
2951; SI-NEXT:    v_mov_b32_e32 v5, v3
2952; SI-NEXT:    s_waitcnt expcnt(0)
2953; SI-NEXT:    v_and_b32_e32 v3, v5, v2
2954; SI-NEXT:    v_not_b32_e32 v4, v3
2955; SI-NEXT:    v_mov_b32_e32 v3, v4
2956; SI-NEXT:    v_mov_b32_e32 v4, v5
2957; SI-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
2958; SI-NEXT:    s_waitcnt vmcnt(0)
2959; SI-NEXT:    buffer_wbinvl1
2960; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
2961; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
2962; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
2963; SI-NEXT:    s_cbranch_execnz .LBB60_1
2964; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
2965; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
2966; SI-NEXT:    v_mov_b32_e32 v0, v3
2967; SI-NEXT:    s_waitcnt expcnt(0)
2968; SI-NEXT:    s_setpc_b64 s[30:31]
2969;
2970; VI-LABEL: global_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory:
2971; VI:       ; %bb.0:
2972; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2973; VI-NEXT:    v_add_u32_e32 v3, vcc, 16, v0
2974; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
2975; VI-NEXT:    flat_load_dword v0, v[3:4]
2976; VI-NEXT:    s_mov_b64 s[4:5], 0
2977; VI-NEXT:  .LBB60_1: ; %atomicrmw.start
2978; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
2979; VI-NEXT:    s_waitcnt vmcnt(0)
2980; VI-NEXT:    v_mov_b32_e32 v1, v0
2981; VI-NEXT:    v_and_b32_e32 v0, v1, v2
2982; VI-NEXT:    v_not_b32_e32 v0, v0
2983; VI-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
2984; VI-NEXT:    s_waitcnt vmcnt(0)
2985; VI-NEXT:    buffer_wbinvl1_vol
2986; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
2987; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2988; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2989; VI-NEXT:    s_cbranch_execnz .LBB60_1
2990; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
2991; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
2992; VI-NEXT:    s_setpc_b64 s[30:31]
2993;
2994; GFX9-LABEL: global_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory:
2995; GFX9:       ; %bb.0:
2996; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2997; GFX9-NEXT:    global_load_dword v3, v[0:1], off offset:16
2998; GFX9-NEXT:    s_mov_b64 s[4:5], 0
2999; GFX9-NEXT:  .LBB60_1: ; %atomicrmw.start
3000; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
3001; GFX9-NEXT:    s_waitcnt vmcnt(0)
3002; GFX9-NEXT:    v_mov_b32_e32 v4, v3
3003; GFX9-NEXT:    v_and_b32_e32 v3, v4, v2
3004; GFX9-NEXT:    v_not_b32_e32 v3, v3
3005; GFX9-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
3006; GFX9-NEXT:    s_waitcnt vmcnt(0)
3007; GFX9-NEXT:    buffer_wbinvl1_vol
3008; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
3009; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3010; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3011; GFX9-NEXT:    s_cbranch_execnz .LBB60_1
3012; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
3013; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
3014; GFX9-NEXT:    v_mov_b32_e32 v0, v3
3015; GFX9-NEXT:    s_setpc_b64 s[30:31]
3016  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
3017  %result = atomicrmw nand ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
3018  ret i32 %result
3019}
3020
3021; ---------------------------------------------------------------------
3022; atomicrmw or
3023; ---------------------------------------------------------------------
3024
3025define void @global_atomic_or_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
3026; SI-LABEL: global_atomic_or_i32_noret:
3027; SI:       ; %bb.0:
3028; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3029; SI-NEXT:    s_mov_b32 s6, 0
3030; SI-NEXT:    s_mov_b32 s7, 0xf000
3031; SI-NEXT:    s_mov_b32 s4, s6
3032; SI-NEXT:    s_mov_b32 s5, s6
3033; SI-NEXT:    buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64
3034; SI-NEXT:    s_waitcnt vmcnt(0)
3035; SI-NEXT:    buffer_wbinvl1
3036; SI-NEXT:    s_waitcnt expcnt(0)
3037; SI-NEXT:    s_setpc_b64 s[30:31]
3038;
3039; VI-LABEL: global_atomic_or_i32_noret:
3040; VI:       ; %bb.0:
3041; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3042; VI-NEXT:    flat_atomic_or v[0:1], v2
3043; VI-NEXT:    s_waitcnt vmcnt(0)
3044; VI-NEXT:    buffer_wbinvl1_vol
3045; VI-NEXT:    s_setpc_b64 s[30:31]
3046;
3047; GFX9-LABEL: global_atomic_or_i32_noret:
3048; GFX9:       ; %bb.0:
3049; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3050; GFX9-NEXT:    global_atomic_or v[0:1], v2, off
3051; GFX9-NEXT:    s_waitcnt vmcnt(0)
3052; GFX9-NEXT:    buffer_wbinvl1_vol
3053; GFX9-NEXT:    s_setpc_b64 s[30:31]
3054  %tmp0 = atomicrmw or ptr addrspace(1) %ptr, i32 %in seq_cst
3055  ret void
3056}
3057
3058define void @global_atomic_or_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
3059; SI-LABEL: global_atomic_or_i32_noret_offset:
3060; SI:       ; %bb.0:
3061; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3062; SI-NEXT:    s_mov_b32 s6, 0
3063; SI-NEXT:    s_mov_b32 s7, 0xf000
3064; SI-NEXT:    s_mov_b32 s4, s6
3065; SI-NEXT:    s_mov_b32 s5, s6
3066; SI-NEXT:    buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 offset:16
3067; SI-NEXT:    s_waitcnt vmcnt(0)
3068; SI-NEXT:    buffer_wbinvl1
3069; SI-NEXT:    s_waitcnt expcnt(0)
3070; SI-NEXT:    s_setpc_b64 s[30:31]
3071;
3072; VI-LABEL: global_atomic_or_i32_noret_offset:
3073; VI:       ; %bb.0:
3074; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3075; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
3076; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3077; VI-NEXT:    flat_atomic_or v[0:1], v2
3078; VI-NEXT:    s_waitcnt vmcnt(0)
3079; VI-NEXT:    buffer_wbinvl1_vol
3080; VI-NEXT:    s_setpc_b64 s[30:31]
3081;
3082; GFX9-LABEL: global_atomic_or_i32_noret_offset:
3083; GFX9:       ; %bb.0:
3084; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3085; GFX9-NEXT:    global_atomic_or v[0:1], v2, off offset:16
3086; GFX9-NEXT:    s_waitcnt vmcnt(0)
3087; GFX9-NEXT:    buffer_wbinvl1_vol
3088; GFX9-NEXT:    s_setpc_b64 s[30:31]
3089  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
3090  %tmp0 = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst
3091  ret void
3092}
3093
3094define i32 @global_atomic_or_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
3095; SI-LABEL: global_atomic_or_i32_ret:
3096; SI:       ; %bb.0:
3097; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3098; SI-NEXT:    s_mov_b32 s6, 0
3099; SI-NEXT:    s_mov_b32 s7, 0xf000
3100; SI-NEXT:    s_mov_b32 s4, s6
3101; SI-NEXT:    s_mov_b32 s5, s6
3102; SI-NEXT:    buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 glc
3103; SI-NEXT:    s_waitcnt vmcnt(0)
3104; SI-NEXT:    buffer_wbinvl1
3105; SI-NEXT:    v_mov_b32_e32 v0, v2
3106; SI-NEXT:    s_waitcnt expcnt(0)
3107; SI-NEXT:    s_setpc_b64 s[30:31]
3108;
3109; VI-LABEL: global_atomic_or_i32_ret:
3110; VI:       ; %bb.0:
3111; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3112; VI-NEXT:    flat_atomic_or v0, v[0:1], v2 glc
3113; VI-NEXT:    s_waitcnt vmcnt(0)
3114; VI-NEXT:    buffer_wbinvl1_vol
3115; VI-NEXT:    s_setpc_b64 s[30:31]
3116;
3117; GFX9-LABEL: global_atomic_or_i32_ret:
3118; GFX9:       ; %bb.0:
3119; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3120; GFX9-NEXT:    global_atomic_or v0, v[0:1], v2, off glc
3121; GFX9-NEXT:    s_waitcnt vmcnt(0)
3122; GFX9-NEXT:    buffer_wbinvl1_vol
3123; GFX9-NEXT:    s_setpc_b64 s[30:31]
3124  %result = atomicrmw or ptr addrspace(1) %ptr, i32 %in seq_cst
3125  ret i32 %result
3126}
3127
3128define i32 @global_atomic_or_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
3129; SI-LABEL: global_atomic_or_i32_ret_offset:
3130; SI:       ; %bb.0:
3131; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3132; SI-NEXT:    s_mov_b32 s6, 0
3133; SI-NEXT:    s_mov_b32 s7, 0xf000
3134; SI-NEXT:    s_mov_b32 s4, s6
3135; SI-NEXT:    s_mov_b32 s5, s6
3136; SI-NEXT:    buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
3137; SI-NEXT:    s_waitcnt vmcnt(0)
3138; SI-NEXT:    buffer_wbinvl1
3139; SI-NEXT:    v_mov_b32_e32 v0, v2
3140; SI-NEXT:    s_waitcnt expcnt(0)
3141; SI-NEXT:    s_setpc_b64 s[30:31]
3142;
3143; VI-LABEL: global_atomic_or_i32_ret_offset:
3144; VI:       ; %bb.0:
3145; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3146; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
3147; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3148; VI-NEXT:    flat_atomic_or v0, v[0:1], v2 glc
3149; VI-NEXT:    s_waitcnt vmcnt(0)
3150; VI-NEXT:    buffer_wbinvl1_vol
3151; VI-NEXT:    s_setpc_b64 s[30:31]
3152;
3153; GFX9-LABEL: global_atomic_or_i32_ret_offset:
3154; GFX9:       ; %bb.0:
3155; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3156; GFX9-NEXT:    global_atomic_or v0, v[0:1], v2, off offset:16 glc
3157; GFX9-NEXT:    s_waitcnt vmcnt(0)
3158; GFX9-NEXT:    buffer_wbinvl1_vol
3159; GFX9-NEXT:    s_setpc_b64 s[30:31]
3160  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
3161  %result = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst
3162  ret i32 %result
3163}
3164
3165define amdgpu_gfx void @global_atomic_or_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
3166; SI-LABEL: global_atomic_or_i32_noret_scalar:
3167; SI:       ; %bb.0:
3168; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3169; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
3170; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
3171; SI-NEXT:    s_mov_b64 exec, s[34:35]
3172; SI-NEXT:    s_waitcnt expcnt(0)
3173; SI-NEXT:    v_writelane_b32 v1, s6, 0
3174; SI-NEXT:    v_writelane_b32 v1, s7, 1
3175; SI-NEXT:    s_mov_b32 s34, s6
3176; SI-NEXT:    s_mov_b32 s7, 0xf000
3177; SI-NEXT:    s_mov_b32 s6, -1
3178; SI-NEXT:    v_mov_b32_e32 v0, s34
3179; SI-NEXT:    s_waitcnt vmcnt(0)
3180; SI-NEXT:    buffer_atomic_or v0, off, s[4:7], 0
3181; SI-NEXT:    s_waitcnt vmcnt(0)
3182; SI-NEXT:    buffer_wbinvl1
3183; SI-NEXT:    v_readlane_b32 s7, v1, 1
3184; SI-NEXT:    v_readlane_b32 s6, v1, 0
3185; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
3186; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
3187; SI-NEXT:    s_mov_b64 exec, s[34:35]
3188; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
3189; SI-NEXT:    s_setpc_b64 s[30:31]
3190;
3191; VI-LABEL: global_atomic_or_i32_noret_scalar:
3192; VI:       ; %bb.0:
3193; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3194; VI-NEXT:    v_mov_b32_e32 v0, s4
3195; VI-NEXT:    v_mov_b32_e32 v1, s5
3196; VI-NEXT:    v_mov_b32_e32 v2, s6
3197; VI-NEXT:    flat_atomic_or v[0:1], v2
3198; VI-NEXT:    s_waitcnt vmcnt(0)
3199; VI-NEXT:    buffer_wbinvl1_vol
3200; VI-NEXT:    s_setpc_b64 s[30:31]
3201;
3202; GFX9-LABEL: global_atomic_or_i32_noret_scalar:
3203; GFX9:       ; %bb.0:
3204; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3205; GFX9-NEXT:    v_mov_b32_e32 v0, 0
3206; GFX9-NEXT:    v_mov_b32_e32 v1, s6
3207; GFX9-NEXT:    global_atomic_or v0, v1, s[4:5]
3208; GFX9-NEXT:    s_waitcnt vmcnt(0)
3209; GFX9-NEXT:    buffer_wbinvl1_vol
3210; GFX9-NEXT:    s_setpc_b64 s[30:31]
3211  %tmp0 = atomicrmw or ptr addrspace(1) %ptr, i32 %in seq_cst
3212  ret void
3213}
3214
3215define amdgpu_gfx void @global_atomic_or_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
3216; SI-LABEL: global_atomic_or_i32_noret_offset_scalar:
3217; SI:       ; %bb.0:
3218; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3219; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
3220; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
3221; SI-NEXT:    s_mov_b64 exec, s[34:35]
3222; SI-NEXT:    s_waitcnt expcnt(0)
3223; SI-NEXT:    v_writelane_b32 v1, s6, 0
3224; SI-NEXT:    v_writelane_b32 v1, s7, 1
3225; SI-NEXT:    s_mov_b32 s34, s6
3226; SI-NEXT:    s_mov_b32 s7, 0xf000
3227; SI-NEXT:    s_mov_b32 s6, -1
3228; SI-NEXT:    v_mov_b32_e32 v0, s34
3229; SI-NEXT:    s_waitcnt vmcnt(0)
3230; SI-NEXT:    buffer_atomic_or v0, off, s[4:7], 0 offset:16
3231; SI-NEXT:    s_waitcnt vmcnt(0)
3232; SI-NEXT:    buffer_wbinvl1
3233; SI-NEXT:    v_readlane_b32 s7, v1, 1
3234; SI-NEXT:    v_readlane_b32 s6, v1, 0
3235; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
3236; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
3237; SI-NEXT:    s_mov_b64 exec, s[34:35]
3238; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
3239; SI-NEXT:    s_setpc_b64 s[30:31]
3240;
3241; VI-LABEL: global_atomic_or_i32_noret_offset_scalar:
3242; VI:       ; %bb.0:
3243; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3244; VI-NEXT:    s_add_u32 s34, s4, 16
3245; VI-NEXT:    s_addc_u32 s35, s5, 0
3246; VI-NEXT:    v_mov_b32_e32 v0, s34
3247; VI-NEXT:    v_mov_b32_e32 v1, s35
3248; VI-NEXT:    v_mov_b32_e32 v2, s6
3249; VI-NEXT:    flat_atomic_or v[0:1], v2
3250; VI-NEXT:    s_waitcnt vmcnt(0)
3251; VI-NEXT:    buffer_wbinvl1_vol
3252; VI-NEXT:    s_setpc_b64 s[30:31]
3253;
3254; GFX9-LABEL: global_atomic_or_i32_noret_offset_scalar:
3255; GFX9:       ; %bb.0:
3256; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3257; GFX9-NEXT:    v_mov_b32_e32 v0, 0
3258; GFX9-NEXT:    v_mov_b32_e32 v1, s6
3259; GFX9-NEXT:    global_atomic_or v0, v1, s[4:5] offset:16
3260; GFX9-NEXT:    s_waitcnt vmcnt(0)
3261; GFX9-NEXT:    buffer_wbinvl1_vol
3262; GFX9-NEXT:    s_setpc_b64 s[30:31]
3263  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
3264  %tmp0 = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst
3265  ret void
3266}
3267
3268define amdgpu_gfx i32 @global_atomic_or_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
3269; SI-LABEL: global_atomic_or_i32_ret_scalar:
3270; SI:       ; %bb.0:
3271; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3272; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
3273; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
3274; SI-NEXT:    s_mov_b64 exec, s[34:35]
3275; SI-NEXT:    s_waitcnt expcnt(0)
3276; SI-NEXT:    v_writelane_b32 v1, s6, 0
3277; SI-NEXT:    v_writelane_b32 v1, s7, 1
3278; SI-NEXT:    s_mov_b32 s34, s6
3279; SI-NEXT:    s_mov_b32 s7, 0xf000
3280; SI-NEXT:    s_mov_b32 s6, -1
3281; SI-NEXT:    v_mov_b32_e32 v0, s34
3282; SI-NEXT:    s_waitcnt vmcnt(0)
3283; SI-NEXT:    buffer_atomic_or v0, off, s[4:7], 0 glc
3284; SI-NEXT:    s_waitcnt vmcnt(0)
3285; SI-NEXT:    buffer_wbinvl1
3286; SI-NEXT:    v_readlane_b32 s7, v1, 1
3287; SI-NEXT:    v_readlane_b32 s6, v1, 0
3288; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
3289; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
3290; SI-NEXT:    s_mov_b64 exec, s[34:35]
3291; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
3292; SI-NEXT:    s_setpc_b64 s[30:31]
3293;
3294; VI-LABEL: global_atomic_or_i32_ret_scalar:
3295; VI:       ; %bb.0:
3296; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3297; VI-NEXT:    v_mov_b32_e32 v0, s4
3298; VI-NEXT:    v_mov_b32_e32 v1, s5
3299; VI-NEXT:    v_mov_b32_e32 v2, s6
3300; VI-NEXT:    flat_atomic_or v0, v[0:1], v2 glc
3301; VI-NEXT:    s_waitcnt vmcnt(0)
3302; VI-NEXT:    buffer_wbinvl1_vol
3303; VI-NEXT:    s_setpc_b64 s[30:31]
3304;
3305; GFX9-LABEL: global_atomic_or_i32_ret_scalar:
3306; GFX9:       ; %bb.0:
3307; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3308; GFX9-NEXT:    v_mov_b32_e32 v0, 0
3309; GFX9-NEXT:    v_mov_b32_e32 v1, s6
3310; GFX9-NEXT:    global_atomic_or v0, v0, v1, s[4:5] glc
3311; GFX9-NEXT:    s_waitcnt vmcnt(0)
3312; GFX9-NEXT:    buffer_wbinvl1_vol
3313; GFX9-NEXT:    s_setpc_b64 s[30:31]
3314  %result = atomicrmw or ptr addrspace(1) %ptr, i32 %in seq_cst
3315  ret i32 %result
3316}
3317
3318define amdgpu_gfx i32 @global_atomic_or_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
3319; SI-LABEL: global_atomic_or_i32_ret_offset_scalar:
3320; SI:       ; %bb.0:
3321; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3322; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
3323; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
3324; SI-NEXT:    s_mov_b64 exec, s[34:35]
3325; SI-NEXT:    s_waitcnt expcnt(0)
3326; SI-NEXT:    v_writelane_b32 v1, s6, 0
3327; SI-NEXT:    v_writelane_b32 v1, s7, 1
3328; SI-NEXT:    s_mov_b32 s34, s6
3329; SI-NEXT:    s_mov_b32 s7, 0xf000
3330; SI-NEXT:    s_mov_b32 s6, -1
3331; SI-NEXT:    v_mov_b32_e32 v0, s34
3332; SI-NEXT:    s_waitcnt vmcnt(0)
3333; SI-NEXT:    buffer_atomic_or v0, off, s[4:7], 0 offset:16 glc
3334; SI-NEXT:    s_waitcnt vmcnt(0)
3335; SI-NEXT:    buffer_wbinvl1
3336; SI-NEXT:    v_readlane_b32 s7, v1, 1
3337; SI-NEXT:    v_readlane_b32 s6, v1, 0
3338; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
3339; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
3340; SI-NEXT:    s_mov_b64 exec, s[34:35]
3341; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
3342; SI-NEXT:    s_setpc_b64 s[30:31]
3343;
3344; VI-LABEL: global_atomic_or_i32_ret_offset_scalar:
3345; VI:       ; %bb.0:
3346; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3347; VI-NEXT:    s_add_u32 s34, s4, 16
3348; VI-NEXT:    s_addc_u32 s35, s5, 0
3349; VI-NEXT:    v_mov_b32_e32 v0, s34
3350; VI-NEXT:    v_mov_b32_e32 v1, s35
3351; VI-NEXT:    v_mov_b32_e32 v2, s6
3352; VI-NEXT:    flat_atomic_or v0, v[0:1], v2 glc
3353; VI-NEXT:    s_waitcnt vmcnt(0)
3354; VI-NEXT:    buffer_wbinvl1_vol
3355; VI-NEXT:    s_setpc_b64 s[30:31]
3356;
3357; GFX9-LABEL: global_atomic_or_i32_ret_offset_scalar:
3358; GFX9:       ; %bb.0:
3359; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3360; GFX9-NEXT:    v_mov_b32_e32 v0, 0
3361; GFX9-NEXT:    v_mov_b32_e32 v1, s6
3362; GFX9-NEXT:    global_atomic_or v0, v0, v1, s[4:5] offset:16 glc
3363; GFX9-NEXT:    s_waitcnt vmcnt(0)
3364; GFX9-NEXT:    buffer_wbinvl1_vol
3365; GFX9-NEXT:    s_setpc_b64 s[30:31]
3366  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
3367  %result = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst
3368  ret i32 %result
3369}
3370
3371define i32 @global_atomic_or_0_i32_ret(ptr addrspace(1) %ptr) {
3372; SI-LABEL: global_atomic_or_0_i32_ret:
3373; SI:       ; %bb.0:
3374; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3375; SI-NEXT:    s_mov_b32 s7, 0xf000
3376; SI-NEXT:    s_mov_b32 s6, 0
3377; SI-NEXT:    v_mov_b32_e32 v2, 0
3378; SI-NEXT:    s_mov_b32 s4, s6
3379; SI-NEXT:    s_mov_b32 s5, s6
3380; SI-NEXT:    buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 glc
3381; SI-NEXT:    s_waitcnt vmcnt(0)
3382; SI-NEXT:    buffer_wbinvl1
3383; SI-NEXT:    v_mov_b32_e32 v0, v2
3384; SI-NEXT:    s_waitcnt expcnt(0)
3385; SI-NEXT:    s_setpc_b64 s[30:31]
3386;
3387; VI-LABEL: global_atomic_or_0_i32_ret:
3388; VI:       ; %bb.0:
3389; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3390; VI-NEXT:    v_mov_b32_e32 v2, 0
3391; VI-NEXT:    flat_atomic_add v0, v[0:1], v2 glc
3392; VI-NEXT:    s_waitcnt vmcnt(0)
3393; VI-NEXT:    buffer_wbinvl1_vol
3394; VI-NEXT:    s_setpc_b64 s[30:31]
3395;
3396; GFX9-LABEL: global_atomic_or_0_i32_ret:
3397; GFX9:       ; %bb.0:
3398; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3399; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3400; GFX9-NEXT:    global_atomic_add v0, v[0:1], v2, off glc
3401; GFX9-NEXT:    s_waitcnt vmcnt(0)
3402; GFX9-NEXT:    buffer_wbinvl1_vol
3403; GFX9-NEXT:    s_setpc_b64 s[30:31]
3404  %result = atomicrmw or ptr addrspace(1) %ptr, i32 0 seq_cst
3405  ret i32 %result
3406}
3407
3408define void @global_atomic_or_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
3409; SI-LABEL: global_atomic_or_i32_noret_offset__amdgpu_no_remote_memory:
3410; SI:       ; %bb.0:
3411; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3412; SI-NEXT:    s_mov_b32 s6, 0
3413; SI-NEXT:    s_mov_b32 s7, 0xf000
3414; SI-NEXT:    s_mov_b32 s4, s6
3415; SI-NEXT:    s_mov_b32 s5, s6
3416; SI-NEXT:    buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 offset:16
3417; SI-NEXT:    s_waitcnt vmcnt(0)
3418; SI-NEXT:    buffer_wbinvl1
3419; SI-NEXT:    s_waitcnt expcnt(0)
3420; SI-NEXT:    s_setpc_b64 s[30:31]
3421;
3422; VI-LABEL: global_atomic_or_i32_noret_offset__amdgpu_no_remote_memory:
3423; VI:       ; %bb.0:
3424; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3425; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
3426; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3427; VI-NEXT:    flat_atomic_or v[0:1], v2
3428; VI-NEXT:    s_waitcnt vmcnt(0)
3429; VI-NEXT:    buffer_wbinvl1_vol
3430; VI-NEXT:    s_setpc_b64 s[30:31]
3431;
3432; GFX9-LABEL: global_atomic_or_i32_noret_offset__amdgpu_no_remote_memory:
3433; GFX9:       ; %bb.0:
3434; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3435; GFX9-NEXT:    global_atomic_or v[0:1], v2, off offset:16
3436; GFX9-NEXT:    s_waitcnt vmcnt(0)
3437; GFX9-NEXT:    buffer_wbinvl1_vol
3438; GFX9-NEXT:    s_setpc_b64 s[30:31]
3439  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
3440  %tmp0 = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
3441  ret void
3442}
3443
3444define i32 @global_atomic_or_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
3445; SI-LABEL: global_atomic_or_i32_ret_offset__amdgpu_no_remote_memory:
3446; SI:       ; %bb.0:
3447; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3448; SI-NEXT:    s_mov_b32 s6, 0
3449; SI-NEXT:    s_mov_b32 s7, 0xf000
3450; SI-NEXT:    s_mov_b32 s4, s6
3451; SI-NEXT:    s_mov_b32 s5, s6
3452; SI-NEXT:    buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
3453; SI-NEXT:    s_waitcnt vmcnt(0)
3454; SI-NEXT:    buffer_wbinvl1
3455; SI-NEXT:    v_mov_b32_e32 v0, v2
3456; SI-NEXT:    s_waitcnt expcnt(0)
3457; SI-NEXT:    s_setpc_b64 s[30:31]
3458;
3459; VI-LABEL: global_atomic_or_i32_ret_offset__amdgpu_no_remote_memory:
3460; VI:       ; %bb.0:
3461; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3462; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
3463; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3464; VI-NEXT:    flat_atomic_or v0, v[0:1], v2 glc
3465; VI-NEXT:    s_waitcnt vmcnt(0)
3466; VI-NEXT:    buffer_wbinvl1_vol
3467; VI-NEXT:    s_setpc_b64 s[30:31]
3468;
3469; GFX9-LABEL: global_atomic_or_i32_ret_offset__amdgpu_no_remote_memory:
3470; GFX9:       ; %bb.0:
3471; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3472; GFX9-NEXT:    global_atomic_or v0, v[0:1], v2, off offset:16 glc
3473; GFX9-NEXT:    s_waitcnt vmcnt(0)
3474; GFX9-NEXT:    buffer_wbinvl1_vol
3475; GFX9-NEXT:    s_setpc_b64 s[30:31]
3476  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
3477  %result = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
3478  ret i32 %result
3479}
3480
3481; ---------------------------------------------------------------------
3482; atomicrmw xor
3483; ---------------------------------------------------------------------
3484
3485define void @global_atomic_xor_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
3486; SI-LABEL: global_atomic_xor_i32_noret:
3487; SI:       ; %bb.0:
3488; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3489; SI-NEXT:    s_mov_b32 s6, 0
3490; SI-NEXT:    s_mov_b32 s7, 0xf000
3491; SI-NEXT:    s_mov_b32 s4, s6
3492; SI-NEXT:    s_mov_b32 s5, s6
3493; SI-NEXT:    buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64
3494; SI-NEXT:    s_waitcnt vmcnt(0)
3495; SI-NEXT:    buffer_wbinvl1
3496; SI-NEXT:    s_waitcnt expcnt(0)
3497; SI-NEXT:    s_setpc_b64 s[30:31]
3498;
3499; VI-LABEL: global_atomic_xor_i32_noret:
3500; VI:       ; %bb.0:
3501; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3502; VI-NEXT:    flat_atomic_xor v[0:1], v2
3503; VI-NEXT:    s_waitcnt vmcnt(0)
3504; VI-NEXT:    buffer_wbinvl1_vol
3505; VI-NEXT:    s_setpc_b64 s[30:31]
3506;
3507; GFX9-LABEL: global_atomic_xor_i32_noret:
3508; GFX9:       ; %bb.0:
3509; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3510; GFX9-NEXT:    global_atomic_xor v[0:1], v2, off
3511; GFX9-NEXT:    s_waitcnt vmcnt(0)
3512; GFX9-NEXT:    buffer_wbinvl1_vol
3513; GFX9-NEXT:    s_setpc_b64 s[30:31]
3514  %tmp0 = atomicrmw xor ptr addrspace(1) %ptr, i32 %in seq_cst
3515  ret void
3516}
3517
3518define void @global_atomic_xor_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
3519; SI-LABEL: global_atomic_xor_i32_noret_offset:
3520; SI:       ; %bb.0:
3521; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3522; SI-NEXT:    s_mov_b32 s6, 0
3523; SI-NEXT:    s_mov_b32 s7, 0xf000
3524; SI-NEXT:    s_mov_b32 s4, s6
3525; SI-NEXT:    s_mov_b32 s5, s6
3526; SI-NEXT:    buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 offset:16
3527; SI-NEXT:    s_waitcnt vmcnt(0)
3528; SI-NEXT:    buffer_wbinvl1
3529; SI-NEXT:    s_waitcnt expcnt(0)
3530; SI-NEXT:    s_setpc_b64 s[30:31]
3531;
3532; VI-LABEL: global_atomic_xor_i32_noret_offset:
3533; VI:       ; %bb.0:
3534; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3535; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
3536; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3537; VI-NEXT:    flat_atomic_xor v[0:1], v2
3538; VI-NEXT:    s_waitcnt vmcnt(0)
3539; VI-NEXT:    buffer_wbinvl1_vol
3540; VI-NEXT:    s_setpc_b64 s[30:31]
3541;
3542; GFX9-LABEL: global_atomic_xor_i32_noret_offset:
3543; GFX9:       ; %bb.0:
3544; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3545; GFX9-NEXT:    global_atomic_xor v[0:1], v2, off offset:16
3546; GFX9-NEXT:    s_waitcnt vmcnt(0)
3547; GFX9-NEXT:    buffer_wbinvl1_vol
3548; GFX9-NEXT:    s_setpc_b64 s[30:31]
3549  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
3550  %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst
3551  ret void
3552}
3553
3554define i32 @global_atomic_xor_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
3555; SI-LABEL: global_atomic_xor_i32_ret:
3556; SI:       ; %bb.0:
3557; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3558; SI-NEXT:    s_mov_b32 s6, 0
3559; SI-NEXT:    s_mov_b32 s7, 0xf000
3560; SI-NEXT:    s_mov_b32 s4, s6
3561; SI-NEXT:    s_mov_b32 s5, s6
3562; SI-NEXT:    buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 glc
3563; SI-NEXT:    s_waitcnt vmcnt(0)
3564; SI-NEXT:    buffer_wbinvl1
3565; SI-NEXT:    v_mov_b32_e32 v0, v2
3566; SI-NEXT:    s_waitcnt expcnt(0)
3567; SI-NEXT:    s_setpc_b64 s[30:31]
3568;
3569; VI-LABEL: global_atomic_xor_i32_ret:
3570; VI:       ; %bb.0:
3571; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3572; VI-NEXT:    flat_atomic_xor v0, v[0:1], v2 glc
3573; VI-NEXT:    s_waitcnt vmcnt(0)
3574; VI-NEXT:    buffer_wbinvl1_vol
3575; VI-NEXT:    s_setpc_b64 s[30:31]
3576;
3577; GFX9-LABEL: global_atomic_xor_i32_ret:
3578; GFX9:       ; %bb.0:
3579; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3580; GFX9-NEXT:    global_atomic_xor v0, v[0:1], v2, off glc
3581; GFX9-NEXT:    s_waitcnt vmcnt(0)
3582; GFX9-NEXT:    buffer_wbinvl1_vol
3583; GFX9-NEXT:    s_setpc_b64 s[30:31]
3584  %result = atomicrmw xor ptr addrspace(1) %ptr, i32 %in seq_cst
3585  ret i32 %result
3586}
3587
3588define i32 @global_atomic_xor_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
3589; SI-LABEL: global_atomic_xor_i32_ret_offset:
3590; SI:       ; %bb.0:
3591; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3592; SI-NEXT:    s_mov_b32 s6, 0
3593; SI-NEXT:    s_mov_b32 s7, 0xf000
3594; SI-NEXT:    s_mov_b32 s4, s6
3595; SI-NEXT:    s_mov_b32 s5, s6
3596; SI-NEXT:    buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
3597; SI-NEXT:    s_waitcnt vmcnt(0)
3598; SI-NEXT:    buffer_wbinvl1
3599; SI-NEXT:    v_mov_b32_e32 v0, v2
3600; SI-NEXT:    s_waitcnt expcnt(0)
3601; SI-NEXT:    s_setpc_b64 s[30:31]
3602;
3603; VI-LABEL: global_atomic_xor_i32_ret_offset:
3604; VI:       ; %bb.0:
3605; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3606; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
3607; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3608; VI-NEXT:    flat_atomic_xor v0, v[0:1], v2 glc
3609; VI-NEXT:    s_waitcnt vmcnt(0)
3610; VI-NEXT:    buffer_wbinvl1_vol
3611; VI-NEXT:    s_setpc_b64 s[30:31]
3612;
3613; GFX9-LABEL: global_atomic_xor_i32_ret_offset:
3614; GFX9:       ; %bb.0:
3615; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3616; GFX9-NEXT:    global_atomic_xor v0, v[0:1], v2, off offset:16 glc
3617; GFX9-NEXT:    s_waitcnt vmcnt(0)
3618; GFX9-NEXT:    buffer_wbinvl1_vol
3619; GFX9-NEXT:    s_setpc_b64 s[30:31]
3620  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
3621  %result = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst
3622  ret i32 %result
3623}
3624
3625define amdgpu_gfx void @global_atomic_xor_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
3626; SI-LABEL: global_atomic_xor_i32_noret_scalar:
3627; SI:       ; %bb.0:
3628; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3629; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
3630; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
3631; SI-NEXT:    s_mov_b64 exec, s[34:35]
3632; SI-NEXT:    s_waitcnt expcnt(0)
3633; SI-NEXT:    v_writelane_b32 v1, s6, 0
3634; SI-NEXT:    v_writelane_b32 v1, s7, 1
3635; SI-NEXT:    s_mov_b32 s34, s6
3636; SI-NEXT:    s_mov_b32 s7, 0xf000
3637; SI-NEXT:    s_mov_b32 s6, -1
3638; SI-NEXT:    v_mov_b32_e32 v0, s34
3639; SI-NEXT:    s_waitcnt vmcnt(0)
3640; SI-NEXT:    buffer_atomic_xor v0, off, s[4:7], 0
3641; SI-NEXT:    s_waitcnt vmcnt(0)
3642; SI-NEXT:    buffer_wbinvl1
3643; SI-NEXT:    v_readlane_b32 s7, v1, 1
3644; SI-NEXT:    v_readlane_b32 s6, v1, 0
3645; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
3646; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
3647; SI-NEXT:    s_mov_b64 exec, s[34:35]
3648; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
3649; SI-NEXT:    s_setpc_b64 s[30:31]
3650;
3651; VI-LABEL: global_atomic_xor_i32_noret_scalar:
3652; VI:       ; %bb.0:
3653; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3654; VI-NEXT:    v_mov_b32_e32 v0, s4
3655; VI-NEXT:    v_mov_b32_e32 v1, s5
3656; VI-NEXT:    v_mov_b32_e32 v2, s6
3657; VI-NEXT:    flat_atomic_xor v[0:1], v2
3658; VI-NEXT:    s_waitcnt vmcnt(0)
3659; VI-NEXT:    buffer_wbinvl1_vol
3660; VI-NEXT:    s_setpc_b64 s[30:31]
3661;
3662; GFX9-LABEL: global_atomic_xor_i32_noret_scalar:
3663; GFX9:       ; %bb.0:
3664; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3665; GFX9-NEXT:    v_mov_b32_e32 v0, 0
3666; GFX9-NEXT:    v_mov_b32_e32 v1, s6
3667; GFX9-NEXT:    global_atomic_xor v0, v1, s[4:5]
3668; GFX9-NEXT:    s_waitcnt vmcnt(0)
3669; GFX9-NEXT:    buffer_wbinvl1_vol
3670; GFX9-NEXT:    s_setpc_b64 s[30:31]
3671  %tmp0 = atomicrmw xor ptr addrspace(1) %ptr, i32 %in seq_cst
3672  ret void
3673}
3674
3675define amdgpu_gfx void @global_atomic_xor_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
3676; SI-LABEL: global_atomic_xor_i32_noret_offset_scalar:
3677; SI:       ; %bb.0:
3678; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3679; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
3680; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
3681; SI-NEXT:    s_mov_b64 exec, s[34:35]
3682; SI-NEXT:    s_waitcnt expcnt(0)
3683; SI-NEXT:    v_writelane_b32 v1, s6, 0
3684; SI-NEXT:    v_writelane_b32 v1, s7, 1
3685; SI-NEXT:    s_mov_b32 s34, s6
3686; SI-NEXT:    s_mov_b32 s7, 0xf000
3687; SI-NEXT:    s_mov_b32 s6, -1
3688; SI-NEXT:    v_mov_b32_e32 v0, s34
3689; SI-NEXT:    s_waitcnt vmcnt(0)
3690; SI-NEXT:    buffer_atomic_xor v0, off, s[4:7], 0 offset:16
3691; SI-NEXT:    s_waitcnt vmcnt(0)
3692; SI-NEXT:    buffer_wbinvl1
3693; SI-NEXT:    v_readlane_b32 s7, v1, 1
3694; SI-NEXT:    v_readlane_b32 s6, v1, 0
3695; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
3696; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
3697; SI-NEXT:    s_mov_b64 exec, s[34:35]
3698; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
3699; SI-NEXT:    s_setpc_b64 s[30:31]
3700;
3701; VI-LABEL: global_atomic_xor_i32_noret_offset_scalar:
3702; VI:       ; %bb.0:
3703; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3704; VI-NEXT:    s_add_u32 s34, s4, 16
3705; VI-NEXT:    s_addc_u32 s35, s5, 0
3706; VI-NEXT:    v_mov_b32_e32 v0, s34
3707; VI-NEXT:    v_mov_b32_e32 v1, s35
3708; VI-NEXT:    v_mov_b32_e32 v2, s6
3709; VI-NEXT:    flat_atomic_xor v[0:1], v2
3710; VI-NEXT:    s_waitcnt vmcnt(0)
3711; VI-NEXT:    buffer_wbinvl1_vol
3712; VI-NEXT:    s_setpc_b64 s[30:31]
3713;
3714; GFX9-LABEL: global_atomic_xor_i32_noret_offset_scalar:
3715; GFX9:       ; %bb.0:
3716; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3717; GFX9-NEXT:    v_mov_b32_e32 v0, 0
3718; GFX9-NEXT:    v_mov_b32_e32 v1, s6
3719; GFX9-NEXT:    global_atomic_xor v0, v1, s[4:5] offset:16
3720; GFX9-NEXT:    s_waitcnt vmcnt(0)
3721; GFX9-NEXT:    buffer_wbinvl1_vol
3722; GFX9-NEXT:    s_setpc_b64 s[30:31]
3723  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
3724  %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst
3725  ret void
3726}
3727
3728define amdgpu_gfx i32 @global_atomic_xor_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
3729; SI-LABEL: global_atomic_xor_i32_ret_scalar:
3730; SI:       ; %bb.0:
3731; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3732; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
3733; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
3734; SI-NEXT:    s_mov_b64 exec, s[34:35]
3735; SI-NEXT:    s_waitcnt expcnt(0)
3736; SI-NEXT:    v_writelane_b32 v1, s6, 0
3737; SI-NEXT:    v_writelane_b32 v1, s7, 1
3738; SI-NEXT:    s_mov_b32 s34, s6
3739; SI-NEXT:    s_mov_b32 s7, 0xf000
3740; SI-NEXT:    s_mov_b32 s6, -1
3741; SI-NEXT:    v_mov_b32_e32 v0, s34
3742; SI-NEXT:    s_waitcnt vmcnt(0)
3743; SI-NEXT:    buffer_atomic_xor v0, off, s[4:7], 0 glc
3744; SI-NEXT:    s_waitcnt vmcnt(0)
3745; SI-NEXT:    buffer_wbinvl1
3746; SI-NEXT:    v_readlane_b32 s7, v1, 1
3747; SI-NEXT:    v_readlane_b32 s6, v1, 0
3748; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
3749; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
3750; SI-NEXT:    s_mov_b64 exec, s[34:35]
3751; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
3752; SI-NEXT:    s_setpc_b64 s[30:31]
3753;
3754; VI-LABEL: global_atomic_xor_i32_ret_scalar:
3755; VI:       ; %bb.0:
3756; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3757; VI-NEXT:    v_mov_b32_e32 v0, s4
3758; VI-NEXT:    v_mov_b32_e32 v1, s5
3759; VI-NEXT:    v_mov_b32_e32 v2, s6
3760; VI-NEXT:    flat_atomic_xor v0, v[0:1], v2 glc
3761; VI-NEXT:    s_waitcnt vmcnt(0)
3762; VI-NEXT:    buffer_wbinvl1_vol
3763; VI-NEXT:    s_setpc_b64 s[30:31]
3764;
3765; GFX9-LABEL: global_atomic_xor_i32_ret_scalar:
3766; GFX9:       ; %bb.0:
3767; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3768; GFX9-NEXT:    v_mov_b32_e32 v0, 0
3769; GFX9-NEXT:    v_mov_b32_e32 v1, s6
3770; GFX9-NEXT:    global_atomic_xor v0, v0, v1, s[4:5] glc
3771; GFX9-NEXT:    s_waitcnt vmcnt(0)
3772; GFX9-NEXT:    buffer_wbinvl1_vol
3773; GFX9-NEXT:    s_setpc_b64 s[30:31]
3774  %result = atomicrmw xor ptr addrspace(1) %ptr, i32 %in seq_cst
3775  ret i32 %result
3776}
3777
3778define amdgpu_gfx i32 @global_atomic_xor_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
3779; SI-LABEL: global_atomic_xor_i32_ret_offset_scalar:
3780; SI:       ; %bb.0:
3781; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3782; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
3783; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
3784; SI-NEXT:    s_mov_b64 exec, s[34:35]
3785; SI-NEXT:    s_waitcnt expcnt(0)
3786; SI-NEXT:    v_writelane_b32 v1, s6, 0
3787; SI-NEXT:    v_writelane_b32 v1, s7, 1
3788; SI-NEXT:    s_mov_b32 s34, s6
3789; SI-NEXT:    s_mov_b32 s7, 0xf000
3790; SI-NEXT:    s_mov_b32 s6, -1
3791; SI-NEXT:    v_mov_b32_e32 v0, s34
3792; SI-NEXT:    s_waitcnt vmcnt(0)
3793; SI-NEXT:    buffer_atomic_xor v0, off, s[4:7], 0 offset:16 glc
3794; SI-NEXT:    s_waitcnt vmcnt(0)
3795; SI-NEXT:    buffer_wbinvl1
3796; SI-NEXT:    v_readlane_b32 s7, v1, 1
3797; SI-NEXT:    v_readlane_b32 s6, v1, 0
3798; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
3799; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
3800; SI-NEXT:    s_mov_b64 exec, s[34:35]
3801; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
3802; SI-NEXT:    s_setpc_b64 s[30:31]
3803;
3804; VI-LABEL: global_atomic_xor_i32_ret_offset_scalar:
3805; VI:       ; %bb.0:
3806; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3807; VI-NEXT:    s_add_u32 s34, s4, 16
3808; VI-NEXT:    s_addc_u32 s35, s5, 0
3809; VI-NEXT:    v_mov_b32_e32 v0, s34
3810; VI-NEXT:    v_mov_b32_e32 v1, s35
3811; VI-NEXT:    v_mov_b32_e32 v2, s6
3812; VI-NEXT:    flat_atomic_xor v0, v[0:1], v2 glc
3813; VI-NEXT:    s_waitcnt vmcnt(0)
3814; VI-NEXT:    buffer_wbinvl1_vol
3815; VI-NEXT:    s_setpc_b64 s[30:31]
3816;
3817; GFX9-LABEL: global_atomic_xor_i32_ret_offset_scalar:
3818; GFX9:       ; %bb.0:
3819; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3820; GFX9-NEXT:    v_mov_b32_e32 v0, 0
3821; GFX9-NEXT:    v_mov_b32_e32 v1, s6
3822; GFX9-NEXT:    global_atomic_xor v0, v0, v1, s[4:5] offset:16 glc
3823; GFX9-NEXT:    s_waitcnt vmcnt(0)
3824; GFX9-NEXT:    buffer_wbinvl1_vol
3825; GFX9-NEXT:    s_setpc_b64 s[30:31]
3826  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
3827  %result = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst
3828  ret i32 %result
3829}
3830
3831define i32 @global_atomic_xor_0_i32_ret(ptr addrspace(1) %ptr) {
3832; SI-LABEL: global_atomic_xor_0_i32_ret:
3833; SI:       ; %bb.0:
3834; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3835; SI-NEXT:    s_mov_b32 s7, 0xf000
3836; SI-NEXT:    s_mov_b32 s6, 0
3837; SI-NEXT:    v_mov_b32_e32 v2, 0
3838; SI-NEXT:    s_mov_b32 s4, s6
3839; SI-NEXT:    s_mov_b32 s5, s6
3840; SI-NEXT:    buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 glc
3841; SI-NEXT:    s_waitcnt vmcnt(0)
3842; SI-NEXT:    buffer_wbinvl1
3843; SI-NEXT:    v_mov_b32_e32 v0, v2
3844; SI-NEXT:    s_waitcnt expcnt(0)
3845; SI-NEXT:    s_setpc_b64 s[30:31]
3846;
3847; VI-LABEL: global_atomic_xor_0_i32_ret:
3848; VI:       ; %bb.0:
3849; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3850; VI-NEXT:    v_mov_b32_e32 v2, 0
3851; VI-NEXT:    flat_atomic_add v0, v[0:1], v2 glc
3852; VI-NEXT:    s_waitcnt vmcnt(0)
3853; VI-NEXT:    buffer_wbinvl1_vol
3854; VI-NEXT:    s_setpc_b64 s[30:31]
3855;
3856; GFX9-LABEL: global_atomic_xor_0_i32_ret:
3857; GFX9:       ; %bb.0:
3858; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3859; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3860; GFX9-NEXT:    global_atomic_add v0, v[0:1], v2, off glc
3861; GFX9-NEXT:    s_waitcnt vmcnt(0)
3862; GFX9-NEXT:    buffer_wbinvl1_vol
3863; GFX9-NEXT:    s_setpc_b64 s[30:31]
3864  %result = atomicrmw xor ptr addrspace(1) %ptr, i32 0 seq_cst
3865  ret i32 %result
3866}
3867
3868define void @global_atomic_xor_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
3869; SI-LABEL: global_atomic_xor_i32_noret_offset__amdgpu_no_remote_memory:
3870; SI:       ; %bb.0:
3871; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3872; SI-NEXT:    s_mov_b32 s6, 0
3873; SI-NEXT:    s_mov_b32 s7, 0xf000
3874; SI-NEXT:    s_mov_b32 s4, s6
3875; SI-NEXT:    s_mov_b32 s5, s6
3876; SI-NEXT:    buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 offset:16
3877; SI-NEXT:    s_waitcnt vmcnt(0)
3878; SI-NEXT:    buffer_wbinvl1
3879; SI-NEXT:    s_waitcnt expcnt(0)
3880; SI-NEXT:    s_setpc_b64 s[30:31]
3881;
3882; VI-LABEL: global_atomic_xor_i32_noret_offset__amdgpu_no_remote_memory:
3883; VI:       ; %bb.0:
3884; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3885; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
3886; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3887; VI-NEXT:    flat_atomic_xor v[0:1], v2
3888; VI-NEXT:    s_waitcnt vmcnt(0)
3889; VI-NEXT:    buffer_wbinvl1_vol
3890; VI-NEXT:    s_setpc_b64 s[30:31]
3891;
3892; GFX9-LABEL: global_atomic_xor_i32_noret_offset__amdgpu_no_remote_memory:
3893; GFX9:       ; %bb.0:
3894; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3895; GFX9-NEXT:    global_atomic_xor v[0:1], v2, off offset:16
3896; GFX9-NEXT:    s_waitcnt vmcnt(0)
3897; GFX9-NEXT:    buffer_wbinvl1_vol
3898; GFX9-NEXT:    s_setpc_b64 s[30:31]
3899  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
3900  %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
3901  ret void
3902}
3903
3904define i32 @global_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
3905; SI-LABEL: global_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory:
3906; SI:       ; %bb.0:
3907; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3908; SI-NEXT:    s_mov_b32 s6, 0
3909; SI-NEXT:    s_mov_b32 s7, 0xf000
3910; SI-NEXT:    s_mov_b32 s4, s6
3911; SI-NEXT:    s_mov_b32 s5, s6
3912; SI-NEXT:    buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
3913; SI-NEXT:    s_waitcnt vmcnt(0)
3914; SI-NEXT:    buffer_wbinvl1
3915; SI-NEXT:    v_mov_b32_e32 v0, v2
3916; SI-NEXT:    s_waitcnt expcnt(0)
3917; SI-NEXT:    s_setpc_b64 s[30:31]
3918;
3919; VI-LABEL: global_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory:
3920; VI:       ; %bb.0:
3921; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3922; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
3923; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3924; VI-NEXT:    flat_atomic_xor v0, v[0:1], v2 glc
3925; VI-NEXT:    s_waitcnt vmcnt(0)
3926; VI-NEXT:    buffer_wbinvl1_vol
3927; VI-NEXT:    s_setpc_b64 s[30:31]
3928;
3929; GFX9-LABEL: global_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory:
3930; GFX9:       ; %bb.0:
3931; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3932; GFX9-NEXT:    global_atomic_xor v0, v[0:1], v2, off offset:16 glc
3933; GFX9-NEXT:    s_waitcnt vmcnt(0)
3934; GFX9-NEXT:    buffer_wbinvl1_vol
3935; GFX9-NEXT:    s_setpc_b64 s[30:31]
3936  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
3937  %result = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
3938  ret i32 %result
3939}
3940
3941; ---------------------------------------------------------------------
3942; atomicrmw max
3943; ---------------------------------------------------------------------
3944
3945define void @global_atomic_max_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
3946; SI-LABEL: global_atomic_max_i32_noret:
3947; SI:       ; %bb.0:
3948; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3949; SI-NEXT:    s_mov_b32 s6, 0
3950; SI-NEXT:    s_mov_b32 s7, 0xf000
3951; SI-NEXT:    s_mov_b32 s4, s6
3952; SI-NEXT:    s_mov_b32 s5, s6
3953; SI-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
3954; SI-NEXT:    s_mov_b64 s[8:9], 0
3955; SI-NEXT:  .LBB83_1: ; %atomicrmw.start
3956; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
3957; SI-NEXT:    s_waitcnt vmcnt(0)
3958; SI-NEXT:    v_max_i32_e32 v3, v4, v2
3959; SI-NEXT:    s_waitcnt expcnt(0)
3960; SI-NEXT:    v_mov_b32_e32 v6, v4
3961; SI-NEXT:    v_mov_b32_e32 v5, v3
3962; SI-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
3963; SI-NEXT:    s_waitcnt vmcnt(0)
3964; SI-NEXT:    buffer_wbinvl1
3965; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
3966; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
3967; SI-NEXT:    v_mov_b32_e32 v4, v5
3968; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
3969; SI-NEXT:    s_cbranch_execnz .LBB83_1
3970; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
3971; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
3972; SI-NEXT:    s_waitcnt expcnt(0)
3973; SI-NEXT:    s_setpc_b64 s[30:31]
3974;
3975; VI-LABEL: global_atomic_max_i32_noret:
3976; VI:       ; %bb.0:
3977; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3978; VI-NEXT:    flat_load_dword v4, v[0:1]
3979; VI-NEXT:    s_mov_b64 s[4:5], 0
3980; VI-NEXT:  .LBB83_1: ; %atomicrmw.start
3981; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
3982; VI-NEXT:    s_waitcnt vmcnt(0)
3983; VI-NEXT:    v_max_i32_e32 v3, v4, v2
3984; VI-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
3985; VI-NEXT:    s_waitcnt vmcnt(0)
3986; VI-NEXT:    buffer_wbinvl1_vol
3987; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
3988; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3989; VI-NEXT:    v_mov_b32_e32 v4, v3
3990; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3991; VI-NEXT:    s_cbranch_execnz .LBB83_1
3992; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
3993; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
3994; VI-NEXT:    s_setpc_b64 s[30:31]
3995;
3996; GFX9-LABEL: global_atomic_max_i32_noret:
3997; GFX9:       ; %bb.0:
3998; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3999; GFX9-NEXT:    global_load_dword v4, v[0:1], off
4000; GFX9-NEXT:    s_mov_b64 s[4:5], 0
4001; GFX9-NEXT:  .LBB83_1: ; %atomicrmw.start
4002; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
4003; GFX9-NEXT:    s_waitcnt vmcnt(0)
4004; GFX9-NEXT:    v_max_i32_e32 v3, v4, v2
4005; GFX9-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
4006; GFX9-NEXT:    s_waitcnt vmcnt(0)
4007; GFX9-NEXT:    buffer_wbinvl1_vol
4008; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
4009; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4010; GFX9-NEXT:    v_mov_b32_e32 v4, v3
4011; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4012; GFX9-NEXT:    s_cbranch_execnz .LBB83_1
4013; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
4014; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
4015; GFX9-NEXT:    s_setpc_b64 s[30:31]
4016  %tmp0 = atomicrmw max ptr addrspace(1) %ptr, i32 %in seq_cst
4017  ret void
4018}
4019
4020define void @global_atomic_max_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
4021; SI-LABEL: global_atomic_max_i32_noret_offset:
4022; SI:       ; %bb.0:
4023; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4024; SI-NEXT:    s_mov_b32 s6, 0
4025; SI-NEXT:    s_mov_b32 s7, 0xf000
4026; SI-NEXT:    s_mov_b32 s4, s6
4027; SI-NEXT:    s_mov_b32 s5, s6
4028; SI-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
4029; SI-NEXT:    s_mov_b64 s[8:9], 0
4030; SI-NEXT:  .LBB84_1: ; %atomicrmw.start
4031; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
4032; SI-NEXT:    s_waitcnt vmcnt(0)
4033; SI-NEXT:    v_max_i32_e32 v3, v4, v2
4034; SI-NEXT:    s_waitcnt expcnt(0)
4035; SI-NEXT:    v_mov_b32_e32 v6, v4
4036; SI-NEXT:    v_mov_b32_e32 v5, v3
4037; SI-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
4038; SI-NEXT:    s_waitcnt vmcnt(0)
4039; SI-NEXT:    buffer_wbinvl1
4040; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
4041; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
4042; SI-NEXT:    v_mov_b32_e32 v4, v5
4043; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
4044; SI-NEXT:    s_cbranch_execnz .LBB84_1
4045; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
4046; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
4047; SI-NEXT:    s_waitcnt expcnt(0)
4048; SI-NEXT:    s_setpc_b64 s[30:31]
4049;
4050; VI-LABEL: global_atomic_max_i32_noret_offset:
4051; VI:       ; %bb.0:
4052; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4053; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
4054; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4055; VI-NEXT:    flat_load_dword v4, v[0:1]
4056; VI-NEXT:    s_mov_b64 s[4:5], 0
4057; VI-NEXT:  .LBB84_1: ; %atomicrmw.start
4058; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
4059; VI-NEXT:    s_waitcnt vmcnt(0)
4060; VI-NEXT:    v_max_i32_e32 v3, v4, v2
4061; VI-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
4062; VI-NEXT:    s_waitcnt vmcnt(0)
4063; VI-NEXT:    buffer_wbinvl1_vol
4064; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
4065; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4066; VI-NEXT:    v_mov_b32_e32 v4, v3
4067; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4068; VI-NEXT:    s_cbranch_execnz .LBB84_1
4069; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
4070; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
4071; VI-NEXT:    s_setpc_b64 s[30:31]
4072;
4073; GFX9-LABEL: global_atomic_max_i32_noret_offset:
4074; GFX9:       ; %bb.0:
4075; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4076; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:16
4077; GFX9-NEXT:    s_mov_b64 s[4:5], 0
4078; GFX9-NEXT:  .LBB84_1: ; %atomicrmw.start
4079; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
4080; GFX9-NEXT:    s_waitcnt vmcnt(0)
4081; GFX9-NEXT:    v_max_i32_e32 v3, v4, v2
4082; GFX9-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
4083; GFX9-NEXT:    s_waitcnt vmcnt(0)
4084; GFX9-NEXT:    buffer_wbinvl1_vol
4085; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
4086; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4087; GFX9-NEXT:    v_mov_b32_e32 v4, v3
4088; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4089; GFX9-NEXT:    s_cbranch_execnz .LBB84_1
4090; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
4091; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
4092; GFX9-NEXT:    s_setpc_b64 s[30:31]
4093  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
4094  %tmp0 = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst
4095  ret void
4096}
4097
4098define i32 @global_atomic_max_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
4099; SI-LABEL: global_atomic_max_i32_ret:
4100; SI:       ; %bb.0:
4101; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4102; SI-NEXT:    s_mov_b32 s6, 0
4103; SI-NEXT:    s_mov_b32 s7, 0xf000
4104; SI-NEXT:    s_mov_b32 s4, s6
4105; SI-NEXT:    s_mov_b32 s5, s6
4106; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
4107; SI-NEXT:    s_mov_b64 s[8:9], 0
4108; SI-NEXT:  .LBB85_1: ; %atomicrmw.start
4109; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
4110; SI-NEXT:    s_waitcnt vmcnt(0)
4111; SI-NEXT:    v_mov_b32_e32 v5, v3
4112; SI-NEXT:    s_waitcnt expcnt(0)
4113; SI-NEXT:    v_max_i32_e32 v4, v5, v2
4114; SI-NEXT:    v_mov_b32_e32 v3, v4
4115; SI-NEXT:    v_mov_b32_e32 v4, v5
4116; SI-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
4117; SI-NEXT:    s_waitcnt vmcnt(0)
4118; SI-NEXT:    buffer_wbinvl1
4119; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
4120; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
4121; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
4122; SI-NEXT:    s_cbranch_execnz .LBB85_1
4123; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
4124; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
4125; SI-NEXT:    v_mov_b32_e32 v0, v3
4126; SI-NEXT:    s_waitcnt expcnt(0)
4127; SI-NEXT:    s_setpc_b64 s[30:31]
4128;
4129; VI-LABEL: global_atomic_max_i32_ret:
4130; VI:       ; %bb.0:
4131; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4132; VI-NEXT:    flat_load_dword v3, v[0:1]
4133; VI-NEXT:    s_mov_b64 s[4:5], 0
4134; VI-NEXT:  .LBB85_1: ; %atomicrmw.start
4135; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
4136; VI-NEXT:    s_waitcnt vmcnt(0)
4137; VI-NEXT:    v_mov_b32_e32 v4, v3
4138; VI-NEXT:    v_max_i32_e32 v3, v4, v2
4139; VI-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
4140; VI-NEXT:    s_waitcnt vmcnt(0)
4141; VI-NEXT:    buffer_wbinvl1_vol
4142; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
4143; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4144; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4145; VI-NEXT:    s_cbranch_execnz .LBB85_1
4146; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
4147; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
4148; VI-NEXT:    v_mov_b32_e32 v0, v3
4149; VI-NEXT:    s_setpc_b64 s[30:31]
4150;
4151; GFX9-LABEL: global_atomic_max_i32_ret:
4152; GFX9:       ; %bb.0:
4153; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4154; GFX9-NEXT:    global_load_dword v3, v[0:1], off
4155; GFX9-NEXT:    s_mov_b64 s[4:5], 0
4156; GFX9-NEXT:  .LBB85_1: ; %atomicrmw.start
4157; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
4158; GFX9-NEXT:    s_waitcnt vmcnt(0)
4159; GFX9-NEXT:    v_mov_b32_e32 v4, v3
4160; GFX9-NEXT:    v_max_i32_e32 v3, v4, v2
4161; GFX9-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
4162; GFX9-NEXT:    s_waitcnt vmcnt(0)
4163; GFX9-NEXT:    buffer_wbinvl1_vol
4164; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
4165; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4166; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4167; GFX9-NEXT:    s_cbranch_execnz .LBB85_1
4168; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
4169; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
4170; GFX9-NEXT:    v_mov_b32_e32 v0, v3
4171; GFX9-NEXT:    s_setpc_b64 s[30:31]
4172  %result = atomicrmw max ptr addrspace(1) %ptr, i32 %in seq_cst
4173  ret i32 %result
4174}
4175
4176define i32 @global_atomic_max_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
4177; SI-LABEL: global_atomic_max_i32_ret_offset:
4178; SI:       ; %bb.0:
4179; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4180; SI-NEXT:    s_mov_b32 s6, 0
4181; SI-NEXT:    s_mov_b32 s7, 0xf000
4182; SI-NEXT:    s_mov_b32 s4, s6
4183; SI-NEXT:    s_mov_b32 s5, s6
4184; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
4185; SI-NEXT:    s_mov_b64 s[8:9], 0
4186; SI-NEXT:  .LBB86_1: ; %atomicrmw.start
4187; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
4188; SI-NEXT:    s_waitcnt vmcnt(0)
4189; SI-NEXT:    v_mov_b32_e32 v5, v3
4190; SI-NEXT:    s_waitcnt expcnt(0)
4191; SI-NEXT:    v_max_i32_e32 v4, v5, v2
4192; SI-NEXT:    v_mov_b32_e32 v3, v4
4193; SI-NEXT:    v_mov_b32_e32 v4, v5
4194; SI-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
4195; SI-NEXT:    s_waitcnt vmcnt(0)
4196; SI-NEXT:    buffer_wbinvl1
4197; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
4198; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
4199; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
4200; SI-NEXT:    s_cbranch_execnz .LBB86_1
4201; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
4202; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
4203; SI-NEXT:    v_mov_b32_e32 v0, v3
4204; SI-NEXT:    s_waitcnt expcnt(0)
4205; SI-NEXT:    s_setpc_b64 s[30:31]
4206;
4207; VI-LABEL: global_atomic_max_i32_ret_offset:
4208; VI:       ; %bb.0:
4209; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4210; VI-NEXT:    v_add_u32_e32 v3, vcc, 16, v0
4211; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
4212; VI-NEXT:    flat_load_dword v0, v[3:4]
4213; VI-NEXT:    s_mov_b64 s[4:5], 0
4214; VI-NEXT:  .LBB86_1: ; %atomicrmw.start
4215; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
4216; VI-NEXT:    s_waitcnt vmcnt(0)
4217; VI-NEXT:    v_mov_b32_e32 v1, v0
4218; VI-NEXT:    v_max_i32_e32 v0, v1, v2
4219; VI-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
4220; VI-NEXT:    s_waitcnt vmcnt(0)
4221; VI-NEXT:    buffer_wbinvl1_vol
4222; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
4223; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4224; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4225; VI-NEXT:    s_cbranch_execnz .LBB86_1
4226; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
4227; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
4228; VI-NEXT:    s_setpc_b64 s[30:31]
4229;
4230; GFX9-LABEL: global_atomic_max_i32_ret_offset:
4231; GFX9:       ; %bb.0:
4232; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4233; GFX9-NEXT:    global_load_dword v3, v[0:1], off offset:16
4234; GFX9-NEXT:    s_mov_b64 s[4:5], 0
4235; GFX9-NEXT:  .LBB86_1: ; %atomicrmw.start
4236; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
4237; GFX9-NEXT:    s_waitcnt vmcnt(0)
4238; GFX9-NEXT:    v_mov_b32_e32 v4, v3
4239; GFX9-NEXT:    v_max_i32_e32 v3, v4, v2
4240; GFX9-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
4241; GFX9-NEXT:    s_waitcnt vmcnt(0)
4242; GFX9-NEXT:    buffer_wbinvl1_vol
4243; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
4244; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4245; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4246; GFX9-NEXT:    s_cbranch_execnz .LBB86_1
4247; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
4248; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
4249; GFX9-NEXT:    v_mov_b32_e32 v0, v3
4250; GFX9-NEXT:    s_setpc_b64 s[30:31]
4251  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
4252  %result = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst
4253  ret i32 %result
4254}
4255
4256define amdgpu_gfx void @global_atomic_max_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
4257; SI-LABEL: global_atomic_max_i32_noret_scalar:
4258; SI:       ; %bb.0:
4259; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4260; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
4261; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
4262; SI-NEXT:    s_mov_b64 exec, s[34:35]
4263; SI-NEXT:    s_waitcnt expcnt(0)
4264; SI-NEXT:    v_writelane_b32 v4, s6, 0
4265; SI-NEXT:    v_writelane_b32 v4, s7, 1
4266; SI-NEXT:    s_mov_b32 s34, s6
4267; SI-NEXT:    s_mov_b32 s7, 0xf000
4268; SI-NEXT:    s_mov_b32 s6, -1
4269; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
4270; SI-NEXT:    s_mov_b64 s[36:37], 0
4271; SI-NEXT:  .LBB87_1: ; %atomicrmw.start
4272; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
4273; SI-NEXT:    s_waitcnt vmcnt(0)
4274; SI-NEXT:    v_max_i32_e32 v0, s34, v1
4275; SI-NEXT:    s_waitcnt expcnt(0)
4276; SI-NEXT:    v_mov_b32_e32 v3, v1
4277; SI-NEXT:    v_mov_b32_e32 v2, v0
4278; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
4279; SI-NEXT:    s_waitcnt vmcnt(0)
4280; SI-NEXT:    buffer_wbinvl1
4281; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
4282; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
4283; SI-NEXT:    v_mov_b32_e32 v1, v2
4284; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
4285; SI-NEXT:    s_cbranch_execnz .LBB87_1
4286; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
4287; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
4288; SI-NEXT:    v_readlane_b32 s7, v4, 1
4289; SI-NEXT:    v_readlane_b32 s6, v4, 0
4290; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
4291; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
4292; SI-NEXT:    s_mov_b64 exec, s[34:35]
4293; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
4294; SI-NEXT:    s_setpc_b64 s[30:31]
4295;
4296; VI-LABEL: global_atomic_max_i32_noret_scalar:
4297; VI:       ; %bb.0:
4298; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4299; VI-NEXT:    v_mov_b32_e32 v0, s4
4300; VI-NEXT:    v_mov_b32_e32 v1, s5
4301; VI-NEXT:    flat_load_dword v3, v[0:1]
4302; VI-NEXT:    s_mov_b64 s[34:35], 0
4303; VI-NEXT:  .LBB87_1: ; %atomicrmw.start
4304; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
4305; VI-NEXT:    s_waitcnt vmcnt(0)
4306; VI-NEXT:    v_max_i32_e32 v2, s6, v3
4307; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4308; VI-NEXT:    s_waitcnt vmcnt(0)
4309; VI-NEXT:    buffer_wbinvl1_vol
4310; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
4311; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
4312; VI-NEXT:    v_mov_b32_e32 v3, v2
4313; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
4314; VI-NEXT:    s_cbranch_execnz .LBB87_1
4315; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
4316; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
4317; VI-NEXT:    s_setpc_b64 s[30:31]
4318;
4319; GFX9-LABEL: global_atomic_max_i32_noret_scalar:
4320; GFX9:       ; %bb.0:
4321; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4322; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4323; GFX9-NEXT:    global_load_dword v1, v2, s[4:5]
4324; GFX9-NEXT:    s_mov_b64 s[34:35], 0
4325; GFX9-NEXT:  .LBB87_1: ; %atomicrmw.start
4326; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
4327; GFX9-NEXT:    s_waitcnt vmcnt(0)
4328; GFX9-NEXT:    v_max_i32_e32 v0, s6, v1
4329; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc
4330; GFX9-NEXT:    s_waitcnt vmcnt(0)
4331; GFX9-NEXT:    buffer_wbinvl1_vol
4332; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
4333; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
4334; GFX9-NEXT:    v_mov_b32_e32 v1, v0
4335; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
4336; GFX9-NEXT:    s_cbranch_execnz .LBB87_1
4337; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
4338; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
4339; GFX9-NEXT:    s_setpc_b64 s[30:31]
4340  %tmp0 = atomicrmw max ptr addrspace(1) %ptr, i32 %in seq_cst
4341  ret void
4342}
4343
4344define amdgpu_gfx void @global_atomic_max_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
4345; SI-LABEL: global_atomic_max_i32_noret_offset_scalar:
4346; SI:       ; %bb.0:
4347; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4348; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
4349; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
4350; SI-NEXT:    s_mov_b64 exec, s[34:35]
4351; SI-NEXT:    s_waitcnt expcnt(0)
4352; SI-NEXT:    v_writelane_b32 v4, s6, 0
4353; SI-NEXT:    v_writelane_b32 v4, s7, 1
4354; SI-NEXT:    s_mov_b32 s34, s6
4355; SI-NEXT:    s_mov_b32 s7, 0xf000
4356; SI-NEXT:    s_mov_b32 s6, -1
4357; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0 offset:16
4358; SI-NEXT:    s_mov_b64 s[36:37], 0
4359; SI-NEXT:  .LBB88_1: ; %atomicrmw.start
4360; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
4361; SI-NEXT:    s_waitcnt vmcnt(0)
4362; SI-NEXT:    v_max_i32_e32 v0, s34, v1
4363; SI-NEXT:    s_waitcnt expcnt(0)
4364; SI-NEXT:    v_mov_b32_e32 v3, v1
4365; SI-NEXT:    v_mov_b32_e32 v2, v0
4366; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
4367; SI-NEXT:    s_waitcnt vmcnt(0)
4368; SI-NEXT:    buffer_wbinvl1
4369; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
4370; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
4371; SI-NEXT:    v_mov_b32_e32 v1, v2
4372; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
4373; SI-NEXT:    s_cbranch_execnz .LBB88_1
4374; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
4375; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
4376; SI-NEXT:    v_readlane_b32 s7, v4, 1
4377; SI-NEXT:    v_readlane_b32 s6, v4, 0
4378; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
4379; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
4380; SI-NEXT:    s_mov_b64 exec, s[34:35]
4381; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
4382; SI-NEXT:    s_setpc_b64 s[30:31]
4383;
4384; VI-LABEL: global_atomic_max_i32_noret_offset_scalar:
4385; VI:       ; %bb.0:
4386; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4387; VI-NEXT:    s_add_u32 s34, s4, 16
4388; VI-NEXT:    s_addc_u32 s35, s5, 0
4389; VI-NEXT:    v_mov_b32_e32 v0, s34
4390; VI-NEXT:    v_mov_b32_e32 v1, s35
4391; VI-NEXT:    flat_load_dword v3, v[0:1]
4392; VI-NEXT:    s_mov_b64 s[34:35], 0
4393; VI-NEXT:  .LBB88_1: ; %atomicrmw.start
4394; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
4395; VI-NEXT:    s_waitcnt vmcnt(0)
4396; VI-NEXT:    v_max_i32_e32 v2, s6, v3
4397; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4398; VI-NEXT:    s_waitcnt vmcnt(0)
4399; VI-NEXT:    buffer_wbinvl1_vol
4400; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
4401; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
4402; VI-NEXT:    v_mov_b32_e32 v3, v2
4403; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
4404; VI-NEXT:    s_cbranch_execnz .LBB88_1
4405; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
4406; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
4407; VI-NEXT:    s_setpc_b64 s[30:31]
4408;
4409; GFX9-LABEL: global_atomic_max_i32_noret_offset_scalar:
4410; GFX9:       ; %bb.0:
4411; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4412; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4413; GFX9-NEXT:    global_load_dword v1, v2, s[4:5] offset:16
4414; GFX9-NEXT:    s_mov_b64 s[34:35], 0
4415; GFX9-NEXT:  .LBB88_1: ; %atomicrmw.start
4416; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
4417; GFX9-NEXT:    s_waitcnt vmcnt(0)
4418; GFX9-NEXT:    v_max_i32_e32 v0, s6, v1
4419; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc
4420; GFX9-NEXT:    s_waitcnt vmcnt(0)
4421; GFX9-NEXT:    buffer_wbinvl1_vol
4422; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
4423; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
4424; GFX9-NEXT:    v_mov_b32_e32 v1, v0
4425; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
4426; GFX9-NEXT:    s_cbranch_execnz .LBB88_1
4427; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
4428; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
4429; GFX9-NEXT:    s_setpc_b64 s[30:31]
4430  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
4431  %tmp0 = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst
4432  ret void
4433}
4434
4435define amdgpu_gfx i32 @global_atomic_max_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
4436; SI-LABEL: global_atomic_max_i32_ret_scalar:
4437; SI:       ; %bb.0:
4438; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4439; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
4440; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
4441; SI-NEXT:    s_mov_b64 exec, s[34:35]
4442; SI-NEXT:    s_waitcnt expcnt(0)
4443; SI-NEXT:    v_writelane_b32 v3, s6, 0
4444; SI-NEXT:    v_writelane_b32 v3, s7, 1
4445; SI-NEXT:    s_mov_b32 s34, s6
4446; SI-NEXT:    s_mov_b32 s7, 0xf000
4447; SI-NEXT:    s_mov_b32 s6, -1
4448; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
4449; SI-NEXT:    s_mov_b64 s[36:37], 0
4450; SI-NEXT:  .LBB89_1: ; %atomicrmw.start
4451; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
4452; SI-NEXT:    s_waitcnt vmcnt(0)
4453; SI-NEXT:    v_mov_b32_e32 v2, v0
4454; SI-NEXT:    s_waitcnt expcnt(0)
4455; SI-NEXT:    v_max_i32_e32 v1, s34, v2
4456; SI-NEXT:    v_mov_b32_e32 v0, v1
4457; SI-NEXT:    v_mov_b32_e32 v1, v2
4458; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
4459; SI-NEXT:    s_waitcnt vmcnt(0)
4460; SI-NEXT:    buffer_wbinvl1
4461; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
4462; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
4463; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
4464; SI-NEXT:    s_cbranch_execnz .LBB89_1
4465; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
4466; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
4467; SI-NEXT:    v_readlane_b32 s7, v3, 1
4468; SI-NEXT:    v_readlane_b32 s6, v3, 0
4469; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
4470; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
4471; SI-NEXT:    s_mov_b64 exec, s[34:35]
4472; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
4473; SI-NEXT:    s_setpc_b64 s[30:31]
4474;
4475; VI-LABEL: global_atomic_max_i32_ret_scalar:
4476; VI:       ; %bb.0:
4477; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4478; VI-NEXT:    v_mov_b32_e32 v0, s4
4479; VI-NEXT:    v_mov_b32_e32 v1, s5
4480; VI-NEXT:    flat_load_dword v0, v[0:1]
4481; VI-NEXT:    v_mov_b32_e32 v1, s4
4482; VI-NEXT:    s_mov_b64 s[34:35], 0
4483; VI-NEXT:    v_mov_b32_e32 v2, s5
4484; VI-NEXT:  .LBB89_1: ; %atomicrmw.start
4485; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
4486; VI-NEXT:    s_waitcnt vmcnt(0)
4487; VI-NEXT:    v_mov_b32_e32 v4, v0
4488; VI-NEXT:    v_max_i32_e32 v3, s6, v4
4489; VI-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
4490; VI-NEXT:    s_waitcnt vmcnt(0)
4491; VI-NEXT:    buffer_wbinvl1_vol
4492; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
4493; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
4494; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
4495; VI-NEXT:    s_cbranch_execnz .LBB89_1
4496; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
4497; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
4498; VI-NEXT:    s_setpc_b64 s[30:31]
4499;
4500; GFX9-LABEL: global_atomic_max_i32_ret_scalar:
4501; GFX9:       ; %bb.0:
4502; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4503; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4504; GFX9-NEXT:    global_load_dword v0, v1, s[4:5]
4505; GFX9-NEXT:    s_mov_b64 s[34:35], 0
4506; GFX9-NEXT:  .LBB89_1: ; %atomicrmw.start
4507; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
4508; GFX9-NEXT:    s_waitcnt vmcnt(0)
4509; GFX9-NEXT:    v_mov_b32_e32 v3, v0
4510; GFX9-NEXT:    v_max_i32_e32 v2, s6, v3
4511; GFX9-NEXT:    global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc
4512; GFX9-NEXT:    s_waitcnt vmcnt(0)
4513; GFX9-NEXT:    buffer_wbinvl1_vol
4514; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
4515; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
4516; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
4517; GFX9-NEXT:    s_cbranch_execnz .LBB89_1
4518; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
4519; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
4520; GFX9-NEXT:    s_setpc_b64 s[30:31]
4521  %result = atomicrmw max ptr addrspace(1) %ptr, i32 %in seq_cst
4522  ret i32 %result
4523}
4524
4525define amdgpu_gfx i32 @global_atomic_max_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
4526; SI-LABEL: global_atomic_max_i32_ret_offset_scalar:
4527; SI:       ; %bb.0:
4528; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4529; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
4530; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
4531; SI-NEXT:    s_mov_b64 exec, s[34:35]
4532; SI-NEXT:    s_waitcnt expcnt(0)
4533; SI-NEXT:    v_writelane_b32 v3, s6, 0
4534; SI-NEXT:    v_writelane_b32 v3, s7, 1
4535; SI-NEXT:    s_mov_b32 s34, s6
4536; SI-NEXT:    s_mov_b32 s7, 0xf000
4537; SI-NEXT:    s_mov_b32 s6, -1
4538; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0 offset:16
4539; SI-NEXT:    s_mov_b64 s[36:37], 0
4540; SI-NEXT:  .LBB90_1: ; %atomicrmw.start
4541; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
4542; SI-NEXT:    s_waitcnt vmcnt(0)
4543; SI-NEXT:    v_mov_b32_e32 v2, v0
4544; SI-NEXT:    s_waitcnt expcnt(0)
4545; SI-NEXT:    v_max_i32_e32 v1, s34, v2
4546; SI-NEXT:    v_mov_b32_e32 v0, v1
4547; SI-NEXT:    v_mov_b32_e32 v1, v2
4548; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
4549; SI-NEXT:    s_waitcnt vmcnt(0)
4550; SI-NEXT:    buffer_wbinvl1
4551; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
4552; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
4553; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
4554; SI-NEXT:    s_cbranch_execnz .LBB90_1
4555; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
4556; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
4557; SI-NEXT:    v_readlane_b32 s7, v3, 1
4558; SI-NEXT:    v_readlane_b32 s6, v3, 0
4559; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
4560; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
4561; SI-NEXT:    s_mov_b64 exec, s[34:35]
4562; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
4563; SI-NEXT:    s_setpc_b64 s[30:31]
4564;
4565; VI-LABEL: global_atomic_max_i32_ret_offset_scalar:
4566; VI:       ; %bb.0:
4567; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4568; VI-NEXT:    s_add_u32 s34, s4, 16
4569; VI-NEXT:    s_addc_u32 s35, s5, 0
4570; VI-NEXT:    v_mov_b32_e32 v1, s34
4571; VI-NEXT:    v_mov_b32_e32 v2, s35
4572; VI-NEXT:    flat_load_dword v0, v[1:2]
4573; VI-NEXT:    s_mov_b64 s[34:35], 0
4574; VI-NEXT:  .LBB90_1: ; %atomicrmw.start
4575; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
4576; VI-NEXT:    s_waitcnt vmcnt(0)
4577; VI-NEXT:    v_mov_b32_e32 v4, v0
4578; VI-NEXT:    v_max_i32_e32 v3, s6, v4
4579; VI-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
4580; VI-NEXT:    s_waitcnt vmcnt(0)
4581; VI-NEXT:    buffer_wbinvl1_vol
4582; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
4583; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
4584; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
4585; VI-NEXT:    s_cbranch_execnz .LBB90_1
4586; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
4587; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
4588; VI-NEXT:    s_setpc_b64 s[30:31]
4589;
4590; GFX9-LABEL: global_atomic_max_i32_ret_offset_scalar:
4591; GFX9:       ; %bb.0:
4592; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4593; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4594; GFX9-NEXT:    global_load_dword v0, v1, s[4:5] offset:16
4595; GFX9-NEXT:    s_mov_b64 s[34:35], 0
4596; GFX9-NEXT:  .LBB90_1: ; %atomicrmw.start
4597; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
4598; GFX9-NEXT:    s_waitcnt vmcnt(0)
4599; GFX9-NEXT:    v_mov_b32_e32 v3, v0
4600; GFX9-NEXT:    v_max_i32_e32 v2, s6, v3
4601; GFX9-NEXT:    global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc
4602; GFX9-NEXT:    s_waitcnt vmcnt(0)
4603; GFX9-NEXT:    buffer_wbinvl1_vol
4604; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
4605; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
4606; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
4607; GFX9-NEXT:    s_cbranch_execnz .LBB90_1
4608; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
4609; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
4610; GFX9-NEXT:    s_setpc_b64 s[30:31]
4611  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
4612  %result = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst
4613  ret i32 %result
4614}
4615
4616define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i32 %index) {
4617; SI-LABEL: atomic_max_i32_addr64_offset:
4618; SI:       ; %bb.0: ; %entry
4619; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
4620; SI-NEXT:    s_waitcnt lgkmcnt(0)
4621; SI-NEXT:    s_ashr_i32 s5, s3, 31
4622; SI-NEXT:    s_mov_b32 s4, s3
4623; SI-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
4624; SI-NEXT:    s_add_u32 s4, s0, s4
4625; SI-NEXT:    s_addc_u32 s5, s1, s5
4626; SI-NEXT:    s_load_dword s3, s[4:5], 0x4
4627; SI-NEXT:    s_mov_b64 s[0:1], 0
4628; SI-NEXT:    s_mov_b32 s7, 0xf000
4629; SI-NEXT:    s_waitcnt lgkmcnt(0)
4630; SI-NEXT:    v_mov_b32_e32 v1, s3
4631; SI-NEXT:    s_mov_b32 s6, -1
4632; SI-NEXT:  .LBB91_1: ; %atomicrmw.start
4633; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
4634; SI-NEXT:    v_max_i32_e32 v0, s2, v1
4635; SI-NEXT:    s_waitcnt expcnt(0)
4636; SI-NEXT:    v_mov_b32_e32 v3, v1
4637; SI-NEXT:    v_mov_b32_e32 v2, v0
4638; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
4639; SI-NEXT:    s_waitcnt vmcnt(0)
4640; SI-NEXT:    buffer_wbinvl1
4641; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
4642; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
4643; SI-NEXT:    v_mov_b32_e32 v1, v2
4644; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
4645; SI-NEXT:    s_cbranch_execnz .LBB91_1
4646; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
4647; SI-NEXT:    s_endpgm
4648;
4649; VI-LABEL: atomic_max_i32_addr64_offset:
4650; VI:       ; %bb.0: ; %entry
4651; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4652; VI-NEXT:    s_waitcnt lgkmcnt(0)
4653; VI-NEXT:    s_ashr_i32 s5, s3, 31
4654; VI-NEXT:    s_mov_b32 s4, s3
4655; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
4656; VI-NEXT:    s_add_u32 s4, s0, s4
4657; VI-NEXT:    s_addc_u32 s5, s1, s5
4658; VI-NEXT:    s_load_dword s3, s[4:5], 0x10
4659; VI-NEXT:    s_add_u32 s4, s4, 16
4660; VI-NEXT:    s_addc_u32 s5, s5, 0
4661; VI-NEXT:    v_mov_b32_e32 v0, s4
4662; VI-NEXT:    s_mov_b64 s[0:1], 0
4663; VI-NEXT:    s_waitcnt lgkmcnt(0)
4664; VI-NEXT:    v_mov_b32_e32 v3, s3
4665; VI-NEXT:    v_mov_b32_e32 v1, s5
4666; VI-NEXT:  .LBB91_1: ; %atomicrmw.start
4667; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
4668; VI-NEXT:    v_max_i32_e32 v2, s2, v3
4669; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4670; VI-NEXT:    s_waitcnt vmcnt(0)
4671; VI-NEXT:    buffer_wbinvl1_vol
4672; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
4673; VI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
4674; VI-NEXT:    v_mov_b32_e32 v3, v2
4675; VI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
4676; VI-NEXT:    s_cbranch_execnz .LBB91_1
4677; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
4678; VI-NEXT:    s_endpgm
4679;
4680; GFX9-LABEL: atomic_max_i32_addr64_offset:
4681; GFX9:       ; %bb.0: ; %entry
4682; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4683; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4684; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4685; GFX9-NEXT:    s_ashr_i32 s5, s3, 31
4686; GFX9-NEXT:    s_mov_b32 s4, s3
4687; GFX9-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
4688; GFX9-NEXT:    s_add_u32 s0, s0, s4
4689; GFX9-NEXT:    s_addc_u32 s1, s1, s5
4690; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x10
4691; GFX9-NEXT:    s_mov_b64 s[4:5], 0
4692; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4693; GFX9-NEXT:    v_mov_b32_e32 v1, s3
4694; GFX9-NEXT:  .LBB91_1: ; %atomicrmw.start
4695; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
4696; GFX9-NEXT:    v_max_i32_e32 v0, s2, v1
4697; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
4698; GFX9-NEXT:    s_waitcnt vmcnt(0)
4699; GFX9-NEXT:    buffer_wbinvl1_vol
4700; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
4701; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4702; GFX9-NEXT:    v_mov_b32_e32 v1, v0
4703; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4704; GFX9-NEXT:    s_cbranch_execnz .LBB91_1
4705; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
4706; GFX9-NEXT:    s_endpgm
4707entry:
4708  %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index
4709  %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
4710  %tmp0 = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst
4711  ret void
4712}
4713
4714define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) {
4715; SI-LABEL: atomic_max_i32_ret_addr64_offset:
4716; SI:       ; %bb.0: ; %entry
4717; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
4718; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
4719; SI-NEXT:    s_waitcnt lgkmcnt(0)
4720; SI-NEXT:    s_ashr_i32 s5, s9, 31
4721; SI-NEXT:    s_mov_b32 s4, s9
4722; SI-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
4723; SI-NEXT:    s_add_u32 s4, s0, s4
4724; SI-NEXT:    s_addc_u32 s5, s1, s5
4725; SI-NEXT:    s_load_dword s6, s[4:5], 0x4
4726; SI-NEXT:    s_mov_b64 s[0:1], 0
4727; SI-NEXT:    s_mov_b32 s7, 0xf000
4728; SI-NEXT:    s_waitcnt lgkmcnt(0)
4729; SI-NEXT:    v_mov_b32_e32 v1, s6
4730; SI-NEXT:    s_mov_b32 s6, -1
4731; SI-NEXT:  .LBB92_1: ; %atomicrmw.start
4732; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
4733; SI-NEXT:    v_max_i32_e32 v0, s8, v1
4734; SI-NEXT:    s_waitcnt expcnt(0)
4735; SI-NEXT:    v_mov_b32_e32 v3, v1
4736; SI-NEXT:    v_mov_b32_e32 v2, v0
4737; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
4738; SI-NEXT:    s_waitcnt vmcnt(0)
4739; SI-NEXT:    buffer_wbinvl1
4740; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
4741; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
4742; SI-NEXT:    v_mov_b32_e32 v1, v2
4743; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
4744; SI-NEXT:    s_cbranch_execnz .LBB92_1
4745; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
4746; SI-NEXT:    s_or_b64 exec, exec, s[0:1]
4747; SI-NEXT:    s_mov_b32 s7, 0xf000
4748; SI-NEXT:    s_mov_b32 s6, -1
4749; SI-NEXT:    s_mov_b32 s4, s2
4750; SI-NEXT:    s_mov_b32 s5, s3
4751; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0
4752; SI-NEXT:    s_endpgm
4753;
4754; VI-LABEL: atomic_max_i32_ret_addr64_offset:
4755; VI:       ; %bb.0: ; %entry
4756; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
4757; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4758; VI-NEXT:    s_waitcnt lgkmcnt(0)
4759; VI-NEXT:    s_ashr_i32 s5, s7, 31
4760; VI-NEXT:    s_mov_b32 s4, s7
4761; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
4762; VI-NEXT:    s_add_u32 s4, s0, s4
4763; VI-NEXT:    s_addc_u32 s5, s1, s5
4764; VI-NEXT:    s_load_dword s7, s[4:5], 0x10
4765; VI-NEXT:    s_add_u32 s4, s4, 16
4766; VI-NEXT:    s_addc_u32 s5, s5, 0
4767; VI-NEXT:    v_mov_b32_e32 v0, s4
4768; VI-NEXT:    s_mov_b64 s[0:1], 0
4769; VI-NEXT:    s_waitcnt lgkmcnt(0)
4770; VI-NEXT:    v_mov_b32_e32 v2, s7
4771; VI-NEXT:    v_mov_b32_e32 v1, s5
4772; VI-NEXT:  .LBB92_1: ; %atomicrmw.start
4773; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
4774; VI-NEXT:    v_mov_b32_e32 v3, v2
4775; VI-NEXT:    v_max_i32_e32 v2, s6, v3
4776; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4777; VI-NEXT:    s_waitcnt vmcnt(0)
4778; VI-NEXT:    buffer_wbinvl1_vol
4779; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
4780; VI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
4781; VI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
4782; VI-NEXT:    s_cbranch_execnz .LBB92_1
4783; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
4784; VI-NEXT:    s_or_b64 exec, exec, s[0:1]
4785; VI-NEXT:    v_mov_b32_e32 v0, s2
4786; VI-NEXT:    v_mov_b32_e32 v1, s3
4787; VI-NEXT:    flat_store_dword v[0:1], v2
4788; VI-NEXT:    s_endpgm
4789;
4790; GFX9-LABEL: atomic_max_i32_ret_addr64_offset:
4791; GFX9:       ; %bb.0: ; %entry
4792; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
4793; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4794; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4795; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4796; GFX9-NEXT:    s_ashr_i32 s5, s7, 31
4797; GFX9-NEXT:    s_mov_b32 s4, s7
4798; GFX9-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
4799; GFX9-NEXT:    s_add_u32 s0, s0, s4
4800; GFX9-NEXT:    s_addc_u32 s1, s1, s5
4801; GFX9-NEXT:    s_load_dword s7, s[0:1], 0x10
4802; GFX9-NEXT:    s_mov_b64 s[4:5], 0
4803; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4804; GFX9-NEXT:    v_mov_b32_e32 v0, s7
4805; GFX9-NEXT:  .LBB92_1: ; %atomicrmw.start
4806; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
4807; GFX9-NEXT:    v_mov_b32_e32 v3, v0
4808; GFX9-NEXT:    v_max_i32_e32 v2, s6, v3
4809; GFX9-NEXT:    global_atomic_cmpswap v0, v1, v[2:3], s[0:1] offset:16 glc
4810; GFX9-NEXT:    s_waitcnt vmcnt(0)
4811; GFX9-NEXT:    buffer_wbinvl1_vol
4812; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
4813; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4814; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4815; GFX9-NEXT:    s_cbranch_execnz .LBB92_1
4816; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
4817; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
4818; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4819; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
4820; GFX9-NEXT:    s_endpgm
4821entry:
4822  %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index
4823  %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
4824  %tmp0 = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst
4825  store i32 %tmp0, ptr addrspace(1) %out2
4826  ret void
4827}
4828
4829define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, i32 %index) {
4830; SI-LABEL: atomic_max_i32_addr64:
4831; SI:       ; %bb.0: ; %entry
4832; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
4833; SI-NEXT:    s_waitcnt lgkmcnt(0)
4834; SI-NEXT:    s_ashr_i32 s5, s3, 31
4835; SI-NEXT:    s_mov_b32 s4, s3
4836; SI-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
4837; SI-NEXT:    s_add_u32 s4, s0, s4
4838; SI-NEXT:    s_addc_u32 s5, s1, s5
4839; SI-NEXT:    s_load_dword s3, s[4:5], 0x0
4840; SI-NEXT:    s_mov_b64 s[0:1], 0
4841; SI-NEXT:    s_mov_b32 s7, 0xf000
4842; SI-NEXT:    s_waitcnt lgkmcnt(0)
4843; SI-NEXT:    v_mov_b32_e32 v1, s3
4844; SI-NEXT:    s_mov_b32 s6, -1
4845; SI-NEXT:  .LBB93_1: ; %atomicrmw.start
4846; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
4847; SI-NEXT:    v_max_i32_e32 v0, s2, v1
4848; SI-NEXT:    s_waitcnt expcnt(0)
4849; SI-NEXT:    v_mov_b32_e32 v3, v1
4850; SI-NEXT:    v_mov_b32_e32 v2, v0
4851; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
4852; SI-NEXT:    s_waitcnt vmcnt(0)
4853; SI-NEXT:    buffer_wbinvl1
4854; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
4855; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
4856; SI-NEXT:    v_mov_b32_e32 v1, v2
4857; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
4858; SI-NEXT:    s_cbranch_execnz .LBB93_1
4859; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
4860; SI-NEXT:    s_endpgm
4861;
4862; VI-LABEL: atomic_max_i32_addr64:
4863; VI:       ; %bb.0: ; %entry
4864; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4865; VI-NEXT:    s_waitcnt lgkmcnt(0)
4866; VI-NEXT:    s_ashr_i32 s5, s3, 31
4867; VI-NEXT:    s_mov_b32 s4, s3
4868; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
4869; VI-NEXT:    s_add_u32 s4, s0, s4
4870; VI-NEXT:    s_addc_u32 s5, s1, s5
4871; VI-NEXT:    s_load_dword s3, s[4:5], 0x0
4872; VI-NEXT:    v_mov_b32_e32 v0, s4
4873; VI-NEXT:    s_mov_b64 s[0:1], 0
4874; VI-NEXT:    v_mov_b32_e32 v1, s5
4875; VI-NEXT:    s_waitcnt lgkmcnt(0)
4876; VI-NEXT:    v_mov_b32_e32 v3, s3
4877; VI-NEXT:  .LBB93_1: ; %atomicrmw.start
4878; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
4879; VI-NEXT:    v_max_i32_e32 v2, s2, v3
4880; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4881; VI-NEXT:    s_waitcnt vmcnt(0)
4882; VI-NEXT:    buffer_wbinvl1_vol
4883; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
4884; VI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
4885; VI-NEXT:    v_mov_b32_e32 v3, v2
4886; VI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
4887; VI-NEXT:    s_cbranch_execnz .LBB93_1
4888; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
4889; VI-NEXT:    s_endpgm
4890;
4891; GFX9-LABEL: atomic_max_i32_addr64:
4892; GFX9:       ; %bb.0: ; %entry
4893; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4894; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4895; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4896; GFX9-NEXT:    s_ashr_i32 s5, s3, 31
4897; GFX9-NEXT:    s_mov_b32 s4, s3
4898; GFX9-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
4899; GFX9-NEXT:    s_add_u32 s0, s0, s4
4900; GFX9-NEXT:    s_addc_u32 s1, s1, s5
4901; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x0
4902; GFX9-NEXT:    s_mov_b64 s[4:5], 0
4903; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4904; GFX9-NEXT:    v_mov_b32_e32 v1, s3
4905; GFX9-NEXT:  .LBB93_1: ; %atomicrmw.start
4906; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
4907; GFX9-NEXT:    v_max_i32_e32 v0, s2, v1
4908; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
4909; GFX9-NEXT:    s_waitcnt vmcnt(0)
4910; GFX9-NEXT:    buffer_wbinvl1_vol
4911; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
4912; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4913; GFX9-NEXT:    v_mov_b32_e32 v1, v0
4914; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4915; GFX9-NEXT:    s_cbranch_execnz .LBB93_1
4916; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
4917; GFX9-NEXT:    s_endpgm
4918entry:
4919  %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index
4920  %tmp0 = atomicrmw max ptr addrspace(1) %ptr, i32 %in seq_cst
4921  ret void
4922}
4923
4924define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) {
4925; SI-LABEL: atomic_max_i32_ret_addr64:
4926; SI:       ; %bb.0: ; %entry
4927; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
4928; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
4929; SI-NEXT:    s_waitcnt lgkmcnt(0)
4930; SI-NEXT:    s_ashr_i32 s5, s9, 31
4931; SI-NEXT:    s_mov_b32 s4, s9
4932; SI-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
4933; SI-NEXT:    s_add_u32 s4, s0, s4
4934; SI-NEXT:    s_addc_u32 s5, s1, s5
4935; SI-NEXT:    s_load_dword s6, s[4:5], 0x0
4936; SI-NEXT:    s_mov_b64 s[0:1], 0
4937; SI-NEXT:    s_mov_b32 s7, 0xf000
4938; SI-NEXT:    s_waitcnt lgkmcnt(0)
4939; SI-NEXT:    v_mov_b32_e32 v1, s6
4940; SI-NEXT:    s_mov_b32 s6, -1
4941; SI-NEXT:  .LBB94_1: ; %atomicrmw.start
4942; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
4943; SI-NEXT:    v_max_i32_e32 v0, s8, v1
4944; SI-NEXT:    s_waitcnt expcnt(0)
4945; SI-NEXT:    v_mov_b32_e32 v3, v1
4946; SI-NEXT:    v_mov_b32_e32 v2, v0
4947; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
4948; SI-NEXT:    s_waitcnt vmcnt(0)
4949; SI-NEXT:    buffer_wbinvl1
4950; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
4951; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
4952; SI-NEXT:    v_mov_b32_e32 v1, v2
4953; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
4954; SI-NEXT:    s_cbranch_execnz .LBB94_1
4955; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
4956; SI-NEXT:    s_or_b64 exec, exec, s[0:1]
4957; SI-NEXT:    s_mov_b32 s7, 0xf000
4958; SI-NEXT:    s_mov_b32 s6, -1
4959; SI-NEXT:    s_mov_b32 s4, s2
4960; SI-NEXT:    s_mov_b32 s5, s3
4961; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0
4962; SI-NEXT:    s_endpgm
4963;
4964; VI-LABEL: atomic_max_i32_ret_addr64:
4965; VI:       ; %bb.0: ; %entry
4966; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
4967; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4968; VI-NEXT:    s_waitcnt lgkmcnt(0)
4969; VI-NEXT:    s_ashr_i32 s5, s7, 31
4970; VI-NEXT:    s_mov_b32 s4, s7
4971; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
4972; VI-NEXT:    s_add_u32 s4, s0, s4
4973; VI-NEXT:    s_addc_u32 s5, s1, s5
4974; VI-NEXT:    s_load_dword s7, s[4:5], 0x0
4975; VI-NEXT:    v_mov_b32_e32 v0, s4
4976; VI-NEXT:    s_mov_b64 s[0:1], 0
4977; VI-NEXT:    v_mov_b32_e32 v1, s5
4978; VI-NEXT:    s_waitcnt lgkmcnt(0)
4979; VI-NEXT:    v_mov_b32_e32 v2, s7
4980; VI-NEXT:  .LBB94_1: ; %atomicrmw.start
4981; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
4982; VI-NEXT:    v_mov_b32_e32 v3, v2
4983; VI-NEXT:    v_max_i32_e32 v2, s6, v3
4984; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4985; VI-NEXT:    s_waitcnt vmcnt(0)
4986; VI-NEXT:    buffer_wbinvl1_vol
4987; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
4988; VI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
4989; VI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
4990; VI-NEXT:    s_cbranch_execnz .LBB94_1
4991; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
4992; VI-NEXT:    s_or_b64 exec, exec, s[0:1]
4993; VI-NEXT:    v_mov_b32_e32 v0, s2
4994; VI-NEXT:    v_mov_b32_e32 v1, s3
4995; VI-NEXT:    flat_store_dword v[0:1], v2
4996; VI-NEXT:    s_endpgm
4997;
4998; GFX9-LABEL: atomic_max_i32_ret_addr64:
4999; GFX9:       ; %bb.0: ; %entry
5000; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
5001; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5002; GFX9-NEXT:    v_mov_b32_e32 v1, 0
5003; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5004; GFX9-NEXT:    s_ashr_i32 s5, s7, 31
5005; GFX9-NEXT:    s_mov_b32 s4, s7
5006; GFX9-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
5007; GFX9-NEXT:    s_add_u32 s0, s0, s4
5008; GFX9-NEXT:    s_addc_u32 s1, s1, s5
5009; GFX9-NEXT:    s_load_dword s7, s[0:1], 0x0
5010; GFX9-NEXT:    s_mov_b64 s[4:5], 0
5011; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5012; GFX9-NEXT:    v_mov_b32_e32 v0, s7
5013; GFX9-NEXT:  .LBB94_1: ; %atomicrmw.start
5014; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
5015; GFX9-NEXT:    v_mov_b32_e32 v3, v0
5016; GFX9-NEXT:    v_max_i32_e32 v2, s6, v3
5017; GFX9-NEXT:    global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc
5018; GFX9-NEXT:    s_waitcnt vmcnt(0)
5019; GFX9-NEXT:    buffer_wbinvl1_vol
5020; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
5021; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5022; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5023; GFX9-NEXT:    s_cbranch_execnz .LBB94_1
5024; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
5025; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
5026; GFX9-NEXT:    v_mov_b32_e32 v1, 0
5027; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
5028; GFX9-NEXT:    s_endpgm
5029entry:
5030  %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index
5031  %tmp0 = atomicrmw max ptr addrspace(1) %ptr, i32 %in seq_cst
5032  store i32 %tmp0, ptr addrspace(1) %out2
5033  ret void
5034}
5035
5036define void @global_atomic_max_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
5037; SI-LABEL: global_atomic_max_i32_noret_offset__amdgpu_no_remote_memory:
5038; SI:       ; %bb.0:
5039; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5040; SI-NEXT:    s_mov_b32 s6, 0
5041; SI-NEXT:    s_mov_b32 s7, 0xf000
5042; SI-NEXT:    s_mov_b32 s4, s6
5043; SI-NEXT:    s_mov_b32 s5, s6
5044; SI-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
5045; SI-NEXT:    s_mov_b64 s[8:9], 0
5046; SI-NEXT:  .LBB95_1: ; %atomicrmw.start
5047; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
5048; SI-NEXT:    s_waitcnt vmcnt(0)
5049; SI-NEXT:    v_max_i32_e32 v3, v4, v2
5050; SI-NEXT:    s_waitcnt expcnt(0)
5051; SI-NEXT:    v_mov_b32_e32 v6, v4
5052; SI-NEXT:    v_mov_b32_e32 v5, v3
5053; SI-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
5054; SI-NEXT:    s_waitcnt vmcnt(0)
5055; SI-NEXT:    buffer_wbinvl1
5056; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
5057; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
5058; SI-NEXT:    v_mov_b32_e32 v4, v5
5059; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
5060; SI-NEXT:    s_cbranch_execnz .LBB95_1
5061; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
5062; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
5063; SI-NEXT:    s_waitcnt expcnt(0)
5064; SI-NEXT:    s_setpc_b64 s[30:31]
5065;
5066; VI-LABEL: global_atomic_max_i32_noret_offset__amdgpu_no_remote_memory:
5067; VI:       ; %bb.0:
5068; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5069; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
5070; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
5071; VI-NEXT:    flat_load_dword v4, v[0:1]
5072; VI-NEXT:    s_mov_b64 s[4:5], 0
5073; VI-NEXT:  .LBB95_1: ; %atomicrmw.start
5074; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
5075; VI-NEXT:    s_waitcnt vmcnt(0)
5076; VI-NEXT:    v_max_i32_e32 v3, v4, v2
5077; VI-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
5078; VI-NEXT:    s_waitcnt vmcnt(0)
5079; VI-NEXT:    buffer_wbinvl1_vol
5080; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
5081; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5082; VI-NEXT:    v_mov_b32_e32 v4, v3
5083; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5084; VI-NEXT:    s_cbranch_execnz .LBB95_1
5085; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
5086; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
5087; VI-NEXT:    s_setpc_b64 s[30:31]
5088;
5089; GFX9-LABEL: global_atomic_max_i32_noret_offset__amdgpu_no_remote_memory:
5090; GFX9:       ; %bb.0:
5091; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5092; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:16
5093; GFX9-NEXT:    s_mov_b64 s[4:5], 0
5094; GFX9-NEXT:  .LBB95_1: ; %atomicrmw.start
5095; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
5096; GFX9-NEXT:    s_waitcnt vmcnt(0)
5097; GFX9-NEXT:    v_max_i32_e32 v3, v4, v2
5098; GFX9-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
5099; GFX9-NEXT:    s_waitcnt vmcnt(0)
5100; GFX9-NEXT:    buffer_wbinvl1_vol
5101; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
5102; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5103; GFX9-NEXT:    v_mov_b32_e32 v4, v3
5104; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5105; GFX9-NEXT:    s_cbranch_execnz .LBB95_1
5106; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
5107; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
5108; GFX9-NEXT:    s_setpc_b64 s[30:31]
5109  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
5110  %tmp0 = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
5111  ret void
5112}
5113
5114define i32 @global_atomic_max_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
5115; SI-LABEL: global_atomic_max_i32_ret_offset__amdgpu_no_remote_memory:
5116; SI:       ; %bb.0:
5117; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5118; SI-NEXT:    s_mov_b32 s6, 0
5119; SI-NEXT:    s_mov_b32 s7, 0xf000
5120; SI-NEXT:    s_mov_b32 s4, s6
5121; SI-NEXT:    s_mov_b32 s5, s6
5122; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
5123; SI-NEXT:    s_mov_b64 s[8:9], 0
5124; SI-NEXT:  .LBB96_1: ; %atomicrmw.start
5125; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
5126; SI-NEXT:    s_waitcnt vmcnt(0)
5127; SI-NEXT:    v_mov_b32_e32 v5, v3
5128; SI-NEXT:    s_waitcnt expcnt(0)
5129; SI-NEXT:    v_max_i32_e32 v4, v5, v2
5130; SI-NEXT:    v_mov_b32_e32 v3, v4
5131; SI-NEXT:    v_mov_b32_e32 v4, v5
5132; SI-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
5133; SI-NEXT:    s_waitcnt vmcnt(0)
5134; SI-NEXT:    buffer_wbinvl1
5135; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
5136; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
5137; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
5138; SI-NEXT:    s_cbranch_execnz .LBB96_1
5139; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
5140; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
5141; SI-NEXT:    v_mov_b32_e32 v0, v3
5142; SI-NEXT:    s_waitcnt expcnt(0)
5143; SI-NEXT:    s_setpc_b64 s[30:31]
5144;
5145; VI-LABEL: global_atomic_max_i32_ret_offset__amdgpu_no_remote_memory:
5146; VI:       ; %bb.0:
5147; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5148; VI-NEXT:    v_add_u32_e32 v3, vcc, 16, v0
5149; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
5150; VI-NEXT:    flat_load_dword v0, v[3:4]
5151; VI-NEXT:    s_mov_b64 s[4:5], 0
5152; VI-NEXT:  .LBB96_1: ; %atomicrmw.start
5153; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
5154; VI-NEXT:    s_waitcnt vmcnt(0)
5155; VI-NEXT:    v_mov_b32_e32 v1, v0
5156; VI-NEXT:    v_max_i32_e32 v0, v1, v2
5157; VI-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
5158; VI-NEXT:    s_waitcnt vmcnt(0)
5159; VI-NEXT:    buffer_wbinvl1_vol
5160; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
5161; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5162; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5163; VI-NEXT:    s_cbranch_execnz .LBB96_1
5164; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
5165; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
5166; VI-NEXT:    s_setpc_b64 s[30:31]
5167;
5168; GFX9-LABEL: global_atomic_max_i32_ret_offset__amdgpu_no_remote_memory:
5169; GFX9:       ; %bb.0:
5170; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5171; GFX9-NEXT:    global_load_dword v3, v[0:1], off offset:16
5172; GFX9-NEXT:    s_mov_b64 s[4:5], 0
5173; GFX9-NEXT:  .LBB96_1: ; %atomicrmw.start
5174; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
5175; GFX9-NEXT:    s_waitcnt vmcnt(0)
5176; GFX9-NEXT:    v_mov_b32_e32 v4, v3
5177; GFX9-NEXT:    v_max_i32_e32 v3, v4, v2
5178; GFX9-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
5179; GFX9-NEXT:    s_waitcnt vmcnt(0)
5180; GFX9-NEXT:    buffer_wbinvl1_vol
5181; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
5182; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5183; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5184; GFX9-NEXT:    s_cbranch_execnz .LBB96_1
5185; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
5186; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
5187; GFX9-NEXT:    v_mov_b32_e32 v0, v3
5188; GFX9-NEXT:    s_setpc_b64 s[30:31]
5189  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
5190  %result = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
5191  ret i32 %result
5192}
5193
5194; ---------------------------------------------------------------------
5195; atomicrmw umax
5196; ---------------------------------------------------------------------
5197
5198define void @global_atomic_umax_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
5199; SI-LABEL: global_atomic_umax_i32_noret:
5200; SI:       ; %bb.0:
5201; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5202; SI-NEXT:    s_mov_b32 s6, 0
5203; SI-NEXT:    s_mov_b32 s7, 0xf000
5204; SI-NEXT:    s_mov_b32 s4, s6
5205; SI-NEXT:    s_mov_b32 s5, s6
5206; SI-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
5207; SI-NEXT:    s_mov_b64 s[8:9], 0
5208; SI-NEXT:  .LBB97_1: ; %atomicrmw.start
5209; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
5210; SI-NEXT:    s_waitcnt vmcnt(0)
5211; SI-NEXT:    v_max_u32_e32 v3, v4, v2
5212; SI-NEXT:    s_waitcnt expcnt(0)
5213; SI-NEXT:    v_mov_b32_e32 v6, v4
5214; SI-NEXT:    v_mov_b32_e32 v5, v3
5215; SI-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
5216; SI-NEXT:    s_waitcnt vmcnt(0)
5217; SI-NEXT:    buffer_wbinvl1
5218; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
5219; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
5220; SI-NEXT:    v_mov_b32_e32 v4, v5
5221; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
5222; SI-NEXT:    s_cbranch_execnz .LBB97_1
5223; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
5224; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
5225; SI-NEXT:    s_waitcnt expcnt(0)
5226; SI-NEXT:    s_setpc_b64 s[30:31]
5227;
5228; VI-LABEL: global_atomic_umax_i32_noret:
5229; VI:       ; %bb.0:
5230; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5231; VI-NEXT:    flat_load_dword v4, v[0:1]
5232; VI-NEXT:    s_mov_b64 s[4:5], 0
5233; VI-NEXT:  .LBB97_1: ; %atomicrmw.start
5234; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
5235; VI-NEXT:    s_waitcnt vmcnt(0)
5236; VI-NEXT:    v_max_u32_e32 v3, v4, v2
5237; VI-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
5238; VI-NEXT:    s_waitcnt vmcnt(0)
5239; VI-NEXT:    buffer_wbinvl1_vol
5240; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
5241; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5242; VI-NEXT:    v_mov_b32_e32 v4, v3
5243; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5244; VI-NEXT:    s_cbranch_execnz .LBB97_1
5245; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
5246; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
5247; VI-NEXT:    s_setpc_b64 s[30:31]
5248;
5249; GFX9-LABEL: global_atomic_umax_i32_noret:
5250; GFX9:       ; %bb.0:
5251; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5252; GFX9-NEXT:    global_load_dword v4, v[0:1], off
5253; GFX9-NEXT:    s_mov_b64 s[4:5], 0
5254; GFX9-NEXT:  .LBB97_1: ; %atomicrmw.start
5255; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
5256; GFX9-NEXT:    s_waitcnt vmcnt(0)
5257; GFX9-NEXT:    v_max_u32_e32 v3, v4, v2
5258; GFX9-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
5259; GFX9-NEXT:    s_waitcnt vmcnt(0)
5260; GFX9-NEXT:    buffer_wbinvl1_vol
5261; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
5262; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5263; GFX9-NEXT:    v_mov_b32_e32 v4, v3
5264; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5265; GFX9-NEXT:    s_cbranch_execnz .LBB97_1
5266; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
5267; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
5268; GFX9-NEXT:    s_setpc_b64 s[30:31]
5269  %tmp0 = atomicrmw umax ptr addrspace(1) %ptr, i32 %in seq_cst
5270  ret void
5271}
5272
5273define void @global_atomic_umax_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
5274; SI-LABEL: global_atomic_umax_i32_noret_offset:
5275; SI:       ; %bb.0:
5276; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5277; SI-NEXT:    s_mov_b32 s6, 0
5278; SI-NEXT:    s_mov_b32 s7, 0xf000
5279; SI-NEXT:    s_mov_b32 s4, s6
5280; SI-NEXT:    s_mov_b32 s5, s6
5281; SI-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
5282; SI-NEXT:    s_mov_b64 s[8:9], 0
5283; SI-NEXT:  .LBB98_1: ; %atomicrmw.start
5284; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
5285; SI-NEXT:    s_waitcnt vmcnt(0)
5286; SI-NEXT:    v_max_u32_e32 v3, v4, v2
5287; SI-NEXT:    s_waitcnt expcnt(0)
5288; SI-NEXT:    v_mov_b32_e32 v6, v4
5289; SI-NEXT:    v_mov_b32_e32 v5, v3
5290; SI-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
5291; SI-NEXT:    s_waitcnt vmcnt(0)
5292; SI-NEXT:    buffer_wbinvl1
5293; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
5294; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
5295; SI-NEXT:    v_mov_b32_e32 v4, v5
5296; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
5297; SI-NEXT:    s_cbranch_execnz .LBB98_1
5298; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
5299; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
5300; SI-NEXT:    s_waitcnt expcnt(0)
5301; SI-NEXT:    s_setpc_b64 s[30:31]
5302;
5303; VI-LABEL: global_atomic_umax_i32_noret_offset:
5304; VI:       ; %bb.0:
5305; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5306; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
5307; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
5308; VI-NEXT:    flat_load_dword v4, v[0:1]
5309; VI-NEXT:    s_mov_b64 s[4:5], 0
5310; VI-NEXT:  .LBB98_1: ; %atomicrmw.start
5311; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
5312; VI-NEXT:    s_waitcnt vmcnt(0)
5313; VI-NEXT:    v_max_u32_e32 v3, v4, v2
5314; VI-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
5315; VI-NEXT:    s_waitcnt vmcnt(0)
5316; VI-NEXT:    buffer_wbinvl1_vol
5317; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
5318; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5319; VI-NEXT:    v_mov_b32_e32 v4, v3
5320; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5321; VI-NEXT:    s_cbranch_execnz .LBB98_1
5322; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
5323; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
5324; VI-NEXT:    s_setpc_b64 s[30:31]
5325;
5326; GFX9-LABEL: global_atomic_umax_i32_noret_offset:
5327; GFX9:       ; %bb.0:
5328; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5329; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:16
5330; GFX9-NEXT:    s_mov_b64 s[4:5], 0
5331; GFX9-NEXT:  .LBB98_1: ; %atomicrmw.start
5332; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
5333; GFX9-NEXT:    s_waitcnt vmcnt(0)
5334; GFX9-NEXT:    v_max_u32_e32 v3, v4, v2
5335; GFX9-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
5336; GFX9-NEXT:    s_waitcnt vmcnt(0)
5337; GFX9-NEXT:    buffer_wbinvl1_vol
5338; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
5339; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5340; GFX9-NEXT:    v_mov_b32_e32 v4, v3
5341; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5342; GFX9-NEXT:    s_cbranch_execnz .LBB98_1
5343; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
5344; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
5345; GFX9-NEXT:    s_setpc_b64 s[30:31]
5346  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
5347  %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst
5348  ret void
5349}
5350
5351define i32 @global_atomic_umax_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
5352; SI-LABEL: global_atomic_umax_i32_ret:
5353; SI:       ; %bb.0:
5354; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5355; SI-NEXT:    s_mov_b32 s6, 0
5356; SI-NEXT:    s_mov_b32 s7, 0xf000
5357; SI-NEXT:    s_mov_b32 s4, s6
5358; SI-NEXT:    s_mov_b32 s5, s6
5359; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
5360; SI-NEXT:    s_mov_b64 s[8:9], 0
5361; SI-NEXT:  .LBB99_1: ; %atomicrmw.start
5362; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
5363; SI-NEXT:    s_waitcnt vmcnt(0)
5364; SI-NEXT:    v_mov_b32_e32 v5, v3
5365; SI-NEXT:    s_waitcnt expcnt(0)
5366; SI-NEXT:    v_max_u32_e32 v4, v5, v2
5367; SI-NEXT:    v_mov_b32_e32 v3, v4
5368; SI-NEXT:    v_mov_b32_e32 v4, v5
5369; SI-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
5370; SI-NEXT:    s_waitcnt vmcnt(0)
5371; SI-NEXT:    buffer_wbinvl1
5372; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
5373; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
5374; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
5375; SI-NEXT:    s_cbranch_execnz .LBB99_1
5376; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
5377; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
5378; SI-NEXT:    v_mov_b32_e32 v0, v3
5379; SI-NEXT:    s_waitcnt expcnt(0)
5380; SI-NEXT:    s_setpc_b64 s[30:31]
5381;
5382; VI-LABEL: global_atomic_umax_i32_ret:
5383; VI:       ; %bb.0:
5384; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5385; VI-NEXT:    flat_load_dword v3, v[0:1]
5386; VI-NEXT:    s_mov_b64 s[4:5], 0
5387; VI-NEXT:  .LBB99_1: ; %atomicrmw.start
5388; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
5389; VI-NEXT:    s_waitcnt vmcnt(0)
5390; VI-NEXT:    v_mov_b32_e32 v4, v3
5391; VI-NEXT:    v_max_u32_e32 v3, v4, v2
5392; VI-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
5393; VI-NEXT:    s_waitcnt vmcnt(0)
5394; VI-NEXT:    buffer_wbinvl1_vol
5395; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
5396; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5397; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5398; VI-NEXT:    s_cbranch_execnz .LBB99_1
5399; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
5400; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
5401; VI-NEXT:    v_mov_b32_e32 v0, v3
5402; VI-NEXT:    s_setpc_b64 s[30:31]
5403;
5404; GFX9-LABEL: global_atomic_umax_i32_ret:
5405; GFX9:       ; %bb.0:
5406; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5407; GFX9-NEXT:    global_load_dword v3, v[0:1], off
5408; GFX9-NEXT:    s_mov_b64 s[4:5], 0
5409; GFX9-NEXT:  .LBB99_1: ; %atomicrmw.start
5410; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
5411; GFX9-NEXT:    s_waitcnt vmcnt(0)
5412; GFX9-NEXT:    v_mov_b32_e32 v4, v3
5413; GFX9-NEXT:    v_max_u32_e32 v3, v4, v2
5414; GFX9-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
5415; GFX9-NEXT:    s_waitcnt vmcnt(0)
5416; GFX9-NEXT:    buffer_wbinvl1_vol
5417; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
5418; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5419; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5420; GFX9-NEXT:    s_cbranch_execnz .LBB99_1
5421; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
5422; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
5423; GFX9-NEXT:    v_mov_b32_e32 v0, v3
5424; GFX9-NEXT:    s_setpc_b64 s[30:31]
5425  %result = atomicrmw umax ptr addrspace(1) %ptr, i32 %in seq_cst
5426  ret i32 %result
5427}
5428
5429define i32 @global_atomic_umax_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
5430; SI-LABEL: global_atomic_umax_i32_ret_offset:
5431; SI:       ; %bb.0:
5432; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5433; SI-NEXT:    s_mov_b32 s6, 0
5434; SI-NEXT:    s_mov_b32 s7, 0xf000
5435; SI-NEXT:    s_mov_b32 s4, s6
5436; SI-NEXT:    s_mov_b32 s5, s6
5437; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
5438; SI-NEXT:    s_mov_b64 s[8:9], 0
5439; SI-NEXT:  .LBB100_1: ; %atomicrmw.start
5440; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
5441; SI-NEXT:    s_waitcnt vmcnt(0)
5442; SI-NEXT:    v_mov_b32_e32 v5, v3
5443; SI-NEXT:    s_waitcnt expcnt(0)
5444; SI-NEXT:    v_max_u32_e32 v4, v5, v2
5445; SI-NEXT:    v_mov_b32_e32 v3, v4
5446; SI-NEXT:    v_mov_b32_e32 v4, v5
5447; SI-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
5448; SI-NEXT:    s_waitcnt vmcnt(0)
5449; SI-NEXT:    buffer_wbinvl1
5450; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
5451; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
5452; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
5453; SI-NEXT:    s_cbranch_execnz .LBB100_1
5454; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
5455; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
5456; SI-NEXT:    v_mov_b32_e32 v0, v3
5457; SI-NEXT:    s_waitcnt expcnt(0)
5458; SI-NEXT:    s_setpc_b64 s[30:31]
5459;
5460; VI-LABEL: global_atomic_umax_i32_ret_offset:
5461; VI:       ; %bb.0:
5462; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5463; VI-NEXT:    v_add_u32_e32 v3, vcc, 16, v0
5464; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
5465; VI-NEXT:    flat_load_dword v0, v[3:4]
5466; VI-NEXT:    s_mov_b64 s[4:5], 0
5467; VI-NEXT:  .LBB100_1: ; %atomicrmw.start
5468; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
5469; VI-NEXT:    s_waitcnt vmcnt(0)
5470; VI-NEXT:    v_mov_b32_e32 v1, v0
5471; VI-NEXT:    v_max_u32_e32 v0, v1, v2
5472; VI-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
5473; VI-NEXT:    s_waitcnt vmcnt(0)
5474; VI-NEXT:    buffer_wbinvl1_vol
5475; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
5476; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5477; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5478; VI-NEXT:    s_cbranch_execnz .LBB100_1
5479; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
5480; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
5481; VI-NEXT:    s_setpc_b64 s[30:31]
5482;
5483; GFX9-LABEL: global_atomic_umax_i32_ret_offset:
5484; GFX9:       ; %bb.0:
5485; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5486; GFX9-NEXT:    global_load_dword v3, v[0:1], off offset:16
5487; GFX9-NEXT:    s_mov_b64 s[4:5], 0
5488; GFX9-NEXT:  .LBB100_1: ; %atomicrmw.start
5489; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
5490; GFX9-NEXT:    s_waitcnt vmcnt(0)
5491; GFX9-NEXT:    v_mov_b32_e32 v4, v3
5492; GFX9-NEXT:    v_max_u32_e32 v3, v4, v2
5493; GFX9-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
5494; GFX9-NEXT:    s_waitcnt vmcnt(0)
5495; GFX9-NEXT:    buffer_wbinvl1_vol
5496; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
5497; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5498; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5499; GFX9-NEXT:    s_cbranch_execnz .LBB100_1
5500; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
5501; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
5502; GFX9-NEXT:    v_mov_b32_e32 v0, v3
5503; GFX9-NEXT:    s_setpc_b64 s[30:31]
5504  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
5505  %result = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst
5506  ret i32 %result
5507}
5508
5509define amdgpu_gfx void @global_atomic_umax_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
5510; SI-LABEL: global_atomic_umax_i32_noret_scalar:
5511; SI:       ; %bb.0:
5512; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5513; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
5514; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
5515; SI-NEXT:    s_mov_b64 exec, s[34:35]
5516; SI-NEXT:    s_waitcnt expcnt(0)
5517; SI-NEXT:    v_writelane_b32 v4, s6, 0
5518; SI-NEXT:    v_writelane_b32 v4, s7, 1
5519; SI-NEXT:    s_mov_b32 s34, s6
5520; SI-NEXT:    s_mov_b32 s7, 0xf000
5521; SI-NEXT:    s_mov_b32 s6, -1
5522; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
5523; SI-NEXT:    s_mov_b64 s[36:37], 0
5524; SI-NEXT:  .LBB101_1: ; %atomicrmw.start
5525; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
5526; SI-NEXT:    s_waitcnt vmcnt(0)
5527; SI-NEXT:    v_max_u32_e32 v0, s34, v1
5528; SI-NEXT:    s_waitcnt expcnt(0)
5529; SI-NEXT:    v_mov_b32_e32 v3, v1
5530; SI-NEXT:    v_mov_b32_e32 v2, v0
5531; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
5532; SI-NEXT:    s_waitcnt vmcnt(0)
5533; SI-NEXT:    buffer_wbinvl1
5534; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
5535; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
5536; SI-NEXT:    v_mov_b32_e32 v1, v2
5537; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
5538; SI-NEXT:    s_cbranch_execnz .LBB101_1
5539; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
5540; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
5541; SI-NEXT:    v_readlane_b32 s7, v4, 1
5542; SI-NEXT:    v_readlane_b32 s6, v4, 0
5543; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
5544; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
5545; SI-NEXT:    s_mov_b64 exec, s[34:35]
5546; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5547; SI-NEXT:    s_setpc_b64 s[30:31]
5548;
5549; VI-LABEL: global_atomic_umax_i32_noret_scalar:
5550; VI:       ; %bb.0:
5551; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5552; VI-NEXT:    v_mov_b32_e32 v0, s4
5553; VI-NEXT:    v_mov_b32_e32 v1, s5
5554; VI-NEXT:    flat_load_dword v3, v[0:1]
5555; VI-NEXT:    s_mov_b64 s[34:35], 0
5556; VI-NEXT:  .LBB101_1: ; %atomicrmw.start
5557; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
5558; VI-NEXT:    s_waitcnt vmcnt(0)
5559; VI-NEXT:    v_max_u32_e32 v2, s6, v3
5560; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5561; VI-NEXT:    s_waitcnt vmcnt(0)
5562; VI-NEXT:    buffer_wbinvl1_vol
5563; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
5564; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
5565; VI-NEXT:    v_mov_b32_e32 v3, v2
5566; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
5567; VI-NEXT:    s_cbranch_execnz .LBB101_1
5568; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
5569; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
5570; VI-NEXT:    s_setpc_b64 s[30:31]
5571;
5572; GFX9-LABEL: global_atomic_umax_i32_noret_scalar:
5573; GFX9:       ; %bb.0:
5574; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5575; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5576; GFX9-NEXT:    global_load_dword v1, v2, s[4:5]
5577; GFX9-NEXT:    s_mov_b64 s[34:35], 0
5578; GFX9-NEXT:  .LBB101_1: ; %atomicrmw.start
5579; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
5580; GFX9-NEXT:    s_waitcnt vmcnt(0)
5581; GFX9-NEXT:    v_max_u32_e32 v0, s6, v1
5582; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc
5583; GFX9-NEXT:    s_waitcnt vmcnt(0)
5584; GFX9-NEXT:    buffer_wbinvl1_vol
5585; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
5586; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
5587; GFX9-NEXT:    v_mov_b32_e32 v1, v0
5588; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
5589; GFX9-NEXT:    s_cbranch_execnz .LBB101_1
5590; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
5591; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
5592; GFX9-NEXT:    s_setpc_b64 s[30:31]
5593  %tmp0 = atomicrmw umax ptr addrspace(1) %ptr, i32 %in seq_cst
5594  ret void
5595}
5596
5597define amdgpu_gfx void @global_atomic_umax_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
5598; SI-LABEL: global_atomic_umax_i32_noret_offset_scalar:
5599; SI:       ; %bb.0:
5600; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5601; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
5602; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
5603; SI-NEXT:    s_mov_b64 exec, s[34:35]
5604; SI-NEXT:    s_waitcnt expcnt(0)
5605; SI-NEXT:    v_writelane_b32 v4, s6, 0
5606; SI-NEXT:    v_writelane_b32 v4, s7, 1
5607; SI-NEXT:    s_mov_b32 s34, s6
5608; SI-NEXT:    s_mov_b32 s7, 0xf000
5609; SI-NEXT:    s_mov_b32 s6, -1
5610; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0 offset:16
5611; SI-NEXT:    s_mov_b64 s[36:37], 0
5612; SI-NEXT:  .LBB102_1: ; %atomicrmw.start
5613; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
5614; SI-NEXT:    s_waitcnt vmcnt(0)
5615; SI-NEXT:    v_max_u32_e32 v0, s34, v1
5616; SI-NEXT:    s_waitcnt expcnt(0)
5617; SI-NEXT:    v_mov_b32_e32 v3, v1
5618; SI-NEXT:    v_mov_b32_e32 v2, v0
5619; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
5620; SI-NEXT:    s_waitcnt vmcnt(0)
5621; SI-NEXT:    buffer_wbinvl1
5622; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
5623; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
5624; SI-NEXT:    v_mov_b32_e32 v1, v2
5625; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
5626; SI-NEXT:    s_cbranch_execnz .LBB102_1
5627; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
5628; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
5629; SI-NEXT:    v_readlane_b32 s7, v4, 1
5630; SI-NEXT:    v_readlane_b32 s6, v4, 0
5631; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
5632; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
5633; SI-NEXT:    s_mov_b64 exec, s[34:35]
5634; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5635; SI-NEXT:    s_setpc_b64 s[30:31]
5636;
5637; VI-LABEL: global_atomic_umax_i32_noret_offset_scalar:
5638; VI:       ; %bb.0:
5639; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5640; VI-NEXT:    s_add_u32 s34, s4, 16
5641; VI-NEXT:    s_addc_u32 s35, s5, 0
5642; VI-NEXT:    v_mov_b32_e32 v0, s34
5643; VI-NEXT:    v_mov_b32_e32 v1, s35
5644; VI-NEXT:    flat_load_dword v3, v[0:1]
5645; VI-NEXT:    s_mov_b64 s[34:35], 0
5646; VI-NEXT:  .LBB102_1: ; %atomicrmw.start
5647; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
5648; VI-NEXT:    s_waitcnt vmcnt(0)
5649; VI-NEXT:    v_max_u32_e32 v2, s6, v3
5650; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5651; VI-NEXT:    s_waitcnt vmcnt(0)
5652; VI-NEXT:    buffer_wbinvl1_vol
5653; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
5654; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
5655; VI-NEXT:    v_mov_b32_e32 v3, v2
5656; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
5657; VI-NEXT:    s_cbranch_execnz .LBB102_1
5658; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
5659; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
5660; VI-NEXT:    s_setpc_b64 s[30:31]
5661;
5662; GFX9-LABEL: global_atomic_umax_i32_noret_offset_scalar:
5663; GFX9:       ; %bb.0:
5664; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5665; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5666; GFX9-NEXT:    global_load_dword v1, v2, s[4:5] offset:16
5667; GFX9-NEXT:    s_mov_b64 s[34:35], 0
5668; GFX9-NEXT:  .LBB102_1: ; %atomicrmw.start
5669; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
5670; GFX9-NEXT:    s_waitcnt vmcnt(0)
5671; GFX9-NEXT:    v_max_u32_e32 v0, s6, v1
5672; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc
5673; GFX9-NEXT:    s_waitcnt vmcnt(0)
5674; GFX9-NEXT:    buffer_wbinvl1_vol
5675; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
5676; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
5677; GFX9-NEXT:    v_mov_b32_e32 v1, v0
5678; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
5679; GFX9-NEXT:    s_cbranch_execnz .LBB102_1
5680; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
5681; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
5682; GFX9-NEXT:    s_setpc_b64 s[30:31]
5683  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
5684  %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst
5685  ret void
5686}
5687
5688define amdgpu_gfx i32 @global_atomic_umax_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
5689; SI-LABEL: global_atomic_umax_i32_ret_scalar:
5690; SI:       ; %bb.0:
5691; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5692; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
5693; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
5694; SI-NEXT:    s_mov_b64 exec, s[34:35]
5695; SI-NEXT:    s_waitcnt expcnt(0)
5696; SI-NEXT:    v_writelane_b32 v3, s6, 0
5697; SI-NEXT:    v_writelane_b32 v3, s7, 1
5698; SI-NEXT:    s_mov_b32 s34, s6
5699; SI-NEXT:    s_mov_b32 s7, 0xf000
5700; SI-NEXT:    s_mov_b32 s6, -1
5701; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
5702; SI-NEXT:    s_mov_b64 s[36:37], 0
5703; SI-NEXT:  .LBB103_1: ; %atomicrmw.start
5704; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
5705; SI-NEXT:    s_waitcnt vmcnt(0)
5706; SI-NEXT:    v_mov_b32_e32 v2, v0
5707; SI-NEXT:    s_waitcnt expcnt(0)
5708; SI-NEXT:    v_max_u32_e32 v1, s34, v2
5709; SI-NEXT:    v_mov_b32_e32 v0, v1
5710; SI-NEXT:    v_mov_b32_e32 v1, v2
5711; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
5712; SI-NEXT:    s_waitcnt vmcnt(0)
5713; SI-NEXT:    buffer_wbinvl1
5714; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
5715; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
5716; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
5717; SI-NEXT:    s_cbranch_execnz .LBB103_1
5718; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
5719; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
5720; SI-NEXT:    v_readlane_b32 s7, v3, 1
5721; SI-NEXT:    v_readlane_b32 s6, v3, 0
5722; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
5723; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
5724; SI-NEXT:    s_mov_b64 exec, s[34:35]
5725; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5726; SI-NEXT:    s_setpc_b64 s[30:31]
5727;
5728; VI-LABEL: global_atomic_umax_i32_ret_scalar:
5729; VI:       ; %bb.0:
5730; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5731; VI-NEXT:    v_mov_b32_e32 v0, s4
5732; VI-NEXT:    v_mov_b32_e32 v1, s5
5733; VI-NEXT:    flat_load_dword v0, v[0:1]
5734; VI-NEXT:    v_mov_b32_e32 v1, s4
5735; VI-NEXT:    s_mov_b64 s[34:35], 0
5736; VI-NEXT:    v_mov_b32_e32 v2, s5
5737; VI-NEXT:  .LBB103_1: ; %atomicrmw.start
5738; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
5739; VI-NEXT:    s_waitcnt vmcnt(0)
5740; VI-NEXT:    v_mov_b32_e32 v4, v0
5741; VI-NEXT:    v_max_u32_e32 v3, s6, v4
5742; VI-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
5743; VI-NEXT:    s_waitcnt vmcnt(0)
5744; VI-NEXT:    buffer_wbinvl1_vol
5745; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
5746; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
5747; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
5748; VI-NEXT:    s_cbranch_execnz .LBB103_1
5749; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
5750; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
5751; VI-NEXT:    s_setpc_b64 s[30:31]
5752;
5753; GFX9-LABEL: global_atomic_umax_i32_ret_scalar:
5754; GFX9:       ; %bb.0:
5755; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5756; GFX9-NEXT:    v_mov_b32_e32 v1, 0
5757; GFX9-NEXT:    global_load_dword v0, v1, s[4:5]
5758; GFX9-NEXT:    s_mov_b64 s[34:35], 0
5759; GFX9-NEXT:  .LBB103_1: ; %atomicrmw.start
5760; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
5761; GFX9-NEXT:    s_waitcnt vmcnt(0)
5762; GFX9-NEXT:    v_mov_b32_e32 v3, v0
5763; GFX9-NEXT:    v_max_u32_e32 v2, s6, v3
5764; GFX9-NEXT:    global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc
5765; GFX9-NEXT:    s_waitcnt vmcnt(0)
5766; GFX9-NEXT:    buffer_wbinvl1_vol
5767; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
5768; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
5769; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
5770; GFX9-NEXT:    s_cbranch_execnz .LBB103_1
5771; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
5772; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
5773; GFX9-NEXT:    s_setpc_b64 s[30:31]
5774  %result = atomicrmw umax ptr addrspace(1) %ptr, i32 %in seq_cst
5775  ret i32 %result
5776}
5777
5778define amdgpu_gfx i32 @global_atomic_umax_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
5779; SI-LABEL: global_atomic_umax_i32_ret_offset_scalar:
5780; SI:       ; %bb.0:
5781; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5782; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
5783; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
5784; SI-NEXT:    s_mov_b64 exec, s[34:35]
5785; SI-NEXT:    s_waitcnt expcnt(0)
5786; SI-NEXT:    v_writelane_b32 v3, s6, 0
5787; SI-NEXT:    v_writelane_b32 v3, s7, 1
5788; SI-NEXT:    s_mov_b32 s34, s6
5789; SI-NEXT:    s_mov_b32 s7, 0xf000
5790; SI-NEXT:    s_mov_b32 s6, -1
5791; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0 offset:16
5792; SI-NEXT:    s_mov_b64 s[36:37], 0
5793; SI-NEXT:  .LBB104_1: ; %atomicrmw.start
5794; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
5795; SI-NEXT:    s_waitcnt vmcnt(0)
5796; SI-NEXT:    v_mov_b32_e32 v2, v0
5797; SI-NEXT:    s_waitcnt expcnt(0)
5798; SI-NEXT:    v_max_u32_e32 v1, s34, v2
5799; SI-NEXT:    v_mov_b32_e32 v0, v1
5800; SI-NEXT:    v_mov_b32_e32 v1, v2
5801; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
5802; SI-NEXT:    s_waitcnt vmcnt(0)
5803; SI-NEXT:    buffer_wbinvl1
5804; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
5805; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
5806; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
5807; SI-NEXT:    s_cbranch_execnz .LBB104_1
5808; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
5809; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
5810; SI-NEXT:    v_readlane_b32 s7, v3, 1
5811; SI-NEXT:    v_readlane_b32 s6, v3, 0
5812; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
5813; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
5814; SI-NEXT:    s_mov_b64 exec, s[34:35]
5815; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5816; SI-NEXT:    s_setpc_b64 s[30:31]
5817;
5818; VI-LABEL: global_atomic_umax_i32_ret_offset_scalar:
5819; VI:       ; %bb.0:
5820; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5821; VI-NEXT:    s_add_u32 s34, s4, 16
5822; VI-NEXT:    s_addc_u32 s35, s5, 0
5823; VI-NEXT:    v_mov_b32_e32 v1, s34
5824; VI-NEXT:    v_mov_b32_e32 v2, s35
5825; VI-NEXT:    flat_load_dword v0, v[1:2]
5826; VI-NEXT:    s_mov_b64 s[34:35], 0
5827; VI-NEXT:  .LBB104_1: ; %atomicrmw.start
5828; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
5829; VI-NEXT:    s_waitcnt vmcnt(0)
5830; VI-NEXT:    v_mov_b32_e32 v4, v0
5831; VI-NEXT:    v_max_u32_e32 v3, s6, v4
5832; VI-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
5833; VI-NEXT:    s_waitcnt vmcnt(0)
5834; VI-NEXT:    buffer_wbinvl1_vol
5835; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
5836; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
5837; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
5838; VI-NEXT:    s_cbranch_execnz .LBB104_1
5839; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
5840; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
5841; VI-NEXT:    s_setpc_b64 s[30:31]
5842;
5843; GFX9-LABEL: global_atomic_umax_i32_ret_offset_scalar:
5844; GFX9:       ; %bb.0:
5845; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5846; GFX9-NEXT:    v_mov_b32_e32 v1, 0
5847; GFX9-NEXT:    global_load_dword v0, v1, s[4:5] offset:16
5848; GFX9-NEXT:    s_mov_b64 s[34:35], 0
5849; GFX9-NEXT:  .LBB104_1: ; %atomicrmw.start
5850; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
5851; GFX9-NEXT:    s_waitcnt vmcnt(0)
5852; GFX9-NEXT:    v_mov_b32_e32 v3, v0
5853; GFX9-NEXT:    v_max_u32_e32 v2, s6, v3
5854; GFX9-NEXT:    global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc
5855; GFX9-NEXT:    s_waitcnt vmcnt(0)
5856; GFX9-NEXT:    buffer_wbinvl1_vol
5857; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
5858; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
5859; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
5860; GFX9-NEXT:    s_cbranch_execnz .LBB104_1
5861; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
5862; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
5863; GFX9-NEXT:    s_setpc_b64 s[30:31]
5864  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
5865  %result = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst
5866  ret i32 %result
5867}
5868
5869define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i32 %index) {
5870; SI-LABEL: atomic_umax_i32_addr64_offset:
5871; SI:       ; %bb.0: ; %entry
5872; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
5873; SI-NEXT:    s_waitcnt lgkmcnt(0)
5874; SI-NEXT:    s_ashr_i32 s5, s3, 31
5875; SI-NEXT:    s_mov_b32 s4, s3
5876; SI-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
5877; SI-NEXT:    s_add_u32 s4, s0, s4
5878; SI-NEXT:    s_addc_u32 s5, s1, s5
5879; SI-NEXT:    s_load_dword s3, s[4:5], 0x4
5880; SI-NEXT:    s_mov_b64 s[0:1], 0
5881; SI-NEXT:    s_mov_b32 s7, 0xf000
5882; SI-NEXT:    s_waitcnt lgkmcnt(0)
5883; SI-NEXT:    v_mov_b32_e32 v1, s3
5884; SI-NEXT:    s_mov_b32 s6, -1
5885; SI-NEXT:  .LBB105_1: ; %atomicrmw.start
5886; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
5887; SI-NEXT:    v_max_u32_e32 v0, s2, v1
5888; SI-NEXT:    s_waitcnt expcnt(0)
5889; SI-NEXT:    v_mov_b32_e32 v3, v1
5890; SI-NEXT:    v_mov_b32_e32 v2, v0
5891; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
5892; SI-NEXT:    s_waitcnt vmcnt(0)
5893; SI-NEXT:    buffer_wbinvl1
5894; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
5895; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
5896; SI-NEXT:    v_mov_b32_e32 v1, v2
5897; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
5898; SI-NEXT:    s_cbranch_execnz .LBB105_1
5899; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
5900; SI-NEXT:    s_endpgm
5901;
5902; VI-LABEL: atomic_umax_i32_addr64_offset:
5903; VI:       ; %bb.0: ; %entry
5904; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5905; VI-NEXT:    s_waitcnt lgkmcnt(0)
5906; VI-NEXT:    s_ashr_i32 s5, s3, 31
5907; VI-NEXT:    s_mov_b32 s4, s3
5908; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
5909; VI-NEXT:    s_add_u32 s4, s0, s4
5910; VI-NEXT:    s_addc_u32 s5, s1, s5
5911; VI-NEXT:    s_load_dword s3, s[4:5], 0x10
5912; VI-NEXT:    s_add_u32 s4, s4, 16
5913; VI-NEXT:    s_addc_u32 s5, s5, 0
5914; VI-NEXT:    v_mov_b32_e32 v0, s4
5915; VI-NEXT:    s_mov_b64 s[0:1], 0
5916; VI-NEXT:    s_waitcnt lgkmcnt(0)
5917; VI-NEXT:    v_mov_b32_e32 v3, s3
5918; VI-NEXT:    v_mov_b32_e32 v1, s5
5919; VI-NEXT:  .LBB105_1: ; %atomicrmw.start
5920; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
5921; VI-NEXT:    v_max_u32_e32 v2, s2, v3
5922; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5923; VI-NEXT:    s_waitcnt vmcnt(0)
5924; VI-NEXT:    buffer_wbinvl1_vol
5925; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
5926; VI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
5927; VI-NEXT:    v_mov_b32_e32 v3, v2
5928; VI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
5929; VI-NEXT:    s_cbranch_execnz .LBB105_1
5930; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
5931; VI-NEXT:    s_endpgm
5932;
5933; GFX9-LABEL: atomic_umax_i32_addr64_offset:
5934; GFX9:       ; %bb.0: ; %entry
5935; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5936; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5937; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5938; GFX9-NEXT:    s_ashr_i32 s5, s3, 31
5939; GFX9-NEXT:    s_mov_b32 s4, s3
5940; GFX9-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
5941; GFX9-NEXT:    s_add_u32 s0, s0, s4
5942; GFX9-NEXT:    s_addc_u32 s1, s1, s5
5943; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x10
5944; GFX9-NEXT:    s_mov_b64 s[4:5], 0
5945; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5946; GFX9-NEXT:    v_mov_b32_e32 v1, s3
5947; GFX9-NEXT:  .LBB105_1: ; %atomicrmw.start
5948; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
5949; GFX9-NEXT:    v_max_u32_e32 v0, s2, v1
5950; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
5951; GFX9-NEXT:    s_waitcnt vmcnt(0)
5952; GFX9-NEXT:    buffer_wbinvl1_vol
5953; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
5954; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5955; GFX9-NEXT:    v_mov_b32_e32 v1, v0
5956; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5957; GFX9-NEXT:    s_cbranch_execnz .LBB105_1
5958; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
5959; GFX9-NEXT:    s_endpgm
5960entry:
5961  %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index
5962  %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
5963  %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst
5964  ret void
5965}
5966
5967define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) {
5968; SI-LABEL: atomic_umax_i32_ret_addr64_offset:
5969; SI:       ; %bb.0: ; %entry
5970; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
5971; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
5972; SI-NEXT:    s_waitcnt lgkmcnt(0)
5973; SI-NEXT:    s_ashr_i32 s5, s9, 31
5974; SI-NEXT:    s_mov_b32 s4, s9
5975; SI-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
5976; SI-NEXT:    s_add_u32 s4, s0, s4
5977; SI-NEXT:    s_addc_u32 s5, s1, s5
5978; SI-NEXT:    s_load_dword s6, s[4:5], 0x4
5979; SI-NEXT:    s_mov_b64 s[0:1], 0
5980; SI-NEXT:    s_mov_b32 s7, 0xf000
5981; SI-NEXT:    s_waitcnt lgkmcnt(0)
5982; SI-NEXT:    v_mov_b32_e32 v1, s6
5983; SI-NEXT:    s_mov_b32 s6, -1
5984; SI-NEXT:  .LBB106_1: ; %atomicrmw.start
5985; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
5986; SI-NEXT:    v_max_u32_e32 v0, s8, v1
5987; SI-NEXT:    s_waitcnt expcnt(0)
5988; SI-NEXT:    v_mov_b32_e32 v3, v1
5989; SI-NEXT:    v_mov_b32_e32 v2, v0
5990; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
5991; SI-NEXT:    s_waitcnt vmcnt(0)
5992; SI-NEXT:    buffer_wbinvl1
5993; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
5994; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
5995; SI-NEXT:    v_mov_b32_e32 v1, v2
5996; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
5997; SI-NEXT:    s_cbranch_execnz .LBB106_1
5998; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
5999; SI-NEXT:    s_or_b64 exec, exec, s[0:1]
6000; SI-NEXT:    s_mov_b32 s7, 0xf000
6001; SI-NEXT:    s_mov_b32 s6, -1
6002; SI-NEXT:    s_mov_b32 s4, s2
6003; SI-NEXT:    s_mov_b32 s5, s3
6004; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0
6005; SI-NEXT:    s_endpgm
6006;
6007; VI-LABEL: atomic_umax_i32_ret_addr64_offset:
6008; VI:       ; %bb.0: ; %entry
6009; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
6010; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
6011; VI-NEXT:    s_waitcnt lgkmcnt(0)
6012; VI-NEXT:    s_ashr_i32 s5, s7, 31
6013; VI-NEXT:    s_mov_b32 s4, s7
6014; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
6015; VI-NEXT:    s_add_u32 s4, s0, s4
6016; VI-NEXT:    s_addc_u32 s5, s1, s5
6017; VI-NEXT:    s_load_dword s7, s[4:5], 0x10
6018; VI-NEXT:    s_add_u32 s4, s4, 16
6019; VI-NEXT:    s_addc_u32 s5, s5, 0
6020; VI-NEXT:    v_mov_b32_e32 v0, s4
6021; VI-NEXT:    s_mov_b64 s[0:1], 0
6022; VI-NEXT:    s_waitcnt lgkmcnt(0)
6023; VI-NEXT:    v_mov_b32_e32 v2, s7
6024; VI-NEXT:    v_mov_b32_e32 v1, s5
6025; VI-NEXT:  .LBB106_1: ; %atomicrmw.start
6026; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
6027; VI-NEXT:    v_mov_b32_e32 v3, v2
6028; VI-NEXT:    v_max_u32_e32 v2, s6, v3
6029; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6030; VI-NEXT:    s_waitcnt vmcnt(0)
6031; VI-NEXT:    buffer_wbinvl1_vol
6032; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
6033; VI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
6034; VI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
6035; VI-NEXT:    s_cbranch_execnz .LBB106_1
6036; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
6037; VI-NEXT:    s_or_b64 exec, exec, s[0:1]
6038; VI-NEXT:    v_mov_b32_e32 v0, s2
6039; VI-NEXT:    v_mov_b32_e32 v1, s3
6040; VI-NEXT:    flat_store_dword v[0:1], v2
6041; VI-NEXT:    s_endpgm
6042;
6043; GFX9-LABEL: atomic_umax_i32_ret_addr64_offset:
6044; GFX9:       ; %bb.0: ; %entry
6045; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
6046; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
6047; GFX9-NEXT:    v_mov_b32_e32 v1, 0
6048; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6049; GFX9-NEXT:    s_ashr_i32 s5, s7, 31
6050; GFX9-NEXT:    s_mov_b32 s4, s7
6051; GFX9-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
6052; GFX9-NEXT:    s_add_u32 s0, s0, s4
6053; GFX9-NEXT:    s_addc_u32 s1, s1, s5
6054; GFX9-NEXT:    s_load_dword s7, s[0:1], 0x10
6055; GFX9-NEXT:    s_mov_b64 s[4:5], 0
6056; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6057; GFX9-NEXT:    v_mov_b32_e32 v0, s7
6058; GFX9-NEXT:  .LBB106_1: ; %atomicrmw.start
6059; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
6060; GFX9-NEXT:    v_mov_b32_e32 v3, v0
6061; GFX9-NEXT:    v_max_u32_e32 v2, s6, v3
6062; GFX9-NEXT:    global_atomic_cmpswap v0, v1, v[2:3], s[0:1] offset:16 glc
6063; GFX9-NEXT:    s_waitcnt vmcnt(0)
6064; GFX9-NEXT:    buffer_wbinvl1_vol
6065; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
6066; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6067; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6068; GFX9-NEXT:    s_cbranch_execnz .LBB106_1
6069; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
6070; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
6071; GFX9-NEXT:    v_mov_b32_e32 v1, 0
6072; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
6073; GFX9-NEXT:    s_endpgm
6074entry:
6075  %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index
6076  %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
6077  %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst
6078  store i32 %tmp0, ptr addrspace(1) %out2
6079  ret void
6080}
6081
6082define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) {
6083; SI-LABEL: atomic_umax_i32_ret_addr64:
6084; SI:       ; %bb.0: ; %entry
6085; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
6086; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
6087; SI-NEXT:    s_waitcnt lgkmcnt(0)
6088; SI-NEXT:    s_ashr_i32 s5, s9, 31
6089; SI-NEXT:    s_mov_b32 s4, s9
6090; SI-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
6091; SI-NEXT:    s_add_u32 s4, s0, s4
6092; SI-NEXT:    s_addc_u32 s5, s1, s5
6093; SI-NEXT:    s_load_dword s6, s[4:5], 0x0
6094; SI-NEXT:    s_mov_b64 s[0:1], 0
6095; SI-NEXT:    s_mov_b32 s7, 0xf000
6096; SI-NEXT:    s_waitcnt lgkmcnt(0)
6097; SI-NEXT:    v_mov_b32_e32 v1, s6
6098; SI-NEXT:    s_mov_b32 s6, -1
6099; SI-NEXT:  .LBB107_1: ; %atomicrmw.start
6100; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
6101; SI-NEXT:    v_max_u32_e32 v0, s8, v1
6102; SI-NEXT:    s_waitcnt expcnt(0)
6103; SI-NEXT:    v_mov_b32_e32 v3, v1
6104; SI-NEXT:    v_mov_b32_e32 v2, v0
6105; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
6106; SI-NEXT:    s_waitcnt vmcnt(0)
6107; SI-NEXT:    buffer_wbinvl1
6108; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
6109; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
6110; SI-NEXT:    v_mov_b32_e32 v1, v2
6111; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
6112; SI-NEXT:    s_cbranch_execnz .LBB107_1
6113; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
6114; SI-NEXT:    s_or_b64 exec, exec, s[0:1]
6115; SI-NEXT:    s_mov_b32 s7, 0xf000
6116; SI-NEXT:    s_mov_b32 s6, -1
6117; SI-NEXT:    s_mov_b32 s4, s2
6118; SI-NEXT:    s_mov_b32 s5, s3
6119; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0
6120; SI-NEXT:    s_endpgm
6121;
6122; VI-LABEL: atomic_umax_i32_ret_addr64:
6123; VI:       ; %bb.0: ; %entry
6124; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
6125; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
6126; VI-NEXT:    s_waitcnt lgkmcnt(0)
6127; VI-NEXT:    s_ashr_i32 s5, s7, 31
6128; VI-NEXT:    s_mov_b32 s4, s7
6129; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
6130; VI-NEXT:    s_add_u32 s4, s0, s4
6131; VI-NEXT:    s_addc_u32 s5, s1, s5
6132; VI-NEXT:    s_load_dword s7, s[4:5], 0x0
6133; VI-NEXT:    v_mov_b32_e32 v0, s4
6134; VI-NEXT:    s_mov_b64 s[0:1], 0
6135; VI-NEXT:    v_mov_b32_e32 v1, s5
6136; VI-NEXT:    s_waitcnt lgkmcnt(0)
6137; VI-NEXT:    v_mov_b32_e32 v2, s7
6138; VI-NEXT:  .LBB107_1: ; %atomicrmw.start
6139; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
6140; VI-NEXT:    v_mov_b32_e32 v3, v2
6141; VI-NEXT:    v_max_u32_e32 v2, s6, v3
6142; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6143; VI-NEXT:    s_waitcnt vmcnt(0)
6144; VI-NEXT:    buffer_wbinvl1_vol
6145; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
6146; VI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
6147; VI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
6148; VI-NEXT:    s_cbranch_execnz .LBB107_1
6149; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
6150; VI-NEXT:    s_or_b64 exec, exec, s[0:1]
6151; VI-NEXT:    v_mov_b32_e32 v0, s2
6152; VI-NEXT:    v_mov_b32_e32 v1, s3
6153; VI-NEXT:    flat_store_dword v[0:1], v2
6154; VI-NEXT:    s_endpgm
6155;
6156; GFX9-LABEL: atomic_umax_i32_ret_addr64:
6157; GFX9:       ; %bb.0: ; %entry
6158; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
6159; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
6160; GFX9-NEXT:    v_mov_b32_e32 v1, 0
6161; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6162; GFX9-NEXT:    s_ashr_i32 s5, s7, 31
6163; GFX9-NEXT:    s_mov_b32 s4, s7
6164; GFX9-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
6165; GFX9-NEXT:    s_add_u32 s0, s0, s4
6166; GFX9-NEXT:    s_addc_u32 s1, s1, s5
6167; GFX9-NEXT:    s_load_dword s7, s[0:1], 0x0
6168; GFX9-NEXT:    s_mov_b64 s[4:5], 0
6169; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6170; GFX9-NEXT:    v_mov_b32_e32 v0, s7
6171; GFX9-NEXT:  .LBB107_1: ; %atomicrmw.start
6172; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
6173; GFX9-NEXT:    v_mov_b32_e32 v3, v0
6174; GFX9-NEXT:    v_max_u32_e32 v2, s6, v3
6175; GFX9-NEXT:    global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc
6176; GFX9-NEXT:    s_waitcnt vmcnt(0)
6177; GFX9-NEXT:    buffer_wbinvl1_vol
6178; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
6179; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6180; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6181; GFX9-NEXT:    s_cbranch_execnz .LBB107_1
6182; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
6183; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
6184; GFX9-NEXT:    v_mov_b32_e32 v1, 0
6185; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
6186; GFX9-NEXT:    s_endpgm
6187entry:
6188  %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index
6189  %tmp0 = atomicrmw umax ptr addrspace(1) %ptr, i32 %in seq_cst
6190  store i32 %tmp0, ptr addrspace(1) %out2
6191  ret void
6192}
6193
6194define void @global_atomic_umax_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
6195; SI-LABEL: global_atomic_umax_i32_noret_offset__amdgpu_no_remote_memory:
6196; SI:       ; %bb.0:
6197; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6198; SI-NEXT:    s_mov_b32 s6, 0
6199; SI-NEXT:    s_mov_b32 s7, 0xf000
6200; SI-NEXT:    s_mov_b32 s4, s6
6201; SI-NEXT:    s_mov_b32 s5, s6
6202; SI-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
6203; SI-NEXT:    s_mov_b64 s[8:9], 0
6204; SI-NEXT:  .LBB108_1: ; %atomicrmw.start
6205; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
6206; SI-NEXT:    s_waitcnt vmcnt(0)
6207; SI-NEXT:    v_max_u32_e32 v3, v4, v2
6208; SI-NEXT:    s_waitcnt expcnt(0)
6209; SI-NEXT:    v_mov_b32_e32 v6, v4
6210; SI-NEXT:    v_mov_b32_e32 v5, v3
6211; SI-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
6212; SI-NEXT:    s_waitcnt vmcnt(0)
6213; SI-NEXT:    buffer_wbinvl1
6214; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
6215; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
6216; SI-NEXT:    v_mov_b32_e32 v4, v5
6217; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
6218; SI-NEXT:    s_cbranch_execnz .LBB108_1
6219; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
6220; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
6221; SI-NEXT:    s_waitcnt expcnt(0)
6222; SI-NEXT:    s_setpc_b64 s[30:31]
6223;
6224; VI-LABEL: global_atomic_umax_i32_noret_offset__amdgpu_no_remote_memory:
6225; VI:       ; %bb.0:
6226; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6227; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
6228; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
6229; VI-NEXT:    flat_load_dword v4, v[0:1]
6230; VI-NEXT:    s_mov_b64 s[4:5], 0
6231; VI-NEXT:  .LBB108_1: ; %atomicrmw.start
6232; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
6233; VI-NEXT:    s_waitcnt vmcnt(0)
6234; VI-NEXT:    v_max_u32_e32 v3, v4, v2
6235; VI-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
6236; VI-NEXT:    s_waitcnt vmcnt(0)
6237; VI-NEXT:    buffer_wbinvl1_vol
6238; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
6239; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6240; VI-NEXT:    v_mov_b32_e32 v4, v3
6241; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6242; VI-NEXT:    s_cbranch_execnz .LBB108_1
6243; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
6244; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
6245; VI-NEXT:    s_setpc_b64 s[30:31]
6246;
6247; GFX9-LABEL: global_atomic_umax_i32_noret_offset__amdgpu_no_remote_memory:
6248; GFX9:       ; %bb.0:
6249; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6250; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:16
6251; GFX9-NEXT:    s_mov_b64 s[4:5], 0
6252; GFX9-NEXT:  .LBB108_1: ; %atomicrmw.start
6253; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
6254; GFX9-NEXT:    s_waitcnt vmcnt(0)
6255; GFX9-NEXT:    v_max_u32_e32 v3, v4, v2
6256; GFX9-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
6257; GFX9-NEXT:    s_waitcnt vmcnt(0)
6258; GFX9-NEXT:    buffer_wbinvl1_vol
6259; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
6260; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6261; GFX9-NEXT:    v_mov_b32_e32 v4, v3
6262; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6263; GFX9-NEXT:    s_cbranch_execnz .LBB108_1
6264; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
6265; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
6266; GFX9-NEXT:    s_setpc_b64 s[30:31]
6267  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
6268  %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
6269  ret void
6270}
6271
6272define i32 @global_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
6273; SI-LABEL: global_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory:
6274; SI:       ; %bb.0:
6275; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6276; SI-NEXT:    s_mov_b32 s6, 0
6277; SI-NEXT:    s_mov_b32 s7, 0xf000
6278; SI-NEXT:    s_mov_b32 s4, s6
6279; SI-NEXT:    s_mov_b32 s5, s6
6280; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
6281; SI-NEXT:    s_mov_b64 s[8:9], 0
6282; SI-NEXT:  .LBB109_1: ; %atomicrmw.start
6283; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
6284; SI-NEXT:    s_waitcnt vmcnt(0)
6285; SI-NEXT:    v_mov_b32_e32 v5, v3
6286; SI-NEXT:    s_waitcnt expcnt(0)
6287; SI-NEXT:    v_max_u32_e32 v4, v5, v2
6288; SI-NEXT:    v_mov_b32_e32 v3, v4
6289; SI-NEXT:    v_mov_b32_e32 v4, v5
6290; SI-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
6291; SI-NEXT:    s_waitcnt vmcnt(0)
6292; SI-NEXT:    buffer_wbinvl1
6293; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
6294; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
6295; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
6296; SI-NEXT:    s_cbranch_execnz .LBB109_1
6297; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
6298; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
6299; SI-NEXT:    v_mov_b32_e32 v0, v3
6300; SI-NEXT:    s_waitcnt expcnt(0)
6301; SI-NEXT:    s_setpc_b64 s[30:31]
6302;
6303; VI-LABEL: global_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory:
6304; VI:       ; %bb.0:
6305; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6306; VI-NEXT:    v_add_u32_e32 v3, vcc, 16, v0
6307; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
6308; VI-NEXT:    flat_load_dword v0, v[3:4]
6309; VI-NEXT:    s_mov_b64 s[4:5], 0
6310; VI-NEXT:  .LBB109_1: ; %atomicrmw.start
6311; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
6312; VI-NEXT:    s_waitcnt vmcnt(0)
6313; VI-NEXT:    v_mov_b32_e32 v1, v0
6314; VI-NEXT:    v_max_u32_e32 v0, v1, v2
6315; VI-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
6316; VI-NEXT:    s_waitcnt vmcnt(0)
6317; VI-NEXT:    buffer_wbinvl1_vol
6318; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
6319; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6320; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6321; VI-NEXT:    s_cbranch_execnz .LBB109_1
6322; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
6323; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
6324; VI-NEXT:    s_setpc_b64 s[30:31]
6325;
6326; GFX9-LABEL: global_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory:
6327; GFX9:       ; %bb.0:
6328; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6329; GFX9-NEXT:    global_load_dword v3, v[0:1], off offset:16
6330; GFX9-NEXT:    s_mov_b64 s[4:5], 0
6331; GFX9-NEXT:  .LBB109_1: ; %atomicrmw.start
6332; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
6333; GFX9-NEXT:    s_waitcnt vmcnt(0)
6334; GFX9-NEXT:    v_mov_b32_e32 v4, v3
6335; GFX9-NEXT:    v_max_u32_e32 v3, v4, v2
6336; GFX9-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
6337; GFX9-NEXT:    s_waitcnt vmcnt(0)
6338; GFX9-NEXT:    buffer_wbinvl1_vol
6339; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
6340; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6341; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6342; GFX9-NEXT:    s_cbranch_execnz .LBB109_1
6343; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
6344; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
6345; GFX9-NEXT:    v_mov_b32_e32 v0, v3
6346; GFX9-NEXT:    s_setpc_b64 s[30:31]
6347  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
6348  %result = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
6349  ret i32 %result
6350}
6351
6352; ---------------------------------------------------------------------
6353; atomicrmw umin
6354; ---------------------------------------------------------------------
6355
6356define void @global_atomic_umin_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
6357; SI-LABEL: global_atomic_umin_i32_noret:
6358; SI:       ; %bb.0:
6359; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6360; SI-NEXT:    s_mov_b32 s6, 0
6361; SI-NEXT:    s_mov_b32 s7, 0xf000
6362; SI-NEXT:    s_mov_b32 s4, s6
6363; SI-NEXT:    s_mov_b32 s5, s6
6364; SI-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
6365; SI-NEXT:    s_mov_b64 s[8:9], 0
6366; SI-NEXT:  .LBB110_1: ; %atomicrmw.start
6367; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
6368; SI-NEXT:    s_waitcnt vmcnt(0)
6369; SI-NEXT:    v_min_u32_e32 v3, v4, v2
6370; SI-NEXT:    s_waitcnt expcnt(0)
6371; SI-NEXT:    v_mov_b32_e32 v6, v4
6372; SI-NEXT:    v_mov_b32_e32 v5, v3
6373; SI-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
6374; SI-NEXT:    s_waitcnt vmcnt(0)
6375; SI-NEXT:    buffer_wbinvl1
6376; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
6377; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
6378; SI-NEXT:    v_mov_b32_e32 v4, v5
6379; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
6380; SI-NEXT:    s_cbranch_execnz .LBB110_1
6381; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
6382; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
6383; SI-NEXT:    s_waitcnt expcnt(0)
6384; SI-NEXT:    s_setpc_b64 s[30:31]
6385;
6386; VI-LABEL: global_atomic_umin_i32_noret:
6387; VI:       ; %bb.0:
6388; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6389; VI-NEXT:    flat_load_dword v4, v[0:1]
6390; VI-NEXT:    s_mov_b64 s[4:5], 0
6391; VI-NEXT:  .LBB110_1: ; %atomicrmw.start
6392; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
6393; VI-NEXT:    s_waitcnt vmcnt(0)
6394; VI-NEXT:    v_min_u32_e32 v3, v4, v2
6395; VI-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
6396; VI-NEXT:    s_waitcnt vmcnt(0)
6397; VI-NEXT:    buffer_wbinvl1_vol
6398; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
6399; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6400; VI-NEXT:    v_mov_b32_e32 v4, v3
6401; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6402; VI-NEXT:    s_cbranch_execnz .LBB110_1
6403; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
6404; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
6405; VI-NEXT:    s_setpc_b64 s[30:31]
6406;
6407; GFX9-LABEL: global_atomic_umin_i32_noret:
6408; GFX9:       ; %bb.0:
6409; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6410; GFX9-NEXT:    global_load_dword v4, v[0:1], off
6411; GFX9-NEXT:    s_mov_b64 s[4:5], 0
6412; GFX9-NEXT:  .LBB110_1: ; %atomicrmw.start
6413; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
6414; GFX9-NEXT:    s_waitcnt vmcnt(0)
6415; GFX9-NEXT:    v_min_u32_e32 v3, v4, v2
6416; GFX9-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
6417; GFX9-NEXT:    s_waitcnt vmcnt(0)
6418; GFX9-NEXT:    buffer_wbinvl1_vol
6419; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
6420; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6421; GFX9-NEXT:    v_mov_b32_e32 v4, v3
6422; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6423; GFX9-NEXT:    s_cbranch_execnz .LBB110_1
6424; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
6425; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
6426; GFX9-NEXT:    s_setpc_b64 s[30:31]
6427  %tmp0 = atomicrmw umin ptr addrspace(1) %ptr, i32 %in seq_cst
6428  ret void
6429}
6430
6431define void @global_atomic_umin_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
6432; SI-LABEL: global_atomic_umin_i32_noret_offset:
6433; SI:       ; %bb.0:
6434; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6435; SI-NEXT:    s_mov_b32 s6, 0
6436; SI-NEXT:    s_mov_b32 s7, 0xf000
6437; SI-NEXT:    s_mov_b32 s4, s6
6438; SI-NEXT:    s_mov_b32 s5, s6
6439; SI-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
6440; SI-NEXT:    s_mov_b64 s[8:9], 0
6441; SI-NEXT:  .LBB111_1: ; %atomicrmw.start
6442; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
6443; SI-NEXT:    s_waitcnt vmcnt(0)
6444; SI-NEXT:    v_min_u32_e32 v3, v4, v2
6445; SI-NEXT:    s_waitcnt expcnt(0)
6446; SI-NEXT:    v_mov_b32_e32 v6, v4
6447; SI-NEXT:    v_mov_b32_e32 v5, v3
6448; SI-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
6449; SI-NEXT:    s_waitcnt vmcnt(0)
6450; SI-NEXT:    buffer_wbinvl1
6451; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
6452; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
6453; SI-NEXT:    v_mov_b32_e32 v4, v5
6454; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
6455; SI-NEXT:    s_cbranch_execnz .LBB111_1
6456; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
6457; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
6458; SI-NEXT:    s_waitcnt expcnt(0)
6459; SI-NEXT:    s_setpc_b64 s[30:31]
6460;
6461; VI-LABEL: global_atomic_umin_i32_noret_offset:
6462; VI:       ; %bb.0:
6463; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6464; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
6465; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
6466; VI-NEXT:    flat_load_dword v4, v[0:1]
6467; VI-NEXT:    s_mov_b64 s[4:5], 0
6468; VI-NEXT:  .LBB111_1: ; %atomicrmw.start
6469; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
6470; VI-NEXT:    s_waitcnt vmcnt(0)
6471; VI-NEXT:    v_min_u32_e32 v3, v4, v2
6472; VI-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
6473; VI-NEXT:    s_waitcnt vmcnt(0)
6474; VI-NEXT:    buffer_wbinvl1_vol
6475; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
6476; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6477; VI-NEXT:    v_mov_b32_e32 v4, v3
6478; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6479; VI-NEXT:    s_cbranch_execnz .LBB111_1
6480; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
6481; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
6482; VI-NEXT:    s_setpc_b64 s[30:31]
6483;
6484; GFX9-LABEL: global_atomic_umin_i32_noret_offset:
6485; GFX9:       ; %bb.0:
6486; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6487; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:16
6488; GFX9-NEXT:    s_mov_b64 s[4:5], 0
6489; GFX9-NEXT:  .LBB111_1: ; %atomicrmw.start
6490; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
6491; GFX9-NEXT:    s_waitcnt vmcnt(0)
6492; GFX9-NEXT:    v_min_u32_e32 v3, v4, v2
6493; GFX9-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
6494; GFX9-NEXT:    s_waitcnt vmcnt(0)
6495; GFX9-NEXT:    buffer_wbinvl1_vol
6496; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
6497; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6498; GFX9-NEXT:    v_mov_b32_e32 v4, v3
6499; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6500; GFX9-NEXT:    s_cbranch_execnz .LBB111_1
6501; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
6502; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
6503; GFX9-NEXT:    s_setpc_b64 s[30:31]
6504  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
6505  %tmp0 = atomicrmw umin ptr addrspace(1) %gep, i32 %in seq_cst
6506  ret void
6507}
6508
6509define i32 @global_atomic_umin_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
6510; SI-LABEL: global_atomic_umin_i32_ret:
6511; SI:       ; %bb.0:
6512; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6513; SI-NEXT:    s_mov_b32 s6, 0
6514; SI-NEXT:    s_mov_b32 s7, 0xf000
6515; SI-NEXT:    s_mov_b32 s4, s6
6516; SI-NEXT:    s_mov_b32 s5, s6
6517; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
6518; SI-NEXT:    s_mov_b64 s[8:9], 0
6519; SI-NEXT:  .LBB112_1: ; %atomicrmw.start
6520; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
6521; SI-NEXT:    s_waitcnt vmcnt(0)
6522; SI-NEXT:    v_mov_b32_e32 v5, v3
6523; SI-NEXT:    s_waitcnt expcnt(0)
6524; SI-NEXT:    v_min_u32_e32 v4, v5, v2
6525; SI-NEXT:    v_mov_b32_e32 v3, v4
6526; SI-NEXT:    v_mov_b32_e32 v4, v5
6527; SI-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
6528; SI-NEXT:    s_waitcnt vmcnt(0)
6529; SI-NEXT:    buffer_wbinvl1
6530; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
6531; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
6532; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
6533; SI-NEXT:    s_cbranch_execnz .LBB112_1
6534; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
6535; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
6536; SI-NEXT:    v_mov_b32_e32 v0, v3
6537; SI-NEXT:    s_waitcnt expcnt(0)
6538; SI-NEXT:    s_setpc_b64 s[30:31]
6539;
6540; VI-LABEL: global_atomic_umin_i32_ret:
6541; VI:       ; %bb.0:
6542; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6543; VI-NEXT:    flat_load_dword v3, v[0:1]
6544; VI-NEXT:    s_mov_b64 s[4:5], 0
6545; VI-NEXT:  .LBB112_1: ; %atomicrmw.start
6546; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
6547; VI-NEXT:    s_waitcnt vmcnt(0)
6548; VI-NEXT:    v_mov_b32_e32 v4, v3
6549; VI-NEXT:    v_min_u32_e32 v3, v4, v2
6550; VI-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
6551; VI-NEXT:    s_waitcnt vmcnt(0)
6552; VI-NEXT:    buffer_wbinvl1_vol
6553; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
6554; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6555; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6556; VI-NEXT:    s_cbranch_execnz .LBB112_1
6557; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
6558; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
6559; VI-NEXT:    v_mov_b32_e32 v0, v3
6560; VI-NEXT:    s_setpc_b64 s[30:31]
6561;
6562; GFX9-LABEL: global_atomic_umin_i32_ret:
6563; GFX9:       ; %bb.0:
6564; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6565; GFX9-NEXT:    global_load_dword v3, v[0:1], off
6566; GFX9-NEXT:    s_mov_b64 s[4:5], 0
6567; GFX9-NEXT:  .LBB112_1: ; %atomicrmw.start
6568; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
6569; GFX9-NEXT:    s_waitcnt vmcnt(0)
6570; GFX9-NEXT:    v_mov_b32_e32 v4, v3
6571; GFX9-NEXT:    v_min_u32_e32 v3, v4, v2
6572; GFX9-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
6573; GFX9-NEXT:    s_waitcnt vmcnt(0)
6574; GFX9-NEXT:    buffer_wbinvl1_vol
6575; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
6576; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6577; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6578; GFX9-NEXT:    s_cbranch_execnz .LBB112_1
6579; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
6580; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
6581; GFX9-NEXT:    v_mov_b32_e32 v0, v3
6582; GFX9-NEXT:    s_setpc_b64 s[30:31]
6583  %result = atomicrmw umin ptr addrspace(1) %ptr, i32 %in seq_cst
6584  ret i32 %result
6585}
6586
6587define i32 @global_atomic_umin_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
6588; SI-LABEL: global_atomic_umin_i32_ret_offset:
6589; SI:       ; %bb.0:
6590; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6591; SI-NEXT:    s_mov_b32 s6, 0
6592; SI-NEXT:    s_mov_b32 s7, 0xf000
6593; SI-NEXT:    s_mov_b32 s4, s6
6594; SI-NEXT:    s_mov_b32 s5, s6
6595; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
6596; SI-NEXT:    s_mov_b64 s[8:9], 0
6597; SI-NEXT:  .LBB113_1: ; %atomicrmw.start
6598; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
6599; SI-NEXT:    s_waitcnt vmcnt(0)
6600; SI-NEXT:    v_mov_b32_e32 v5, v3
6601; SI-NEXT:    s_waitcnt expcnt(0)
6602; SI-NEXT:    v_min_u32_e32 v4, v5, v2
6603; SI-NEXT:    v_mov_b32_e32 v3, v4
6604; SI-NEXT:    v_mov_b32_e32 v4, v5
6605; SI-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
6606; SI-NEXT:    s_waitcnt vmcnt(0)
6607; SI-NEXT:    buffer_wbinvl1
6608; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
6609; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
6610; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
6611; SI-NEXT:    s_cbranch_execnz .LBB113_1
6612; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
6613; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
6614; SI-NEXT:    v_mov_b32_e32 v0, v3
6615; SI-NEXT:    s_waitcnt expcnt(0)
6616; SI-NEXT:    s_setpc_b64 s[30:31]
6617;
6618; VI-LABEL: global_atomic_umin_i32_ret_offset:
6619; VI:       ; %bb.0:
6620; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6621; VI-NEXT:    v_add_u32_e32 v3, vcc, 16, v0
6622; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
6623; VI-NEXT:    flat_load_dword v0, v[3:4]
6624; VI-NEXT:    s_mov_b64 s[4:5], 0
6625; VI-NEXT:  .LBB113_1: ; %atomicrmw.start
6626; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
6627; VI-NEXT:    s_waitcnt vmcnt(0)
6628; VI-NEXT:    v_mov_b32_e32 v1, v0
6629; VI-NEXT:    v_min_u32_e32 v0, v1, v2
6630; VI-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
6631; VI-NEXT:    s_waitcnt vmcnt(0)
6632; VI-NEXT:    buffer_wbinvl1_vol
6633; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
6634; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6635; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6636; VI-NEXT:    s_cbranch_execnz .LBB113_1
6637; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
6638; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
6639; VI-NEXT:    s_setpc_b64 s[30:31]
6640;
6641; GFX9-LABEL: global_atomic_umin_i32_ret_offset:
6642; GFX9:       ; %bb.0:
6643; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6644; GFX9-NEXT:    global_load_dword v3, v[0:1], off offset:16
6645; GFX9-NEXT:    s_mov_b64 s[4:5], 0
6646; GFX9-NEXT:  .LBB113_1: ; %atomicrmw.start
6647; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
6648; GFX9-NEXT:    s_waitcnt vmcnt(0)
6649; GFX9-NEXT:    v_mov_b32_e32 v4, v3
6650; GFX9-NEXT:    v_min_u32_e32 v3, v4, v2
6651; GFX9-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
6652; GFX9-NEXT:    s_waitcnt vmcnt(0)
6653; GFX9-NEXT:    buffer_wbinvl1_vol
6654; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
6655; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6656; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6657; GFX9-NEXT:    s_cbranch_execnz .LBB113_1
6658; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
6659; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
6660; GFX9-NEXT:    v_mov_b32_e32 v0, v3
6661; GFX9-NEXT:    s_setpc_b64 s[30:31]
6662  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
6663  %result = atomicrmw umin ptr addrspace(1) %gep, i32 %in seq_cst
6664  ret i32 %result
6665}
6666
6667define amdgpu_gfx void @global_atomic_umin_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
6668; SI-LABEL: global_atomic_umin_i32_noret_scalar:
6669; SI:       ; %bb.0:
6670; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6671; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
6672; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
6673; SI-NEXT:    s_mov_b64 exec, s[34:35]
6674; SI-NEXT:    s_waitcnt expcnt(0)
6675; SI-NEXT:    v_writelane_b32 v4, s6, 0
6676; SI-NEXT:    v_writelane_b32 v4, s7, 1
6677; SI-NEXT:    s_mov_b32 s34, s6
6678; SI-NEXT:    s_mov_b32 s7, 0xf000
6679; SI-NEXT:    s_mov_b32 s6, -1
6680; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
6681; SI-NEXT:    s_mov_b64 s[36:37], 0
6682; SI-NEXT:  .LBB114_1: ; %atomicrmw.start
6683; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
6684; SI-NEXT:    s_waitcnt vmcnt(0)
6685; SI-NEXT:    v_min_u32_e32 v0, s34, v1
6686; SI-NEXT:    s_waitcnt expcnt(0)
6687; SI-NEXT:    v_mov_b32_e32 v3, v1
6688; SI-NEXT:    v_mov_b32_e32 v2, v0
6689; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
6690; SI-NEXT:    s_waitcnt vmcnt(0)
6691; SI-NEXT:    buffer_wbinvl1
6692; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
6693; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
6694; SI-NEXT:    v_mov_b32_e32 v1, v2
6695; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
6696; SI-NEXT:    s_cbranch_execnz .LBB114_1
6697; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
6698; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
6699; SI-NEXT:    v_readlane_b32 s7, v4, 1
6700; SI-NEXT:    v_readlane_b32 s6, v4, 0
6701; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
6702; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
6703; SI-NEXT:    s_mov_b64 exec, s[34:35]
6704; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
6705; SI-NEXT:    s_setpc_b64 s[30:31]
6706;
6707; VI-LABEL: global_atomic_umin_i32_noret_scalar:
6708; VI:       ; %bb.0:
6709; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6710; VI-NEXT:    v_mov_b32_e32 v0, s4
6711; VI-NEXT:    v_mov_b32_e32 v1, s5
6712; VI-NEXT:    flat_load_dword v3, v[0:1]
6713; VI-NEXT:    s_mov_b64 s[34:35], 0
6714; VI-NEXT:  .LBB114_1: ; %atomicrmw.start
6715; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
6716; VI-NEXT:    s_waitcnt vmcnt(0)
6717; VI-NEXT:    v_min_u32_e32 v2, s6, v3
6718; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6719; VI-NEXT:    s_waitcnt vmcnt(0)
6720; VI-NEXT:    buffer_wbinvl1_vol
6721; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
6722; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
6723; VI-NEXT:    v_mov_b32_e32 v3, v2
6724; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
6725; VI-NEXT:    s_cbranch_execnz .LBB114_1
6726; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
6727; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
6728; VI-NEXT:    s_setpc_b64 s[30:31]
6729;
6730; GFX9-LABEL: global_atomic_umin_i32_noret_scalar:
6731; GFX9:       ; %bb.0:
6732; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6733; GFX9-NEXT:    v_mov_b32_e32 v2, 0
6734; GFX9-NEXT:    global_load_dword v1, v2, s[4:5]
6735; GFX9-NEXT:    s_mov_b64 s[34:35], 0
6736; GFX9-NEXT:  .LBB114_1: ; %atomicrmw.start
6737; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
6738; GFX9-NEXT:    s_waitcnt vmcnt(0)
6739; GFX9-NEXT:    v_min_u32_e32 v0, s6, v1
6740; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc
6741; GFX9-NEXT:    s_waitcnt vmcnt(0)
6742; GFX9-NEXT:    buffer_wbinvl1_vol
6743; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
6744; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
6745; GFX9-NEXT:    v_mov_b32_e32 v1, v0
6746; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
6747; GFX9-NEXT:    s_cbranch_execnz .LBB114_1
6748; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
6749; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
6750; GFX9-NEXT:    s_setpc_b64 s[30:31]
6751  %tmp0 = atomicrmw umin ptr addrspace(1) %ptr, i32 %in seq_cst
6752  ret void
6753}
6754
6755define amdgpu_gfx void @global_atomic_umin_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
6756; SI-LABEL: global_atomic_umin_i32_noret_offset_scalar:
6757; SI:       ; %bb.0:
6758; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6759; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
6760; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
6761; SI-NEXT:    s_mov_b64 exec, s[34:35]
6762; SI-NEXT:    s_waitcnt expcnt(0)
6763; SI-NEXT:    v_writelane_b32 v4, s6, 0
6764; SI-NEXT:    v_writelane_b32 v4, s7, 1
6765; SI-NEXT:    s_mov_b32 s34, s6
6766; SI-NEXT:    s_mov_b32 s7, 0xf000
6767; SI-NEXT:    s_mov_b32 s6, -1
6768; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0 offset:16
6769; SI-NEXT:    s_mov_b64 s[36:37], 0
6770; SI-NEXT:  .LBB115_1: ; %atomicrmw.start
6771; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
6772; SI-NEXT:    s_waitcnt vmcnt(0)
6773; SI-NEXT:    v_min_u32_e32 v0, s34, v1
6774; SI-NEXT:    s_waitcnt expcnt(0)
6775; SI-NEXT:    v_mov_b32_e32 v3, v1
6776; SI-NEXT:    v_mov_b32_e32 v2, v0
6777; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
6778; SI-NEXT:    s_waitcnt vmcnt(0)
6779; SI-NEXT:    buffer_wbinvl1
6780; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
6781; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
6782; SI-NEXT:    v_mov_b32_e32 v1, v2
6783; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
6784; SI-NEXT:    s_cbranch_execnz .LBB115_1
6785; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
6786; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
6787; SI-NEXT:    v_readlane_b32 s7, v4, 1
6788; SI-NEXT:    v_readlane_b32 s6, v4, 0
6789; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
6790; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
6791; SI-NEXT:    s_mov_b64 exec, s[34:35]
6792; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
6793; SI-NEXT:    s_setpc_b64 s[30:31]
6794;
6795; VI-LABEL: global_atomic_umin_i32_noret_offset_scalar:
6796; VI:       ; %bb.0:
6797; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6798; VI-NEXT:    s_add_u32 s34, s4, 16
6799; VI-NEXT:    s_addc_u32 s35, s5, 0
6800; VI-NEXT:    v_mov_b32_e32 v0, s34
6801; VI-NEXT:    v_mov_b32_e32 v1, s35
6802; VI-NEXT:    flat_load_dword v3, v[0:1]
6803; VI-NEXT:    s_mov_b64 s[34:35], 0
6804; VI-NEXT:  .LBB115_1: ; %atomicrmw.start
6805; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
6806; VI-NEXT:    s_waitcnt vmcnt(0)
6807; VI-NEXT:    v_min_u32_e32 v2, s6, v3
6808; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6809; VI-NEXT:    s_waitcnt vmcnt(0)
6810; VI-NEXT:    buffer_wbinvl1_vol
6811; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
6812; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
6813; VI-NEXT:    v_mov_b32_e32 v3, v2
6814; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
6815; VI-NEXT:    s_cbranch_execnz .LBB115_1
6816; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
6817; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
6818; VI-NEXT:    s_setpc_b64 s[30:31]
6819;
6820; GFX9-LABEL: global_atomic_umin_i32_noret_offset_scalar:
6821; GFX9:       ; %bb.0:
6822; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6823; GFX9-NEXT:    v_mov_b32_e32 v2, 0
6824; GFX9-NEXT:    global_load_dword v1, v2, s[4:5] offset:16
6825; GFX9-NEXT:    s_mov_b64 s[34:35], 0
6826; GFX9-NEXT:  .LBB115_1: ; %atomicrmw.start
6827; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
6828; GFX9-NEXT:    s_waitcnt vmcnt(0)
6829; GFX9-NEXT:    v_min_u32_e32 v0, s6, v1
6830; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc
6831; GFX9-NEXT:    s_waitcnt vmcnt(0)
6832; GFX9-NEXT:    buffer_wbinvl1_vol
6833; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
6834; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
6835; GFX9-NEXT:    v_mov_b32_e32 v1, v0
6836; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
6837; GFX9-NEXT:    s_cbranch_execnz .LBB115_1
6838; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
6839; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
6840; GFX9-NEXT:    s_setpc_b64 s[30:31]
6841  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
6842  %tmp0 = atomicrmw umin ptr addrspace(1) %gep, i32 %in seq_cst
6843  ret void
6844}
6845
6846define amdgpu_gfx i32 @global_atomic_umin_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
6847; SI-LABEL: global_atomic_umin_i32_ret_scalar:
6848; SI:       ; %bb.0:
6849; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6850; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
6851; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
6852; SI-NEXT:    s_mov_b64 exec, s[34:35]
6853; SI-NEXT:    s_waitcnt expcnt(0)
6854; SI-NEXT:    v_writelane_b32 v3, s6, 0
6855; SI-NEXT:    v_writelane_b32 v3, s7, 1
6856; SI-NEXT:    s_mov_b32 s34, s6
6857; SI-NEXT:    s_mov_b32 s7, 0xf000
6858; SI-NEXT:    s_mov_b32 s6, -1
6859; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
6860; SI-NEXT:    s_mov_b64 s[36:37], 0
6861; SI-NEXT:  .LBB116_1: ; %atomicrmw.start
6862; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
6863; SI-NEXT:    s_waitcnt vmcnt(0)
6864; SI-NEXT:    v_mov_b32_e32 v2, v0
6865; SI-NEXT:    s_waitcnt expcnt(0)
6866; SI-NEXT:    v_min_u32_e32 v1, s34, v2
6867; SI-NEXT:    v_mov_b32_e32 v0, v1
6868; SI-NEXT:    v_mov_b32_e32 v1, v2
6869; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
6870; SI-NEXT:    s_waitcnt vmcnt(0)
6871; SI-NEXT:    buffer_wbinvl1
6872; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
6873; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
6874; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
6875; SI-NEXT:    s_cbranch_execnz .LBB116_1
6876; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
6877; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
6878; SI-NEXT:    v_readlane_b32 s7, v3, 1
6879; SI-NEXT:    v_readlane_b32 s6, v3, 0
6880; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
6881; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
6882; SI-NEXT:    s_mov_b64 exec, s[34:35]
6883; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
6884; SI-NEXT:    s_setpc_b64 s[30:31]
6885;
6886; VI-LABEL: global_atomic_umin_i32_ret_scalar:
6887; VI:       ; %bb.0:
6888; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6889; VI-NEXT:    v_mov_b32_e32 v0, s4
6890; VI-NEXT:    v_mov_b32_e32 v1, s5
6891; VI-NEXT:    flat_load_dword v0, v[0:1]
6892; VI-NEXT:    v_mov_b32_e32 v1, s4
6893; VI-NEXT:    s_mov_b64 s[34:35], 0
6894; VI-NEXT:    v_mov_b32_e32 v2, s5
6895; VI-NEXT:  .LBB116_1: ; %atomicrmw.start
6896; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
6897; VI-NEXT:    s_waitcnt vmcnt(0)
6898; VI-NEXT:    v_mov_b32_e32 v4, v0
6899; VI-NEXT:    v_min_u32_e32 v3, s6, v4
6900; VI-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
6901; VI-NEXT:    s_waitcnt vmcnt(0)
6902; VI-NEXT:    buffer_wbinvl1_vol
6903; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
6904; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
6905; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
6906; VI-NEXT:    s_cbranch_execnz .LBB116_1
6907; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
6908; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
6909; VI-NEXT:    s_setpc_b64 s[30:31]
6910;
6911; GFX9-LABEL: global_atomic_umin_i32_ret_scalar:
6912; GFX9:       ; %bb.0:
6913; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6914; GFX9-NEXT:    v_mov_b32_e32 v1, 0
6915; GFX9-NEXT:    global_load_dword v0, v1, s[4:5]
6916; GFX9-NEXT:    s_mov_b64 s[34:35], 0
6917; GFX9-NEXT:  .LBB116_1: ; %atomicrmw.start
6918; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
6919; GFX9-NEXT:    s_waitcnt vmcnt(0)
6920; GFX9-NEXT:    v_mov_b32_e32 v3, v0
6921; GFX9-NEXT:    v_min_u32_e32 v2, s6, v3
6922; GFX9-NEXT:    global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc
6923; GFX9-NEXT:    s_waitcnt vmcnt(0)
6924; GFX9-NEXT:    buffer_wbinvl1_vol
6925; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
6926; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
6927; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
6928; GFX9-NEXT:    s_cbranch_execnz .LBB116_1
6929; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
6930; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
6931; GFX9-NEXT:    s_setpc_b64 s[30:31]
6932  %result = atomicrmw umin ptr addrspace(1) %ptr, i32 %in seq_cst
6933  ret i32 %result
6934}
6935
6936define amdgpu_gfx i32 @global_atomic_umin_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
6937; SI-LABEL: global_atomic_umin_i32_ret_offset_scalar:
6938; SI:       ; %bb.0:
6939; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6940; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
6941; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
6942; SI-NEXT:    s_mov_b64 exec, s[34:35]
6943; SI-NEXT:    s_waitcnt expcnt(0)
6944; SI-NEXT:    v_writelane_b32 v3, s6, 0
6945; SI-NEXT:    v_writelane_b32 v3, s7, 1
6946; SI-NEXT:    s_mov_b32 s34, s6
6947; SI-NEXT:    s_mov_b32 s7, 0xf000
6948; SI-NEXT:    s_mov_b32 s6, -1
6949; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0 offset:16
6950; SI-NEXT:    s_mov_b64 s[36:37], 0
6951; SI-NEXT:  .LBB117_1: ; %atomicrmw.start
6952; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
6953; SI-NEXT:    s_waitcnt vmcnt(0)
6954; SI-NEXT:    v_mov_b32_e32 v2, v0
6955; SI-NEXT:    s_waitcnt expcnt(0)
6956; SI-NEXT:    v_min_u32_e32 v1, s34, v2
6957; SI-NEXT:    v_mov_b32_e32 v0, v1
6958; SI-NEXT:    v_mov_b32_e32 v1, v2
6959; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
6960; SI-NEXT:    s_waitcnt vmcnt(0)
6961; SI-NEXT:    buffer_wbinvl1
6962; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
6963; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
6964; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
6965; SI-NEXT:    s_cbranch_execnz .LBB117_1
6966; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
6967; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
6968; SI-NEXT:    v_readlane_b32 s7, v3, 1
6969; SI-NEXT:    v_readlane_b32 s6, v3, 0
6970; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
6971; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
6972; SI-NEXT:    s_mov_b64 exec, s[34:35]
6973; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
6974; SI-NEXT:    s_setpc_b64 s[30:31]
6975;
6976; VI-LABEL: global_atomic_umin_i32_ret_offset_scalar:
6977; VI:       ; %bb.0:
6978; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6979; VI-NEXT:    s_add_u32 s34, s4, 16
6980; VI-NEXT:    s_addc_u32 s35, s5, 0
6981; VI-NEXT:    v_mov_b32_e32 v1, s34
6982; VI-NEXT:    v_mov_b32_e32 v2, s35
6983; VI-NEXT:    flat_load_dword v0, v[1:2]
6984; VI-NEXT:    s_mov_b64 s[34:35], 0
6985; VI-NEXT:  .LBB117_1: ; %atomicrmw.start
6986; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
6987; VI-NEXT:    s_waitcnt vmcnt(0)
6988; VI-NEXT:    v_mov_b32_e32 v4, v0
6989; VI-NEXT:    v_min_u32_e32 v3, s6, v4
6990; VI-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
6991; VI-NEXT:    s_waitcnt vmcnt(0)
6992; VI-NEXT:    buffer_wbinvl1_vol
6993; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
6994; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
6995; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
6996; VI-NEXT:    s_cbranch_execnz .LBB117_1
6997; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
6998; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
6999; VI-NEXT:    s_setpc_b64 s[30:31]
7000;
7001; GFX9-LABEL: global_atomic_umin_i32_ret_offset_scalar:
7002; GFX9:       ; %bb.0:
7003; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7004; GFX9-NEXT:    v_mov_b32_e32 v1, 0
7005; GFX9-NEXT:    global_load_dword v0, v1, s[4:5] offset:16
7006; GFX9-NEXT:    s_mov_b64 s[34:35], 0
7007; GFX9-NEXT:  .LBB117_1: ; %atomicrmw.start
7008; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
7009; GFX9-NEXT:    s_waitcnt vmcnt(0)
7010; GFX9-NEXT:    v_mov_b32_e32 v3, v0
7011; GFX9-NEXT:    v_min_u32_e32 v2, s6, v3
7012; GFX9-NEXT:    global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc
7013; GFX9-NEXT:    s_waitcnt vmcnt(0)
7014; GFX9-NEXT:    buffer_wbinvl1_vol
7015; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
7016; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
7017; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
7018; GFX9-NEXT:    s_cbranch_execnz .LBB117_1
7019; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
7020; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
7021; GFX9-NEXT:    s_setpc_b64 s[30:31]
7022  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
7023  %result = atomicrmw umin ptr addrspace(1) %gep, i32 %in seq_cst
7024  ret i32 %result
7025}
7026
7027define void @global_atomic_umin_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
7028; SI-LABEL: global_atomic_umin_i32_noret_offset__amdgpu_no_remote_memory:
7029; SI:       ; %bb.0:
7030; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7031; SI-NEXT:    s_mov_b32 s6, 0
7032; SI-NEXT:    s_mov_b32 s7, 0xf000
7033; SI-NEXT:    s_mov_b32 s4, s6
7034; SI-NEXT:    s_mov_b32 s5, s6
7035; SI-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
7036; SI-NEXT:    s_mov_b64 s[8:9], 0
7037; SI-NEXT:  .LBB118_1: ; %atomicrmw.start
7038; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
7039; SI-NEXT:    s_waitcnt vmcnt(0)
7040; SI-NEXT:    v_min_u32_e32 v3, v4, v2
7041; SI-NEXT:    s_waitcnt expcnt(0)
7042; SI-NEXT:    v_mov_b32_e32 v6, v4
7043; SI-NEXT:    v_mov_b32_e32 v5, v3
7044; SI-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
7045; SI-NEXT:    s_waitcnt vmcnt(0)
7046; SI-NEXT:    buffer_wbinvl1
7047; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
7048; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
7049; SI-NEXT:    v_mov_b32_e32 v4, v5
7050; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
7051; SI-NEXT:    s_cbranch_execnz .LBB118_1
7052; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
7053; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
7054; SI-NEXT:    s_waitcnt expcnt(0)
7055; SI-NEXT:    s_setpc_b64 s[30:31]
7056;
7057; VI-LABEL: global_atomic_umin_i32_noret_offset__amdgpu_no_remote_memory:
7058; VI:       ; %bb.0:
7059; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7060; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
7061; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
7062; VI-NEXT:    flat_load_dword v4, v[0:1]
7063; VI-NEXT:    s_mov_b64 s[4:5], 0
7064; VI-NEXT:  .LBB118_1: ; %atomicrmw.start
7065; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
7066; VI-NEXT:    s_waitcnt vmcnt(0)
7067; VI-NEXT:    v_min_u32_e32 v3, v4, v2
7068; VI-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
7069; VI-NEXT:    s_waitcnt vmcnt(0)
7070; VI-NEXT:    buffer_wbinvl1_vol
7071; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
7072; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7073; VI-NEXT:    v_mov_b32_e32 v4, v3
7074; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7075; VI-NEXT:    s_cbranch_execnz .LBB118_1
7076; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
7077; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
7078; VI-NEXT:    s_setpc_b64 s[30:31]
7079;
7080; GFX9-LABEL: global_atomic_umin_i32_noret_offset__amdgpu_no_remote_memory:
7081; GFX9:       ; %bb.0:
7082; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7083; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:16
7084; GFX9-NEXT:    s_mov_b64 s[4:5], 0
7085; GFX9-NEXT:  .LBB118_1: ; %atomicrmw.start
7086; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
7087; GFX9-NEXT:    s_waitcnt vmcnt(0)
7088; GFX9-NEXT:    v_min_u32_e32 v3, v4, v2
7089; GFX9-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
7090; GFX9-NEXT:    s_waitcnt vmcnt(0)
7091; GFX9-NEXT:    buffer_wbinvl1_vol
7092; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
7093; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7094; GFX9-NEXT:    v_mov_b32_e32 v4, v3
7095; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7096; GFX9-NEXT:    s_cbranch_execnz .LBB118_1
7097; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
7098; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
7099; GFX9-NEXT:    s_setpc_b64 s[30:31]
7100  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
7101  %tmp0 = atomicrmw umin ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
7102  ret void
7103}
7104
7105define i32 @global_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
7106; SI-LABEL: global_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory:
7107; SI:       ; %bb.0:
7108; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7109; SI-NEXT:    s_mov_b32 s6, 0
7110; SI-NEXT:    s_mov_b32 s7, 0xf000
7111; SI-NEXT:    s_mov_b32 s4, s6
7112; SI-NEXT:    s_mov_b32 s5, s6
7113; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
7114; SI-NEXT:    s_mov_b64 s[8:9], 0
7115; SI-NEXT:  .LBB119_1: ; %atomicrmw.start
7116; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
7117; SI-NEXT:    s_waitcnt vmcnt(0)
7118; SI-NEXT:    v_mov_b32_e32 v5, v3
7119; SI-NEXT:    s_waitcnt expcnt(0)
7120; SI-NEXT:    v_min_u32_e32 v4, v5, v2
7121; SI-NEXT:    v_mov_b32_e32 v3, v4
7122; SI-NEXT:    v_mov_b32_e32 v4, v5
7123; SI-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
7124; SI-NEXT:    s_waitcnt vmcnt(0)
7125; SI-NEXT:    buffer_wbinvl1
7126; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
7127; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
7128; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
7129; SI-NEXT:    s_cbranch_execnz .LBB119_1
7130; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
7131; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
7132; SI-NEXT:    v_mov_b32_e32 v0, v3
7133; SI-NEXT:    s_waitcnt expcnt(0)
7134; SI-NEXT:    s_setpc_b64 s[30:31]
7135;
7136; VI-LABEL: global_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory:
7137; VI:       ; %bb.0:
7138; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7139; VI-NEXT:    v_add_u32_e32 v3, vcc, 16, v0
7140; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
7141; VI-NEXT:    flat_load_dword v0, v[3:4]
7142; VI-NEXT:    s_mov_b64 s[4:5], 0
7143; VI-NEXT:  .LBB119_1: ; %atomicrmw.start
7144; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
7145; VI-NEXT:    s_waitcnt vmcnt(0)
7146; VI-NEXT:    v_mov_b32_e32 v1, v0
7147; VI-NEXT:    v_min_u32_e32 v0, v1, v2
7148; VI-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
7149; VI-NEXT:    s_waitcnt vmcnt(0)
7150; VI-NEXT:    buffer_wbinvl1_vol
7151; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
7152; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7153; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7154; VI-NEXT:    s_cbranch_execnz .LBB119_1
7155; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
7156; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
7157; VI-NEXT:    s_setpc_b64 s[30:31]
7158;
7159; GFX9-LABEL: global_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory:
7160; GFX9:       ; %bb.0:
7161; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7162; GFX9-NEXT:    global_load_dword v3, v[0:1], off offset:16
7163; GFX9-NEXT:    s_mov_b64 s[4:5], 0
7164; GFX9-NEXT:  .LBB119_1: ; %atomicrmw.start
7165; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
7166; GFX9-NEXT:    s_waitcnt vmcnt(0)
7167; GFX9-NEXT:    v_mov_b32_e32 v4, v3
7168; GFX9-NEXT:    v_min_u32_e32 v3, v4, v2
7169; GFX9-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
7170; GFX9-NEXT:    s_waitcnt vmcnt(0)
7171; GFX9-NEXT:    buffer_wbinvl1_vol
7172; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
7173; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7174; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7175; GFX9-NEXT:    s_cbranch_execnz .LBB119_1
7176; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
7177; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
7178; GFX9-NEXT:    v_mov_b32_e32 v0, v3
7179; GFX9-NEXT:    s_setpc_b64 s[30:31]
7180  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
7181  %result = atomicrmw umin ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
7182  ret i32 %result
7183}
7184
7185; ---------------------------------------------------------------------
7186; atomicrmw min
7187; ---------------------------------------------------------------------
7188
7189define void @global_atomic_min_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
7190; SI-LABEL: global_atomic_min_i32_noret:
7191; SI:       ; %bb.0:
7192; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7193; SI-NEXT:    s_mov_b32 s6, 0
7194; SI-NEXT:    s_mov_b32 s7, 0xf000
7195; SI-NEXT:    s_mov_b32 s4, s6
7196; SI-NEXT:    s_mov_b32 s5, s6
7197; SI-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
7198; SI-NEXT:    s_mov_b64 s[8:9], 0
7199; SI-NEXT:  .LBB120_1: ; %atomicrmw.start
7200; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
7201; SI-NEXT:    s_waitcnt vmcnt(0)
7202; SI-NEXT:    v_min_i32_e32 v3, v4, v2
7203; SI-NEXT:    s_waitcnt expcnt(0)
7204; SI-NEXT:    v_mov_b32_e32 v6, v4
7205; SI-NEXT:    v_mov_b32_e32 v5, v3
7206; SI-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
7207; SI-NEXT:    s_waitcnt vmcnt(0)
7208; SI-NEXT:    buffer_wbinvl1
7209; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
7210; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
7211; SI-NEXT:    v_mov_b32_e32 v4, v5
7212; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
7213; SI-NEXT:    s_cbranch_execnz .LBB120_1
7214; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
7215; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
7216; SI-NEXT:    s_waitcnt expcnt(0)
7217; SI-NEXT:    s_setpc_b64 s[30:31]
7218;
7219; VI-LABEL: global_atomic_min_i32_noret:
7220; VI:       ; %bb.0:
7221; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7222; VI-NEXT:    flat_load_dword v4, v[0:1]
7223; VI-NEXT:    s_mov_b64 s[4:5], 0
7224; VI-NEXT:  .LBB120_1: ; %atomicrmw.start
7225; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
7226; VI-NEXT:    s_waitcnt vmcnt(0)
7227; VI-NEXT:    v_min_i32_e32 v3, v4, v2
7228; VI-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
7229; VI-NEXT:    s_waitcnt vmcnt(0)
7230; VI-NEXT:    buffer_wbinvl1_vol
7231; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
7232; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7233; VI-NEXT:    v_mov_b32_e32 v4, v3
7234; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7235; VI-NEXT:    s_cbranch_execnz .LBB120_1
7236; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
7237; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
7238; VI-NEXT:    s_setpc_b64 s[30:31]
7239;
7240; GFX9-LABEL: global_atomic_min_i32_noret:
7241; GFX9:       ; %bb.0:
7242; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7243; GFX9-NEXT:    global_load_dword v4, v[0:1], off
7244; GFX9-NEXT:    s_mov_b64 s[4:5], 0
7245; GFX9-NEXT:  .LBB120_1: ; %atomicrmw.start
7246; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
7247; GFX9-NEXT:    s_waitcnt vmcnt(0)
7248; GFX9-NEXT:    v_min_i32_e32 v3, v4, v2
7249; GFX9-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
7250; GFX9-NEXT:    s_waitcnt vmcnt(0)
7251; GFX9-NEXT:    buffer_wbinvl1_vol
7252; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
7253; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7254; GFX9-NEXT:    v_mov_b32_e32 v4, v3
7255; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7256; GFX9-NEXT:    s_cbranch_execnz .LBB120_1
7257; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
7258; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
7259; GFX9-NEXT:    s_setpc_b64 s[30:31]
7260  %tmp0 = atomicrmw min ptr addrspace(1) %ptr, i32 %in seq_cst
7261  ret void
7262}
7263
7264define void @global_atomic_min_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
7265; SI-LABEL: global_atomic_min_i32_noret_offset:
7266; SI:       ; %bb.0:
7267; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7268; SI-NEXT:    s_mov_b32 s6, 0
7269; SI-NEXT:    s_mov_b32 s7, 0xf000
7270; SI-NEXT:    s_mov_b32 s4, s6
7271; SI-NEXT:    s_mov_b32 s5, s6
7272; SI-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
7273; SI-NEXT:    s_mov_b64 s[8:9], 0
7274; SI-NEXT:  .LBB121_1: ; %atomicrmw.start
7275; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
7276; SI-NEXT:    s_waitcnt vmcnt(0)
7277; SI-NEXT:    v_min_i32_e32 v3, v4, v2
7278; SI-NEXT:    s_waitcnt expcnt(0)
7279; SI-NEXT:    v_mov_b32_e32 v6, v4
7280; SI-NEXT:    v_mov_b32_e32 v5, v3
7281; SI-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
7282; SI-NEXT:    s_waitcnt vmcnt(0)
7283; SI-NEXT:    buffer_wbinvl1
7284; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
7285; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
7286; SI-NEXT:    v_mov_b32_e32 v4, v5
7287; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
7288; SI-NEXT:    s_cbranch_execnz .LBB121_1
7289; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
7290; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
7291; SI-NEXT:    s_waitcnt expcnt(0)
7292; SI-NEXT:    s_setpc_b64 s[30:31]
7293;
7294; VI-LABEL: global_atomic_min_i32_noret_offset:
7295; VI:       ; %bb.0:
7296; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7297; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
7298; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
7299; VI-NEXT:    flat_load_dword v4, v[0:1]
7300; VI-NEXT:    s_mov_b64 s[4:5], 0
7301; VI-NEXT:  .LBB121_1: ; %atomicrmw.start
7302; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
7303; VI-NEXT:    s_waitcnt vmcnt(0)
7304; VI-NEXT:    v_min_i32_e32 v3, v4, v2
7305; VI-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
7306; VI-NEXT:    s_waitcnt vmcnt(0)
7307; VI-NEXT:    buffer_wbinvl1_vol
7308; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
7309; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7310; VI-NEXT:    v_mov_b32_e32 v4, v3
7311; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7312; VI-NEXT:    s_cbranch_execnz .LBB121_1
7313; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
7314; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
7315; VI-NEXT:    s_setpc_b64 s[30:31]
7316;
7317; GFX9-LABEL: global_atomic_min_i32_noret_offset:
7318; GFX9:       ; %bb.0:
7319; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7320; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:16
7321; GFX9-NEXT:    s_mov_b64 s[4:5], 0
7322; GFX9-NEXT:  .LBB121_1: ; %atomicrmw.start
7323; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
7324; GFX9-NEXT:    s_waitcnt vmcnt(0)
7325; GFX9-NEXT:    v_min_i32_e32 v3, v4, v2
7326; GFX9-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
7327; GFX9-NEXT:    s_waitcnt vmcnt(0)
7328; GFX9-NEXT:    buffer_wbinvl1_vol
7329; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
7330; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7331; GFX9-NEXT:    v_mov_b32_e32 v4, v3
7332; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7333; GFX9-NEXT:    s_cbranch_execnz .LBB121_1
7334; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
7335; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
7336; GFX9-NEXT:    s_setpc_b64 s[30:31]
7337  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
7338  %tmp0 = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst
7339  ret void
7340}
7341
7342define i32 @global_atomic_min_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
7343; SI-LABEL: global_atomic_min_i32_ret:
7344; SI:       ; %bb.0:
7345; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7346; SI-NEXT:    s_mov_b32 s6, 0
7347; SI-NEXT:    s_mov_b32 s7, 0xf000
7348; SI-NEXT:    s_mov_b32 s4, s6
7349; SI-NEXT:    s_mov_b32 s5, s6
7350; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
7351; SI-NEXT:    s_mov_b64 s[8:9], 0
7352; SI-NEXT:  .LBB122_1: ; %atomicrmw.start
7353; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
7354; SI-NEXT:    s_waitcnt vmcnt(0)
7355; SI-NEXT:    v_mov_b32_e32 v5, v3
7356; SI-NEXT:    s_waitcnt expcnt(0)
7357; SI-NEXT:    v_min_i32_e32 v4, v5, v2
7358; SI-NEXT:    v_mov_b32_e32 v3, v4
7359; SI-NEXT:    v_mov_b32_e32 v4, v5
7360; SI-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
7361; SI-NEXT:    s_waitcnt vmcnt(0)
7362; SI-NEXT:    buffer_wbinvl1
7363; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
7364; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
7365; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
7366; SI-NEXT:    s_cbranch_execnz .LBB122_1
7367; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
7368; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
7369; SI-NEXT:    v_mov_b32_e32 v0, v3
7370; SI-NEXT:    s_waitcnt expcnt(0)
7371; SI-NEXT:    s_setpc_b64 s[30:31]
7372;
7373; VI-LABEL: global_atomic_min_i32_ret:
7374; VI:       ; %bb.0:
7375; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7376; VI-NEXT:    flat_load_dword v3, v[0:1]
7377; VI-NEXT:    s_mov_b64 s[4:5], 0
7378; VI-NEXT:  .LBB122_1: ; %atomicrmw.start
7379; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
7380; VI-NEXT:    s_waitcnt vmcnt(0)
7381; VI-NEXT:    v_mov_b32_e32 v4, v3
7382; VI-NEXT:    v_min_i32_e32 v3, v4, v2
7383; VI-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
7384; VI-NEXT:    s_waitcnt vmcnt(0)
7385; VI-NEXT:    buffer_wbinvl1_vol
7386; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
7387; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7388; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7389; VI-NEXT:    s_cbranch_execnz .LBB122_1
7390; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
7391; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
7392; VI-NEXT:    v_mov_b32_e32 v0, v3
7393; VI-NEXT:    s_setpc_b64 s[30:31]
7394;
7395; GFX9-LABEL: global_atomic_min_i32_ret:
7396; GFX9:       ; %bb.0:
7397; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7398; GFX9-NEXT:    global_load_dword v3, v[0:1], off
7399; GFX9-NEXT:    s_mov_b64 s[4:5], 0
7400; GFX9-NEXT:  .LBB122_1: ; %atomicrmw.start
7401; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
7402; GFX9-NEXT:    s_waitcnt vmcnt(0)
7403; GFX9-NEXT:    v_mov_b32_e32 v4, v3
7404; GFX9-NEXT:    v_min_i32_e32 v3, v4, v2
7405; GFX9-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
7406; GFX9-NEXT:    s_waitcnt vmcnt(0)
7407; GFX9-NEXT:    buffer_wbinvl1_vol
7408; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
7409; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7410; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7411; GFX9-NEXT:    s_cbranch_execnz .LBB122_1
7412; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
7413; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
7414; GFX9-NEXT:    v_mov_b32_e32 v0, v3
7415; GFX9-NEXT:    s_setpc_b64 s[30:31]
7416  %result = atomicrmw min ptr addrspace(1) %ptr, i32 %in seq_cst
7417  ret i32 %result
7418}
7419
7420define i32 @global_atomic_min_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
7421; SI-LABEL: global_atomic_min_i32_ret_offset:
7422; SI:       ; %bb.0:
7423; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7424; SI-NEXT:    s_mov_b32 s6, 0
7425; SI-NEXT:    s_mov_b32 s7, 0xf000
7426; SI-NEXT:    s_mov_b32 s4, s6
7427; SI-NEXT:    s_mov_b32 s5, s6
7428; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
7429; SI-NEXT:    s_mov_b64 s[8:9], 0
7430; SI-NEXT:  .LBB123_1: ; %atomicrmw.start
7431; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
7432; SI-NEXT:    s_waitcnt vmcnt(0)
7433; SI-NEXT:    v_mov_b32_e32 v5, v3
7434; SI-NEXT:    s_waitcnt expcnt(0)
7435; SI-NEXT:    v_min_i32_e32 v4, v5, v2
7436; SI-NEXT:    v_mov_b32_e32 v3, v4
7437; SI-NEXT:    v_mov_b32_e32 v4, v5
7438; SI-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
7439; SI-NEXT:    s_waitcnt vmcnt(0)
7440; SI-NEXT:    buffer_wbinvl1
7441; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
7442; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
7443; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
7444; SI-NEXT:    s_cbranch_execnz .LBB123_1
7445; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
7446; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
7447; SI-NEXT:    v_mov_b32_e32 v0, v3
7448; SI-NEXT:    s_waitcnt expcnt(0)
7449; SI-NEXT:    s_setpc_b64 s[30:31]
7450;
7451; VI-LABEL: global_atomic_min_i32_ret_offset:
7452; VI:       ; %bb.0:
7453; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7454; VI-NEXT:    v_add_u32_e32 v3, vcc, 16, v0
7455; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
7456; VI-NEXT:    flat_load_dword v0, v[3:4]
7457; VI-NEXT:    s_mov_b64 s[4:5], 0
7458; VI-NEXT:  .LBB123_1: ; %atomicrmw.start
7459; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
7460; VI-NEXT:    s_waitcnt vmcnt(0)
7461; VI-NEXT:    v_mov_b32_e32 v1, v0
7462; VI-NEXT:    v_min_i32_e32 v0, v1, v2
7463; VI-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
7464; VI-NEXT:    s_waitcnt vmcnt(0)
7465; VI-NEXT:    buffer_wbinvl1_vol
7466; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
7467; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7468; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7469; VI-NEXT:    s_cbranch_execnz .LBB123_1
7470; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
7471; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
7472; VI-NEXT:    s_setpc_b64 s[30:31]
7473;
7474; GFX9-LABEL: global_atomic_min_i32_ret_offset:
7475; GFX9:       ; %bb.0:
7476; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7477; GFX9-NEXT:    global_load_dword v3, v[0:1], off offset:16
7478; GFX9-NEXT:    s_mov_b64 s[4:5], 0
7479; GFX9-NEXT:  .LBB123_1: ; %atomicrmw.start
7480; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
7481; GFX9-NEXT:    s_waitcnt vmcnt(0)
7482; GFX9-NEXT:    v_mov_b32_e32 v4, v3
7483; GFX9-NEXT:    v_min_i32_e32 v3, v4, v2
7484; GFX9-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
7485; GFX9-NEXT:    s_waitcnt vmcnt(0)
7486; GFX9-NEXT:    buffer_wbinvl1_vol
7487; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
7488; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7489; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7490; GFX9-NEXT:    s_cbranch_execnz .LBB123_1
7491; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
7492; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
7493; GFX9-NEXT:    v_mov_b32_e32 v0, v3
7494; GFX9-NEXT:    s_setpc_b64 s[30:31]
7495  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
7496  %result = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst
7497  ret i32 %result
7498}
7499
7500define amdgpu_gfx void @global_atomic_min_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
7501; SI-LABEL: global_atomic_min_i32_noret_scalar:
7502; SI:       ; %bb.0:
7503; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7504; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
7505; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
7506; SI-NEXT:    s_mov_b64 exec, s[34:35]
7507; SI-NEXT:    s_waitcnt expcnt(0)
7508; SI-NEXT:    v_writelane_b32 v4, s6, 0
7509; SI-NEXT:    v_writelane_b32 v4, s7, 1
7510; SI-NEXT:    s_mov_b32 s34, s6
7511; SI-NEXT:    s_mov_b32 s7, 0xf000
7512; SI-NEXT:    s_mov_b32 s6, -1
7513; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
7514; SI-NEXT:    s_mov_b64 s[36:37], 0
7515; SI-NEXT:  .LBB124_1: ; %atomicrmw.start
7516; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
7517; SI-NEXT:    s_waitcnt vmcnt(0)
7518; SI-NEXT:    v_min_i32_e32 v0, s34, v1
7519; SI-NEXT:    s_waitcnt expcnt(0)
7520; SI-NEXT:    v_mov_b32_e32 v3, v1
7521; SI-NEXT:    v_mov_b32_e32 v2, v0
7522; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
7523; SI-NEXT:    s_waitcnt vmcnt(0)
7524; SI-NEXT:    buffer_wbinvl1
7525; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
7526; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
7527; SI-NEXT:    v_mov_b32_e32 v1, v2
7528; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
7529; SI-NEXT:    s_cbranch_execnz .LBB124_1
7530; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
7531; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
7532; SI-NEXT:    v_readlane_b32 s7, v4, 1
7533; SI-NEXT:    v_readlane_b32 s6, v4, 0
7534; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
7535; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
7536; SI-NEXT:    s_mov_b64 exec, s[34:35]
7537; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
7538; SI-NEXT:    s_setpc_b64 s[30:31]
7539;
7540; VI-LABEL: global_atomic_min_i32_noret_scalar:
7541; VI:       ; %bb.0:
7542; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7543; VI-NEXT:    v_mov_b32_e32 v0, s4
7544; VI-NEXT:    v_mov_b32_e32 v1, s5
7545; VI-NEXT:    flat_load_dword v3, v[0:1]
7546; VI-NEXT:    s_mov_b64 s[34:35], 0
7547; VI-NEXT:  .LBB124_1: ; %atomicrmw.start
7548; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
7549; VI-NEXT:    s_waitcnt vmcnt(0)
7550; VI-NEXT:    v_min_i32_e32 v2, s6, v3
7551; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7552; VI-NEXT:    s_waitcnt vmcnt(0)
7553; VI-NEXT:    buffer_wbinvl1_vol
7554; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
7555; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
7556; VI-NEXT:    v_mov_b32_e32 v3, v2
7557; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
7558; VI-NEXT:    s_cbranch_execnz .LBB124_1
7559; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
7560; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
7561; VI-NEXT:    s_setpc_b64 s[30:31]
7562;
7563; GFX9-LABEL: global_atomic_min_i32_noret_scalar:
7564; GFX9:       ; %bb.0:
7565; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7566; GFX9-NEXT:    v_mov_b32_e32 v2, 0
7567; GFX9-NEXT:    global_load_dword v1, v2, s[4:5]
7568; GFX9-NEXT:    s_mov_b64 s[34:35], 0
7569; GFX9-NEXT:  .LBB124_1: ; %atomicrmw.start
7570; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
7571; GFX9-NEXT:    s_waitcnt vmcnt(0)
7572; GFX9-NEXT:    v_min_i32_e32 v0, s6, v1
7573; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc
7574; GFX9-NEXT:    s_waitcnt vmcnt(0)
7575; GFX9-NEXT:    buffer_wbinvl1_vol
7576; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
7577; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
7578; GFX9-NEXT:    v_mov_b32_e32 v1, v0
7579; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
7580; GFX9-NEXT:    s_cbranch_execnz .LBB124_1
7581; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
7582; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
7583; GFX9-NEXT:    s_setpc_b64 s[30:31]
7584  %tmp0 = atomicrmw min ptr addrspace(1) %ptr, i32 %in seq_cst
7585  ret void
7586}
7587
7588define amdgpu_gfx void @global_atomic_min_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
7589; SI-LABEL: global_atomic_min_i32_noret_offset_scalar:
7590; SI:       ; %bb.0:
7591; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7592; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
7593; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
7594; SI-NEXT:    s_mov_b64 exec, s[34:35]
7595; SI-NEXT:    s_waitcnt expcnt(0)
7596; SI-NEXT:    v_writelane_b32 v4, s6, 0
7597; SI-NEXT:    v_writelane_b32 v4, s7, 1
7598; SI-NEXT:    s_mov_b32 s34, s6
7599; SI-NEXT:    s_mov_b32 s7, 0xf000
7600; SI-NEXT:    s_mov_b32 s6, -1
7601; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0 offset:16
7602; SI-NEXT:    s_mov_b64 s[36:37], 0
7603; SI-NEXT:  .LBB125_1: ; %atomicrmw.start
7604; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
7605; SI-NEXT:    s_waitcnt vmcnt(0)
7606; SI-NEXT:    v_min_i32_e32 v0, s34, v1
7607; SI-NEXT:    s_waitcnt expcnt(0)
7608; SI-NEXT:    v_mov_b32_e32 v3, v1
7609; SI-NEXT:    v_mov_b32_e32 v2, v0
7610; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
7611; SI-NEXT:    s_waitcnt vmcnt(0)
7612; SI-NEXT:    buffer_wbinvl1
7613; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
7614; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
7615; SI-NEXT:    v_mov_b32_e32 v1, v2
7616; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
7617; SI-NEXT:    s_cbranch_execnz .LBB125_1
7618; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
7619; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
7620; SI-NEXT:    v_readlane_b32 s7, v4, 1
7621; SI-NEXT:    v_readlane_b32 s6, v4, 0
7622; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
7623; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
7624; SI-NEXT:    s_mov_b64 exec, s[34:35]
7625; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
7626; SI-NEXT:    s_setpc_b64 s[30:31]
7627;
7628; VI-LABEL: global_atomic_min_i32_noret_offset_scalar:
7629; VI:       ; %bb.0:
7630; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7631; VI-NEXT:    s_add_u32 s34, s4, 16
7632; VI-NEXT:    s_addc_u32 s35, s5, 0
7633; VI-NEXT:    v_mov_b32_e32 v0, s34
7634; VI-NEXT:    v_mov_b32_e32 v1, s35
7635; VI-NEXT:    flat_load_dword v3, v[0:1]
7636; VI-NEXT:    s_mov_b64 s[34:35], 0
7637; VI-NEXT:  .LBB125_1: ; %atomicrmw.start
7638; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
7639; VI-NEXT:    s_waitcnt vmcnt(0)
7640; VI-NEXT:    v_min_i32_e32 v2, s6, v3
7641; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7642; VI-NEXT:    s_waitcnt vmcnt(0)
7643; VI-NEXT:    buffer_wbinvl1_vol
7644; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
7645; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
7646; VI-NEXT:    v_mov_b32_e32 v3, v2
7647; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
7648; VI-NEXT:    s_cbranch_execnz .LBB125_1
7649; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
7650; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
7651; VI-NEXT:    s_setpc_b64 s[30:31]
7652;
7653; GFX9-LABEL: global_atomic_min_i32_noret_offset_scalar:
7654; GFX9:       ; %bb.0:
7655; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7656; GFX9-NEXT:    v_mov_b32_e32 v2, 0
7657; GFX9-NEXT:    global_load_dword v1, v2, s[4:5] offset:16
7658; GFX9-NEXT:    s_mov_b64 s[34:35], 0
7659; GFX9-NEXT:  .LBB125_1: ; %atomicrmw.start
7660; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
7661; GFX9-NEXT:    s_waitcnt vmcnt(0)
7662; GFX9-NEXT:    v_min_i32_e32 v0, s6, v1
7663; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc
7664; GFX9-NEXT:    s_waitcnt vmcnt(0)
7665; GFX9-NEXT:    buffer_wbinvl1_vol
7666; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
7667; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
7668; GFX9-NEXT:    v_mov_b32_e32 v1, v0
7669; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
7670; GFX9-NEXT:    s_cbranch_execnz .LBB125_1
7671; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
7672; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
7673; GFX9-NEXT:    s_setpc_b64 s[30:31]
7674  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
7675  %tmp0 = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst
7676  ret void
7677}
7678
7679define amdgpu_gfx i32 @global_atomic_min_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
7680; SI-LABEL: global_atomic_min_i32_ret_scalar:
7681; SI:       ; %bb.0:
7682; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7683; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
7684; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
7685; SI-NEXT:    s_mov_b64 exec, s[34:35]
7686; SI-NEXT:    s_waitcnt expcnt(0)
7687; SI-NEXT:    v_writelane_b32 v3, s6, 0
7688; SI-NEXT:    v_writelane_b32 v3, s7, 1
7689; SI-NEXT:    s_mov_b32 s34, s6
7690; SI-NEXT:    s_mov_b32 s7, 0xf000
7691; SI-NEXT:    s_mov_b32 s6, -1
7692; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
7693; SI-NEXT:    s_mov_b64 s[36:37], 0
7694; SI-NEXT:  .LBB126_1: ; %atomicrmw.start
7695; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
7696; SI-NEXT:    s_waitcnt vmcnt(0)
7697; SI-NEXT:    v_mov_b32_e32 v2, v0
7698; SI-NEXT:    s_waitcnt expcnt(0)
7699; SI-NEXT:    v_min_i32_e32 v1, s34, v2
7700; SI-NEXT:    v_mov_b32_e32 v0, v1
7701; SI-NEXT:    v_mov_b32_e32 v1, v2
7702; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
7703; SI-NEXT:    s_waitcnt vmcnt(0)
7704; SI-NEXT:    buffer_wbinvl1
7705; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
7706; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
7707; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
7708; SI-NEXT:    s_cbranch_execnz .LBB126_1
7709; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
7710; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
7711; SI-NEXT:    v_readlane_b32 s7, v3, 1
7712; SI-NEXT:    v_readlane_b32 s6, v3, 0
7713; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
7714; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
7715; SI-NEXT:    s_mov_b64 exec, s[34:35]
7716; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
7717; SI-NEXT:    s_setpc_b64 s[30:31]
7718;
7719; VI-LABEL: global_atomic_min_i32_ret_scalar:
7720; VI:       ; %bb.0:
7721; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7722; VI-NEXT:    v_mov_b32_e32 v0, s4
7723; VI-NEXT:    v_mov_b32_e32 v1, s5
7724; VI-NEXT:    flat_load_dword v0, v[0:1]
7725; VI-NEXT:    v_mov_b32_e32 v1, s4
7726; VI-NEXT:    s_mov_b64 s[34:35], 0
7727; VI-NEXT:    v_mov_b32_e32 v2, s5
7728; VI-NEXT:  .LBB126_1: ; %atomicrmw.start
7729; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
7730; VI-NEXT:    s_waitcnt vmcnt(0)
7731; VI-NEXT:    v_mov_b32_e32 v4, v0
7732; VI-NEXT:    v_min_i32_e32 v3, s6, v4
7733; VI-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
7734; VI-NEXT:    s_waitcnt vmcnt(0)
7735; VI-NEXT:    buffer_wbinvl1_vol
7736; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
7737; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
7738; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
7739; VI-NEXT:    s_cbranch_execnz .LBB126_1
7740; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
7741; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
7742; VI-NEXT:    s_setpc_b64 s[30:31]
7743;
7744; GFX9-LABEL: global_atomic_min_i32_ret_scalar:
7745; GFX9:       ; %bb.0:
7746; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7747; GFX9-NEXT:    v_mov_b32_e32 v1, 0
7748; GFX9-NEXT:    global_load_dword v0, v1, s[4:5]
7749; GFX9-NEXT:    s_mov_b64 s[34:35], 0
7750; GFX9-NEXT:  .LBB126_1: ; %atomicrmw.start
7751; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
7752; GFX9-NEXT:    s_waitcnt vmcnt(0)
7753; GFX9-NEXT:    v_mov_b32_e32 v3, v0
7754; GFX9-NEXT:    v_min_i32_e32 v2, s6, v3
7755; GFX9-NEXT:    global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc
7756; GFX9-NEXT:    s_waitcnt vmcnt(0)
7757; GFX9-NEXT:    buffer_wbinvl1_vol
7758; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
7759; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
7760; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
7761; GFX9-NEXT:    s_cbranch_execnz .LBB126_1
7762; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
7763; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
7764; GFX9-NEXT:    s_setpc_b64 s[30:31]
7765  %result = atomicrmw min ptr addrspace(1) %ptr, i32 %in seq_cst
7766  ret i32 %result
7767}
7768
7769define amdgpu_gfx i32 @global_atomic_min_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
7770; SI-LABEL: global_atomic_min_i32_ret_offset_scalar:
7771; SI:       ; %bb.0:
7772; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7773; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
7774; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
7775; SI-NEXT:    s_mov_b64 exec, s[34:35]
7776; SI-NEXT:    s_waitcnt expcnt(0)
7777; SI-NEXT:    v_writelane_b32 v3, s6, 0
7778; SI-NEXT:    v_writelane_b32 v3, s7, 1
7779; SI-NEXT:    s_mov_b32 s34, s6
7780; SI-NEXT:    s_mov_b32 s7, 0xf000
7781; SI-NEXT:    s_mov_b32 s6, -1
7782; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0 offset:16
7783; SI-NEXT:    s_mov_b64 s[36:37], 0
7784; SI-NEXT:  .LBB127_1: ; %atomicrmw.start
7785; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
7786; SI-NEXT:    s_waitcnt vmcnt(0)
7787; SI-NEXT:    v_mov_b32_e32 v2, v0
7788; SI-NEXT:    s_waitcnt expcnt(0)
7789; SI-NEXT:    v_min_i32_e32 v1, s34, v2
7790; SI-NEXT:    v_mov_b32_e32 v0, v1
7791; SI-NEXT:    v_mov_b32_e32 v1, v2
7792; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
7793; SI-NEXT:    s_waitcnt vmcnt(0)
7794; SI-NEXT:    buffer_wbinvl1
7795; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
7796; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
7797; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
7798; SI-NEXT:    s_cbranch_execnz .LBB127_1
7799; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
7800; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
7801; SI-NEXT:    v_readlane_b32 s7, v3, 1
7802; SI-NEXT:    v_readlane_b32 s6, v3, 0
7803; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
7804; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
7805; SI-NEXT:    s_mov_b64 exec, s[34:35]
7806; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
7807; SI-NEXT:    s_setpc_b64 s[30:31]
7808;
7809; VI-LABEL: global_atomic_min_i32_ret_offset_scalar:
7810; VI:       ; %bb.0:
7811; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7812; VI-NEXT:    s_add_u32 s34, s4, 16
7813; VI-NEXT:    s_addc_u32 s35, s5, 0
7814; VI-NEXT:    v_mov_b32_e32 v1, s34
7815; VI-NEXT:    v_mov_b32_e32 v2, s35
7816; VI-NEXT:    flat_load_dword v0, v[1:2]
7817; VI-NEXT:    s_mov_b64 s[34:35], 0
7818; VI-NEXT:  .LBB127_1: ; %atomicrmw.start
7819; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
7820; VI-NEXT:    s_waitcnt vmcnt(0)
7821; VI-NEXT:    v_mov_b32_e32 v4, v0
7822; VI-NEXT:    v_min_i32_e32 v3, s6, v4
7823; VI-NEXT:    flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
7824; VI-NEXT:    s_waitcnt vmcnt(0)
7825; VI-NEXT:    buffer_wbinvl1_vol
7826; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
7827; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
7828; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
7829; VI-NEXT:    s_cbranch_execnz .LBB127_1
7830; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
7831; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
7832; VI-NEXT:    s_setpc_b64 s[30:31]
7833;
7834; GFX9-LABEL: global_atomic_min_i32_ret_offset_scalar:
7835; GFX9:       ; %bb.0:
7836; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7837; GFX9-NEXT:    v_mov_b32_e32 v1, 0
7838; GFX9-NEXT:    global_load_dword v0, v1, s[4:5] offset:16
7839; GFX9-NEXT:    s_mov_b64 s[34:35], 0
7840; GFX9-NEXT:  .LBB127_1: ; %atomicrmw.start
7841; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
7842; GFX9-NEXT:    s_waitcnt vmcnt(0)
7843; GFX9-NEXT:    v_mov_b32_e32 v3, v0
7844; GFX9-NEXT:    v_min_i32_e32 v2, s6, v3
7845; GFX9-NEXT:    global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc
7846; GFX9-NEXT:    s_waitcnt vmcnt(0)
7847; GFX9-NEXT:    buffer_wbinvl1_vol
7848; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
7849; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
7850; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
7851; GFX9-NEXT:    s_cbranch_execnz .LBB127_1
7852; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
7853; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
7854; GFX9-NEXT:    s_setpc_b64 s[30:31]
7855  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
7856  %result = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst
7857  ret i32 %result
7858}
7859
7860define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i32 %index) {
7861; SI-LABEL: atomic_min_i32_addr64_offset:
7862; SI:       ; %bb.0: ; %entry
7863; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
7864; SI-NEXT:    s_waitcnt lgkmcnt(0)
7865; SI-NEXT:    s_ashr_i32 s5, s3, 31
7866; SI-NEXT:    s_mov_b32 s4, s3
7867; SI-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
7868; SI-NEXT:    s_add_u32 s4, s0, s4
7869; SI-NEXT:    s_addc_u32 s5, s1, s5
7870; SI-NEXT:    s_load_dword s3, s[4:5], 0x4
7871; SI-NEXT:    s_mov_b64 s[0:1], 0
7872; SI-NEXT:    s_mov_b32 s7, 0xf000
7873; SI-NEXT:    s_waitcnt lgkmcnt(0)
7874; SI-NEXT:    v_mov_b32_e32 v1, s3
7875; SI-NEXT:    s_mov_b32 s6, -1
7876; SI-NEXT:  .LBB128_1: ; %atomicrmw.start
7877; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
7878; SI-NEXT:    v_min_i32_e32 v0, s2, v1
7879; SI-NEXT:    s_waitcnt expcnt(0)
7880; SI-NEXT:    v_mov_b32_e32 v3, v1
7881; SI-NEXT:    v_mov_b32_e32 v2, v0
7882; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
7883; SI-NEXT:    s_waitcnt vmcnt(0)
7884; SI-NEXT:    buffer_wbinvl1
7885; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
7886; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
7887; SI-NEXT:    v_mov_b32_e32 v1, v2
7888; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
7889; SI-NEXT:    s_cbranch_execnz .LBB128_1
7890; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
7891; SI-NEXT:    s_endpgm
7892;
7893; VI-LABEL: atomic_min_i32_addr64_offset:
7894; VI:       ; %bb.0: ; %entry
7895; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
7896; VI-NEXT:    s_waitcnt lgkmcnt(0)
7897; VI-NEXT:    s_ashr_i32 s5, s3, 31
7898; VI-NEXT:    s_mov_b32 s4, s3
7899; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
7900; VI-NEXT:    s_add_u32 s4, s0, s4
7901; VI-NEXT:    s_addc_u32 s5, s1, s5
7902; VI-NEXT:    s_load_dword s3, s[4:5], 0x10
7903; VI-NEXT:    s_add_u32 s4, s4, 16
7904; VI-NEXT:    s_addc_u32 s5, s5, 0
7905; VI-NEXT:    v_mov_b32_e32 v0, s4
7906; VI-NEXT:    s_mov_b64 s[0:1], 0
7907; VI-NEXT:    s_waitcnt lgkmcnt(0)
7908; VI-NEXT:    v_mov_b32_e32 v3, s3
7909; VI-NEXT:    v_mov_b32_e32 v1, s5
7910; VI-NEXT:  .LBB128_1: ; %atomicrmw.start
7911; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
7912; VI-NEXT:    v_min_i32_e32 v2, s2, v3
7913; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7914; VI-NEXT:    s_waitcnt vmcnt(0)
7915; VI-NEXT:    buffer_wbinvl1_vol
7916; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
7917; VI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
7918; VI-NEXT:    v_mov_b32_e32 v3, v2
7919; VI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
7920; VI-NEXT:    s_cbranch_execnz .LBB128_1
7921; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
7922; VI-NEXT:    s_endpgm
7923;
7924; GFX9-LABEL: atomic_min_i32_addr64_offset:
7925; GFX9:       ; %bb.0: ; %entry
7926; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
7927; GFX9-NEXT:    v_mov_b32_e32 v2, 0
7928; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7929; GFX9-NEXT:    s_ashr_i32 s5, s3, 31
7930; GFX9-NEXT:    s_mov_b32 s4, s3
7931; GFX9-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
7932; GFX9-NEXT:    s_add_u32 s0, s0, s4
7933; GFX9-NEXT:    s_addc_u32 s1, s1, s5
7934; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x10
7935; GFX9-NEXT:    s_mov_b64 s[4:5], 0
7936; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7937; GFX9-NEXT:    v_mov_b32_e32 v1, s3
7938; GFX9-NEXT:  .LBB128_1: ; %atomicrmw.start
7939; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
7940; GFX9-NEXT:    v_min_i32_e32 v0, s2, v1
7941; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
7942; GFX9-NEXT:    s_waitcnt vmcnt(0)
7943; GFX9-NEXT:    buffer_wbinvl1_vol
7944; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
7945; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7946; GFX9-NEXT:    v_mov_b32_e32 v1, v0
7947; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7948; GFX9-NEXT:    s_cbranch_execnz .LBB128_1
7949; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
7950; GFX9-NEXT:    s_endpgm
7951entry:
7952  %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index
7953  %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
7954  %tmp0 = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst
7955  ret void
7956}
7957
7958define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) {
7959; SI-LABEL: atomic_min_i32_ret_addr64_offset:
7960; SI:       ; %bb.0: ; %entry
7961; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
7962; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
7963; SI-NEXT:    s_waitcnt lgkmcnt(0)
7964; SI-NEXT:    s_ashr_i32 s5, s9, 31
7965; SI-NEXT:    s_mov_b32 s4, s9
7966; SI-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
7967; SI-NEXT:    s_add_u32 s4, s0, s4
7968; SI-NEXT:    s_addc_u32 s5, s1, s5
7969; SI-NEXT:    s_load_dword s6, s[4:5], 0x4
7970; SI-NEXT:    s_mov_b64 s[0:1], 0
7971; SI-NEXT:    s_mov_b32 s7, 0xf000
7972; SI-NEXT:    s_waitcnt lgkmcnt(0)
7973; SI-NEXT:    v_mov_b32_e32 v1, s6
7974; SI-NEXT:    s_mov_b32 s6, -1
7975; SI-NEXT:  .LBB129_1: ; %atomicrmw.start
7976; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
7977; SI-NEXT:    v_min_i32_e32 v0, s8, v1
7978; SI-NEXT:    s_waitcnt expcnt(0)
7979; SI-NEXT:    v_mov_b32_e32 v3, v1
7980; SI-NEXT:    v_mov_b32_e32 v2, v0
7981; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
7982; SI-NEXT:    s_waitcnt vmcnt(0)
7983; SI-NEXT:    buffer_wbinvl1
7984; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
7985; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
7986; SI-NEXT:    v_mov_b32_e32 v1, v2
7987; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
7988; SI-NEXT:    s_cbranch_execnz .LBB129_1
7989; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
7990; SI-NEXT:    s_or_b64 exec, exec, s[0:1]
7991; SI-NEXT:    s_mov_b32 s7, 0xf000
7992; SI-NEXT:    s_mov_b32 s6, -1
7993; SI-NEXT:    s_mov_b32 s4, s2
7994; SI-NEXT:    s_mov_b32 s5, s3
7995; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0
7996; SI-NEXT:    s_endpgm
7997;
7998; VI-LABEL: atomic_min_i32_ret_addr64_offset:
7999; VI:       ; %bb.0: ; %entry
8000; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
8001; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
8002; VI-NEXT:    s_waitcnt lgkmcnt(0)
8003; VI-NEXT:    s_ashr_i32 s5, s7, 31
8004; VI-NEXT:    s_mov_b32 s4, s7
8005; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
8006; VI-NEXT:    s_add_u32 s4, s0, s4
8007; VI-NEXT:    s_addc_u32 s5, s1, s5
8008; VI-NEXT:    s_load_dword s7, s[4:5], 0x10
8009; VI-NEXT:    s_add_u32 s4, s4, 16
8010; VI-NEXT:    s_addc_u32 s5, s5, 0
8011; VI-NEXT:    v_mov_b32_e32 v0, s4
8012; VI-NEXT:    s_mov_b64 s[0:1], 0
8013; VI-NEXT:    s_waitcnt lgkmcnt(0)
8014; VI-NEXT:    v_mov_b32_e32 v2, s7
8015; VI-NEXT:    v_mov_b32_e32 v1, s5
8016; VI-NEXT:  .LBB129_1: ; %atomicrmw.start
8017; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
8018; VI-NEXT:    v_mov_b32_e32 v3, v2
8019; VI-NEXT:    v_min_i32_e32 v2, s6, v3
8020; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8021; VI-NEXT:    s_waitcnt vmcnt(0)
8022; VI-NEXT:    buffer_wbinvl1_vol
8023; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
8024; VI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
8025; VI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
8026; VI-NEXT:    s_cbranch_execnz .LBB129_1
8027; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
8028; VI-NEXT:    s_or_b64 exec, exec, s[0:1]
8029; VI-NEXT:    v_mov_b32_e32 v0, s2
8030; VI-NEXT:    v_mov_b32_e32 v1, s3
8031; VI-NEXT:    flat_store_dword v[0:1], v2
8032; VI-NEXT:    s_endpgm
8033;
8034; GFX9-LABEL: atomic_min_i32_ret_addr64_offset:
8035; GFX9:       ; %bb.0: ; %entry
8036; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
8037; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
8038; GFX9-NEXT:    v_mov_b32_e32 v1, 0
8039; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8040; GFX9-NEXT:    s_ashr_i32 s5, s7, 31
8041; GFX9-NEXT:    s_mov_b32 s4, s7
8042; GFX9-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
8043; GFX9-NEXT:    s_add_u32 s0, s0, s4
8044; GFX9-NEXT:    s_addc_u32 s1, s1, s5
8045; GFX9-NEXT:    s_load_dword s7, s[0:1], 0x10
8046; GFX9-NEXT:    s_mov_b64 s[4:5], 0
8047; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8048; GFX9-NEXT:    v_mov_b32_e32 v0, s7
8049; GFX9-NEXT:  .LBB129_1: ; %atomicrmw.start
8050; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
8051; GFX9-NEXT:    v_mov_b32_e32 v3, v0
8052; GFX9-NEXT:    v_min_i32_e32 v2, s6, v3
8053; GFX9-NEXT:    global_atomic_cmpswap v0, v1, v[2:3], s[0:1] offset:16 glc
8054; GFX9-NEXT:    s_waitcnt vmcnt(0)
8055; GFX9-NEXT:    buffer_wbinvl1_vol
8056; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
8057; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8058; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8059; GFX9-NEXT:    s_cbranch_execnz .LBB129_1
8060; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
8061; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
8062; GFX9-NEXT:    v_mov_b32_e32 v1, 0
8063; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
8064; GFX9-NEXT:    s_endpgm
8065entry:
8066  %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index
8067  %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
8068  %tmp0 = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst
8069  store i32 %tmp0, ptr addrspace(1) %out2
8070  ret void
8071}
8072
8073define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) {
8074; SI-LABEL: atomic_min_i32:
8075; SI:       ; %bb.0: ; %entry
8076; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
8077; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
8078; SI-NEXT:    s_waitcnt lgkmcnt(0)
8079; SI-NEXT:    s_load_dword s2, s[0:1], 0x0
8080; SI-NEXT:    s_mov_b64 s[4:5], 0
8081; SI-NEXT:    s_mov_b32 s3, 0xf000
8082; SI-NEXT:    s_waitcnt lgkmcnt(0)
8083; SI-NEXT:    v_mov_b32_e32 v1, s2
8084; SI-NEXT:    s_mov_b32 s2, -1
8085; SI-NEXT:  .LBB130_1: ; %atomicrmw.start
8086; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
8087; SI-NEXT:    v_min_i32_e32 v0, s6, v1
8088; SI-NEXT:    s_waitcnt expcnt(0)
8089; SI-NEXT:    v_mov_b32_e32 v3, v1
8090; SI-NEXT:    v_mov_b32_e32 v2, v0
8091; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
8092; SI-NEXT:    s_waitcnt vmcnt(0)
8093; SI-NEXT:    buffer_wbinvl1
8094; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
8095; SI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8096; SI-NEXT:    v_mov_b32_e32 v1, v2
8097; SI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8098; SI-NEXT:    s_cbranch_execnz .LBB130_1
8099; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
8100; SI-NEXT:    s_endpgm
8101;
8102; VI-LABEL: atomic_min_i32:
8103; VI:       ; %bb.0: ; %entry
8104; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
8105; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
8106; VI-NEXT:    s_mov_b64 s[0:1], 0
8107; VI-NEXT:    s_waitcnt lgkmcnt(0)
8108; VI-NEXT:    s_load_dword s3, s[6:7], 0x0
8109; VI-NEXT:    v_mov_b32_e32 v0, s6
8110; VI-NEXT:    v_mov_b32_e32 v1, s7
8111; VI-NEXT:    s_waitcnt lgkmcnt(0)
8112; VI-NEXT:    v_mov_b32_e32 v3, s3
8113; VI-NEXT:  .LBB130_1: ; %atomicrmw.start
8114; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
8115; VI-NEXT:    v_min_i32_e32 v2, s2, v3
8116; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8117; VI-NEXT:    s_waitcnt vmcnt(0)
8118; VI-NEXT:    buffer_wbinvl1_vol
8119; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
8120; VI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
8121; VI-NEXT:    v_mov_b32_e32 v3, v2
8122; VI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
8123; VI-NEXT:    s_cbranch_execnz .LBB130_1
8124; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
8125; VI-NEXT:    s_endpgm
8126;
8127; GFX9-LABEL: atomic_min_i32:
8128; GFX9:       ; %bb.0: ; %entry
8129; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
8130; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
8131; GFX9-NEXT:    s_mov_b64 s[2:3], 0
8132; GFX9-NEXT:    v_mov_b32_e32 v2, 0
8133; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8134; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
8135; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8136; GFX9-NEXT:    v_mov_b32_e32 v1, s4
8137; GFX9-NEXT:  .LBB130_1: ; %atomicrmw.start
8138; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
8139; GFX9-NEXT:    v_min_i32_e32 v0, s6, v1
8140; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
8141; GFX9-NEXT:    s_waitcnt vmcnt(0)
8142; GFX9-NEXT:    buffer_wbinvl1_vol
8143; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
8144; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
8145; GFX9-NEXT:    v_mov_b32_e32 v1, v0
8146; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
8147; GFX9-NEXT:    s_cbranch_execnz .LBB130_1
8148; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
8149; GFX9-NEXT:    s_endpgm
8150entry:
8151  %tmp0 = atomicrmw min ptr addrspace(1) %out, i32 %in seq_cst
8152  ret void
8153}
8154
8155define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) {
8156; SI-LABEL: atomic_min_i32_ret_addr64:
8157; SI:       ; %bb.0: ; %entry
8158; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
8159; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
8160; SI-NEXT:    s_waitcnt lgkmcnt(0)
8161; SI-NEXT:    s_ashr_i32 s5, s9, 31
8162; SI-NEXT:    s_mov_b32 s4, s9
8163; SI-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
8164; SI-NEXT:    s_add_u32 s4, s0, s4
8165; SI-NEXT:    s_addc_u32 s5, s1, s5
8166; SI-NEXT:    s_load_dword s6, s[4:5], 0x0
8167; SI-NEXT:    s_mov_b64 s[0:1], 0
8168; SI-NEXT:    s_mov_b32 s7, 0xf000
8169; SI-NEXT:    s_waitcnt lgkmcnt(0)
8170; SI-NEXT:    v_mov_b32_e32 v1, s6
8171; SI-NEXT:    s_mov_b32 s6, -1
8172; SI-NEXT:  .LBB131_1: ; %atomicrmw.start
8173; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
8174; SI-NEXT:    v_min_i32_e32 v0, s8, v1
8175; SI-NEXT:    s_waitcnt expcnt(0)
8176; SI-NEXT:    v_mov_b32_e32 v3, v1
8177; SI-NEXT:    v_mov_b32_e32 v2, v0
8178; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
8179; SI-NEXT:    s_waitcnt vmcnt(0)
8180; SI-NEXT:    buffer_wbinvl1
8181; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
8182; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
8183; SI-NEXT:    v_mov_b32_e32 v1, v2
8184; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
8185; SI-NEXT:    s_cbranch_execnz .LBB131_1
8186; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
8187; SI-NEXT:    s_or_b64 exec, exec, s[0:1]
8188; SI-NEXT:    s_mov_b32 s7, 0xf000
8189; SI-NEXT:    s_mov_b32 s6, -1
8190; SI-NEXT:    s_mov_b32 s4, s2
8191; SI-NEXT:    s_mov_b32 s5, s3
8192; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0
8193; SI-NEXT:    s_endpgm
8194;
8195; VI-LABEL: atomic_min_i32_ret_addr64:
8196; VI:       ; %bb.0: ; %entry
8197; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
8198; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
8199; VI-NEXT:    s_waitcnt lgkmcnt(0)
8200; VI-NEXT:    s_ashr_i32 s5, s7, 31
8201; VI-NEXT:    s_mov_b32 s4, s7
8202; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
8203; VI-NEXT:    s_add_u32 s4, s0, s4
8204; VI-NEXT:    s_addc_u32 s5, s1, s5
8205; VI-NEXT:    s_load_dword s7, s[4:5], 0x0
8206; VI-NEXT:    v_mov_b32_e32 v0, s4
8207; VI-NEXT:    s_mov_b64 s[0:1], 0
8208; VI-NEXT:    v_mov_b32_e32 v1, s5
8209; VI-NEXT:    s_waitcnt lgkmcnt(0)
8210; VI-NEXT:    v_mov_b32_e32 v2, s7
8211; VI-NEXT:  .LBB131_1: ; %atomicrmw.start
8212; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
8213; VI-NEXT:    v_mov_b32_e32 v3, v2
8214; VI-NEXT:    v_min_i32_e32 v2, s6, v3
8215; VI-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8216; VI-NEXT:    s_waitcnt vmcnt(0)
8217; VI-NEXT:    buffer_wbinvl1_vol
8218; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
8219; VI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
8220; VI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
8221; VI-NEXT:    s_cbranch_execnz .LBB131_1
8222; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
8223; VI-NEXT:    s_or_b64 exec, exec, s[0:1]
8224; VI-NEXT:    v_mov_b32_e32 v0, s2
8225; VI-NEXT:    v_mov_b32_e32 v1, s3
8226; VI-NEXT:    flat_store_dword v[0:1], v2
8227; VI-NEXT:    s_endpgm
8228;
8229; GFX9-LABEL: atomic_min_i32_ret_addr64:
8230; GFX9:       ; %bb.0: ; %entry
8231; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
8232; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
8233; GFX9-NEXT:    v_mov_b32_e32 v1, 0
8234; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8235; GFX9-NEXT:    s_ashr_i32 s5, s7, 31
8236; GFX9-NEXT:    s_mov_b32 s4, s7
8237; GFX9-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
8238; GFX9-NEXT:    s_add_u32 s0, s0, s4
8239; GFX9-NEXT:    s_addc_u32 s1, s1, s5
8240; GFX9-NEXT:    s_load_dword s7, s[0:1], 0x0
8241; GFX9-NEXT:    s_mov_b64 s[4:5], 0
8242; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8243; GFX9-NEXT:    v_mov_b32_e32 v0, s7
8244; GFX9-NEXT:  .LBB131_1: ; %atomicrmw.start
8245; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
8246; GFX9-NEXT:    v_mov_b32_e32 v3, v0
8247; GFX9-NEXT:    v_min_i32_e32 v2, s6, v3
8248; GFX9-NEXT:    global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc
8249; GFX9-NEXT:    s_waitcnt vmcnt(0)
8250; GFX9-NEXT:    buffer_wbinvl1_vol
8251; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
8252; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8253; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8254; GFX9-NEXT:    s_cbranch_execnz .LBB131_1
8255; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
8256; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
8257; GFX9-NEXT:    v_mov_b32_e32 v1, 0
8258; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
8259; GFX9-NEXT:    s_endpgm
8260entry:
8261  %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index
8262  %tmp0 = atomicrmw min ptr addrspace(1) %ptr, i32 %in seq_cst
8263  store i32 %tmp0, ptr addrspace(1) %out2
8264  ret void
8265}
8266
8267define void @global_atomic_min_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
8268; SI-LABEL: global_atomic_min_i32_noret_offset__amdgpu_no_remote_memory:
8269; SI:       ; %bb.0:
8270; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8271; SI-NEXT:    s_mov_b32 s6, 0
8272; SI-NEXT:    s_mov_b32 s7, 0xf000
8273; SI-NEXT:    s_mov_b32 s4, s6
8274; SI-NEXT:    s_mov_b32 s5, s6
8275; SI-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
8276; SI-NEXT:    s_mov_b64 s[8:9], 0
8277; SI-NEXT:  .LBB132_1: ; %atomicrmw.start
8278; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
8279; SI-NEXT:    s_waitcnt vmcnt(0)
8280; SI-NEXT:    v_min_i32_e32 v3, v4, v2
8281; SI-NEXT:    s_waitcnt expcnt(0)
8282; SI-NEXT:    v_mov_b32_e32 v6, v4
8283; SI-NEXT:    v_mov_b32_e32 v5, v3
8284; SI-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
8285; SI-NEXT:    s_waitcnt vmcnt(0)
8286; SI-NEXT:    buffer_wbinvl1
8287; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
8288; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
8289; SI-NEXT:    v_mov_b32_e32 v4, v5
8290; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
8291; SI-NEXT:    s_cbranch_execnz .LBB132_1
8292; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
8293; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
8294; SI-NEXT:    s_waitcnt expcnt(0)
8295; SI-NEXT:    s_setpc_b64 s[30:31]
8296;
8297; VI-LABEL: global_atomic_min_i32_noret_offset__amdgpu_no_remote_memory:
8298; VI:       ; %bb.0:
8299; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8300; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
8301; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
8302; VI-NEXT:    flat_load_dword v4, v[0:1]
8303; VI-NEXT:    s_mov_b64 s[4:5], 0
8304; VI-NEXT:  .LBB132_1: ; %atomicrmw.start
8305; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
8306; VI-NEXT:    s_waitcnt vmcnt(0)
8307; VI-NEXT:    v_min_i32_e32 v3, v4, v2
8308; VI-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
8309; VI-NEXT:    s_waitcnt vmcnt(0)
8310; VI-NEXT:    buffer_wbinvl1_vol
8311; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
8312; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8313; VI-NEXT:    v_mov_b32_e32 v4, v3
8314; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8315; VI-NEXT:    s_cbranch_execnz .LBB132_1
8316; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
8317; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
8318; VI-NEXT:    s_setpc_b64 s[30:31]
8319;
8320; GFX9-LABEL: global_atomic_min_i32_noret_offset__amdgpu_no_remote_memory:
8321; GFX9:       ; %bb.0:
8322; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8323; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:16
8324; GFX9-NEXT:    s_mov_b64 s[4:5], 0
8325; GFX9-NEXT:  .LBB132_1: ; %atomicrmw.start
8326; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
8327; GFX9-NEXT:    s_waitcnt vmcnt(0)
8328; GFX9-NEXT:    v_min_i32_e32 v3, v4, v2
8329; GFX9-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
8330; GFX9-NEXT:    s_waitcnt vmcnt(0)
8331; GFX9-NEXT:    buffer_wbinvl1_vol
8332; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
8333; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8334; GFX9-NEXT:    v_mov_b32_e32 v4, v3
8335; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8336; GFX9-NEXT:    s_cbranch_execnz .LBB132_1
8337; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
8338; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
8339; GFX9-NEXT:    s_setpc_b64 s[30:31]
8340  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
8341  %tmp0 = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
8342  ret void
8343}
8344
8345define i32 @global_atomic_min_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
8346; SI-LABEL: global_atomic_min_i32_ret_offset__amdgpu_no_remote_memory:
8347; SI:       ; %bb.0:
8348; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8349; SI-NEXT:    s_mov_b32 s6, 0
8350; SI-NEXT:    s_mov_b32 s7, 0xf000
8351; SI-NEXT:    s_mov_b32 s4, s6
8352; SI-NEXT:    s_mov_b32 s5, s6
8353; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
8354; SI-NEXT:    s_mov_b64 s[8:9], 0
8355; SI-NEXT:  .LBB133_1: ; %atomicrmw.start
8356; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
8357; SI-NEXT:    s_waitcnt vmcnt(0)
8358; SI-NEXT:    v_mov_b32_e32 v5, v3
8359; SI-NEXT:    s_waitcnt expcnt(0)
8360; SI-NEXT:    v_min_i32_e32 v4, v5, v2
8361; SI-NEXT:    v_mov_b32_e32 v3, v4
8362; SI-NEXT:    v_mov_b32_e32 v4, v5
8363; SI-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
8364; SI-NEXT:    s_waitcnt vmcnt(0)
8365; SI-NEXT:    buffer_wbinvl1
8366; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
8367; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
8368; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
8369; SI-NEXT:    s_cbranch_execnz .LBB133_1
8370; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
8371; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
8372; SI-NEXT:    v_mov_b32_e32 v0, v3
8373; SI-NEXT:    s_waitcnt expcnt(0)
8374; SI-NEXT:    s_setpc_b64 s[30:31]
8375;
8376; VI-LABEL: global_atomic_min_i32_ret_offset__amdgpu_no_remote_memory:
8377; VI:       ; %bb.0:
8378; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8379; VI-NEXT:    v_add_u32_e32 v3, vcc, 16, v0
8380; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
8381; VI-NEXT:    flat_load_dword v0, v[3:4]
8382; VI-NEXT:    s_mov_b64 s[4:5], 0
8383; VI-NEXT:  .LBB133_1: ; %atomicrmw.start
8384; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
8385; VI-NEXT:    s_waitcnt vmcnt(0)
8386; VI-NEXT:    v_mov_b32_e32 v1, v0
8387; VI-NEXT:    v_min_i32_e32 v0, v1, v2
8388; VI-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
8389; VI-NEXT:    s_waitcnt vmcnt(0)
8390; VI-NEXT:    buffer_wbinvl1_vol
8391; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
8392; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8393; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8394; VI-NEXT:    s_cbranch_execnz .LBB133_1
8395; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
8396; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
8397; VI-NEXT:    s_setpc_b64 s[30:31]
8398;
8399; GFX9-LABEL: global_atomic_min_i32_ret_offset__amdgpu_no_remote_memory:
8400; GFX9:       ; %bb.0:
8401; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8402; GFX9-NEXT:    global_load_dword v3, v[0:1], off offset:16
8403; GFX9-NEXT:    s_mov_b64 s[4:5], 0
8404; GFX9-NEXT:  .LBB133_1: ; %atomicrmw.start
8405; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
8406; GFX9-NEXT:    s_waitcnt vmcnt(0)
8407; GFX9-NEXT:    v_mov_b32_e32 v4, v3
8408; GFX9-NEXT:    v_min_i32_e32 v3, v4, v2
8409; GFX9-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
8410; GFX9-NEXT:    s_waitcnt vmcnt(0)
8411; GFX9-NEXT:    buffer_wbinvl1_vol
8412; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
8413; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8414; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8415; GFX9-NEXT:    s_cbranch_execnz .LBB133_1
8416; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
8417; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
8418; GFX9-NEXT:    v_mov_b32_e32 v0, v3
8419; GFX9-NEXT:    s_setpc_b64 s[30:31]
8420  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
8421  %result = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
8422  ret i32 %result
8423}
8424
8425; ---------------------------------------------------------------------
8426; atomicrmw uinc_wrap
8427; ---------------------------------------------------------------------
8428
8429define void @global_atomic_uinc_wrap_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
8430; SI-LABEL: global_atomic_uinc_wrap_i32_noret:
8431; SI:       ; %bb.0:
8432; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8433; SI-NEXT:    s_mov_b32 s6, 0
8434; SI-NEXT:    s_mov_b32 s7, 0xf000
8435; SI-NEXT:    s_mov_b32 s4, s6
8436; SI-NEXT:    s_mov_b32 s5, s6
8437; SI-NEXT:    buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64
8438; SI-NEXT:    s_waitcnt vmcnt(0)
8439; SI-NEXT:    buffer_wbinvl1
8440; SI-NEXT:    s_waitcnt expcnt(0)
8441; SI-NEXT:    s_setpc_b64 s[30:31]
8442;
8443; VI-LABEL: global_atomic_uinc_wrap_i32_noret:
8444; VI:       ; %bb.0:
8445; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8446; VI-NEXT:    flat_atomic_inc v[0:1], v2
8447; VI-NEXT:    s_waitcnt vmcnt(0)
8448; VI-NEXT:    buffer_wbinvl1_vol
8449; VI-NEXT:    s_setpc_b64 s[30:31]
8450;
8451; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret:
8452; GFX9:       ; %bb.0:
8453; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8454; GFX9-NEXT:    global_atomic_inc v[0:1], v2, off
8455; GFX9-NEXT:    s_waitcnt vmcnt(0)
8456; GFX9-NEXT:    buffer_wbinvl1_vol
8457; GFX9-NEXT:    s_setpc_b64 s[30:31]
8458  %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %in seq_cst
8459  ret void
8460}
8461
8462define void @global_atomic_uinc_wrap_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
8463; SI-LABEL: global_atomic_uinc_wrap_i32_noret_offset:
8464; SI:       ; %bb.0:
8465; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8466; SI-NEXT:    s_mov_b32 s6, 0
8467; SI-NEXT:    s_mov_b32 s7, 0xf000
8468; SI-NEXT:    s_mov_b32 s4, s6
8469; SI-NEXT:    s_mov_b32 s5, s6
8470; SI-NEXT:    buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 offset:16
8471; SI-NEXT:    s_waitcnt vmcnt(0)
8472; SI-NEXT:    buffer_wbinvl1
8473; SI-NEXT:    s_waitcnt expcnt(0)
8474; SI-NEXT:    s_setpc_b64 s[30:31]
8475;
8476; VI-LABEL: global_atomic_uinc_wrap_i32_noret_offset:
8477; VI:       ; %bb.0:
8478; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8479; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
8480; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
8481; VI-NEXT:    flat_atomic_inc v[0:1], v2
8482; VI-NEXT:    s_waitcnt vmcnt(0)
8483; VI-NEXT:    buffer_wbinvl1_vol
8484; VI-NEXT:    s_setpc_b64 s[30:31]
8485;
8486; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret_offset:
8487; GFX9:       ; %bb.0:
8488; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8489; GFX9-NEXT:    global_atomic_inc v[0:1], v2, off offset:16
8490; GFX9-NEXT:    s_waitcnt vmcnt(0)
8491; GFX9-NEXT:    buffer_wbinvl1_vol
8492; GFX9-NEXT:    s_setpc_b64 s[30:31]
8493  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
8494  %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst
8495  ret void
8496}
8497
8498define i32 @global_atomic_uinc_wrap_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
8499; SI-LABEL: global_atomic_uinc_wrap_i32_ret:
8500; SI:       ; %bb.0:
8501; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8502; SI-NEXT:    s_mov_b32 s6, 0
8503; SI-NEXT:    s_mov_b32 s7, 0xf000
8504; SI-NEXT:    s_mov_b32 s4, s6
8505; SI-NEXT:    s_mov_b32 s5, s6
8506; SI-NEXT:    buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 glc
8507; SI-NEXT:    s_waitcnt vmcnt(0)
8508; SI-NEXT:    buffer_wbinvl1
8509; SI-NEXT:    v_mov_b32_e32 v0, v2
8510; SI-NEXT:    s_waitcnt expcnt(0)
8511; SI-NEXT:    s_setpc_b64 s[30:31]
8512;
8513; VI-LABEL: global_atomic_uinc_wrap_i32_ret:
8514; VI:       ; %bb.0:
8515; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8516; VI-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
8517; VI-NEXT:    s_waitcnt vmcnt(0)
8518; VI-NEXT:    buffer_wbinvl1_vol
8519; VI-NEXT:    s_setpc_b64 s[30:31]
8520;
8521; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret:
8522; GFX9:       ; %bb.0:
8523; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8524; GFX9-NEXT:    global_atomic_inc v0, v[0:1], v2, off glc
8525; GFX9-NEXT:    s_waitcnt vmcnt(0)
8526; GFX9-NEXT:    buffer_wbinvl1_vol
8527; GFX9-NEXT:    s_setpc_b64 s[30:31]
8528  %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %in seq_cst
8529  ret i32 %result
8530}
8531
8532define i32 @global_atomic_uinc_wrap_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
8533; SI-LABEL: global_atomic_uinc_wrap_i32_ret_offset:
8534; SI:       ; %bb.0:
8535; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8536; SI-NEXT:    s_mov_b32 s6, 0
8537; SI-NEXT:    s_mov_b32 s7, 0xf000
8538; SI-NEXT:    s_mov_b32 s4, s6
8539; SI-NEXT:    s_mov_b32 s5, s6
8540; SI-NEXT:    buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
8541; SI-NEXT:    s_waitcnt vmcnt(0)
8542; SI-NEXT:    buffer_wbinvl1
8543; SI-NEXT:    v_mov_b32_e32 v0, v2
8544; SI-NEXT:    s_waitcnt expcnt(0)
8545; SI-NEXT:    s_setpc_b64 s[30:31]
8546;
8547; VI-LABEL: global_atomic_uinc_wrap_i32_ret_offset:
8548; VI:       ; %bb.0:
8549; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8550; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
8551; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
8552; VI-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
8553; VI-NEXT:    s_waitcnt vmcnt(0)
8554; VI-NEXT:    buffer_wbinvl1_vol
8555; VI-NEXT:    s_setpc_b64 s[30:31]
8556;
8557; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret_offset:
8558; GFX9:       ; %bb.0:
8559; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8560; GFX9-NEXT:    global_atomic_inc v0, v[0:1], v2, off offset:16 glc
8561; GFX9-NEXT:    s_waitcnt vmcnt(0)
8562; GFX9-NEXT:    buffer_wbinvl1_vol
8563; GFX9-NEXT:    s_setpc_b64 s[30:31]
8564  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
8565  %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst
8566  ret i32 %result
8567}
8568
8569define amdgpu_gfx void @global_atomic_uinc_wrap_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
8570; SI-LABEL: global_atomic_uinc_wrap_i32_noret_scalar:
8571; SI:       ; %bb.0:
8572; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8573; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
8574; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
8575; SI-NEXT:    s_mov_b64 exec, s[34:35]
8576; SI-NEXT:    s_waitcnt expcnt(0)
8577; SI-NEXT:    v_writelane_b32 v1, s6, 0
8578; SI-NEXT:    v_writelane_b32 v1, s7, 1
8579; SI-NEXT:    s_mov_b32 s34, s6
8580; SI-NEXT:    s_mov_b32 s7, 0xf000
8581; SI-NEXT:    s_mov_b32 s6, -1
8582; SI-NEXT:    v_mov_b32_e32 v0, s34
8583; SI-NEXT:    s_waitcnt vmcnt(0)
8584; SI-NEXT:    buffer_atomic_inc v0, off, s[4:7], 0
8585; SI-NEXT:    s_waitcnt vmcnt(0)
8586; SI-NEXT:    buffer_wbinvl1
8587; SI-NEXT:    v_readlane_b32 s7, v1, 1
8588; SI-NEXT:    v_readlane_b32 s6, v1, 0
8589; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
8590; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
8591; SI-NEXT:    s_mov_b64 exec, s[34:35]
8592; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
8593; SI-NEXT:    s_setpc_b64 s[30:31]
8594;
8595; VI-LABEL: global_atomic_uinc_wrap_i32_noret_scalar:
8596; VI:       ; %bb.0:
8597; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8598; VI-NEXT:    v_mov_b32_e32 v0, s4
8599; VI-NEXT:    v_mov_b32_e32 v1, s5
8600; VI-NEXT:    v_mov_b32_e32 v2, s6
8601; VI-NEXT:    flat_atomic_inc v[0:1], v2
8602; VI-NEXT:    s_waitcnt vmcnt(0)
8603; VI-NEXT:    buffer_wbinvl1_vol
8604; VI-NEXT:    s_setpc_b64 s[30:31]
8605;
8606; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret_scalar:
8607; GFX9:       ; %bb.0:
8608; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8609; GFX9-NEXT:    v_mov_b32_e32 v0, 0
8610; GFX9-NEXT:    v_mov_b32_e32 v1, s6
8611; GFX9-NEXT:    global_atomic_inc v0, v1, s[4:5]
8612; GFX9-NEXT:    s_waitcnt vmcnt(0)
8613; GFX9-NEXT:    buffer_wbinvl1_vol
8614; GFX9-NEXT:    s_setpc_b64 s[30:31]
8615  %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %in seq_cst
8616  ret void
8617}
8618
8619define amdgpu_gfx void @global_atomic_uinc_wrap_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
8620; SI-LABEL: global_atomic_uinc_wrap_i32_noret_offset_scalar:
8621; SI:       ; %bb.0:
8622; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8623; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
8624; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
8625; SI-NEXT:    s_mov_b64 exec, s[34:35]
8626; SI-NEXT:    s_waitcnt expcnt(0)
8627; SI-NEXT:    v_writelane_b32 v1, s6, 0
8628; SI-NEXT:    v_writelane_b32 v1, s7, 1
8629; SI-NEXT:    s_mov_b32 s34, s6
8630; SI-NEXT:    s_mov_b32 s7, 0xf000
8631; SI-NEXT:    s_mov_b32 s6, -1
8632; SI-NEXT:    v_mov_b32_e32 v0, s34
8633; SI-NEXT:    s_waitcnt vmcnt(0)
8634; SI-NEXT:    buffer_atomic_inc v0, off, s[4:7], 0 offset:16
8635; SI-NEXT:    s_waitcnt vmcnt(0)
8636; SI-NEXT:    buffer_wbinvl1
8637; SI-NEXT:    v_readlane_b32 s7, v1, 1
8638; SI-NEXT:    v_readlane_b32 s6, v1, 0
8639; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
8640; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
8641; SI-NEXT:    s_mov_b64 exec, s[34:35]
8642; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
8643; SI-NEXT:    s_setpc_b64 s[30:31]
8644;
8645; VI-LABEL: global_atomic_uinc_wrap_i32_noret_offset_scalar:
8646; VI:       ; %bb.0:
8647; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8648; VI-NEXT:    s_add_u32 s34, s4, 16
8649; VI-NEXT:    s_addc_u32 s35, s5, 0
8650; VI-NEXT:    v_mov_b32_e32 v0, s34
8651; VI-NEXT:    v_mov_b32_e32 v1, s35
8652; VI-NEXT:    v_mov_b32_e32 v2, s6
8653; VI-NEXT:    flat_atomic_inc v[0:1], v2
8654; VI-NEXT:    s_waitcnt vmcnt(0)
8655; VI-NEXT:    buffer_wbinvl1_vol
8656; VI-NEXT:    s_setpc_b64 s[30:31]
8657;
8658; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret_offset_scalar:
8659; GFX9:       ; %bb.0:
8660; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8661; GFX9-NEXT:    v_mov_b32_e32 v0, 0
8662; GFX9-NEXT:    v_mov_b32_e32 v1, s6
8663; GFX9-NEXT:    global_atomic_inc v0, v1, s[4:5] offset:16
8664; GFX9-NEXT:    s_waitcnt vmcnt(0)
8665; GFX9-NEXT:    buffer_wbinvl1_vol
8666; GFX9-NEXT:    s_setpc_b64 s[30:31]
8667  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
8668  %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst
8669  ret void
8670}
8671
8672define amdgpu_gfx i32 @global_atomic_uinc_wrap_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
8673; SI-LABEL: global_atomic_uinc_wrap_i32_ret_scalar:
8674; SI:       ; %bb.0:
8675; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8676; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
8677; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
8678; SI-NEXT:    s_mov_b64 exec, s[34:35]
8679; SI-NEXT:    s_waitcnt expcnt(0)
8680; SI-NEXT:    v_writelane_b32 v1, s6, 0
8681; SI-NEXT:    v_writelane_b32 v1, s7, 1
8682; SI-NEXT:    s_mov_b32 s34, s6
8683; SI-NEXT:    s_mov_b32 s7, 0xf000
8684; SI-NEXT:    s_mov_b32 s6, -1
8685; SI-NEXT:    v_mov_b32_e32 v0, s34
8686; SI-NEXT:    s_waitcnt vmcnt(0)
8687; SI-NEXT:    buffer_atomic_inc v0, off, s[4:7], 0 glc
8688; SI-NEXT:    s_waitcnt vmcnt(0)
8689; SI-NEXT:    buffer_wbinvl1
8690; SI-NEXT:    v_readlane_b32 s7, v1, 1
8691; SI-NEXT:    v_readlane_b32 s6, v1, 0
8692; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
8693; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
8694; SI-NEXT:    s_mov_b64 exec, s[34:35]
8695; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
8696; SI-NEXT:    s_setpc_b64 s[30:31]
8697;
8698; VI-LABEL: global_atomic_uinc_wrap_i32_ret_scalar:
8699; VI:       ; %bb.0:
8700; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8701; VI-NEXT:    v_mov_b32_e32 v0, s4
8702; VI-NEXT:    v_mov_b32_e32 v1, s5
8703; VI-NEXT:    v_mov_b32_e32 v2, s6
8704; VI-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
8705; VI-NEXT:    s_waitcnt vmcnt(0)
8706; VI-NEXT:    buffer_wbinvl1_vol
8707; VI-NEXT:    s_setpc_b64 s[30:31]
8708;
8709; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret_scalar:
8710; GFX9:       ; %bb.0:
8711; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8712; GFX9-NEXT:    v_mov_b32_e32 v0, 0
8713; GFX9-NEXT:    v_mov_b32_e32 v1, s6
8714; GFX9-NEXT:    global_atomic_inc v0, v0, v1, s[4:5] glc
8715; GFX9-NEXT:    s_waitcnt vmcnt(0)
8716; GFX9-NEXT:    buffer_wbinvl1_vol
8717; GFX9-NEXT:    s_setpc_b64 s[30:31]
8718  %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %in seq_cst
8719  ret i32 %result
8720}
8721
8722define amdgpu_gfx i32 @global_atomic_uinc_wrap_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
8723; SI-LABEL: global_atomic_uinc_wrap_i32_ret_offset_scalar:
8724; SI:       ; %bb.0:
8725; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8726; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
8727; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
8728; SI-NEXT:    s_mov_b64 exec, s[34:35]
8729; SI-NEXT:    s_waitcnt expcnt(0)
8730; SI-NEXT:    v_writelane_b32 v1, s6, 0
8731; SI-NEXT:    v_writelane_b32 v1, s7, 1
8732; SI-NEXT:    s_mov_b32 s34, s6
8733; SI-NEXT:    s_mov_b32 s7, 0xf000
8734; SI-NEXT:    s_mov_b32 s6, -1
8735; SI-NEXT:    v_mov_b32_e32 v0, s34
8736; SI-NEXT:    s_waitcnt vmcnt(0)
8737; SI-NEXT:    buffer_atomic_inc v0, off, s[4:7], 0 offset:16 glc
8738; SI-NEXT:    s_waitcnt vmcnt(0)
8739; SI-NEXT:    buffer_wbinvl1
8740; SI-NEXT:    v_readlane_b32 s7, v1, 1
8741; SI-NEXT:    v_readlane_b32 s6, v1, 0
8742; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
8743; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
8744; SI-NEXT:    s_mov_b64 exec, s[34:35]
8745; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
8746; SI-NEXT:    s_setpc_b64 s[30:31]
8747;
8748; VI-LABEL: global_atomic_uinc_wrap_i32_ret_offset_scalar:
8749; VI:       ; %bb.0:
8750; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8751; VI-NEXT:    s_add_u32 s34, s4, 16
8752; VI-NEXT:    s_addc_u32 s35, s5, 0
8753; VI-NEXT:    v_mov_b32_e32 v0, s34
8754; VI-NEXT:    v_mov_b32_e32 v1, s35
8755; VI-NEXT:    v_mov_b32_e32 v2, s6
8756; VI-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
8757; VI-NEXT:    s_waitcnt vmcnt(0)
8758; VI-NEXT:    buffer_wbinvl1_vol
8759; VI-NEXT:    s_setpc_b64 s[30:31]
8760;
8761; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret_offset_scalar:
8762; GFX9:       ; %bb.0:
8763; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8764; GFX9-NEXT:    v_mov_b32_e32 v0, 0
8765; GFX9-NEXT:    v_mov_b32_e32 v1, s6
8766; GFX9-NEXT:    global_atomic_inc v0, v0, v1, s[4:5] offset:16 glc
8767; GFX9-NEXT:    s_waitcnt vmcnt(0)
8768; GFX9-NEXT:    buffer_wbinvl1_vol
8769; GFX9-NEXT:    s_setpc_b64 s[30:31]
8770  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
8771  %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst
8772  ret i32 %result
8773}
8774
8775define void @global_atomic_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
8776; SI-LABEL: global_atomic_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory:
8777; SI:       ; %bb.0:
8778; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8779; SI-NEXT:    s_mov_b32 s6, 0
8780; SI-NEXT:    s_mov_b32 s7, 0xf000
8781; SI-NEXT:    s_mov_b32 s4, s6
8782; SI-NEXT:    s_mov_b32 s5, s6
8783; SI-NEXT:    buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 offset:16
8784; SI-NEXT:    s_waitcnt vmcnt(0)
8785; SI-NEXT:    buffer_wbinvl1
8786; SI-NEXT:    s_waitcnt expcnt(0)
8787; SI-NEXT:    s_setpc_b64 s[30:31]
8788;
8789; VI-LABEL: global_atomic_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory:
8790; VI:       ; %bb.0:
8791; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8792; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
8793; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
8794; VI-NEXT:    flat_atomic_inc v[0:1], v2
8795; VI-NEXT:    s_waitcnt vmcnt(0)
8796; VI-NEXT:    buffer_wbinvl1_vol
8797; VI-NEXT:    s_setpc_b64 s[30:31]
8798;
8799; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory:
8800; GFX9:       ; %bb.0:
8801; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8802; GFX9-NEXT:    global_atomic_inc v[0:1], v2, off offset:16
8803; GFX9-NEXT:    s_waitcnt vmcnt(0)
8804; GFX9-NEXT:    buffer_wbinvl1_vol
8805; GFX9-NEXT:    s_setpc_b64 s[30:31]
8806  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
8807  %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
8808  ret void
8809}
8810
8811define i32 @global_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
8812; SI-LABEL: global_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory:
8813; SI:       ; %bb.0:
8814; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8815; SI-NEXT:    s_mov_b32 s6, 0
8816; SI-NEXT:    s_mov_b32 s7, 0xf000
8817; SI-NEXT:    s_mov_b32 s4, s6
8818; SI-NEXT:    s_mov_b32 s5, s6
8819; SI-NEXT:    buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
8820; SI-NEXT:    s_waitcnt vmcnt(0)
8821; SI-NEXT:    buffer_wbinvl1
8822; SI-NEXT:    v_mov_b32_e32 v0, v2
8823; SI-NEXT:    s_waitcnt expcnt(0)
8824; SI-NEXT:    s_setpc_b64 s[30:31]
8825;
8826; VI-LABEL: global_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory:
8827; VI:       ; %bb.0:
8828; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8829; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
8830; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
8831; VI-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
8832; VI-NEXT:    s_waitcnt vmcnt(0)
8833; VI-NEXT:    buffer_wbinvl1_vol
8834; VI-NEXT:    s_setpc_b64 s[30:31]
8835;
8836; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory:
8837; GFX9:       ; %bb.0:
8838; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8839; GFX9-NEXT:    global_atomic_inc v0, v[0:1], v2, off offset:16 glc
8840; GFX9-NEXT:    s_waitcnt vmcnt(0)
8841; GFX9-NEXT:    buffer_wbinvl1_vol
8842; GFX9-NEXT:    s_setpc_b64 s[30:31]
8843  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
8844  %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
8845  ret i32 %result
8846}
8847
8848; ---------------------------------------------------------------------
8849; atomicrmw udec_wrap
8850; ---------------------------------------------------------------------
8851
8852define void @global_atomic_udec_wrap_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
8853; SI-LABEL: global_atomic_udec_wrap_i32_noret:
8854; SI:       ; %bb.0:
8855; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8856; SI-NEXT:    s_mov_b32 s6, 0
8857; SI-NEXT:    s_mov_b32 s7, 0xf000
8858; SI-NEXT:    s_mov_b32 s4, s6
8859; SI-NEXT:    s_mov_b32 s5, s6
8860; SI-NEXT:    buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64
8861; SI-NEXT:    s_waitcnt vmcnt(0)
8862; SI-NEXT:    buffer_wbinvl1
8863; SI-NEXT:    s_waitcnt expcnt(0)
8864; SI-NEXT:    s_setpc_b64 s[30:31]
8865;
8866; VI-LABEL: global_atomic_udec_wrap_i32_noret:
8867; VI:       ; %bb.0:
8868; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8869; VI-NEXT:    flat_atomic_dec v[0:1], v2
8870; VI-NEXT:    s_waitcnt vmcnt(0)
8871; VI-NEXT:    buffer_wbinvl1_vol
8872; VI-NEXT:    s_setpc_b64 s[30:31]
8873;
8874; GFX9-LABEL: global_atomic_udec_wrap_i32_noret:
8875; GFX9:       ; %bb.0:
8876; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8877; GFX9-NEXT:    global_atomic_dec v[0:1], v2, off
8878; GFX9-NEXT:    s_waitcnt vmcnt(0)
8879; GFX9-NEXT:    buffer_wbinvl1_vol
8880; GFX9-NEXT:    s_setpc_b64 s[30:31]
8881  %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %in seq_cst
8882  ret void
8883}
8884
8885define void @global_atomic_udec_wrap_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
8886; SI-LABEL: global_atomic_udec_wrap_i32_noret_offset:
8887; SI:       ; %bb.0:
8888; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8889; SI-NEXT:    s_mov_b32 s6, 0
8890; SI-NEXT:    s_mov_b32 s7, 0xf000
8891; SI-NEXT:    s_mov_b32 s4, s6
8892; SI-NEXT:    s_mov_b32 s5, s6
8893; SI-NEXT:    buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 offset:16
8894; SI-NEXT:    s_waitcnt vmcnt(0)
8895; SI-NEXT:    buffer_wbinvl1
8896; SI-NEXT:    s_waitcnt expcnt(0)
8897; SI-NEXT:    s_setpc_b64 s[30:31]
8898;
8899; VI-LABEL: global_atomic_udec_wrap_i32_noret_offset:
8900; VI:       ; %bb.0:
8901; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8902; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
8903; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
8904; VI-NEXT:    flat_atomic_dec v[0:1], v2
8905; VI-NEXT:    s_waitcnt vmcnt(0)
8906; VI-NEXT:    buffer_wbinvl1_vol
8907; VI-NEXT:    s_setpc_b64 s[30:31]
8908;
8909; GFX9-LABEL: global_atomic_udec_wrap_i32_noret_offset:
8910; GFX9:       ; %bb.0:
8911; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8912; GFX9-NEXT:    global_atomic_dec v[0:1], v2, off offset:16
8913; GFX9-NEXT:    s_waitcnt vmcnt(0)
8914; GFX9-NEXT:    buffer_wbinvl1_vol
8915; GFX9-NEXT:    s_setpc_b64 s[30:31]
8916  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
8917  %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst
8918  ret void
8919}
8920
8921define i32 @global_atomic_udec_wrap_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
8922; SI-LABEL: global_atomic_udec_wrap_i32_ret:
8923; SI:       ; %bb.0:
8924; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8925; SI-NEXT:    s_mov_b32 s6, 0
8926; SI-NEXT:    s_mov_b32 s7, 0xf000
8927; SI-NEXT:    s_mov_b32 s4, s6
8928; SI-NEXT:    s_mov_b32 s5, s6
8929; SI-NEXT:    buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 glc
8930; SI-NEXT:    s_waitcnt vmcnt(0)
8931; SI-NEXT:    buffer_wbinvl1
8932; SI-NEXT:    v_mov_b32_e32 v0, v2
8933; SI-NEXT:    s_waitcnt expcnt(0)
8934; SI-NEXT:    s_setpc_b64 s[30:31]
8935;
8936; VI-LABEL: global_atomic_udec_wrap_i32_ret:
8937; VI:       ; %bb.0:
8938; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8939; VI-NEXT:    flat_atomic_dec v0, v[0:1], v2 glc
8940; VI-NEXT:    s_waitcnt vmcnt(0)
8941; VI-NEXT:    buffer_wbinvl1_vol
8942; VI-NEXT:    s_setpc_b64 s[30:31]
8943;
8944; GFX9-LABEL: global_atomic_udec_wrap_i32_ret:
8945; GFX9:       ; %bb.0:
8946; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8947; GFX9-NEXT:    global_atomic_dec v0, v[0:1], v2, off glc
8948; GFX9-NEXT:    s_waitcnt vmcnt(0)
8949; GFX9-NEXT:    buffer_wbinvl1_vol
8950; GFX9-NEXT:    s_setpc_b64 s[30:31]
8951  %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %in seq_cst
8952  ret i32 %result
8953}
8954
8955define i32 @global_atomic_udec_wrap_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
8956; SI-LABEL: global_atomic_udec_wrap_i32_ret_offset:
8957; SI:       ; %bb.0:
8958; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8959; SI-NEXT:    s_mov_b32 s6, 0
8960; SI-NEXT:    s_mov_b32 s7, 0xf000
8961; SI-NEXT:    s_mov_b32 s4, s6
8962; SI-NEXT:    s_mov_b32 s5, s6
8963; SI-NEXT:    buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
8964; SI-NEXT:    s_waitcnt vmcnt(0)
8965; SI-NEXT:    buffer_wbinvl1
8966; SI-NEXT:    v_mov_b32_e32 v0, v2
8967; SI-NEXT:    s_waitcnt expcnt(0)
8968; SI-NEXT:    s_setpc_b64 s[30:31]
8969;
8970; VI-LABEL: global_atomic_udec_wrap_i32_ret_offset:
8971; VI:       ; %bb.0:
8972; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8973; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
8974; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
8975; VI-NEXT:    flat_atomic_dec v0, v[0:1], v2 glc
8976; VI-NEXT:    s_waitcnt vmcnt(0)
8977; VI-NEXT:    buffer_wbinvl1_vol
8978; VI-NEXT:    s_setpc_b64 s[30:31]
8979;
8980; GFX9-LABEL: global_atomic_udec_wrap_i32_ret_offset:
8981; GFX9:       ; %bb.0:
8982; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8983; GFX9-NEXT:    global_atomic_dec v0, v[0:1], v2, off offset:16 glc
8984; GFX9-NEXT:    s_waitcnt vmcnt(0)
8985; GFX9-NEXT:    buffer_wbinvl1_vol
8986; GFX9-NEXT:    s_setpc_b64 s[30:31]
8987  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
8988  %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst
8989  ret i32 %result
8990}
8991
8992define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
8993; SI-LABEL: global_atomic_udec_wrap_i32_noret_scalar:
8994; SI:       ; %bb.0:
8995; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8996; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
8997; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
8998; SI-NEXT:    s_mov_b64 exec, s[34:35]
8999; SI-NEXT:    s_waitcnt expcnt(0)
9000; SI-NEXT:    v_writelane_b32 v1, s6, 0
9001; SI-NEXT:    v_writelane_b32 v1, s7, 1
9002; SI-NEXT:    s_mov_b32 s34, s6
9003; SI-NEXT:    s_mov_b32 s7, 0xf000
9004; SI-NEXT:    s_mov_b32 s6, -1
9005; SI-NEXT:    v_mov_b32_e32 v0, s34
9006; SI-NEXT:    s_waitcnt vmcnt(0)
9007; SI-NEXT:    buffer_atomic_dec v0, off, s[4:7], 0
9008; SI-NEXT:    s_waitcnt vmcnt(0)
9009; SI-NEXT:    buffer_wbinvl1
9010; SI-NEXT:    v_readlane_b32 s7, v1, 1
9011; SI-NEXT:    v_readlane_b32 s6, v1, 0
9012; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
9013; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
9014; SI-NEXT:    s_mov_b64 exec, s[34:35]
9015; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
9016; SI-NEXT:    s_setpc_b64 s[30:31]
9017;
9018; VI-LABEL: global_atomic_udec_wrap_i32_noret_scalar:
9019; VI:       ; %bb.0:
9020; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9021; VI-NEXT:    v_mov_b32_e32 v0, s4
9022; VI-NEXT:    v_mov_b32_e32 v1, s5
9023; VI-NEXT:    v_mov_b32_e32 v2, s6
9024; VI-NEXT:    flat_atomic_dec v[0:1], v2
9025; VI-NEXT:    s_waitcnt vmcnt(0)
9026; VI-NEXT:    buffer_wbinvl1_vol
9027; VI-NEXT:    s_setpc_b64 s[30:31]
9028;
9029; GFX9-LABEL: global_atomic_udec_wrap_i32_noret_scalar:
9030; GFX9:       ; %bb.0:
9031; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9032; GFX9-NEXT:    v_mov_b32_e32 v0, 0
9033; GFX9-NEXT:    v_mov_b32_e32 v1, s6
9034; GFX9-NEXT:    global_atomic_dec v0, v1, s[4:5]
9035; GFX9-NEXT:    s_waitcnt vmcnt(0)
9036; GFX9-NEXT:    buffer_wbinvl1_vol
9037; GFX9-NEXT:    s_setpc_b64 s[30:31]
9038  %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %in seq_cst
9039  ret void
9040}
9041
9042define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
9043; SI-LABEL: global_atomic_udec_wrap_i32_noret_offset_scalar:
9044; SI:       ; %bb.0:
9045; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9046; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
9047; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
9048; SI-NEXT:    s_mov_b64 exec, s[34:35]
9049; SI-NEXT:    s_waitcnt expcnt(0)
9050; SI-NEXT:    v_writelane_b32 v1, s6, 0
9051; SI-NEXT:    v_writelane_b32 v1, s7, 1
9052; SI-NEXT:    s_mov_b32 s34, s6
9053; SI-NEXT:    s_mov_b32 s7, 0xf000
9054; SI-NEXT:    s_mov_b32 s6, -1
9055; SI-NEXT:    v_mov_b32_e32 v0, s34
9056; SI-NEXT:    s_waitcnt vmcnt(0)
9057; SI-NEXT:    buffer_atomic_dec v0, off, s[4:7], 0 offset:16
9058; SI-NEXT:    s_waitcnt vmcnt(0)
9059; SI-NEXT:    buffer_wbinvl1
9060; SI-NEXT:    v_readlane_b32 s7, v1, 1
9061; SI-NEXT:    v_readlane_b32 s6, v1, 0
9062; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
9063; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
9064; SI-NEXT:    s_mov_b64 exec, s[34:35]
9065; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
9066; SI-NEXT:    s_setpc_b64 s[30:31]
9067;
9068; VI-LABEL: global_atomic_udec_wrap_i32_noret_offset_scalar:
9069; VI:       ; %bb.0:
9070; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9071; VI-NEXT:    s_add_u32 s34, s4, 16
9072; VI-NEXT:    s_addc_u32 s35, s5, 0
9073; VI-NEXT:    v_mov_b32_e32 v0, s34
9074; VI-NEXT:    v_mov_b32_e32 v1, s35
9075; VI-NEXT:    v_mov_b32_e32 v2, s6
9076; VI-NEXT:    flat_atomic_dec v[0:1], v2
9077; VI-NEXT:    s_waitcnt vmcnt(0)
9078; VI-NEXT:    buffer_wbinvl1_vol
9079; VI-NEXT:    s_setpc_b64 s[30:31]
9080;
9081; GFX9-LABEL: global_atomic_udec_wrap_i32_noret_offset_scalar:
9082; GFX9:       ; %bb.0:
9083; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9084; GFX9-NEXT:    v_mov_b32_e32 v0, 0
9085; GFX9-NEXT:    v_mov_b32_e32 v1, s6
9086; GFX9-NEXT:    global_atomic_dec v0, v1, s[4:5] offset:16
9087; GFX9-NEXT:    s_waitcnt vmcnt(0)
9088; GFX9-NEXT:    buffer_wbinvl1_vol
9089; GFX9-NEXT:    s_setpc_b64 s[30:31]
9090  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
9091  %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst
9092  ret void
9093}
9094
9095define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
9096; SI-LABEL: global_atomic_udec_wrap_i32_ret_scalar:
9097; SI:       ; %bb.0:
9098; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9099; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
9100; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
9101; SI-NEXT:    s_mov_b64 exec, s[34:35]
9102; SI-NEXT:    s_waitcnt expcnt(0)
9103; SI-NEXT:    v_writelane_b32 v1, s6, 0
9104; SI-NEXT:    v_writelane_b32 v1, s7, 1
9105; SI-NEXT:    s_mov_b32 s34, s6
9106; SI-NEXT:    s_mov_b32 s7, 0xf000
9107; SI-NEXT:    s_mov_b32 s6, -1
9108; SI-NEXT:    v_mov_b32_e32 v0, s34
9109; SI-NEXT:    s_waitcnt vmcnt(0)
9110; SI-NEXT:    buffer_atomic_dec v0, off, s[4:7], 0 glc
9111; SI-NEXT:    s_waitcnt vmcnt(0)
9112; SI-NEXT:    buffer_wbinvl1
9113; SI-NEXT:    v_readlane_b32 s7, v1, 1
9114; SI-NEXT:    v_readlane_b32 s6, v1, 0
9115; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
9116; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
9117; SI-NEXT:    s_mov_b64 exec, s[34:35]
9118; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
9119; SI-NEXT:    s_setpc_b64 s[30:31]
9120;
9121; VI-LABEL: global_atomic_udec_wrap_i32_ret_scalar:
9122; VI:       ; %bb.0:
9123; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9124; VI-NEXT:    v_mov_b32_e32 v0, s4
9125; VI-NEXT:    v_mov_b32_e32 v1, s5
9126; VI-NEXT:    v_mov_b32_e32 v2, s6
9127; VI-NEXT:    flat_atomic_dec v0, v[0:1], v2 glc
9128; VI-NEXT:    s_waitcnt vmcnt(0)
9129; VI-NEXT:    buffer_wbinvl1_vol
9130; VI-NEXT:    s_setpc_b64 s[30:31]
9131;
9132; GFX9-LABEL: global_atomic_udec_wrap_i32_ret_scalar:
9133; GFX9:       ; %bb.0:
9134; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9135; GFX9-NEXT:    v_mov_b32_e32 v0, 0
9136; GFX9-NEXT:    v_mov_b32_e32 v1, s6
9137; GFX9-NEXT:    global_atomic_dec v0, v0, v1, s[4:5] glc
9138; GFX9-NEXT:    s_waitcnt vmcnt(0)
9139; GFX9-NEXT:    buffer_wbinvl1_vol
9140; GFX9-NEXT:    s_setpc_b64 s[30:31]
9141  %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %in seq_cst
9142  ret i32 %result
9143}
9144
9145define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
9146; SI-LABEL: global_atomic_udec_wrap_i32_ret_offset_scalar:
9147; SI:       ; %bb.0:
9148; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9149; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
9150; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
9151; SI-NEXT:    s_mov_b64 exec, s[34:35]
9152; SI-NEXT:    s_waitcnt expcnt(0)
9153; SI-NEXT:    v_writelane_b32 v1, s6, 0
9154; SI-NEXT:    v_writelane_b32 v1, s7, 1
9155; SI-NEXT:    s_mov_b32 s34, s6
9156; SI-NEXT:    s_mov_b32 s7, 0xf000
9157; SI-NEXT:    s_mov_b32 s6, -1
9158; SI-NEXT:    v_mov_b32_e32 v0, s34
9159; SI-NEXT:    s_waitcnt vmcnt(0)
9160; SI-NEXT:    buffer_atomic_dec v0, off, s[4:7], 0 offset:16 glc
9161; SI-NEXT:    s_waitcnt vmcnt(0)
9162; SI-NEXT:    buffer_wbinvl1
9163; SI-NEXT:    v_readlane_b32 s7, v1, 1
9164; SI-NEXT:    v_readlane_b32 s6, v1, 0
9165; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
9166; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
9167; SI-NEXT:    s_mov_b64 exec, s[34:35]
9168; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
9169; SI-NEXT:    s_setpc_b64 s[30:31]
9170;
9171; VI-LABEL: global_atomic_udec_wrap_i32_ret_offset_scalar:
9172; VI:       ; %bb.0:
9173; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9174; VI-NEXT:    s_add_u32 s34, s4, 16
9175; VI-NEXT:    s_addc_u32 s35, s5, 0
9176; VI-NEXT:    v_mov_b32_e32 v0, s34
9177; VI-NEXT:    v_mov_b32_e32 v1, s35
9178; VI-NEXT:    v_mov_b32_e32 v2, s6
9179; VI-NEXT:    flat_atomic_dec v0, v[0:1], v2 glc
9180; VI-NEXT:    s_waitcnt vmcnt(0)
9181; VI-NEXT:    buffer_wbinvl1_vol
9182; VI-NEXT:    s_setpc_b64 s[30:31]
9183;
9184; GFX9-LABEL: global_atomic_udec_wrap_i32_ret_offset_scalar:
9185; GFX9:       ; %bb.0:
9186; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9187; GFX9-NEXT:    v_mov_b32_e32 v0, 0
9188; GFX9-NEXT:    v_mov_b32_e32 v1, s6
9189; GFX9-NEXT:    global_atomic_dec v0, v0, v1, s[4:5] offset:16 glc
9190; GFX9-NEXT:    s_waitcnt vmcnt(0)
9191; GFX9-NEXT:    buffer_wbinvl1_vol
9192; GFX9-NEXT:    s_setpc_b64 s[30:31]
9193  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
9194  %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst
9195  ret i32 %result
9196}
9197
9198define void @global_atomic_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
9199; SI-LABEL: global_atomic_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory:
9200; SI:       ; %bb.0:
9201; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9202; SI-NEXT:    s_mov_b32 s6, 0
9203; SI-NEXT:    s_mov_b32 s7, 0xf000
9204; SI-NEXT:    s_mov_b32 s4, s6
9205; SI-NEXT:    s_mov_b32 s5, s6
9206; SI-NEXT:    buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 offset:16
9207; SI-NEXT:    s_waitcnt vmcnt(0)
9208; SI-NEXT:    buffer_wbinvl1
9209; SI-NEXT:    s_waitcnt expcnt(0)
9210; SI-NEXT:    s_setpc_b64 s[30:31]
9211;
9212; VI-LABEL: global_atomic_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory:
9213; VI:       ; %bb.0:
9214; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9215; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
9216; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
9217; VI-NEXT:    flat_atomic_dec v[0:1], v2
9218; VI-NEXT:    s_waitcnt vmcnt(0)
9219; VI-NEXT:    buffer_wbinvl1_vol
9220; VI-NEXT:    s_setpc_b64 s[30:31]
9221;
9222; GFX9-LABEL: global_atomic_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory:
9223; GFX9:       ; %bb.0:
9224; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9225; GFX9-NEXT:    global_atomic_dec v[0:1], v2, off offset:16
9226; GFX9-NEXT:    s_waitcnt vmcnt(0)
9227; GFX9-NEXT:    buffer_wbinvl1_vol
9228; GFX9-NEXT:    s_setpc_b64 s[30:31]
9229  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
9230  %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
9231  ret void
9232}
9233
9234define i32 @global_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
9235; SI-LABEL: global_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory:
9236; SI:       ; %bb.0:
9237; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9238; SI-NEXT:    s_mov_b32 s6, 0
9239; SI-NEXT:    s_mov_b32 s7, 0xf000
9240; SI-NEXT:    s_mov_b32 s4, s6
9241; SI-NEXT:    s_mov_b32 s5, s6
9242; SI-NEXT:    buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
9243; SI-NEXT:    s_waitcnt vmcnt(0)
9244; SI-NEXT:    buffer_wbinvl1
9245; SI-NEXT:    v_mov_b32_e32 v0, v2
9246; SI-NEXT:    s_waitcnt expcnt(0)
9247; SI-NEXT:    s_setpc_b64 s[30:31]
9248;
9249; VI-LABEL: global_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory:
9250; VI:       ; %bb.0:
9251; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9252; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
9253; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
9254; VI-NEXT:    flat_atomic_dec v0, v[0:1], v2 glc
9255; VI-NEXT:    s_waitcnt vmcnt(0)
9256; VI-NEXT:    buffer_wbinvl1_vol
9257; VI-NEXT:    s_setpc_b64 s[30:31]
9258;
9259; GFX9-LABEL: global_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory:
9260; GFX9:       ; %bb.0:
9261; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9262; GFX9-NEXT:    global_atomic_dec v0, v[0:1], v2, off offset:16 glc
9263; GFX9-NEXT:    s_waitcnt vmcnt(0)
9264; GFX9-NEXT:    buffer_wbinvl1_vol
9265; GFX9-NEXT:    s_setpc_b64 s[30:31]
9266  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
9267  %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
9268  ret i32 %result
9269}
9270
9271!0 = !{}
9272