xref: /llvm-project/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll (revision eeac0ffaf46cf9f9b0f680b9940cc4b68a0286d8)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
3; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5
6; ---------------------------------------------------------------------
7; atomicrmw xchg
8; ---------------------------------------------------------------------
9
10define void @global_atomic_xchg_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
11; SI-LABEL: global_atomic_xchg_i64_noret:
12; SI:       ; %bb.0:
13; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14; SI-NEXT:    s_mov_b32 s6, 0
15; SI-NEXT:    s_mov_b32 s7, 0xf000
16; SI-NEXT:    s_mov_b32 s4, s6
17; SI-NEXT:    s_mov_b32 s5, s6
18; SI-NEXT:    buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64
19; SI-NEXT:    s_waitcnt vmcnt(0)
20; SI-NEXT:    buffer_wbinvl1
21; SI-NEXT:    s_waitcnt expcnt(0)
22; SI-NEXT:    s_setpc_b64 s[30:31]
23;
24; VI-LABEL: global_atomic_xchg_i64_noret:
25; VI:       ; %bb.0:
26; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27; VI-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
28; VI-NEXT:    s_waitcnt vmcnt(0)
29; VI-NEXT:    buffer_wbinvl1_vol
30; VI-NEXT:    s_setpc_b64 s[30:31]
31;
32; GFX9-LABEL: global_atomic_xchg_i64_noret:
33; GFX9:       ; %bb.0:
34; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35; GFX9-NEXT:    global_atomic_swap_x2 v[0:1], v[2:3], off
36; GFX9-NEXT:    s_waitcnt vmcnt(0)
37; GFX9-NEXT:    buffer_wbinvl1_vol
38; GFX9-NEXT:    s_setpc_b64 s[30:31]
39  %tmp0 = atomicrmw xchg ptr addrspace(1) %ptr, i64 %in seq_cst
40  ret void
41}
42
43define void @global_atomic_xchg_i64_noret_offset(ptr addrspace(1) %out, i64 %in) {
44; SI-LABEL: global_atomic_xchg_i64_noret_offset:
45; SI:       ; %bb.0:
46; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
47; SI-NEXT:    s_mov_b32 s6, 0
48; SI-NEXT:    s_mov_b32 s7, 0xf000
49; SI-NEXT:    s_mov_b32 s4, s6
50; SI-NEXT:    s_mov_b32 s5, s6
51; SI-NEXT:    buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32
52; SI-NEXT:    s_waitcnt vmcnt(0)
53; SI-NEXT:    buffer_wbinvl1
54; SI-NEXT:    s_waitcnt expcnt(0)
55; SI-NEXT:    s_setpc_b64 s[30:31]
56;
57; VI-LABEL: global_atomic_xchg_i64_noret_offset:
58; VI:       ; %bb.0:
59; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
60; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
61; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
62; VI-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
63; VI-NEXT:    s_waitcnt vmcnt(0)
64; VI-NEXT:    buffer_wbinvl1_vol
65; VI-NEXT:    s_setpc_b64 s[30:31]
66;
67; GFX9-LABEL: global_atomic_xchg_i64_noret_offset:
68; GFX9:       ; %bb.0:
69; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
70; GFX9-NEXT:    global_atomic_swap_x2 v[0:1], v[2:3], off offset:32
71; GFX9-NEXT:    s_waitcnt vmcnt(0)
72; GFX9-NEXT:    buffer_wbinvl1_vol
73; GFX9-NEXT:    s_setpc_b64 s[30:31]
74  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
75  %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, i64 %in seq_cst
76  ret void
77}
78
79define i64 @global_atomic_xchg_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
80; SI-LABEL: global_atomic_xchg_i64_ret:
81; SI:       ; %bb.0:
82; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
83; SI-NEXT:    s_mov_b32 s6, 0
84; SI-NEXT:    s_mov_b32 s7, 0xf000
85; SI-NEXT:    s_mov_b32 s4, s6
86; SI-NEXT:    s_mov_b32 s5, s6
87; SI-NEXT:    buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
88; SI-NEXT:    s_waitcnt vmcnt(0)
89; SI-NEXT:    buffer_wbinvl1
90; SI-NEXT:    v_mov_b32_e32 v0, v2
91; SI-NEXT:    v_mov_b32_e32 v1, v3
92; SI-NEXT:    s_waitcnt expcnt(0)
93; SI-NEXT:    s_setpc_b64 s[30:31]
94;
95; VI-LABEL: global_atomic_xchg_i64_ret:
96; VI:       ; %bb.0:
97; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
98; VI-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
99; VI-NEXT:    s_waitcnt vmcnt(0)
100; VI-NEXT:    buffer_wbinvl1_vol
101; VI-NEXT:    s_setpc_b64 s[30:31]
102;
103; GFX9-LABEL: global_atomic_xchg_i64_ret:
104; GFX9:       ; %bb.0:
105; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
106; GFX9-NEXT:    global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off glc
107; GFX9-NEXT:    s_waitcnt vmcnt(0)
108; GFX9-NEXT:    buffer_wbinvl1_vol
109; GFX9-NEXT:    s_setpc_b64 s[30:31]
110  %result = atomicrmw xchg ptr addrspace(1) %ptr, i64 %in seq_cst
111  ret i64 %result
112}
113
114define i64 @global_atomic_xchg_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
115; SI-LABEL: global_atomic_xchg_i64_ret_offset:
116; SI:       ; %bb.0:
117; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
118; SI-NEXT:    s_mov_b32 s6, 0
119; SI-NEXT:    s_mov_b32 s7, 0xf000
120; SI-NEXT:    s_mov_b32 s4, s6
121; SI-NEXT:    s_mov_b32 s5, s6
122; SI-NEXT:    buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc
123; SI-NEXT:    s_waitcnt vmcnt(0)
124; SI-NEXT:    buffer_wbinvl1
125; SI-NEXT:    v_mov_b32_e32 v0, v2
126; SI-NEXT:    v_mov_b32_e32 v1, v3
127; SI-NEXT:    s_waitcnt expcnt(0)
128; SI-NEXT:    s_setpc_b64 s[30:31]
129;
130; VI-LABEL: global_atomic_xchg_i64_ret_offset:
131; VI:       ; %bb.0:
132; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
133; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
134; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
135; VI-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
136; VI-NEXT:    s_waitcnt vmcnt(0)
137; VI-NEXT:    buffer_wbinvl1_vol
138; VI-NEXT:    s_setpc_b64 s[30:31]
139;
140; GFX9-LABEL: global_atomic_xchg_i64_ret_offset:
141; GFX9:       ; %bb.0:
142; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
143; GFX9-NEXT:    global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
144; GFX9-NEXT:    s_waitcnt vmcnt(0)
145; GFX9-NEXT:    buffer_wbinvl1_vol
146; GFX9-NEXT:    s_setpc_b64 s[30:31]
147  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
148  %result = atomicrmw xchg ptr addrspace(1) %gep, i64 %in seq_cst
149  ret i64 %result
150}
151
152define amdgpu_gfx void @global_atomic_xchg_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
153; SI-LABEL: global_atomic_xchg_i64_noret_scalar:
154; SI:       ; %bb.0:
155; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
156; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
157; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
158; SI-NEXT:    s_mov_b64 exec, s[34:35]
159; SI-NEXT:    s_waitcnt expcnt(0)
160; SI-NEXT:    v_writelane_b32 v2, s6, 0
161; SI-NEXT:    v_writelane_b32 v2, s7, 1
162; SI-NEXT:    s_mov_b32 s34, s7
163; SI-NEXT:    s_mov_b32 s35, s6
164; SI-NEXT:    s_mov_b32 s7, 0xf000
165; SI-NEXT:    s_mov_b32 s6, -1
166; SI-NEXT:    v_mov_b32_e32 v0, s35
167; SI-NEXT:    v_mov_b32_e32 v1, s34
168; SI-NEXT:    s_waitcnt vmcnt(0)
169; SI-NEXT:    buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0
170; SI-NEXT:    s_waitcnt vmcnt(0)
171; SI-NEXT:    buffer_wbinvl1
172; SI-NEXT:    v_readlane_b32 s7, v2, 1
173; SI-NEXT:    v_readlane_b32 s6, v2, 0
174; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
175; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
176; SI-NEXT:    s_mov_b64 exec, s[34:35]
177; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
178; SI-NEXT:    s_setpc_b64 s[30:31]
179;
180; VI-LABEL: global_atomic_xchg_i64_noret_scalar:
181; VI:       ; %bb.0:
182; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
183; VI-NEXT:    v_mov_b32_e32 v0, s6
184; VI-NEXT:    v_mov_b32_e32 v1, s7
185; VI-NEXT:    v_mov_b32_e32 v2, s4
186; VI-NEXT:    v_mov_b32_e32 v3, s5
187; VI-NEXT:    flat_atomic_swap_x2 v[2:3], v[0:1]
188; VI-NEXT:    s_waitcnt vmcnt(0)
189; VI-NEXT:    buffer_wbinvl1_vol
190; VI-NEXT:    s_setpc_b64 s[30:31]
191;
192; GFX9-LABEL: global_atomic_xchg_i64_noret_scalar:
193; GFX9:       ; %bb.0:
194; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
195; GFX9-NEXT:    v_mov_b32_e32 v0, s6
196; GFX9-NEXT:    v_mov_b32_e32 v1, s7
197; GFX9-NEXT:    v_mov_b32_e32 v2, 0
198; GFX9-NEXT:    global_atomic_swap_x2 v2, v[0:1], s[4:5]
199; GFX9-NEXT:    s_waitcnt vmcnt(0)
200; GFX9-NEXT:    buffer_wbinvl1_vol
201; GFX9-NEXT:    s_setpc_b64 s[30:31]
202  %tmp0 = atomicrmw xchg ptr addrspace(1) %ptr, i64 %in seq_cst
203  ret void
204}
205
206define amdgpu_gfx void @global_atomic_xchg_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
207; SI-LABEL: global_atomic_xchg_i64_noret_offset_scalar:
208; SI:       ; %bb.0:
209; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
210; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
211; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
212; SI-NEXT:    s_mov_b64 exec, s[34:35]
213; SI-NEXT:    s_waitcnt expcnt(0)
214; SI-NEXT:    v_writelane_b32 v2, s6, 0
215; SI-NEXT:    v_writelane_b32 v2, s7, 1
216; SI-NEXT:    v_mov_b32_e32 v0, s6
217; SI-NEXT:    v_mov_b32_e32 v1, s7
218; SI-NEXT:    s_mov_b32 s7, 0xf000
219; SI-NEXT:    s_mov_b32 s6, -1
220; SI-NEXT:    s_waitcnt vmcnt(0)
221; SI-NEXT:    buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 offset:32
222; SI-NEXT:    s_waitcnt vmcnt(0)
223; SI-NEXT:    buffer_wbinvl1
224; SI-NEXT:    v_readlane_b32 s7, v2, 1
225; SI-NEXT:    v_readlane_b32 s6, v2, 0
226; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
227; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
228; SI-NEXT:    s_mov_b64 exec, s[34:35]
229; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
230; SI-NEXT:    s_setpc_b64 s[30:31]
231;
232; VI-LABEL: global_atomic_xchg_i64_noret_offset_scalar:
233; VI:       ; %bb.0:
234; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
235; VI-NEXT:    s_add_u32 s34, s4, 32
236; VI-NEXT:    s_addc_u32 s35, s5, 0
237; VI-NEXT:    v_mov_b32_e32 v2, s34
238; VI-NEXT:    v_mov_b32_e32 v0, s6
239; VI-NEXT:    v_mov_b32_e32 v1, s7
240; VI-NEXT:    v_mov_b32_e32 v3, s35
241; VI-NEXT:    flat_atomic_swap_x2 v[2:3], v[0:1]
242; VI-NEXT:    s_waitcnt vmcnt(0)
243; VI-NEXT:    buffer_wbinvl1_vol
244; VI-NEXT:    s_setpc_b64 s[30:31]
245;
246; GFX9-LABEL: global_atomic_xchg_i64_noret_offset_scalar:
247; GFX9:       ; %bb.0:
248; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
249; GFX9-NEXT:    v_mov_b32_e32 v0, s6
250; GFX9-NEXT:    v_mov_b32_e32 v1, s7
251; GFX9-NEXT:    v_mov_b32_e32 v2, 0
252; GFX9-NEXT:    global_atomic_swap_x2 v2, v[0:1], s[4:5] offset:32
253; GFX9-NEXT:    s_waitcnt vmcnt(0)
254; GFX9-NEXT:    buffer_wbinvl1_vol
255; GFX9-NEXT:    s_setpc_b64 s[30:31]
256  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
257  %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, i64 %in seq_cst
258  ret void
259}
260
261define amdgpu_gfx i64 @global_atomic_xchg_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
262; SI-LABEL: global_atomic_xchg_i64_ret_scalar:
263; SI:       ; %bb.0:
264; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
265; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
266; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
267; SI-NEXT:    s_mov_b64 exec, s[34:35]
268; SI-NEXT:    s_waitcnt expcnt(0)
269; SI-NEXT:    v_writelane_b32 v2, s6, 0
270; SI-NEXT:    v_writelane_b32 v2, s7, 1
271; SI-NEXT:    s_mov_b32 s34, s7
272; SI-NEXT:    s_mov_b32 s35, s6
273; SI-NEXT:    s_mov_b32 s7, 0xf000
274; SI-NEXT:    s_mov_b32 s6, -1
275; SI-NEXT:    v_mov_b32_e32 v0, s35
276; SI-NEXT:    v_mov_b32_e32 v1, s34
277; SI-NEXT:    s_waitcnt vmcnt(0)
278; SI-NEXT:    buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 glc
279; SI-NEXT:    s_waitcnt vmcnt(0)
280; SI-NEXT:    buffer_wbinvl1
281; SI-NEXT:    v_readlane_b32 s7, v2, 1
282; SI-NEXT:    v_readlane_b32 s6, v2, 0
283; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
284; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
285; SI-NEXT:    s_mov_b64 exec, s[34:35]
286; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
287; SI-NEXT:    s_setpc_b64 s[30:31]
288;
289; VI-LABEL: global_atomic_xchg_i64_ret_scalar:
290; VI:       ; %bb.0:
291; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
292; VI-NEXT:    v_mov_b32_e32 v0, s6
293; VI-NEXT:    v_mov_b32_e32 v1, s7
294; VI-NEXT:    v_mov_b32_e32 v2, s4
295; VI-NEXT:    v_mov_b32_e32 v3, s5
296; VI-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
297; VI-NEXT:    s_waitcnt vmcnt(0)
298; VI-NEXT:    buffer_wbinvl1_vol
299; VI-NEXT:    s_setpc_b64 s[30:31]
300;
301; GFX9-LABEL: global_atomic_xchg_i64_ret_scalar:
302; GFX9:       ; %bb.0:
303; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
304; GFX9-NEXT:    v_mov_b32_e32 v0, s6
305; GFX9-NEXT:    v_mov_b32_e32 v1, s7
306; GFX9-NEXT:    v_mov_b32_e32 v2, 0
307; GFX9-NEXT:    global_atomic_swap_x2 v[0:1], v2, v[0:1], s[4:5] glc
308; GFX9-NEXT:    s_waitcnt vmcnt(0)
309; GFX9-NEXT:    buffer_wbinvl1_vol
310; GFX9-NEXT:    s_setpc_b64 s[30:31]
311  %result = atomicrmw xchg ptr addrspace(1) %ptr, i64 %in seq_cst
312  ret i64 %result
313}
314
315define amdgpu_gfx i64 @global_atomic_xchg_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
316; SI-LABEL: global_atomic_xchg_i64_ret_offset_scalar:
317; SI:       ; %bb.0:
318; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
319; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
320; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
321; SI-NEXT:    s_mov_b64 exec, s[34:35]
322; SI-NEXT:    s_waitcnt expcnt(0)
323; SI-NEXT:    v_writelane_b32 v2, s6, 0
324; SI-NEXT:    v_writelane_b32 v2, s7, 1
325; SI-NEXT:    v_mov_b32_e32 v0, s6
326; SI-NEXT:    v_mov_b32_e32 v1, s7
327; SI-NEXT:    s_mov_b32 s7, 0xf000
328; SI-NEXT:    s_mov_b32 s6, -1
329; SI-NEXT:    s_waitcnt vmcnt(0)
330; SI-NEXT:    buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 offset:32 glc
331; SI-NEXT:    s_waitcnt vmcnt(0)
332; SI-NEXT:    buffer_wbinvl1
333; SI-NEXT:    v_readlane_b32 s7, v2, 1
334; SI-NEXT:    v_readlane_b32 s6, v2, 0
335; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
336; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
337; SI-NEXT:    s_mov_b64 exec, s[34:35]
338; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
339; SI-NEXT:    s_setpc_b64 s[30:31]
340;
341; VI-LABEL: global_atomic_xchg_i64_ret_offset_scalar:
342; VI:       ; %bb.0:
343; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
344; VI-NEXT:    s_add_u32 s34, s4, 32
345; VI-NEXT:    s_addc_u32 s35, s5, 0
346; VI-NEXT:    v_mov_b32_e32 v2, s34
347; VI-NEXT:    v_mov_b32_e32 v0, s6
348; VI-NEXT:    v_mov_b32_e32 v1, s7
349; VI-NEXT:    v_mov_b32_e32 v3, s35
350; VI-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
351; VI-NEXT:    s_waitcnt vmcnt(0)
352; VI-NEXT:    buffer_wbinvl1_vol
353; VI-NEXT:    s_setpc_b64 s[30:31]
354;
355; GFX9-LABEL: global_atomic_xchg_i64_ret_offset_scalar:
356; GFX9:       ; %bb.0:
357; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
358; GFX9-NEXT:    v_mov_b32_e32 v0, s6
359; GFX9-NEXT:    v_mov_b32_e32 v1, s7
360; GFX9-NEXT:    v_mov_b32_e32 v2, 0
361; GFX9-NEXT:    global_atomic_swap_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc
362; GFX9-NEXT:    s_waitcnt vmcnt(0)
363; GFX9-NEXT:    buffer_wbinvl1_vol
364; GFX9-NEXT:    s_setpc_b64 s[30:31]
365  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
366  %result = atomicrmw xchg ptr addrspace(1) %gep, i64 %in seq_cst
367  ret i64 %result
368}
369
370define void @global_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
371; SI-LABEL: global_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory:
372; SI:       ; %bb.0:
373; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
374; SI-NEXT:    s_mov_b32 s6, 0
375; SI-NEXT:    s_mov_b32 s7, 0xf000
376; SI-NEXT:    s_mov_b32 s4, s6
377; SI-NEXT:    s_mov_b32 s5, s6
378; SI-NEXT:    buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32
379; SI-NEXT:    s_waitcnt vmcnt(0)
380; SI-NEXT:    buffer_wbinvl1
381; SI-NEXT:    s_waitcnt expcnt(0)
382; SI-NEXT:    s_setpc_b64 s[30:31]
383;
384; VI-LABEL: global_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory:
385; VI:       ; %bb.0:
386; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
387; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
388; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
389; VI-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
390; VI-NEXT:    s_waitcnt vmcnt(0)
391; VI-NEXT:    buffer_wbinvl1_vol
392; VI-NEXT:    s_setpc_b64 s[30:31]
393;
394; GFX9-LABEL: global_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory:
395; GFX9:       ; %bb.0:
396; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
397; GFX9-NEXT:    global_atomic_swap_x2 v[0:1], v[2:3], off offset:32
398; GFX9-NEXT:    s_waitcnt vmcnt(0)
399; GFX9-NEXT:    buffer_wbinvl1_vol
400; GFX9-NEXT:    s_setpc_b64 s[30:31]
401  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
402  %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
403  ret void
404}
405
406define i64 @global_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
407; SI-LABEL: global_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory:
408; SI:       ; %bb.0:
409; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
410; SI-NEXT:    s_mov_b32 s6, 0
411; SI-NEXT:    s_mov_b32 s7, 0xf000
412; SI-NEXT:    s_mov_b32 s4, s6
413; SI-NEXT:    s_mov_b32 s5, s6
414; SI-NEXT:    buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc
415; SI-NEXT:    s_waitcnt vmcnt(0)
416; SI-NEXT:    buffer_wbinvl1
417; SI-NEXT:    v_mov_b32_e32 v0, v2
418; SI-NEXT:    v_mov_b32_e32 v1, v3
419; SI-NEXT:    s_waitcnt expcnt(0)
420; SI-NEXT:    s_setpc_b64 s[30:31]
421;
422; VI-LABEL: global_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory:
423; VI:       ; %bb.0:
424; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
425; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
426; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
427; VI-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
428; VI-NEXT:    s_waitcnt vmcnt(0)
429; VI-NEXT:    buffer_wbinvl1_vol
430; VI-NEXT:    s_setpc_b64 s[30:31]
431;
432; GFX9-LABEL: global_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory:
433; GFX9:       ; %bb.0:
434; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
435; GFX9-NEXT:    global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
436; GFX9-NEXT:    s_waitcnt vmcnt(0)
437; GFX9-NEXT:    buffer_wbinvl1_vol
438; GFX9-NEXT:    s_setpc_b64 s[30:31]
439  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
440  %result = atomicrmw xchg ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
441  ret i64 %result
442}
443
444; ---------------------------------------------------------------------
445; atomicrmw xchg f64
446; ---------------------------------------------------------------------
447
448define void @global_atomic_xchg_f64_noret(ptr addrspace(1) %ptr, double %in) {
449; SI-LABEL: global_atomic_xchg_f64_noret:
450; SI:       ; %bb.0:
451; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
452; SI-NEXT:    s_mov_b32 s6, 0
453; SI-NEXT:    s_mov_b32 s7, 0xf000
454; SI-NEXT:    s_mov_b32 s4, s6
455; SI-NEXT:    s_mov_b32 s5, s6
456; SI-NEXT:    buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64
457; SI-NEXT:    s_waitcnt vmcnt(0)
458; SI-NEXT:    buffer_wbinvl1
459; SI-NEXT:    s_waitcnt expcnt(0)
460; SI-NEXT:    s_setpc_b64 s[30:31]
461;
462; VI-LABEL: global_atomic_xchg_f64_noret:
463; VI:       ; %bb.0:
464; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
465; VI-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
466; VI-NEXT:    s_waitcnt vmcnt(0)
467; VI-NEXT:    buffer_wbinvl1_vol
468; VI-NEXT:    s_setpc_b64 s[30:31]
469;
470; GFX9-LABEL: global_atomic_xchg_f64_noret:
471; GFX9:       ; %bb.0:
472; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
473; GFX9-NEXT:    global_atomic_swap_x2 v[0:1], v[2:3], off
474; GFX9-NEXT:    s_waitcnt vmcnt(0)
475; GFX9-NEXT:    buffer_wbinvl1_vol
476; GFX9-NEXT:    s_setpc_b64 s[30:31]
477  %tmp0 = atomicrmw xchg ptr addrspace(1) %ptr, double %in seq_cst
478  ret void
479}
480
481define void @global_atomic_xchg_f64_noret_offset(ptr addrspace(1) %out, double %in) {
482; SI-LABEL: global_atomic_xchg_f64_noret_offset:
483; SI:       ; %bb.0:
484; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
485; SI-NEXT:    s_mov_b32 s6, 0
486; SI-NEXT:    s_mov_b32 s7, 0xf000
487; SI-NEXT:    s_mov_b32 s4, s6
488; SI-NEXT:    s_mov_b32 s5, s6
489; SI-NEXT:    buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32
490; SI-NEXT:    s_waitcnt vmcnt(0)
491; SI-NEXT:    buffer_wbinvl1
492; SI-NEXT:    s_waitcnt expcnt(0)
493; SI-NEXT:    s_setpc_b64 s[30:31]
494;
495; VI-LABEL: global_atomic_xchg_f64_noret_offset:
496; VI:       ; %bb.0:
497; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
498; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
499; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
500; VI-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
501; VI-NEXT:    s_waitcnt vmcnt(0)
502; VI-NEXT:    buffer_wbinvl1_vol
503; VI-NEXT:    s_setpc_b64 s[30:31]
504;
505; GFX9-LABEL: global_atomic_xchg_f64_noret_offset:
506; GFX9:       ; %bb.0:
507; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
508; GFX9-NEXT:    global_atomic_swap_x2 v[0:1], v[2:3], off offset:32
509; GFX9-NEXT:    s_waitcnt vmcnt(0)
510; GFX9-NEXT:    buffer_wbinvl1_vol
511; GFX9-NEXT:    s_setpc_b64 s[30:31]
512  %gep = getelementptr double, ptr addrspace(1) %out, i32 4
513  %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, double %in seq_cst
514  ret void
515}
516
517define double @global_atomic_xchg_f64_ret(ptr addrspace(1) %ptr, double %in) {
518; SI-LABEL: global_atomic_xchg_f64_ret:
519; SI:       ; %bb.0:
520; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
521; SI-NEXT:    s_mov_b32 s6, 0
522; SI-NEXT:    s_mov_b32 s7, 0xf000
523; SI-NEXT:    s_mov_b32 s4, s6
524; SI-NEXT:    s_mov_b32 s5, s6
525; SI-NEXT:    buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
526; SI-NEXT:    s_waitcnt vmcnt(0)
527; SI-NEXT:    buffer_wbinvl1
528; SI-NEXT:    v_mov_b32_e32 v0, v2
529; SI-NEXT:    v_mov_b32_e32 v1, v3
530; SI-NEXT:    s_waitcnt expcnt(0)
531; SI-NEXT:    s_setpc_b64 s[30:31]
532;
533; VI-LABEL: global_atomic_xchg_f64_ret:
534; VI:       ; %bb.0:
535; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
536; VI-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
537; VI-NEXT:    s_waitcnt vmcnt(0)
538; VI-NEXT:    buffer_wbinvl1_vol
539; VI-NEXT:    s_setpc_b64 s[30:31]
540;
541; GFX9-LABEL: global_atomic_xchg_f64_ret:
542; GFX9:       ; %bb.0:
543; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
544; GFX9-NEXT:    global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off glc
545; GFX9-NEXT:    s_waitcnt vmcnt(0)
546; GFX9-NEXT:    buffer_wbinvl1_vol
547; GFX9-NEXT:    s_setpc_b64 s[30:31]
548  %result = atomicrmw xchg ptr addrspace(1) %ptr, double %in seq_cst
549  ret double %result
550}
551
552define double @global_atomic_xchg_f64_ret_offset(ptr addrspace(1) %out, double %in) {
553; SI-LABEL: global_atomic_xchg_f64_ret_offset:
554; SI:       ; %bb.0:
555; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
556; SI-NEXT:    s_mov_b32 s6, 0
557; SI-NEXT:    s_mov_b32 s7, 0xf000
558; SI-NEXT:    s_mov_b32 s4, s6
559; SI-NEXT:    s_mov_b32 s5, s6
560; SI-NEXT:    buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc
561; SI-NEXT:    s_waitcnt vmcnt(0)
562; SI-NEXT:    buffer_wbinvl1
563; SI-NEXT:    v_mov_b32_e32 v0, v2
564; SI-NEXT:    v_mov_b32_e32 v1, v3
565; SI-NEXT:    s_waitcnt expcnt(0)
566; SI-NEXT:    s_setpc_b64 s[30:31]
567;
568; VI-LABEL: global_atomic_xchg_f64_ret_offset:
569; VI:       ; %bb.0:
570; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
571; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
572; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
573; VI-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
574; VI-NEXT:    s_waitcnt vmcnt(0)
575; VI-NEXT:    buffer_wbinvl1_vol
576; VI-NEXT:    s_setpc_b64 s[30:31]
577;
578; GFX9-LABEL: global_atomic_xchg_f64_ret_offset:
579; GFX9:       ; %bb.0:
580; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
581; GFX9-NEXT:    global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
582; GFX9-NEXT:    s_waitcnt vmcnt(0)
583; GFX9-NEXT:    buffer_wbinvl1_vol
584; GFX9-NEXT:    s_setpc_b64 s[30:31]
585  %gep = getelementptr double, ptr addrspace(1) %out, i32 4
586  %result = atomicrmw xchg ptr addrspace(1) %gep, double %in seq_cst
587  ret double %result
588}
589
590define amdgpu_gfx void @global_atomic_xchg_f64_noret_scalar(ptr addrspace(1) inreg %ptr, double inreg %in) {
591; SI-LABEL: global_atomic_xchg_f64_noret_scalar:
592; SI:       ; %bb.0:
593; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
594; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
595; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
596; SI-NEXT:    s_mov_b64 exec, s[34:35]
597; SI-NEXT:    s_waitcnt expcnt(0)
598; SI-NEXT:    v_writelane_b32 v2, s6, 0
599; SI-NEXT:    v_writelane_b32 v2, s7, 1
600; SI-NEXT:    s_mov_b32 s34, s7
601; SI-NEXT:    s_mov_b32 s35, s6
602; SI-NEXT:    s_mov_b32 s7, 0xf000
603; SI-NEXT:    s_mov_b32 s6, -1
604; SI-NEXT:    v_mov_b32_e32 v0, s35
605; SI-NEXT:    v_mov_b32_e32 v1, s34
606; SI-NEXT:    s_waitcnt vmcnt(0)
607; SI-NEXT:    buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0
608; SI-NEXT:    s_waitcnt vmcnt(0)
609; SI-NEXT:    buffer_wbinvl1
610; SI-NEXT:    v_readlane_b32 s7, v2, 1
611; SI-NEXT:    v_readlane_b32 s6, v2, 0
612; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
613; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
614; SI-NEXT:    s_mov_b64 exec, s[34:35]
615; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
616; SI-NEXT:    s_setpc_b64 s[30:31]
617;
618; VI-LABEL: global_atomic_xchg_f64_noret_scalar:
619; VI:       ; %bb.0:
620; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
621; VI-NEXT:    v_mov_b32_e32 v0, s6
622; VI-NEXT:    v_mov_b32_e32 v1, s7
623; VI-NEXT:    v_mov_b32_e32 v2, s4
624; VI-NEXT:    v_mov_b32_e32 v3, s5
625; VI-NEXT:    flat_atomic_swap_x2 v[2:3], v[0:1]
626; VI-NEXT:    s_waitcnt vmcnt(0)
627; VI-NEXT:    buffer_wbinvl1_vol
628; VI-NEXT:    s_setpc_b64 s[30:31]
629;
630; GFX9-LABEL: global_atomic_xchg_f64_noret_scalar:
631; GFX9:       ; %bb.0:
632; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
633; GFX9-NEXT:    v_mov_b32_e32 v0, s6
634; GFX9-NEXT:    v_mov_b32_e32 v1, s7
635; GFX9-NEXT:    v_mov_b32_e32 v2, 0
636; GFX9-NEXT:    global_atomic_swap_x2 v2, v[0:1], s[4:5]
637; GFX9-NEXT:    s_waitcnt vmcnt(0)
638; GFX9-NEXT:    buffer_wbinvl1_vol
639; GFX9-NEXT:    s_setpc_b64 s[30:31]
640  %tmp0 = atomicrmw xchg ptr addrspace(1) %ptr, double %in seq_cst
641  ret void
642}
643
644define amdgpu_gfx void @global_atomic_xchg_f64_noret_offset_scalar(ptr addrspace(1) inreg %out, double inreg %in) {
645; SI-LABEL: global_atomic_xchg_f64_noret_offset_scalar:
646; SI:       ; %bb.0:
647; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
648; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
649; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
650; SI-NEXT:    s_mov_b64 exec, s[34:35]
651; SI-NEXT:    s_waitcnt expcnt(0)
652; SI-NEXT:    v_writelane_b32 v2, s6, 0
653; SI-NEXT:    v_writelane_b32 v2, s7, 1
654; SI-NEXT:    v_mov_b32_e32 v0, s6
655; SI-NEXT:    v_mov_b32_e32 v1, s7
656; SI-NEXT:    s_mov_b32 s7, 0xf000
657; SI-NEXT:    s_mov_b32 s6, -1
658; SI-NEXT:    s_waitcnt vmcnt(0)
659; SI-NEXT:    buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 offset:32
660; SI-NEXT:    s_waitcnt vmcnt(0)
661; SI-NEXT:    buffer_wbinvl1
662; SI-NEXT:    v_readlane_b32 s7, v2, 1
663; SI-NEXT:    v_readlane_b32 s6, v2, 0
664; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
665; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
666; SI-NEXT:    s_mov_b64 exec, s[34:35]
667; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
668; SI-NEXT:    s_setpc_b64 s[30:31]
669;
670; VI-LABEL: global_atomic_xchg_f64_noret_offset_scalar:
671; VI:       ; %bb.0:
672; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
673; VI-NEXT:    s_add_u32 s34, s4, 32
674; VI-NEXT:    s_addc_u32 s35, s5, 0
675; VI-NEXT:    v_mov_b32_e32 v2, s34
676; VI-NEXT:    v_mov_b32_e32 v0, s6
677; VI-NEXT:    v_mov_b32_e32 v1, s7
678; VI-NEXT:    v_mov_b32_e32 v3, s35
679; VI-NEXT:    flat_atomic_swap_x2 v[2:3], v[0:1]
680; VI-NEXT:    s_waitcnt vmcnt(0)
681; VI-NEXT:    buffer_wbinvl1_vol
682; VI-NEXT:    s_setpc_b64 s[30:31]
683;
684; GFX9-LABEL: global_atomic_xchg_f64_noret_offset_scalar:
685; GFX9:       ; %bb.0:
686; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
687; GFX9-NEXT:    v_mov_b32_e32 v0, s6
688; GFX9-NEXT:    v_mov_b32_e32 v1, s7
689; GFX9-NEXT:    v_mov_b32_e32 v2, 0
690; GFX9-NEXT:    global_atomic_swap_x2 v2, v[0:1], s[4:5] offset:32
691; GFX9-NEXT:    s_waitcnt vmcnt(0)
692; GFX9-NEXT:    buffer_wbinvl1_vol
693; GFX9-NEXT:    s_setpc_b64 s[30:31]
694  %gep = getelementptr double, ptr addrspace(1) %out, i32 4
695  %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, double %in seq_cst
696  ret void
697}
698
699define amdgpu_gfx double @global_atomic_xchg_f64_ret_scalar(ptr addrspace(1) inreg %ptr, double inreg %in) {
700; SI-LABEL: global_atomic_xchg_f64_ret_scalar:
701; SI:       ; %bb.0:
702; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
703; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
704; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
705; SI-NEXT:    s_mov_b64 exec, s[34:35]
706; SI-NEXT:    s_waitcnt expcnt(0)
707; SI-NEXT:    v_writelane_b32 v2, s6, 0
708; SI-NEXT:    v_writelane_b32 v2, s7, 1
709; SI-NEXT:    s_mov_b32 s34, s7
710; SI-NEXT:    s_mov_b32 s35, s6
711; SI-NEXT:    s_mov_b32 s7, 0xf000
712; SI-NEXT:    s_mov_b32 s6, -1
713; SI-NEXT:    v_mov_b32_e32 v0, s35
714; SI-NEXT:    v_mov_b32_e32 v1, s34
715; SI-NEXT:    s_waitcnt vmcnt(0)
716; SI-NEXT:    buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 glc
717; SI-NEXT:    s_waitcnt vmcnt(0)
718; SI-NEXT:    buffer_wbinvl1
719; SI-NEXT:    v_readlane_b32 s7, v2, 1
720; SI-NEXT:    v_readlane_b32 s6, v2, 0
721; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
722; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
723; SI-NEXT:    s_mov_b64 exec, s[34:35]
724; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
725; SI-NEXT:    s_setpc_b64 s[30:31]
726;
727; VI-LABEL: global_atomic_xchg_f64_ret_scalar:
728; VI:       ; %bb.0:
729; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
730; VI-NEXT:    v_mov_b32_e32 v0, s6
731; VI-NEXT:    v_mov_b32_e32 v1, s7
732; VI-NEXT:    v_mov_b32_e32 v2, s4
733; VI-NEXT:    v_mov_b32_e32 v3, s5
734; VI-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
735; VI-NEXT:    s_waitcnt vmcnt(0)
736; VI-NEXT:    buffer_wbinvl1_vol
737; VI-NEXT:    s_setpc_b64 s[30:31]
738;
739; GFX9-LABEL: global_atomic_xchg_f64_ret_scalar:
740; GFX9:       ; %bb.0:
741; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
742; GFX9-NEXT:    v_mov_b32_e32 v0, s6
743; GFX9-NEXT:    v_mov_b32_e32 v1, s7
744; GFX9-NEXT:    v_mov_b32_e32 v2, 0
745; GFX9-NEXT:    global_atomic_swap_x2 v[0:1], v2, v[0:1], s[4:5] glc
746; GFX9-NEXT:    s_waitcnt vmcnt(0)
747; GFX9-NEXT:    buffer_wbinvl1_vol
748; GFX9-NEXT:    s_setpc_b64 s[30:31]
749  %result = atomicrmw xchg ptr addrspace(1) %ptr, double %in seq_cst
750  ret double %result
751}
752
753define amdgpu_gfx double @global_atomic_xchg_f64_ret_offset_scalar(ptr addrspace(1) inreg %out, double inreg %in) {
754; SI-LABEL: global_atomic_xchg_f64_ret_offset_scalar:
755; SI:       ; %bb.0:
756; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
757; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
758; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
759; SI-NEXT:    s_mov_b64 exec, s[34:35]
760; SI-NEXT:    s_waitcnt expcnt(0)
761; SI-NEXT:    v_writelane_b32 v2, s6, 0
762; SI-NEXT:    v_writelane_b32 v2, s7, 1
763; SI-NEXT:    v_mov_b32_e32 v0, s6
764; SI-NEXT:    v_mov_b32_e32 v1, s7
765; SI-NEXT:    s_mov_b32 s7, 0xf000
766; SI-NEXT:    s_mov_b32 s6, -1
767; SI-NEXT:    s_waitcnt vmcnt(0)
768; SI-NEXT:    buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 offset:32 glc
769; SI-NEXT:    s_waitcnt vmcnt(0)
770; SI-NEXT:    buffer_wbinvl1
771; SI-NEXT:    v_readlane_b32 s7, v2, 1
772; SI-NEXT:    v_readlane_b32 s6, v2, 0
773; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
774; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
775; SI-NEXT:    s_mov_b64 exec, s[34:35]
776; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
777; SI-NEXT:    s_setpc_b64 s[30:31]
778;
779; VI-LABEL: global_atomic_xchg_f64_ret_offset_scalar:
780; VI:       ; %bb.0:
781; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
782; VI-NEXT:    s_add_u32 s34, s4, 32
783; VI-NEXT:    s_addc_u32 s35, s5, 0
784; VI-NEXT:    v_mov_b32_e32 v2, s34
785; VI-NEXT:    v_mov_b32_e32 v0, s6
786; VI-NEXT:    v_mov_b32_e32 v1, s7
787; VI-NEXT:    v_mov_b32_e32 v3, s35
788; VI-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
789; VI-NEXT:    s_waitcnt vmcnt(0)
790; VI-NEXT:    buffer_wbinvl1_vol
791; VI-NEXT:    s_setpc_b64 s[30:31]
792;
793; GFX9-LABEL: global_atomic_xchg_f64_ret_offset_scalar:
794; GFX9:       ; %bb.0:
795; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
796; GFX9-NEXT:    v_mov_b32_e32 v0, s6
797; GFX9-NEXT:    v_mov_b32_e32 v1, s7
798; GFX9-NEXT:    v_mov_b32_e32 v2, 0
799; GFX9-NEXT:    global_atomic_swap_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc
800; GFX9-NEXT:    s_waitcnt vmcnt(0)
801; GFX9-NEXT:    buffer_wbinvl1_vol
802; GFX9-NEXT:    s_setpc_b64 s[30:31]
803  %gep = getelementptr double, ptr addrspace(1) %out, i32 4
804  %result = atomicrmw xchg ptr addrspace(1) %gep, double %in seq_cst
805  ret double %result
806}
807
808define void @global_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, double %in) {
809; SI-LABEL: global_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory:
810; SI:       ; %bb.0:
811; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
812; SI-NEXT:    s_mov_b32 s6, 0
813; SI-NEXT:    s_mov_b32 s7, 0xf000
814; SI-NEXT:    s_mov_b32 s4, s6
815; SI-NEXT:    s_mov_b32 s5, s6
816; SI-NEXT:    buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:16
817; SI-NEXT:    s_waitcnt vmcnt(0)
818; SI-NEXT:    buffer_wbinvl1
819; SI-NEXT:    s_waitcnt expcnt(0)
820; SI-NEXT:    s_setpc_b64 s[30:31]
821;
822; VI-LABEL: global_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory:
823; VI:       ; %bb.0:
824; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
825; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
826; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
827; VI-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
828; VI-NEXT:    s_waitcnt vmcnt(0)
829; VI-NEXT:    buffer_wbinvl1_vol
830; VI-NEXT:    s_setpc_b64 s[30:31]
831;
832; GFX9-LABEL: global_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory:
833; GFX9:       ; %bb.0:
834; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
835; GFX9-NEXT:    global_atomic_swap_x2 v[0:1], v[2:3], off offset:16
836; GFX9-NEXT:    s_waitcnt vmcnt(0)
837; GFX9-NEXT:    buffer_wbinvl1_vol
838; GFX9-NEXT:    s_setpc_b64 s[30:31]
839  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
840  %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, double %in seq_cst, !amdgpu.no.remote.memory !0
841  ret void
842}
843
844define double @global_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, double %in) {
845; SI-LABEL: global_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory:
846; SI:       ; %bb.0:
847; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
848; SI-NEXT:    s_mov_b32 s6, 0
849; SI-NEXT:    s_mov_b32 s7, 0xf000
850; SI-NEXT:    s_mov_b32 s4, s6
851; SI-NEXT:    s_mov_b32 s5, s6
852; SI-NEXT:    buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:16 glc
853; SI-NEXT:    s_waitcnt vmcnt(0)
854; SI-NEXT:    buffer_wbinvl1
855; SI-NEXT:    v_mov_b32_e32 v0, v2
856; SI-NEXT:    v_mov_b32_e32 v1, v3
857; SI-NEXT:    s_waitcnt expcnt(0)
858; SI-NEXT:    s_setpc_b64 s[30:31]
859;
860; VI-LABEL: global_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory:
861; VI:       ; %bb.0:
862; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
863; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
864; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
865; VI-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
866; VI-NEXT:    s_waitcnt vmcnt(0)
867; VI-NEXT:    buffer_wbinvl1_vol
868; VI-NEXT:    s_setpc_b64 s[30:31]
869;
870; GFX9-LABEL: global_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory:
871; GFX9:       ; %bb.0:
872; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
873; GFX9-NEXT:    global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:16 glc
874; GFX9-NEXT:    s_waitcnt vmcnt(0)
875; GFX9-NEXT:    buffer_wbinvl1_vol
876; GFX9-NEXT:    s_setpc_b64 s[30:31]
877  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
878  %result = atomicrmw xchg ptr addrspace(1) %gep, double %in seq_cst, !amdgpu.no.remote.memory !0
879  ret double %result
880}
881
882; ---------------------------------------------------------------------
883; atomicrmw add
884; ---------------------------------------------------------------------
885
886define void @global_atomic_add_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
887; SI-LABEL: global_atomic_add_i64_noret:
888; SI:       ; %bb.0:
889; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
890; SI-NEXT:    s_mov_b32 s6, 0
891; SI-NEXT:    s_mov_b32 s7, 0xf000
892; SI-NEXT:    s_mov_b32 s4, s6
893; SI-NEXT:    s_mov_b32 s5, s6
894; SI-NEXT:    buffer_atomic_add_x2 v[2:3], v[0:1], s[4:7], 0 addr64
895; SI-NEXT:    s_waitcnt vmcnt(0)
896; SI-NEXT:    buffer_wbinvl1
897; SI-NEXT:    s_waitcnt expcnt(0)
898; SI-NEXT:    s_setpc_b64 s[30:31]
899;
900; VI-LABEL: global_atomic_add_i64_noret:
901; VI:       ; %bb.0:
902; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
903; VI-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3]
904; VI-NEXT:    s_waitcnt vmcnt(0)
905; VI-NEXT:    buffer_wbinvl1_vol
906; VI-NEXT:    s_setpc_b64 s[30:31]
907;
908; GFX9-LABEL: global_atomic_add_i64_noret:
909; GFX9:       ; %bb.0:
910; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
911; GFX9-NEXT:    global_atomic_add_x2 v[0:1], v[2:3], off
912; GFX9-NEXT:    s_waitcnt vmcnt(0)
913; GFX9-NEXT:    buffer_wbinvl1_vol
914; GFX9-NEXT:    s_setpc_b64 s[30:31]
915  %tmp0 = atomicrmw add ptr addrspace(1) %ptr, i64 %in seq_cst
916  ret void
917}
918
919define void @global_atomic_add_i64_noret_offset(ptr addrspace(1) %out, i64 %in) {
920; SI-LABEL: global_atomic_add_i64_noret_offset:
921; SI:       ; %bb.0:
922; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
923; SI-NEXT:    s_mov_b32 s6, 0
924; SI-NEXT:    s_mov_b32 s7, 0xf000
925; SI-NEXT:    s_mov_b32 s4, s6
926; SI-NEXT:    s_mov_b32 s5, s6
927; SI-NEXT:    buffer_atomic_add_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32
928; SI-NEXT:    s_waitcnt vmcnt(0)
929; SI-NEXT:    buffer_wbinvl1
930; SI-NEXT:    s_waitcnt expcnt(0)
931; SI-NEXT:    s_setpc_b64 s[30:31]
932;
933; VI-LABEL: global_atomic_add_i64_noret_offset:
934; VI:       ; %bb.0:
935; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
936; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
937; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
938; VI-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3]
939; VI-NEXT:    s_waitcnt vmcnt(0)
940; VI-NEXT:    buffer_wbinvl1_vol
941; VI-NEXT:    s_setpc_b64 s[30:31]
942;
943; GFX9-LABEL: global_atomic_add_i64_noret_offset:
944; GFX9:       ; %bb.0:
945; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
946; GFX9-NEXT:    global_atomic_add_x2 v[0:1], v[2:3], off offset:32
947; GFX9-NEXT:    s_waitcnt vmcnt(0)
948; GFX9-NEXT:    buffer_wbinvl1_vol
949; GFX9-NEXT:    s_setpc_b64 s[30:31]
950  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
951  %tmp0 = atomicrmw add ptr addrspace(1) %gep, i64 %in seq_cst
952  ret void
953}
954
955define i64 @global_atomic_add_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
956; SI-LABEL: global_atomic_add_i64_ret:
957; SI:       ; %bb.0:
958; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
959; SI-NEXT:    s_mov_b32 s6, 0
960; SI-NEXT:    s_mov_b32 s7, 0xf000
961; SI-NEXT:    s_mov_b32 s4, s6
962; SI-NEXT:    s_mov_b32 s5, s6
963; SI-NEXT:    buffer_atomic_add_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
964; SI-NEXT:    s_waitcnt vmcnt(0)
965; SI-NEXT:    buffer_wbinvl1
966; SI-NEXT:    v_mov_b32_e32 v0, v2
967; SI-NEXT:    v_mov_b32_e32 v1, v3
968; SI-NEXT:    s_waitcnt expcnt(0)
969; SI-NEXT:    s_setpc_b64 s[30:31]
970;
971; VI-LABEL: global_atomic_add_i64_ret:
972; VI:       ; %bb.0:
973; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
974; VI-NEXT:    flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
975; VI-NEXT:    s_waitcnt vmcnt(0)
976; VI-NEXT:    buffer_wbinvl1_vol
977; VI-NEXT:    s_setpc_b64 s[30:31]
978;
979; GFX9-LABEL: global_atomic_add_i64_ret:
980; GFX9:       ; %bb.0:
981; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
982; GFX9-NEXT:    global_atomic_add_x2 v[0:1], v[0:1], v[2:3], off glc
983; GFX9-NEXT:    s_waitcnt vmcnt(0)
984; GFX9-NEXT:    buffer_wbinvl1_vol
985; GFX9-NEXT:    s_setpc_b64 s[30:31]
986  %result = atomicrmw add ptr addrspace(1) %ptr, i64 %in seq_cst
987  ret i64 %result
988}
989
990define i64 @global_atomic_add_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
991; SI-LABEL: global_atomic_add_i64_ret_offset:
992; SI:       ; %bb.0:
993; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
994; SI-NEXT:    s_mov_b32 s6, 0
995; SI-NEXT:    s_mov_b32 s7, 0xf000
996; SI-NEXT:    s_mov_b32 s4, s6
997; SI-NEXT:    s_mov_b32 s5, s6
998; SI-NEXT:    buffer_atomic_add_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc
999; SI-NEXT:    s_waitcnt vmcnt(0)
1000; SI-NEXT:    buffer_wbinvl1
1001; SI-NEXT:    v_mov_b32_e32 v0, v2
1002; SI-NEXT:    v_mov_b32_e32 v1, v3
1003; SI-NEXT:    s_waitcnt expcnt(0)
1004; SI-NEXT:    s_setpc_b64 s[30:31]
1005;
1006; VI-LABEL: global_atomic_add_i64_ret_offset:
1007; VI:       ; %bb.0:
1008; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1009; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
1010; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1011; VI-NEXT:    flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
1012; VI-NEXT:    s_waitcnt vmcnt(0)
1013; VI-NEXT:    buffer_wbinvl1_vol
1014; VI-NEXT:    s_setpc_b64 s[30:31]
1015;
1016; GFX9-LABEL: global_atomic_add_i64_ret_offset:
1017; GFX9:       ; %bb.0:
1018; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1019; GFX9-NEXT:    global_atomic_add_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
1020; GFX9-NEXT:    s_waitcnt vmcnt(0)
1021; GFX9-NEXT:    buffer_wbinvl1_vol
1022; GFX9-NEXT:    s_setpc_b64 s[30:31]
1023  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
1024  %result = atomicrmw add ptr addrspace(1) %gep, i64 %in seq_cst
1025  ret i64 %result
1026}
1027
1028define amdgpu_gfx void @global_atomic_add_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
1029; SI-LABEL: global_atomic_add_i64_noret_scalar:
1030; SI:       ; %bb.0:
1031; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1032; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
1033; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
1034; SI-NEXT:    s_mov_b64 exec, s[34:35]
1035; SI-NEXT:    s_waitcnt expcnt(0)
1036; SI-NEXT:    v_writelane_b32 v2, s6, 0
1037; SI-NEXT:    v_writelane_b32 v2, s7, 1
1038; SI-NEXT:    s_mov_b32 s34, s7
1039; SI-NEXT:    s_mov_b32 s35, s6
1040; SI-NEXT:    s_mov_b32 s7, 0xf000
1041; SI-NEXT:    s_mov_b32 s6, -1
1042; SI-NEXT:    v_mov_b32_e32 v0, s35
1043; SI-NEXT:    v_mov_b32_e32 v1, s34
1044; SI-NEXT:    s_waitcnt vmcnt(0)
1045; SI-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[4:7], 0
1046; SI-NEXT:    s_waitcnt vmcnt(0)
1047; SI-NEXT:    buffer_wbinvl1
1048; SI-NEXT:    v_readlane_b32 s7, v2, 1
1049; SI-NEXT:    v_readlane_b32 s6, v2, 0
1050; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
1051; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
1052; SI-NEXT:    s_mov_b64 exec, s[34:35]
1053; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
1054; SI-NEXT:    s_setpc_b64 s[30:31]
1055;
1056; VI-LABEL: global_atomic_add_i64_noret_scalar:
1057; VI:       ; %bb.0:
1058; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1059; VI-NEXT:    v_mov_b32_e32 v0, s6
1060; VI-NEXT:    v_mov_b32_e32 v1, s7
1061; VI-NEXT:    v_mov_b32_e32 v2, s4
1062; VI-NEXT:    v_mov_b32_e32 v3, s5
1063; VI-NEXT:    flat_atomic_add_x2 v[2:3], v[0:1]
1064; VI-NEXT:    s_waitcnt vmcnt(0)
1065; VI-NEXT:    buffer_wbinvl1_vol
1066; VI-NEXT:    s_setpc_b64 s[30:31]
1067;
1068; GFX9-LABEL: global_atomic_add_i64_noret_scalar:
1069; GFX9:       ; %bb.0:
1070; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1071; GFX9-NEXT:    v_mov_b32_e32 v0, s6
1072; GFX9-NEXT:    v_mov_b32_e32 v1, s7
1073; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1074; GFX9-NEXT:    global_atomic_add_x2 v2, v[0:1], s[4:5]
1075; GFX9-NEXT:    s_waitcnt vmcnt(0)
1076; GFX9-NEXT:    buffer_wbinvl1_vol
1077; GFX9-NEXT:    s_setpc_b64 s[30:31]
1078  %tmp0 = atomicrmw add ptr addrspace(1) %ptr, i64 %in seq_cst
1079  ret void
1080}
1081
1082define amdgpu_gfx void @global_atomic_add_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
1083; SI-LABEL: global_atomic_add_i64_noret_offset_scalar:
1084; SI:       ; %bb.0:
1085; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1086; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
1087; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
1088; SI-NEXT:    s_mov_b64 exec, s[34:35]
1089; SI-NEXT:    s_waitcnt expcnt(0)
1090; SI-NEXT:    v_writelane_b32 v2, s6, 0
1091; SI-NEXT:    v_writelane_b32 v2, s7, 1
1092; SI-NEXT:    v_mov_b32_e32 v0, s6
1093; SI-NEXT:    v_mov_b32_e32 v1, s7
1094; SI-NEXT:    s_mov_b32 s7, 0xf000
1095; SI-NEXT:    s_mov_b32 s6, -1
1096; SI-NEXT:    s_waitcnt vmcnt(0)
1097; SI-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[4:7], 0 offset:32
1098; SI-NEXT:    s_waitcnt vmcnt(0)
1099; SI-NEXT:    buffer_wbinvl1
1100; SI-NEXT:    v_readlane_b32 s7, v2, 1
1101; SI-NEXT:    v_readlane_b32 s6, v2, 0
1102; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
1103; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
1104; SI-NEXT:    s_mov_b64 exec, s[34:35]
1105; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
1106; SI-NEXT:    s_setpc_b64 s[30:31]
1107;
1108; VI-LABEL: global_atomic_add_i64_noret_offset_scalar:
1109; VI:       ; %bb.0:
1110; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1111; VI-NEXT:    s_add_u32 s34, s4, 32
1112; VI-NEXT:    s_addc_u32 s35, s5, 0
1113; VI-NEXT:    v_mov_b32_e32 v2, s34
1114; VI-NEXT:    v_mov_b32_e32 v0, s6
1115; VI-NEXT:    v_mov_b32_e32 v1, s7
1116; VI-NEXT:    v_mov_b32_e32 v3, s35
1117; VI-NEXT:    flat_atomic_add_x2 v[2:3], v[0:1]
1118; VI-NEXT:    s_waitcnt vmcnt(0)
1119; VI-NEXT:    buffer_wbinvl1_vol
1120; VI-NEXT:    s_setpc_b64 s[30:31]
1121;
1122; GFX9-LABEL: global_atomic_add_i64_noret_offset_scalar:
1123; GFX9:       ; %bb.0:
1124; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1125; GFX9-NEXT:    v_mov_b32_e32 v0, s6
1126; GFX9-NEXT:    v_mov_b32_e32 v1, s7
1127; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1128; GFX9-NEXT:    global_atomic_add_x2 v2, v[0:1], s[4:5] offset:32
1129; GFX9-NEXT:    s_waitcnt vmcnt(0)
1130; GFX9-NEXT:    buffer_wbinvl1_vol
1131; GFX9-NEXT:    s_setpc_b64 s[30:31]
1132  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
1133  %tmp0 = atomicrmw add ptr addrspace(1) %gep, i64 %in seq_cst
1134  ret void
1135}
1136
1137define amdgpu_gfx i64 @global_atomic_add_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
1138; SI-LABEL: global_atomic_add_i64_ret_scalar:
1139; SI:       ; %bb.0:
1140; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1141; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
1142; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
1143; SI-NEXT:    s_mov_b64 exec, s[34:35]
1144; SI-NEXT:    s_waitcnt expcnt(0)
1145; SI-NEXT:    v_writelane_b32 v2, s6, 0
1146; SI-NEXT:    v_writelane_b32 v2, s7, 1
1147; SI-NEXT:    s_mov_b32 s34, s7
1148; SI-NEXT:    s_mov_b32 s35, s6
1149; SI-NEXT:    s_mov_b32 s7, 0xf000
1150; SI-NEXT:    s_mov_b32 s6, -1
1151; SI-NEXT:    v_mov_b32_e32 v0, s35
1152; SI-NEXT:    v_mov_b32_e32 v1, s34
1153; SI-NEXT:    s_waitcnt vmcnt(0)
1154; SI-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[4:7], 0 glc
1155; SI-NEXT:    s_waitcnt vmcnt(0)
1156; SI-NEXT:    buffer_wbinvl1
1157; SI-NEXT:    v_readlane_b32 s7, v2, 1
1158; SI-NEXT:    v_readlane_b32 s6, v2, 0
1159; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
1160; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
1161; SI-NEXT:    s_mov_b64 exec, s[34:35]
1162; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
1163; SI-NEXT:    s_setpc_b64 s[30:31]
1164;
1165; VI-LABEL: global_atomic_add_i64_ret_scalar:
1166; VI:       ; %bb.0:
1167; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1168; VI-NEXT:    v_mov_b32_e32 v0, s6
1169; VI-NEXT:    v_mov_b32_e32 v1, s7
1170; VI-NEXT:    v_mov_b32_e32 v2, s4
1171; VI-NEXT:    v_mov_b32_e32 v3, s5
1172; VI-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
1173; VI-NEXT:    s_waitcnt vmcnt(0)
1174; VI-NEXT:    buffer_wbinvl1_vol
1175; VI-NEXT:    s_setpc_b64 s[30:31]
1176;
1177; GFX9-LABEL: global_atomic_add_i64_ret_scalar:
1178; GFX9:       ; %bb.0:
1179; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1180; GFX9-NEXT:    v_mov_b32_e32 v0, s6
1181; GFX9-NEXT:    v_mov_b32_e32 v1, s7
1182; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1183; GFX9-NEXT:    global_atomic_add_x2 v[0:1], v2, v[0:1], s[4:5] glc
1184; GFX9-NEXT:    s_waitcnt vmcnt(0)
1185; GFX9-NEXT:    buffer_wbinvl1_vol
1186; GFX9-NEXT:    s_setpc_b64 s[30:31]
1187  %result = atomicrmw add ptr addrspace(1) %ptr, i64 %in seq_cst
1188  ret i64 %result
1189}
1190
1191define amdgpu_gfx i64 @global_atomic_add_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
1192; SI-LABEL: global_atomic_add_i64_ret_offset_scalar:
1193; SI:       ; %bb.0:
1194; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1195; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
1196; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
1197; SI-NEXT:    s_mov_b64 exec, s[34:35]
1198; SI-NEXT:    s_waitcnt expcnt(0)
1199; SI-NEXT:    v_writelane_b32 v2, s6, 0
1200; SI-NEXT:    v_writelane_b32 v2, s7, 1
1201; SI-NEXT:    v_mov_b32_e32 v0, s6
1202; SI-NEXT:    v_mov_b32_e32 v1, s7
1203; SI-NEXT:    s_mov_b32 s7, 0xf000
1204; SI-NEXT:    s_mov_b32 s6, -1
1205; SI-NEXT:    s_waitcnt vmcnt(0)
1206; SI-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[4:7], 0 offset:32 glc
1207; SI-NEXT:    s_waitcnt vmcnt(0)
1208; SI-NEXT:    buffer_wbinvl1
1209; SI-NEXT:    v_readlane_b32 s7, v2, 1
1210; SI-NEXT:    v_readlane_b32 s6, v2, 0
1211; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
1212; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
1213; SI-NEXT:    s_mov_b64 exec, s[34:35]
1214; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
1215; SI-NEXT:    s_setpc_b64 s[30:31]
1216;
1217; VI-LABEL: global_atomic_add_i64_ret_offset_scalar:
1218; VI:       ; %bb.0:
1219; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1220; VI-NEXT:    s_add_u32 s34, s4, 32
1221; VI-NEXT:    s_addc_u32 s35, s5, 0
1222; VI-NEXT:    v_mov_b32_e32 v2, s34
1223; VI-NEXT:    v_mov_b32_e32 v0, s6
1224; VI-NEXT:    v_mov_b32_e32 v1, s7
1225; VI-NEXT:    v_mov_b32_e32 v3, s35
1226; VI-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
1227; VI-NEXT:    s_waitcnt vmcnt(0)
1228; VI-NEXT:    buffer_wbinvl1_vol
1229; VI-NEXT:    s_setpc_b64 s[30:31]
1230;
1231; GFX9-LABEL: global_atomic_add_i64_ret_offset_scalar:
1232; GFX9:       ; %bb.0:
1233; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1234; GFX9-NEXT:    v_mov_b32_e32 v0, s6
1235; GFX9-NEXT:    v_mov_b32_e32 v1, s7
1236; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1237; GFX9-NEXT:    global_atomic_add_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc
1238; GFX9-NEXT:    s_waitcnt vmcnt(0)
1239; GFX9-NEXT:    buffer_wbinvl1_vol
1240; GFX9-NEXT:    s_setpc_b64 s[30:31]
1241  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
1242  %result = atomicrmw add ptr addrspace(1) %gep, i64 %in seq_cst
1243  ret i64 %result
1244}
1245
1246define void @global_atomic_add_i64_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
1247; SI-LABEL: global_atomic_add_i64_noret_offset__amdgpu_no_remote_memory:
1248; SI:       ; %bb.0:
1249; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1250; SI-NEXT:    s_mov_b32 s6, 0
1251; SI-NEXT:    s_mov_b32 s7, 0xf000
1252; SI-NEXT:    s_mov_b32 s4, s6
1253; SI-NEXT:    s_mov_b32 s5, s6
1254; SI-NEXT:    buffer_atomic_add_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32
1255; SI-NEXT:    s_waitcnt vmcnt(0)
1256; SI-NEXT:    buffer_wbinvl1
1257; SI-NEXT:    s_waitcnt expcnt(0)
1258; SI-NEXT:    s_setpc_b64 s[30:31]
1259;
1260; VI-LABEL: global_atomic_add_i64_noret_offset__amdgpu_no_remote_memory:
1261; VI:       ; %bb.0:
1262; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1263; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
1264; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1265; VI-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3]
1266; VI-NEXT:    s_waitcnt vmcnt(0)
1267; VI-NEXT:    buffer_wbinvl1_vol
1268; VI-NEXT:    s_setpc_b64 s[30:31]
1269;
1270; GFX9-LABEL: global_atomic_add_i64_noret_offset__amdgpu_no_remote_memory:
1271; GFX9:       ; %bb.0:
1272; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1273; GFX9-NEXT:    global_atomic_add_x2 v[0:1], v[2:3], off offset:32
1274; GFX9-NEXT:    s_waitcnt vmcnt(0)
1275; GFX9-NEXT:    buffer_wbinvl1_vol
1276; GFX9-NEXT:    s_setpc_b64 s[30:31]
1277  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
1278  %tmp0 = atomicrmw add ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
1279  ret void
1280}
1281
1282define i64 @global_atomic_add_i64_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
1283; SI-LABEL: global_atomic_add_i64_ret_offset__amdgpu_no_remote_memory:
1284; SI:       ; %bb.0:
1285; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1286; SI-NEXT:    s_mov_b32 s6, 0
1287; SI-NEXT:    s_mov_b32 s7, 0xf000
1288; SI-NEXT:    s_mov_b32 s4, s6
1289; SI-NEXT:    s_mov_b32 s5, s6
1290; SI-NEXT:    buffer_atomic_add_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc
1291; SI-NEXT:    s_waitcnt vmcnt(0)
1292; SI-NEXT:    buffer_wbinvl1
1293; SI-NEXT:    v_mov_b32_e32 v0, v2
1294; SI-NEXT:    v_mov_b32_e32 v1, v3
1295; SI-NEXT:    s_waitcnt expcnt(0)
1296; SI-NEXT:    s_setpc_b64 s[30:31]
1297;
1298; VI-LABEL: global_atomic_add_i64_ret_offset__amdgpu_no_remote_memory:
1299; VI:       ; %bb.0:
1300; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1301; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
1302; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1303; VI-NEXT:    flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
1304; VI-NEXT:    s_waitcnt vmcnt(0)
1305; VI-NEXT:    buffer_wbinvl1_vol
1306; VI-NEXT:    s_setpc_b64 s[30:31]
1307;
1308; GFX9-LABEL: global_atomic_add_i64_ret_offset__amdgpu_no_remote_memory:
1309; GFX9:       ; %bb.0:
1310; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1311; GFX9-NEXT:    global_atomic_add_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
1312; GFX9-NEXT:    s_waitcnt vmcnt(0)
1313; GFX9-NEXT:    buffer_wbinvl1_vol
1314; GFX9-NEXT:    s_setpc_b64 s[30:31]
1315  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
1316  %result = atomicrmw add ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
1317  ret i64 %result
1318}
1319
1320; ---------------------------------------------------------------------
1321; atomicrmw sub
1322; ---------------------------------------------------------------------
1323
1324define void @global_atomic_sub_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
1325; SI-LABEL: global_atomic_sub_i64_noret:
1326; SI:       ; %bb.0:
1327; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1328; SI-NEXT:    s_mov_b32 s6, 0
1329; SI-NEXT:    s_mov_b32 s7, 0xf000
1330; SI-NEXT:    s_mov_b32 s4, s6
1331; SI-NEXT:    s_mov_b32 s5, s6
1332; SI-NEXT:    buffer_atomic_sub_x2 v[2:3], v[0:1], s[4:7], 0 addr64
1333; SI-NEXT:    s_waitcnt vmcnt(0)
1334; SI-NEXT:    buffer_wbinvl1
1335; SI-NEXT:    s_waitcnt expcnt(0)
1336; SI-NEXT:    s_setpc_b64 s[30:31]
1337;
1338; VI-LABEL: global_atomic_sub_i64_noret:
1339; VI:       ; %bb.0:
1340; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1341; VI-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3]
1342; VI-NEXT:    s_waitcnt vmcnt(0)
1343; VI-NEXT:    buffer_wbinvl1_vol
1344; VI-NEXT:    s_setpc_b64 s[30:31]
1345;
1346; GFX9-LABEL: global_atomic_sub_i64_noret:
1347; GFX9:       ; %bb.0:
1348; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1349; GFX9-NEXT:    global_atomic_sub_x2 v[0:1], v[2:3], off
1350; GFX9-NEXT:    s_waitcnt vmcnt(0)
1351; GFX9-NEXT:    buffer_wbinvl1_vol
1352; GFX9-NEXT:    s_setpc_b64 s[30:31]
1353  %tmp0 = atomicrmw sub ptr addrspace(1) %ptr, i64 %in seq_cst
1354  ret void
1355}
1356
1357define void @global_atomic_sub_i64_noret_offset(ptr addrspace(1) %out, i64 %in) {
1358; SI-LABEL: global_atomic_sub_i64_noret_offset:
1359; SI:       ; %bb.0:
1360; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1361; SI-NEXT:    s_mov_b32 s6, 0
1362; SI-NEXT:    s_mov_b32 s7, 0xf000
1363; SI-NEXT:    s_mov_b32 s4, s6
1364; SI-NEXT:    s_mov_b32 s5, s6
1365; SI-NEXT:    buffer_atomic_sub_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32
1366; SI-NEXT:    s_waitcnt vmcnt(0)
1367; SI-NEXT:    buffer_wbinvl1
1368; SI-NEXT:    s_waitcnt expcnt(0)
1369; SI-NEXT:    s_setpc_b64 s[30:31]
1370;
1371; VI-LABEL: global_atomic_sub_i64_noret_offset:
1372; VI:       ; %bb.0:
1373; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1374; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
1375; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1376; VI-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3]
1377; VI-NEXT:    s_waitcnt vmcnt(0)
1378; VI-NEXT:    buffer_wbinvl1_vol
1379; VI-NEXT:    s_setpc_b64 s[30:31]
1380;
1381; GFX9-LABEL: global_atomic_sub_i64_noret_offset:
1382; GFX9:       ; %bb.0:
1383; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1384; GFX9-NEXT:    global_atomic_sub_x2 v[0:1], v[2:3], off offset:32
1385; GFX9-NEXT:    s_waitcnt vmcnt(0)
1386; GFX9-NEXT:    buffer_wbinvl1_vol
1387; GFX9-NEXT:    s_setpc_b64 s[30:31]
1388  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
1389  %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst
1390  ret void
1391}
1392
1393define i64 @global_atomic_sub_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
1394; SI-LABEL: global_atomic_sub_i64_ret:
1395; SI:       ; %bb.0:
1396; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1397; SI-NEXT:    s_mov_b32 s6, 0
1398; SI-NEXT:    s_mov_b32 s7, 0xf000
1399; SI-NEXT:    s_mov_b32 s4, s6
1400; SI-NEXT:    s_mov_b32 s5, s6
1401; SI-NEXT:    buffer_atomic_sub_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
1402; SI-NEXT:    s_waitcnt vmcnt(0)
1403; SI-NEXT:    buffer_wbinvl1
1404; SI-NEXT:    v_mov_b32_e32 v0, v2
1405; SI-NEXT:    v_mov_b32_e32 v1, v3
1406; SI-NEXT:    s_waitcnt expcnt(0)
1407; SI-NEXT:    s_setpc_b64 s[30:31]
1408;
1409; VI-LABEL: global_atomic_sub_i64_ret:
1410; VI:       ; %bb.0:
1411; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1412; VI-NEXT:    flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
1413; VI-NEXT:    s_waitcnt vmcnt(0)
1414; VI-NEXT:    buffer_wbinvl1_vol
1415; VI-NEXT:    s_setpc_b64 s[30:31]
1416;
1417; GFX9-LABEL: global_atomic_sub_i64_ret:
1418; GFX9:       ; %bb.0:
1419; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1420; GFX9-NEXT:    global_atomic_sub_x2 v[0:1], v[0:1], v[2:3], off glc
1421; GFX9-NEXT:    s_waitcnt vmcnt(0)
1422; GFX9-NEXT:    buffer_wbinvl1_vol
1423; GFX9-NEXT:    s_setpc_b64 s[30:31]
1424  %result = atomicrmw sub ptr addrspace(1) %ptr, i64 %in seq_cst
1425  ret i64 %result
1426}
1427
1428define i64 @global_atomic_sub_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
1429; SI-LABEL: global_atomic_sub_i64_ret_offset:
1430; SI:       ; %bb.0:
1431; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1432; SI-NEXT:    s_mov_b32 s6, 0
1433; SI-NEXT:    s_mov_b32 s7, 0xf000
1434; SI-NEXT:    s_mov_b32 s4, s6
1435; SI-NEXT:    s_mov_b32 s5, s6
1436; SI-NEXT:    buffer_atomic_sub_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc
1437; SI-NEXT:    s_waitcnt vmcnt(0)
1438; SI-NEXT:    buffer_wbinvl1
1439; SI-NEXT:    v_mov_b32_e32 v0, v2
1440; SI-NEXT:    v_mov_b32_e32 v1, v3
1441; SI-NEXT:    s_waitcnt expcnt(0)
1442; SI-NEXT:    s_setpc_b64 s[30:31]
1443;
1444; VI-LABEL: global_atomic_sub_i64_ret_offset:
1445; VI:       ; %bb.0:
1446; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1447; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
1448; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1449; VI-NEXT:    flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
1450; VI-NEXT:    s_waitcnt vmcnt(0)
1451; VI-NEXT:    buffer_wbinvl1_vol
1452; VI-NEXT:    s_setpc_b64 s[30:31]
1453;
1454; GFX9-LABEL: global_atomic_sub_i64_ret_offset:
1455; GFX9:       ; %bb.0:
1456; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1457; GFX9-NEXT:    global_atomic_sub_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
1458; GFX9-NEXT:    s_waitcnt vmcnt(0)
1459; GFX9-NEXT:    buffer_wbinvl1_vol
1460; GFX9-NEXT:    s_setpc_b64 s[30:31]
1461  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
1462  %result = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst
1463  ret i64 %result
1464}
1465
1466define amdgpu_gfx void @global_atomic_sub_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
1467; SI-LABEL: global_atomic_sub_i64_noret_scalar:
1468; SI:       ; %bb.0:
1469; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1470; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
1471; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
1472; SI-NEXT:    s_mov_b64 exec, s[34:35]
1473; SI-NEXT:    s_waitcnt expcnt(0)
1474; SI-NEXT:    v_writelane_b32 v2, s6, 0
1475; SI-NEXT:    v_writelane_b32 v2, s7, 1
1476; SI-NEXT:    s_mov_b32 s34, s7
1477; SI-NEXT:    s_mov_b32 s35, s6
1478; SI-NEXT:    s_mov_b32 s7, 0xf000
1479; SI-NEXT:    s_mov_b32 s6, -1
1480; SI-NEXT:    v_mov_b32_e32 v0, s35
1481; SI-NEXT:    v_mov_b32_e32 v1, s34
1482; SI-NEXT:    s_waitcnt vmcnt(0)
1483; SI-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0
1484; SI-NEXT:    s_waitcnt vmcnt(0)
1485; SI-NEXT:    buffer_wbinvl1
1486; SI-NEXT:    v_readlane_b32 s7, v2, 1
1487; SI-NEXT:    v_readlane_b32 s6, v2, 0
1488; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
1489; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
1490; SI-NEXT:    s_mov_b64 exec, s[34:35]
1491; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
1492; SI-NEXT:    s_setpc_b64 s[30:31]
1493;
1494; VI-LABEL: global_atomic_sub_i64_noret_scalar:
1495; VI:       ; %bb.0:
1496; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1497; VI-NEXT:    v_mov_b32_e32 v0, s6
1498; VI-NEXT:    v_mov_b32_e32 v1, s7
1499; VI-NEXT:    v_mov_b32_e32 v2, s4
1500; VI-NEXT:    v_mov_b32_e32 v3, s5
1501; VI-NEXT:    flat_atomic_sub_x2 v[2:3], v[0:1]
1502; VI-NEXT:    s_waitcnt vmcnt(0)
1503; VI-NEXT:    buffer_wbinvl1_vol
1504; VI-NEXT:    s_setpc_b64 s[30:31]
1505;
1506; GFX9-LABEL: global_atomic_sub_i64_noret_scalar:
1507; GFX9:       ; %bb.0:
1508; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1509; GFX9-NEXT:    v_mov_b32_e32 v0, s6
1510; GFX9-NEXT:    v_mov_b32_e32 v1, s7
1511; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1512; GFX9-NEXT:    global_atomic_sub_x2 v2, v[0:1], s[4:5]
1513; GFX9-NEXT:    s_waitcnt vmcnt(0)
1514; GFX9-NEXT:    buffer_wbinvl1_vol
1515; GFX9-NEXT:    s_setpc_b64 s[30:31]
1516  %tmp0 = atomicrmw sub ptr addrspace(1) %ptr, i64 %in seq_cst
1517  ret void
1518}
1519
1520define amdgpu_gfx void @global_atomic_sub_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
1521; SI-LABEL: global_atomic_sub_i64_noret_offset_scalar:
1522; SI:       ; %bb.0:
1523; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1524; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
1525; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
1526; SI-NEXT:    s_mov_b64 exec, s[34:35]
1527; SI-NEXT:    s_waitcnt expcnt(0)
1528; SI-NEXT:    v_writelane_b32 v2, s6, 0
1529; SI-NEXT:    v_writelane_b32 v2, s7, 1
1530; SI-NEXT:    v_mov_b32_e32 v0, s6
1531; SI-NEXT:    v_mov_b32_e32 v1, s7
1532; SI-NEXT:    s_mov_b32 s7, 0xf000
1533; SI-NEXT:    s_mov_b32 s6, -1
1534; SI-NEXT:    s_waitcnt vmcnt(0)
1535; SI-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0 offset:32
1536; SI-NEXT:    s_waitcnt vmcnt(0)
1537; SI-NEXT:    buffer_wbinvl1
1538; SI-NEXT:    v_readlane_b32 s7, v2, 1
1539; SI-NEXT:    v_readlane_b32 s6, v2, 0
1540; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
1541; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
1542; SI-NEXT:    s_mov_b64 exec, s[34:35]
1543; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
1544; SI-NEXT:    s_setpc_b64 s[30:31]
1545;
1546; VI-LABEL: global_atomic_sub_i64_noret_offset_scalar:
1547; VI:       ; %bb.0:
1548; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1549; VI-NEXT:    s_add_u32 s34, s4, 32
1550; VI-NEXT:    s_addc_u32 s35, s5, 0
1551; VI-NEXT:    v_mov_b32_e32 v2, s34
1552; VI-NEXT:    v_mov_b32_e32 v0, s6
1553; VI-NEXT:    v_mov_b32_e32 v1, s7
1554; VI-NEXT:    v_mov_b32_e32 v3, s35
1555; VI-NEXT:    flat_atomic_sub_x2 v[2:3], v[0:1]
1556; VI-NEXT:    s_waitcnt vmcnt(0)
1557; VI-NEXT:    buffer_wbinvl1_vol
1558; VI-NEXT:    s_setpc_b64 s[30:31]
1559;
1560; GFX9-LABEL: global_atomic_sub_i64_noret_offset_scalar:
1561; GFX9:       ; %bb.0:
1562; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1563; GFX9-NEXT:    v_mov_b32_e32 v0, s6
1564; GFX9-NEXT:    v_mov_b32_e32 v1, s7
1565; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1566; GFX9-NEXT:    global_atomic_sub_x2 v2, v[0:1], s[4:5] offset:32
1567; GFX9-NEXT:    s_waitcnt vmcnt(0)
1568; GFX9-NEXT:    buffer_wbinvl1_vol
1569; GFX9-NEXT:    s_setpc_b64 s[30:31]
1570  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
1571  %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst
1572  ret void
1573}
1574
1575define amdgpu_gfx i64 @global_atomic_sub_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
1576; SI-LABEL: global_atomic_sub_i64_ret_scalar:
1577; SI:       ; %bb.0:
1578; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1579; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
1580; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
1581; SI-NEXT:    s_mov_b64 exec, s[34:35]
1582; SI-NEXT:    s_waitcnt expcnt(0)
1583; SI-NEXT:    v_writelane_b32 v2, s6, 0
1584; SI-NEXT:    v_writelane_b32 v2, s7, 1
1585; SI-NEXT:    s_mov_b32 s34, s7
1586; SI-NEXT:    s_mov_b32 s35, s6
1587; SI-NEXT:    s_mov_b32 s7, 0xf000
1588; SI-NEXT:    s_mov_b32 s6, -1
1589; SI-NEXT:    v_mov_b32_e32 v0, s35
1590; SI-NEXT:    v_mov_b32_e32 v1, s34
1591; SI-NEXT:    s_waitcnt vmcnt(0)
1592; SI-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0 glc
1593; SI-NEXT:    s_waitcnt vmcnt(0)
1594; SI-NEXT:    buffer_wbinvl1
1595; SI-NEXT:    v_readlane_b32 s7, v2, 1
1596; SI-NEXT:    v_readlane_b32 s6, v2, 0
1597; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
1598; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
1599; SI-NEXT:    s_mov_b64 exec, s[34:35]
1600; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
1601; SI-NEXT:    s_setpc_b64 s[30:31]
1602;
1603; VI-LABEL: global_atomic_sub_i64_ret_scalar:
1604; VI:       ; %bb.0:
1605; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1606; VI-NEXT:    v_mov_b32_e32 v0, s6
1607; VI-NEXT:    v_mov_b32_e32 v1, s7
1608; VI-NEXT:    v_mov_b32_e32 v2, s4
1609; VI-NEXT:    v_mov_b32_e32 v3, s5
1610; VI-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
1611; VI-NEXT:    s_waitcnt vmcnt(0)
1612; VI-NEXT:    buffer_wbinvl1_vol
1613; VI-NEXT:    s_setpc_b64 s[30:31]
1614;
1615; GFX9-LABEL: global_atomic_sub_i64_ret_scalar:
1616; GFX9:       ; %bb.0:
1617; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1618; GFX9-NEXT:    v_mov_b32_e32 v0, s6
1619; GFX9-NEXT:    v_mov_b32_e32 v1, s7
1620; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1621; GFX9-NEXT:    global_atomic_sub_x2 v[0:1], v2, v[0:1], s[4:5] glc
1622; GFX9-NEXT:    s_waitcnt vmcnt(0)
1623; GFX9-NEXT:    buffer_wbinvl1_vol
1624; GFX9-NEXT:    s_setpc_b64 s[30:31]
1625  %result = atomicrmw sub ptr addrspace(1) %ptr, i64 %in seq_cst
1626  ret i64 %result
1627}
1628
1629define amdgpu_gfx i64 @global_atomic_sub_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
1630; SI-LABEL: global_atomic_sub_i64_ret_offset_scalar:
1631; SI:       ; %bb.0:
1632; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1633; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
1634; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
1635; SI-NEXT:    s_mov_b64 exec, s[34:35]
1636; SI-NEXT:    s_waitcnt expcnt(0)
1637; SI-NEXT:    v_writelane_b32 v2, s6, 0
1638; SI-NEXT:    v_writelane_b32 v2, s7, 1
1639; SI-NEXT:    v_mov_b32_e32 v0, s6
1640; SI-NEXT:    v_mov_b32_e32 v1, s7
1641; SI-NEXT:    s_mov_b32 s7, 0xf000
1642; SI-NEXT:    s_mov_b32 s6, -1
1643; SI-NEXT:    s_waitcnt vmcnt(0)
1644; SI-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0 offset:32 glc
1645; SI-NEXT:    s_waitcnt vmcnt(0)
1646; SI-NEXT:    buffer_wbinvl1
1647; SI-NEXT:    v_readlane_b32 s7, v2, 1
1648; SI-NEXT:    v_readlane_b32 s6, v2, 0
1649; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
1650; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
1651; SI-NEXT:    s_mov_b64 exec, s[34:35]
1652; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
1653; SI-NEXT:    s_setpc_b64 s[30:31]
1654;
1655; VI-LABEL: global_atomic_sub_i64_ret_offset_scalar:
1656; VI:       ; %bb.0:
1657; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1658; VI-NEXT:    s_add_u32 s34, s4, 32
1659; VI-NEXT:    s_addc_u32 s35, s5, 0
1660; VI-NEXT:    v_mov_b32_e32 v2, s34
1661; VI-NEXT:    v_mov_b32_e32 v0, s6
1662; VI-NEXT:    v_mov_b32_e32 v1, s7
1663; VI-NEXT:    v_mov_b32_e32 v3, s35
1664; VI-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
1665; VI-NEXT:    s_waitcnt vmcnt(0)
1666; VI-NEXT:    buffer_wbinvl1_vol
1667; VI-NEXT:    s_setpc_b64 s[30:31]
1668;
1669; GFX9-LABEL: global_atomic_sub_i64_ret_offset_scalar:
1670; GFX9:       ; %bb.0:
1671; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1672; GFX9-NEXT:    v_mov_b32_e32 v0, s6
1673; GFX9-NEXT:    v_mov_b32_e32 v1, s7
1674; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1675; GFX9-NEXT:    global_atomic_sub_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc
1676; GFX9-NEXT:    s_waitcnt vmcnt(0)
1677; GFX9-NEXT:    buffer_wbinvl1_vol
1678; GFX9-NEXT:    s_setpc_b64 s[30:31]
1679  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
1680  %result = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst
1681  ret i64 %result
1682}
1683
1684define void @global_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
1685; SI-LABEL: global_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory:
1686; SI:       ; %bb.0:
1687; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1688; SI-NEXT:    s_mov_b32 s6, 0
1689; SI-NEXT:    s_mov_b32 s7, 0xf000
1690; SI-NEXT:    s_mov_b32 s4, s6
1691; SI-NEXT:    s_mov_b32 s5, s6
1692; SI-NEXT:    buffer_atomic_sub_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32
1693; SI-NEXT:    s_waitcnt vmcnt(0)
1694; SI-NEXT:    buffer_wbinvl1
1695; SI-NEXT:    s_waitcnt expcnt(0)
1696; SI-NEXT:    s_setpc_b64 s[30:31]
1697;
1698; VI-LABEL: global_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory:
1699; VI:       ; %bb.0:
1700; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1701; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
1702; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1703; VI-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3]
1704; VI-NEXT:    s_waitcnt vmcnt(0)
1705; VI-NEXT:    buffer_wbinvl1_vol
1706; VI-NEXT:    s_setpc_b64 s[30:31]
1707;
1708; GFX9-LABEL: global_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory:
1709; GFX9:       ; %bb.0:
1710; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1711; GFX9-NEXT:    global_atomic_sub_x2 v[0:1], v[2:3], off offset:32
1712; GFX9-NEXT:    s_waitcnt vmcnt(0)
1713; GFX9-NEXT:    buffer_wbinvl1_vol
1714; GFX9-NEXT:    s_setpc_b64 s[30:31]
1715  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
1716  %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
1717  ret void
1718}
1719
1720define i64 @global_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
1721; SI-LABEL: global_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory:
1722; SI:       ; %bb.0:
1723; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1724; SI-NEXT:    s_mov_b32 s6, 0
1725; SI-NEXT:    s_mov_b32 s7, 0xf000
1726; SI-NEXT:    s_mov_b32 s4, s6
1727; SI-NEXT:    s_mov_b32 s5, s6
1728; SI-NEXT:    buffer_atomic_sub_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc
1729; SI-NEXT:    s_waitcnt vmcnt(0)
1730; SI-NEXT:    buffer_wbinvl1
1731; SI-NEXT:    v_mov_b32_e32 v0, v2
1732; SI-NEXT:    v_mov_b32_e32 v1, v3
1733; SI-NEXT:    s_waitcnt expcnt(0)
1734; SI-NEXT:    s_setpc_b64 s[30:31]
1735;
1736; VI-LABEL: global_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory:
1737; VI:       ; %bb.0:
1738; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1739; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
1740; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1741; VI-NEXT:    flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
1742; VI-NEXT:    s_waitcnt vmcnt(0)
1743; VI-NEXT:    buffer_wbinvl1_vol
1744; VI-NEXT:    s_setpc_b64 s[30:31]
1745;
1746; GFX9-LABEL: global_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory:
1747; GFX9:       ; %bb.0:
1748; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1749; GFX9-NEXT:    global_atomic_sub_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
1750; GFX9-NEXT:    s_waitcnt vmcnt(0)
1751; GFX9-NEXT:    buffer_wbinvl1_vol
1752; GFX9-NEXT:    s_setpc_b64 s[30:31]
1753  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
1754  %result = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
1755  ret i64 %result
1756}
1757
1758; ---------------------------------------------------------------------
1759; atomicrmw and
1760; ---------------------------------------------------------------------
1761
1762define void @global_atomic_and_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
1763; SI-LABEL: global_atomic_and_i64_noret:
1764; SI:       ; %bb.0:
1765; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1766; SI-NEXT:    s_mov_b32 s6, 0
1767; SI-NEXT:    s_mov_b32 s7, 0xf000
1768; SI-NEXT:    s_mov_b32 s4, s6
1769; SI-NEXT:    s_mov_b32 s5, s6
1770; SI-NEXT:    buffer_atomic_and_x2 v[2:3], v[0:1], s[4:7], 0 addr64
1771; SI-NEXT:    s_waitcnt vmcnt(0)
1772; SI-NEXT:    buffer_wbinvl1
1773; SI-NEXT:    s_waitcnt expcnt(0)
1774; SI-NEXT:    s_setpc_b64 s[30:31]
1775;
1776; VI-LABEL: global_atomic_and_i64_noret:
1777; VI:       ; %bb.0:
1778; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1779; VI-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3]
1780; VI-NEXT:    s_waitcnt vmcnt(0)
1781; VI-NEXT:    buffer_wbinvl1_vol
1782; VI-NEXT:    s_setpc_b64 s[30:31]
1783;
1784; GFX9-LABEL: global_atomic_and_i64_noret:
1785; GFX9:       ; %bb.0:
1786; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1787; GFX9-NEXT:    global_atomic_and_x2 v[0:1], v[2:3], off
1788; GFX9-NEXT:    s_waitcnt vmcnt(0)
1789; GFX9-NEXT:    buffer_wbinvl1_vol
1790; GFX9-NEXT:    s_setpc_b64 s[30:31]
1791  %tmp0 = atomicrmw and ptr addrspace(1) %ptr, i64 %in seq_cst
1792  ret void
1793}
1794
1795define void @global_atomic_and_i64_noret_offset(ptr addrspace(1) %out, i64 %in) {
1796; SI-LABEL: global_atomic_and_i64_noret_offset:
1797; SI:       ; %bb.0:
1798; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1799; SI-NEXT:    s_mov_b32 s6, 0
1800; SI-NEXT:    s_mov_b32 s7, 0xf000
1801; SI-NEXT:    s_mov_b32 s4, s6
1802; SI-NEXT:    s_mov_b32 s5, s6
1803; SI-NEXT:    buffer_atomic_and_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32
1804; SI-NEXT:    s_waitcnt vmcnt(0)
1805; SI-NEXT:    buffer_wbinvl1
1806; SI-NEXT:    s_waitcnt expcnt(0)
1807; SI-NEXT:    s_setpc_b64 s[30:31]
1808;
1809; VI-LABEL: global_atomic_and_i64_noret_offset:
1810; VI:       ; %bb.0:
1811; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1812; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
1813; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1814; VI-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3]
1815; VI-NEXT:    s_waitcnt vmcnt(0)
1816; VI-NEXT:    buffer_wbinvl1_vol
1817; VI-NEXT:    s_setpc_b64 s[30:31]
1818;
1819; GFX9-LABEL: global_atomic_and_i64_noret_offset:
1820; GFX9:       ; %bb.0:
1821; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1822; GFX9-NEXT:    global_atomic_and_x2 v[0:1], v[2:3], off offset:32
1823; GFX9-NEXT:    s_waitcnt vmcnt(0)
1824; GFX9-NEXT:    buffer_wbinvl1_vol
1825; GFX9-NEXT:    s_setpc_b64 s[30:31]
1826  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
1827  %tmp0 = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst
1828  ret void
1829}
1830
1831define i64 @global_atomic_and_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
1832; SI-LABEL: global_atomic_and_i64_ret:
1833; SI:       ; %bb.0:
1834; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1835; SI-NEXT:    s_mov_b32 s6, 0
1836; SI-NEXT:    s_mov_b32 s7, 0xf000
1837; SI-NEXT:    s_mov_b32 s4, s6
1838; SI-NEXT:    s_mov_b32 s5, s6
1839; SI-NEXT:    buffer_atomic_and_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
1840; SI-NEXT:    s_waitcnt vmcnt(0)
1841; SI-NEXT:    buffer_wbinvl1
1842; SI-NEXT:    v_mov_b32_e32 v0, v2
1843; SI-NEXT:    v_mov_b32_e32 v1, v3
1844; SI-NEXT:    s_waitcnt expcnt(0)
1845; SI-NEXT:    s_setpc_b64 s[30:31]
1846;
1847; VI-LABEL: global_atomic_and_i64_ret:
1848; VI:       ; %bb.0:
1849; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1850; VI-NEXT:    flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
1851; VI-NEXT:    s_waitcnt vmcnt(0)
1852; VI-NEXT:    buffer_wbinvl1_vol
1853; VI-NEXT:    s_setpc_b64 s[30:31]
1854;
1855; GFX9-LABEL: global_atomic_and_i64_ret:
1856; GFX9:       ; %bb.0:
1857; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1858; GFX9-NEXT:    global_atomic_and_x2 v[0:1], v[0:1], v[2:3], off glc
1859; GFX9-NEXT:    s_waitcnt vmcnt(0)
1860; GFX9-NEXT:    buffer_wbinvl1_vol
1861; GFX9-NEXT:    s_setpc_b64 s[30:31]
1862  %result = atomicrmw and ptr addrspace(1) %ptr, i64 %in seq_cst
1863  ret i64 %result
1864}
1865
1866define i64 @global_atomic_and_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
1867; SI-LABEL: global_atomic_and_i64_ret_offset:
1868; SI:       ; %bb.0:
1869; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1870; SI-NEXT:    s_mov_b32 s6, 0
1871; SI-NEXT:    s_mov_b32 s7, 0xf000
1872; SI-NEXT:    s_mov_b32 s4, s6
1873; SI-NEXT:    s_mov_b32 s5, s6
1874; SI-NEXT:    buffer_atomic_and_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc
1875; SI-NEXT:    s_waitcnt vmcnt(0)
1876; SI-NEXT:    buffer_wbinvl1
1877; SI-NEXT:    v_mov_b32_e32 v0, v2
1878; SI-NEXT:    v_mov_b32_e32 v1, v3
1879; SI-NEXT:    s_waitcnt expcnt(0)
1880; SI-NEXT:    s_setpc_b64 s[30:31]
1881;
1882; VI-LABEL: global_atomic_and_i64_ret_offset:
1883; VI:       ; %bb.0:
1884; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1885; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
1886; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1887; VI-NEXT:    flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
1888; VI-NEXT:    s_waitcnt vmcnt(0)
1889; VI-NEXT:    buffer_wbinvl1_vol
1890; VI-NEXT:    s_setpc_b64 s[30:31]
1891;
1892; GFX9-LABEL: global_atomic_and_i64_ret_offset:
1893; GFX9:       ; %bb.0:
1894; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1895; GFX9-NEXT:    global_atomic_and_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
1896; GFX9-NEXT:    s_waitcnt vmcnt(0)
1897; GFX9-NEXT:    buffer_wbinvl1_vol
1898; GFX9-NEXT:    s_setpc_b64 s[30:31]
1899  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
1900  %result = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst
1901  ret i64 %result
1902}
1903
1904define amdgpu_gfx void @global_atomic_and_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
1905; SI-LABEL: global_atomic_and_i64_noret_scalar:
1906; SI:       ; %bb.0:
1907; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1908; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
1909; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
1910; SI-NEXT:    s_mov_b64 exec, s[34:35]
1911; SI-NEXT:    s_waitcnt expcnt(0)
1912; SI-NEXT:    v_writelane_b32 v2, s6, 0
1913; SI-NEXT:    v_writelane_b32 v2, s7, 1
1914; SI-NEXT:    s_mov_b32 s34, s7
1915; SI-NEXT:    s_mov_b32 s35, s6
1916; SI-NEXT:    s_mov_b32 s7, 0xf000
1917; SI-NEXT:    s_mov_b32 s6, -1
1918; SI-NEXT:    v_mov_b32_e32 v0, s35
1919; SI-NEXT:    v_mov_b32_e32 v1, s34
1920; SI-NEXT:    s_waitcnt vmcnt(0)
1921; SI-NEXT:    buffer_atomic_and_x2 v[0:1], off, s[4:7], 0
1922; SI-NEXT:    s_waitcnt vmcnt(0)
1923; SI-NEXT:    buffer_wbinvl1
1924; SI-NEXT:    v_readlane_b32 s7, v2, 1
1925; SI-NEXT:    v_readlane_b32 s6, v2, 0
1926; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
1927; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
1928; SI-NEXT:    s_mov_b64 exec, s[34:35]
1929; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
1930; SI-NEXT:    s_setpc_b64 s[30:31]
1931;
1932; VI-LABEL: global_atomic_and_i64_noret_scalar:
1933; VI:       ; %bb.0:
1934; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1935; VI-NEXT:    v_mov_b32_e32 v0, s6
1936; VI-NEXT:    v_mov_b32_e32 v1, s7
1937; VI-NEXT:    v_mov_b32_e32 v2, s4
1938; VI-NEXT:    v_mov_b32_e32 v3, s5
1939; VI-NEXT:    flat_atomic_and_x2 v[2:3], v[0:1]
1940; VI-NEXT:    s_waitcnt vmcnt(0)
1941; VI-NEXT:    buffer_wbinvl1_vol
1942; VI-NEXT:    s_setpc_b64 s[30:31]
1943;
1944; GFX9-LABEL: global_atomic_and_i64_noret_scalar:
1945; GFX9:       ; %bb.0:
1946; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1947; GFX9-NEXT:    v_mov_b32_e32 v0, s6
1948; GFX9-NEXT:    v_mov_b32_e32 v1, s7
1949; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1950; GFX9-NEXT:    global_atomic_and_x2 v2, v[0:1], s[4:5]
1951; GFX9-NEXT:    s_waitcnt vmcnt(0)
1952; GFX9-NEXT:    buffer_wbinvl1_vol
1953; GFX9-NEXT:    s_setpc_b64 s[30:31]
1954  %tmp0 = atomicrmw and ptr addrspace(1) %ptr, i64 %in seq_cst
1955  ret void
1956}
1957
1958define amdgpu_gfx void @global_atomic_and_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
1959; SI-LABEL: global_atomic_and_i64_noret_offset_scalar:
1960; SI:       ; %bb.0:
1961; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1962; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
1963; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
1964; SI-NEXT:    s_mov_b64 exec, s[34:35]
1965; SI-NEXT:    s_waitcnt expcnt(0)
1966; SI-NEXT:    v_writelane_b32 v2, s6, 0
1967; SI-NEXT:    v_writelane_b32 v2, s7, 1
1968; SI-NEXT:    v_mov_b32_e32 v0, s6
1969; SI-NEXT:    v_mov_b32_e32 v1, s7
1970; SI-NEXT:    s_mov_b32 s7, 0xf000
1971; SI-NEXT:    s_mov_b32 s6, -1
1972; SI-NEXT:    s_waitcnt vmcnt(0)
1973; SI-NEXT:    buffer_atomic_and_x2 v[0:1], off, s[4:7], 0 offset:32
1974; SI-NEXT:    s_waitcnt vmcnt(0)
1975; SI-NEXT:    buffer_wbinvl1
1976; SI-NEXT:    v_readlane_b32 s7, v2, 1
1977; SI-NEXT:    v_readlane_b32 s6, v2, 0
1978; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
1979; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
1980; SI-NEXT:    s_mov_b64 exec, s[34:35]
1981; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
1982; SI-NEXT:    s_setpc_b64 s[30:31]
1983;
1984; VI-LABEL: global_atomic_and_i64_noret_offset_scalar:
1985; VI:       ; %bb.0:
1986; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1987; VI-NEXT:    s_add_u32 s34, s4, 32
1988; VI-NEXT:    s_addc_u32 s35, s5, 0
1989; VI-NEXT:    v_mov_b32_e32 v2, s34
1990; VI-NEXT:    v_mov_b32_e32 v0, s6
1991; VI-NEXT:    v_mov_b32_e32 v1, s7
1992; VI-NEXT:    v_mov_b32_e32 v3, s35
1993; VI-NEXT:    flat_atomic_and_x2 v[2:3], v[0:1]
1994; VI-NEXT:    s_waitcnt vmcnt(0)
1995; VI-NEXT:    buffer_wbinvl1_vol
1996; VI-NEXT:    s_setpc_b64 s[30:31]
1997;
1998; GFX9-LABEL: global_atomic_and_i64_noret_offset_scalar:
1999; GFX9:       ; %bb.0:
2000; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2001; GFX9-NEXT:    v_mov_b32_e32 v0, s6
2002; GFX9-NEXT:    v_mov_b32_e32 v1, s7
2003; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2004; GFX9-NEXT:    global_atomic_and_x2 v2, v[0:1], s[4:5] offset:32
2005; GFX9-NEXT:    s_waitcnt vmcnt(0)
2006; GFX9-NEXT:    buffer_wbinvl1_vol
2007; GFX9-NEXT:    s_setpc_b64 s[30:31]
2008  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
2009  %tmp0 = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst
2010  ret void
2011}
2012
2013define amdgpu_gfx i64 @global_atomic_and_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
2014; SI-LABEL: global_atomic_and_i64_ret_scalar:
2015; SI:       ; %bb.0:
2016; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2017; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
2018; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
2019; SI-NEXT:    s_mov_b64 exec, s[34:35]
2020; SI-NEXT:    s_waitcnt expcnt(0)
2021; SI-NEXT:    v_writelane_b32 v2, s6, 0
2022; SI-NEXT:    v_writelane_b32 v2, s7, 1
2023; SI-NEXT:    s_mov_b32 s34, s7
2024; SI-NEXT:    s_mov_b32 s35, s6
2025; SI-NEXT:    s_mov_b32 s7, 0xf000
2026; SI-NEXT:    s_mov_b32 s6, -1
2027; SI-NEXT:    v_mov_b32_e32 v0, s35
2028; SI-NEXT:    v_mov_b32_e32 v1, s34
2029; SI-NEXT:    s_waitcnt vmcnt(0)
2030; SI-NEXT:    buffer_atomic_and_x2 v[0:1], off, s[4:7], 0 glc
2031; SI-NEXT:    s_waitcnt vmcnt(0)
2032; SI-NEXT:    buffer_wbinvl1
2033; SI-NEXT:    v_readlane_b32 s7, v2, 1
2034; SI-NEXT:    v_readlane_b32 s6, v2, 0
2035; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
2036; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
2037; SI-NEXT:    s_mov_b64 exec, s[34:35]
2038; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
2039; SI-NEXT:    s_setpc_b64 s[30:31]
2040;
2041; VI-LABEL: global_atomic_and_i64_ret_scalar:
2042; VI:       ; %bb.0:
2043; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2044; VI-NEXT:    v_mov_b32_e32 v0, s6
2045; VI-NEXT:    v_mov_b32_e32 v1, s7
2046; VI-NEXT:    v_mov_b32_e32 v2, s4
2047; VI-NEXT:    v_mov_b32_e32 v3, s5
2048; VI-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
2049; VI-NEXT:    s_waitcnt vmcnt(0)
2050; VI-NEXT:    buffer_wbinvl1_vol
2051; VI-NEXT:    s_setpc_b64 s[30:31]
2052;
2053; GFX9-LABEL: global_atomic_and_i64_ret_scalar:
2054; GFX9:       ; %bb.0:
2055; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2056; GFX9-NEXT:    v_mov_b32_e32 v0, s6
2057; GFX9-NEXT:    v_mov_b32_e32 v1, s7
2058; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2059; GFX9-NEXT:    global_atomic_and_x2 v[0:1], v2, v[0:1], s[4:5] glc
2060; GFX9-NEXT:    s_waitcnt vmcnt(0)
2061; GFX9-NEXT:    buffer_wbinvl1_vol
2062; GFX9-NEXT:    s_setpc_b64 s[30:31]
2063  %result = atomicrmw and ptr addrspace(1) %ptr, i64 %in seq_cst
2064  ret i64 %result
2065}
2066
2067define amdgpu_gfx i64 @global_atomic_and_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
2068; SI-LABEL: global_atomic_and_i64_ret_offset_scalar:
2069; SI:       ; %bb.0:
2070; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2071; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
2072; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
2073; SI-NEXT:    s_mov_b64 exec, s[34:35]
2074; SI-NEXT:    s_waitcnt expcnt(0)
2075; SI-NEXT:    v_writelane_b32 v2, s6, 0
2076; SI-NEXT:    v_writelane_b32 v2, s7, 1
2077; SI-NEXT:    v_mov_b32_e32 v0, s6
2078; SI-NEXT:    v_mov_b32_e32 v1, s7
2079; SI-NEXT:    s_mov_b32 s7, 0xf000
2080; SI-NEXT:    s_mov_b32 s6, -1
2081; SI-NEXT:    s_waitcnt vmcnt(0)
2082; SI-NEXT:    buffer_atomic_and_x2 v[0:1], off, s[4:7], 0 offset:32 glc
2083; SI-NEXT:    s_waitcnt vmcnt(0)
2084; SI-NEXT:    buffer_wbinvl1
2085; SI-NEXT:    v_readlane_b32 s7, v2, 1
2086; SI-NEXT:    v_readlane_b32 s6, v2, 0
2087; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
2088; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
2089; SI-NEXT:    s_mov_b64 exec, s[34:35]
2090; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
2091; SI-NEXT:    s_setpc_b64 s[30:31]
2092;
2093; VI-LABEL: global_atomic_and_i64_ret_offset_scalar:
2094; VI:       ; %bb.0:
2095; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2096; VI-NEXT:    s_add_u32 s34, s4, 32
2097; VI-NEXT:    s_addc_u32 s35, s5, 0
2098; VI-NEXT:    v_mov_b32_e32 v2, s34
2099; VI-NEXT:    v_mov_b32_e32 v0, s6
2100; VI-NEXT:    v_mov_b32_e32 v1, s7
2101; VI-NEXT:    v_mov_b32_e32 v3, s35
2102; VI-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
2103; VI-NEXT:    s_waitcnt vmcnt(0)
2104; VI-NEXT:    buffer_wbinvl1_vol
2105; VI-NEXT:    s_setpc_b64 s[30:31]
2106;
2107; GFX9-LABEL: global_atomic_and_i64_ret_offset_scalar:
2108; GFX9:       ; %bb.0:
2109; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2110; GFX9-NEXT:    v_mov_b32_e32 v0, s6
2111; GFX9-NEXT:    v_mov_b32_e32 v1, s7
2112; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2113; GFX9-NEXT:    global_atomic_and_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc
2114; GFX9-NEXT:    s_waitcnt vmcnt(0)
2115; GFX9-NEXT:    buffer_wbinvl1_vol
2116; GFX9-NEXT:    s_setpc_b64 s[30:31]
2117  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
2118  %result = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst
2119  ret i64 %result
2120}
2121
2122define void @global_atomic_and_i64_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
2123; SI-LABEL: global_atomic_and_i64_noret_offset__amdgpu_no_remote_memory:
2124; SI:       ; %bb.0:
2125; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2126; SI-NEXT:    s_mov_b32 s6, 0
2127; SI-NEXT:    s_mov_b32 s7, 0xf000
2128; SI-NEXT:    s_mov_b32 s4, s6
2129; SI-NEXT:    s_mov_b32 s5, s6
2130; SI-NEXT:    buffer_atomic_and_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32
2131; SI-NEXT:    s_waitcnt vmcnt(0)
2132; SI-NEXT:    buffer_wbinvl1
2133; SI-NEXT:    s_waitcnt expcnt(0)
2134; SI-NEXT:    s_setpc_b64 s[30:31]
2135;
2136; VI-LABEL: global_atomic_and_i64_noret_offset__amdgpu_no_remote_memory:
2137; VI:       ; %bb.0:
2138; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2139; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
2140; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2141; VI-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3]
2142; VI-NEXT:    s_waitcnt vmcnt(0)
2143; VI-NEXT:    buffer_wbinvl1_vol
2144; VI-NEXT:    s_setpc_b64 s[30:31]
2145;
2146; GFX9-LABEL: global_atomic_and_i64_noret_offset__amdgpu_no_remote_memory:
2147; GFX9:       ; %bb.0:
2148; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2149; GFX9-NEXT:    global_atomic_and_x2 v[0:1], v[2:3], off offset:32
2150; GFX9-NEXT:    s_waitcnt vmcnt(0)
2151; GFX9-NEXT:    buffer_wbinvl1_vol
2152; GFX9-NEXT:    s_setpc_b64 s[30:31]
2153  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
2154  %tmp0 = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
2155  ret void
2156}
2157
2158define i64 @global_atomic_and_i64_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
2159; SI-LABEL: global_atomic_and_i64_ret_offset__amdgpu_no_remote_memory:
2160; SI:       ; %bb.0:
2161; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2162; SI-NEXT:    s_mov_b32 s6, 0
2163; SI-NEXT:    s_mov_b32 s7, 0xf000
2164; SI-NEXT:    s_mov_b32 s4, s6
2165; SI-NEXT:    s_mov_b32 s5, s6
2166; SI-NEXT:    buffer_atomic_and_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc
2167; SI-NEXT:    s_waitcnt vmcnt(0)
2168; SI-NEXT:    buffer_wbinvl1
2169; SI-NEXT:    v_mov_b32_e32 v0, v2
2170; SI-NEXT:    v_mov_b32_e32 v1, v3
2171; SI-NEXT:    s_waitcnt expcnt(0)
2172; SI-NEXT:    s_setpc_b64 s[30:31]
2173;
2174; VI-LABEL: global_atomic_and_i64_ret_offset__amdgpu_no_remote_memory:
2175; VI:       ; %bb.0:
2176; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2177; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
2178; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2179; VI-NEXT:    flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
2180; VI-NEXT:    s_waitcnt vmcnt(0)
2181; VI-NEXT:    buffer_wbinvl1_vol
2182; VI-NEXT:    s_setpc_b64 s[30:31]
2183;
2184; GFX9-LABEL: global_atomic_and_i64_ret_offset__amdgpu_no_remote_memory:
2185; GFX9:       ; %bb.0:
2186; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2187; GFX9-NEXT:    global_atomic_and_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
2188; GFX9-NEXT:    s_waitcnt vmcnt(0)
2189; GFX9-NEXT:    buffer_wbinvl1_vol
2190; GFX9-NEXT:    s_setpc_b64 s[30:31]
2191  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
2192  %result = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
2193  ret i64 %result
2194}
2195
2196; ---------------------------------------------------------------------
2197; atomicrmw nand
2198; ---------------------------------------------------------------------
2199
2200define void @global_atomic_nand_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
2201; SI-LABEL: global_atomic_nand_i64_noret:
2202; SI:       ; %bb.0:
2203; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2204; SI-NEXT:    s_mov_b32 s6, 0
2205; SI-NEXT:    s_mov_b32 s7, 0xf000
2206; SI-NEXT:    s_mov_b32 s4, s6
2207; SI-NEXT:    s_mov_b32 s5, s6
2208; SI-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
2209; SI-NEXT:    s_mov_b64 s[8:9], 0
2210; SI-NEXT:  .LBB50_1: ; %atomicrmw.start
2211; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
2212; SI-NEXT:    s_waitcnt vmcnt(0)
2213; SI-NEXT:    v_and_b32_e32 v4, v7, v3
2214; SI-NEXT:    s_waitcnt expcnt(0)
2215; SI-NEXT:    v_and_b32_e32 v8, v6, v2
2216; SI-NEXT:    v_not_b32_e32 v5, v4
2217; SI-NEXT:    v_not_b32_e32 v4, v8
2218; SI-NEXT:    v_mov_b32_e32 v11, v7
2219; SI-NEXT:    v_mov_b32_e32 v10, v6
2220; SI-NEXT:    v_mov_b32_e32 v9, v5
2221; SI-NEXT:    v_mov_b32_e32 v8, v4
2222; SI-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
2223; SI-NEXT:    s_waitcnt vmcnt(0)
2224; SI-NEXT:    buffer_wbinvl1
2225; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
2226; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
2227; SI-NEXT:    v_mov_b32_e32 v6, v8
2228; SI-NEXT:    v_mov_b32_e32 v7, v9
2229; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
2230; SI-NEXT:    s_cbranch_execnz .LBB50_1
2231; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
2232; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
2233; SI-NEXT:    s_waitcnt expcnt(0)
2234; SI-NEXT:    s_setpc_b64 s[30:31]
2235;
2236; VI-LABEL: global_atomic_nand_i64_noret:
2237; VI:       ; %bb.0:
2238; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2239; VI-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
2240; VI-NEXT:    s_mov_b64 s[4:5], 0
2241; VI-NEXT:  .LBB50_1: ; %atomicrmw.start
2242; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
2243; VI-NEXT:    s_waitcnt vmcnt(0)
2244; VI-NEXT:    v_and_b32_e32 v4, v7, v3
2245; VI-NEXT:    v_and_b32_e32 v8, v6, v2
2246; VI-NEXT:    v_not_b32_e32 v5, v4
2247; VI-NEXT:    v_not_b32_e32 v4, v8
2248; VI-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
2249; VI-NEXT:    s_waitcnt vmcnt(0)
2250; VI-NEXT:    buffer_wbinvl1_vol
2251; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
2252; VI-NEXT:    v_mov_b32_e32 v7, v5
2253; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2254; VI-NEXT:    v_mov_b32_e32 v6, v4
2255; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2256; VI-NEXT:    s_cbranch_execnz .LBB50_1
2257; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
2258; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
2259; VI-NEXT:    s_setpc_b64 s[30:31]
2260;
2261; GFX9-LABEL: global_atomic_nand_i64_noret:
2262; GFX9:       ; %bb.0:
2263; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2264; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
2265; GFX9-NEXT:    s_mov_b64 s[4:5], 0
2266; GFX9-NEXT:  .LBB50_1: ; %atomicrmw.start
2267; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
2268; GFX9-NEXT:    s_waitcnt vmcnt(0)
2269; GFX9-NEXT:    v_and_b32_e32 v4, v7, v3
2270; GFX9-NEXT:    v_and_b32_e32 v8, v6, v2
2271; GFX9-NEXT:    v_not_b32_e32 v5, v4
2272; GFX9-NEXT:    v_not_b32_e32 v4, v8
2273; GFX9-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
2274; GFX9-NEXT:    s_waitcnt vmcnt(0)
2275; GFX9-NEXT:    buffer_wbinvl1_vol
2276; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
2277; GFX9-NEXT:    v_mov_b32_e32 v7, v5
2278; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2279; GFX9-NEXT:    v_mov_b32_e32 v6, v4
2280; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2281; GFX9-NEXT:    s_cbranch_execnz .LBB50_1
2282; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
2283; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
2284; GFX9-NEXT:    s_setpc_b64 s[30:31]
2285  %tmp0 = atomicrmw nand ptr addrspace(1) %ptr, i64 %in seq_cst
2286  ret void
2287}
2288
2289define void @global_atomic_nand_i64_noret_offset(ptr addrspace(1) %out, i64 %in) {
2290; SI-LABEL: global_atomic_nand_i64_noret_offset:
2291; SI:       ; %bb.0:
2292; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2293; SI-NEXT:    s_mov_b32 s6, 0
2294; SI-NEXT:    s_mov_b32 s7, 0xf000
2295; SI-NEXT:    s_mov_b32 s4, s6
2296; SI-NEXT:    s_mov_b32 s5, s6
2297; SI-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32
2298; SI-NEXT:    s_mov_b64 s[8:9], 0
2299; SI-NEXT:  .LBB51_1: ; %atomicrmw.start
2300; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
2301; SI-NEXT:    s_waitcnt vmcnt(0)
2302; SI-NEXT:    v_and_b32_e32 v4, v7, v3
2303; SI-NEXT:    s_waitcnt expcnt(0)
2304; SI-NEXT:    v_and_b32_e32 v8, v6, v2
2305; SI-NEXT:    v_not_b32_e32 v5, v4
2306; SI-NEXT:    v_not_b32_e32 v4, v8
2307; SI-NEXT:    v_mov_b32_e32 v11, v7
2308; SI-NEXT:    v_mov_b32_e32 v10, v6
2309; SI-NEXT:    v_mov_b32_e32 v9, v5
2310; SI-NEXT:    v_mov_b32_e32 v8, v4
2311; SI-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
2312; SI-NEXT:    s_waitcnt vmcnt(0)
2313; SI-NEXT:    buffer_wbinvl1
2314; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
2315; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
2316; SI-NEXT:    v_mov_b32_e32 v6, v8
2317; SI-NEXT:    v_mov_b32_e32 v7, v9
2318; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
2319; SI-NEXT:    s_cbranch_execnz .LBB51_1
2320; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
2321; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
2322; SI-NEXT:    s_waitcnt expcnt(0)
2323; SI-NEXT:    s_setpc_b64 s[30:31]
2324;
2325; VI-LABEL: global_atomic_nand_i64_noret_offset:
2326; VI:       ; %bb.0:
2327; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2328; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
2329; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2330; VI-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
2331; VI-NEXT:    s_mov_b64 s[4:5], 0
2332; VI-NEXT:  .LBB51_1: ; %atomicrmw.start
2333; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
2334; VI-NEXT:    s_waitcnt vmcnt(0)
2335; VI-NEXT:    v_and_b32_e32 v4, v7, v3
2336; VI-NEXT:    v_and_b32_e32 v8, v6, v2
2337; VI-NEXT:    v_not_b32_e32 v5, v4
2338; VI-NEXT:    v_not_b32_e32 v4, v8
2339; VI-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
2340; VI-NEXT:    s_waitcnt vmcnt(0)
2341; VI-NEXT:    buffer_wbinvl1_vol
2342; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
2343; VI-NEXT:    v_mov_b32_e32 v7, v5
2344; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2345; VI-NEXT:    v_mov_b32_e32 v6, v4
2346; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2347; VI-NEXT:    s_cbranch_execnz .LBB51_1
2348; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
2349; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
2350; VI-NEXT:    s_setpc_b64 s[30:31]
2351;
2352; GFX9-LABEL: global_atomic_nand_i64_noret_offset:
2353; GFX9:       ; %bb.0:
2354; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2355; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off offset:32
2356; GFX9-NEXT:    s_mov_b64 s[4:5], 0
2357; GFX9-NEXT:  .LBB51_1: ; %atomicrmw.start
2358; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
2359; GFX9-NEXT:    s_waitcnt vmcnt(0)
2360; GFX9-NEXT:    v_and_b32_e32 v4, v7, v3
2361; GFX9-NEXT:    v_and_b32_e32 v8, v6, v2
2362; GFX9-NEXT:    v_not_b32_e32 v5, v4
2363; GFX9-NEXT:    v_not_b32_e32 v4, v8
2364; GFX9-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
2365; GFX9-NEXT:    s_waitcnt vmcnt(0)
2366; GFX9-NEXT:    buffer_wbinvl1_vol
2367; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
2368; GFX9-NEXT:    v_mov_b32_e32 v7, v5
2369; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2370; GFX9-NEXT:    v_mov_b32_e32 v6, v4
2371; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2372; GFX9-NEXT:    s_cbranch_execnz .LBB51_1
2373; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
2374; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
2375; GFX9-NEXT:    s_setpc_b64 s[30:31]
2376  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
2377  %tmp0 = atomicrmw nand ptr addrspace(1) %gep, i64 %in seq_cst
2378  ret void
2379}
2380
2381define i64 @global_atomic_nand_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
2382; SI-LABEL: global_atomic_nand_i64_ret:
2383; SI:       ; %bb.0:
2384; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2385; SI-NEXT:    v_mov_b32_e32 v6, v3
2386; SI-NEXT:    v_mov_b32_e32 v7, v2
2387; SI-NEXT:    v_mov_b32_e32 v5, v1
2388; SI-NEXT:    v_mov_b32_e32 v4, v0
2389; SI-NEXT:    s_mov_b32 s6, 0
2390; SI-NEXT:    s_mov_b32 s7, 0xf000
2391; SI-NEXT:    s_mov_b32 s4, s6
2392; SI-NEXT:    s_mov_b32 s5, s6
2393; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
2394; SI-NEXT:    s_mov_b64 s[8:9], 0
2395; SI-NEXT:  .LBB52_1: ; %atomicrmw.start
2396; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
2397; SI-NEXT:    s_waitcnt vmcnt(0)
2398; SI-NEXT:    v_mov_b32_e32 v11, v1
2399; SI-NEXT:    v_mov_b32_e32 v10, v0
2400; SI-NEXT:    s_waitcnt expcnt(0)
2401; SI-NEXT:    v_and_b32_e32 v0, v11, v6
2402; SI-NEXT:    v_and_b32_e32 v1, v10, v7
2403; SI-NEXT:    v_not_b32_e32 v9, v0
2404; SI-NEXT:    v_not_b32_e32 v8, v1
2405; SI-NEXT:    v_mov_b32_e32 v0, v8
2406; SI-NEXT:    v_mov_b32_e32 v1, v9
2407; SI-NEXT:    v_mov_b32_e32 v2, v10
2408; SI-NEXT:    v_mov_b32_e32 v3, v11
2409; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
2410; SI-NEXT:    s_waitcnt vmcnt(0)
2411; SI-NEXT:    buffer_wbinvl1
2412; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
2413; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
2414; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
2415; SI-NEXT:    s_cbranch_execnz .LBB52_1
2416; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
2417; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
2418; SI-NEXT:    s_waitcnt expcnt(0)
2419; SI-NEXT:    s_setpc_b64 s[30:31]
2420;
2421; VI-LABEL: global_atomic_nand_i64_ret:
2422; VI:       ; %bb.0:
2423; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2424; VI-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
2425; VI-NEXT:    s_mov_b64 s[4:5], 0
2426; VI-NEXT:  .LBB52_1: ; %atomicrmw.start
2427; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
2428; VI-NEXT:    s_waitcnt vmcnt(0)
2429; VI-NEXT:    v_mov_b32_e32 v7, v5
2430; VI-NEXT:    v_mov_b32_e32 v6, v4
2431; VI-NEXT:    v_and_b32_e32 v4, v7, v3
2432; VI-NEXT:    v_and_b32_e32 v8, v6, v2
2433; VI-NEXT:    v_not_b32_e32 v5, v4
2434; VI-NEXT:    v_not_b32_e32 v4, v8
2435; VI-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
2436; VI-NEXT:    s_waitcnt vmcnt(0)
2437; VI-NEXT:    buffer_wbinvl1_vol
2438; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
2439; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2440; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2441; VI-NEXT:    s_cbranch_execnz .LBB52_1
2442; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
2443; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
2444; VI-NEXT:    v_mov_b32_e32 v0, v4
2445; VI-NEXT:    v_mov_b32_e32 v1, v5
2446; VI-NEXT:    s_setpc_b64 s[30:31]
2447;
2448; GFX9-LABEL: global_atomic_nand_i64_ret:
2449; GFX9:       ; %bb.0:
2450; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2451; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
2452; GFX9-NEXT:    s_mov_b64 s[4:5], 0
2453; GFX9-NEXT:  .LBB52_1: ; %atomicrmw.start
2454; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
2455; GFX9-NEXT:    s_waitcnt vmcnt(0)
2456; GFX9-NEXT:    v_mov_b32_e32 v7, v5
2457; GFX9-NEXT:    v_mov_b32_e32 v6, v4
2458; GFX9-NEXT:    v_and_b32_e32 v4, v7, v3
2459; GFX9-NEXT:    v_and_b32_e32 v8, v6, v2
2460; GFX9-NEXT:    v_not_b32_e32 v5, v4
2461; GFX9-NEXT:    v_not_b32_e32 v4, v8
2462; GFX9-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
2463; GFX9-NEXT:    s_waitcnt vmcnt(0)
2464; GFX9-NEXT:    buffer_wbinvl1_vol
2465; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
2466; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2467; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2468; GFX9-NEXT:    s_cbranch_execnz .LBB52_1
2469; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
2470; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
2471; GFX9-NEXT:    v_mov_b32_e32 v0, v4
2472; GFX9-NEXT:    v_mov_b32_e32 v1, v5
2473; GFX9-NEXT:    s_setpc_b64 s[30:31]
2474  %result = atomicrmw nand ptr addrspace(1) %ptr, i64 %in seq_cst
2475  ret i64 %result
2476}
2477
2478define i64 @global_atomic_nand_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
2479; SI-LABEL: global_atomic_nand_i64_ret_offset:
2480; SI:       ; %bb.0:
2481; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2482; SI-NEXT:    v_mov_b32_e32 v6, v3
2483; SI-NEXT:    v_mov_b32_e32 v7, v2
2484; SI-NEXT:    v_mov_b32_e32 v5, v1
2485; SI-NEXT:    v_mov_b32_e32 v4, v0
2486; SI-NEXT:    s_mov_b32 s6, 0
2487; SI-NEXT:    s_mov_b32 s7, 0xf000
2488; SI-NEXT:    s_mov_b32 s4, s6
2489; SI-NEXT:    s_mov_b32 s5, s6
2490; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 offset:32
2491; SI-NEXT:    s_mov_b64 s[8:9], 0
2492; SI-NEXT:  .LBB53_1: ; %atomicrmw.start
2493; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
2494; SI-NEXT:    s_waitcnt vmcnt(0)
2495; SI-NEXT:    v_mov_b32_e32 v11, v1
2496; SI-NEXT:    v_mov_b32_e32 v10, v0
2497; SI-NEXT:    s_waitcnt expcnt(0)
2498; SI-NEXT:    v_and_b32_e32 v0, v11, v6
2499; SI-NEXT:    v_and_b32_e32 v1, v10, v7
2500; SI-NEXT:    v_not_b32_e32 v9, v0
2501; SI-NEXT:    v_not_b32_e32 v8, v1
2502; SI-NEXT:    v_mov_b32_e32 v0, v8
2503; SI-NEXT:    v_mov_b32_e32 v1, v9
2504; SI-NEXT:    v_mov_b32_e32 v2, v10
2505; SI-NEXT:    v_mov_b32_e32 v3, v11
2506; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 offset:32 glc
2507; SI-NEXT:    s_waitcnt vmcnt(0)
2508; SI-NEXT:    buffer_wbinvl1
2509; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
2510; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
2511; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
2512; SI-NEXT:    s_cbranch_execnz .LBB53_1
2513; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
2514; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
2515; SI-NEXT:    s_waitcnt expcnt(0)
2516; SI-NEXT:    s_setpc_b64 s[30:31]
2517;
2518; VI-LABEL: global_atomic_nand_i64_ret_offset:
2519; VI:       ; %bb.0:
2520; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2521; VI-NEXT:    v_add_u32_e32 v4, vcc, 32, v0
2522; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
2523; VI-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
2524; VI-NEXT:    s_mov_b64 s[4:5], 0
2525; VI-NEXT:  .LBB53_1: ; %atomicrmw.start
2526; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
2527; VI-NEXT:    s_waitcnt vmcnt(0)
2528; VI-NEXT:    v_mov_b32_e32 v9, v1
2529; VI-NEXT:    v_mov_b32_e32 v8, v0
2530; VI-NEXT:    v_and_b32_e32 v0, v9, v3
2531; VI-NEXT:    v_and_b32_e32 v1, v8, v2
2532; VI-NEXT:    v_not_b32_e32 v7, v0
2533; VI-NEXT:    v_not_b32_e32 v6, v1
2534; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
2535; VI-NEXT:    s_waitcnt vmcnt(0)
2536; VI-NEXT:    buffer_wbinvl1_vol
2537; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
2538; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2539; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2540; VI-NEXT:    s_cbranch_execnz .LBB53_1
2541; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
2542; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
2543; VI-NEXT:    s_setpc_b64 s[30:31]
2544;
2545; GFX9-LABEL: global_atomic_nand_i64_ret_offset:
2546; GFX9:       ; %bb.0:
2547; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2548; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:32
2549; GFX9-NEXT:    s_mov_b64 s[4:5], 0
2550; GFX9-NEXT:  .LBB53_1: ; %atomicrmw.start
2551; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
2552; GFX9-NEXT:    s_waitcnt vmcnt(0)
2553; GFX9-NEXT:    v_mov_b32_e32 v7, v5
2554; GFX9-NEXT:    v_mov_b32_e32 v6, v4
2555; GFX9-NEXT:    v_and_b32_e32 v4, v7, v3
2556; GFX9-NEXT:    v_and_b32_e32 v8, v6, v2
2557; GFX9-NEXT:    v_not_b32_e32 v5, v4
2558; GFX9-NEXT:    v_not_b32_e32 v4, v8
2559; GFX9-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
2560; GFX9-NEXT:    s_waitcnt vmcnt(0)
2561; GFX9-NEXT:    buffer_wbinvl1_vol
2562; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
2563; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2564; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2565; GFX9-NEXT:    s_cbranch_execnz .LBB53_1
2566; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
2567; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
2568; GFX9-NEXT:    v_mov_b32_e32 v0, v4
2569; GFX9-NEXT:    v_mov_b32_e32 v1, v5
2570; GFX9-NEXT:    s_setpc_b64 s[30:31]
2571  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
2572  %result = atomicrmw nand ptr addrspace(1) %gep, i64 %in seq_cst
2573  ret i64 %result
2574}
2575
2576define amdgpu_gfx void @global_atomic_nand_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
2577; SI-LABEL: global_atomic_nand_i64_noret_scalar:
2578; SI:       ; %bb.0:
2579; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2580; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
2581; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
2582; SI-NEXT:    s_mov_b64 exec, s[34:35]
2583; SI-NEXT:    s_waitcnt expcnt(0)
2584; SI-NEXT:    v_writelane_b32 v8, s6, 0
2585; SI-NEXT:    v_writelane_b32 v8, s7, 1
2586; SI-NEXT:    s_mov_b32 s34, s7
2587; SI-NEXT:    s_mov_b32 s35, s6
2588; SI-NEXT:    s_mov_b32 s7, 0xf000
2589; SI-NEXT:    s_mov_b32 s6, -1
2590; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
2591; SI-NEXT:    s_mov_b64 s[36:37], 0
2592; SI-NEXT:  .LBB54_1: ; %atomicrmw.start
2593; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
2594; SI-NEXT:    s_waitcnt vmcnt(0)
2595; SI-NEXT:    v_and_b32_e32 v0, s34, v3
2596; SI-NEXT:    s_waitcnt expcnt(0)
2597; SI-NEXT:    v_and_b32_e32 v4, s35, v2
2598; SI-NEXT:    v_not_b32_e32 v1, v0
2599; SI-NEXT:    v_not_b32_e32 v0, v4
2600; SI-NEXT:    v_mov_b32_e32 v7, v3
2601; SI-NEXT:    v_mov_b32_e32 v6, v2
2602; SI-NEXT:    v_mov_b32_e32 v5, v1
2603; SI-NEXT:    v_mov_b32_e32 v4, v0
2604; SI-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc
2605; SI-NEXT:    s_waitcnt vmcnt(0)
2606; SI-NEXT:    buffer_wbinvl1
2607; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
2608; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
2609; SI-NEXT:    v_mov_b32_e32 v2, v4
2610; SI-NEXT:    v_mov_b32_e32 v3, v5
2611; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
2612; SI-NEXT:    s_cbranch_execnz .LBB54_1
2613; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
2614; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
2615; SI-NEXT:    v_readlane_b32 s7, v8, 1
2616; SI-NEXT:    v_readlane_b32 s6, v8, 0
2617; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
2618; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
2619; SI-NEXT:    s_mov_b64 exec, s[34:35]
2620; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
2621; SI-NEXT:    s_setpc_b64 s[30:31]
2622;
2623; VI-LABEL: global_atomic_nand_i64_noret_scalar:
2624; VI:       ; %bb.0:
2625; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2626; VI-NEXT:    v_mov_b32_e32 v0, s4
2627; VI-NEXT:    v_mov_b32_e32 v1, s5
2628; VI-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
2629; VI-NEXT:    v_mov_b32_e32 v4, s4
2630; VI-NEXT:    s_mov_b64 s[34:35], 0
2631; VI-NEXT:    v_mov_b32_e32 v5, s5
2632; VI-NEXT:  .LBB54_1: ; %atomicrmw.start
2633; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
2634; VI-NEXT:    s_waitcnt vmcnt(0)
2635; VI-NEXT:    v_and_b32_e32 v0, s7, v3
2636; VI-NEXT:    v_and_b32_e32 v6, s6, v2
2637; VI-NEXT:    v_not_b32_e32 v1, v0
2638; VI-NEXT:    v_not_b32_e32 v0, v6
2639; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
2640; VI-NEXT:    s_waitcnt vmcnt(0)
2641; VI-NEXT:    buffer_wbinvl1_vol
2642; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
2643; VI-NEXT:    v_mov_b32_e32 v3, v1
2644; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
2645; VI-NEXT:    v_mov_b32_e32 v2, v0
2646; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
2647; VI-NEXT:    s_cbranch_execnz .LBB54_1
2648; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
2649; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
2650; VI-NEXT:    s_setpc_b64 s[30:31]
2651;
2652; GFX9-LABEL: global_atomic_nand_i64_noret_scalar:
2653; GFX9:       ; %bb.0:
2654; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2655; GFX9-NEXT:    v_mov_b32_e32 v4, 0
2656; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[4:5]
2657; GFX9-NEXT:    s_mov_b64 s[34:35], 0
2658; GFX9-NEXT:  .LBB54_1: ; %atomicrmw.start
2659; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
2660; GFX9-NEXT:    s_waitcnt vmcnt(0)
2661; GFX9-NEXT:    v_and_b32_e32 v0, s7, v3
2662; GFX9-NEXT:    v_and_b32_e32 v5, s6, v2
2663; GFX9-NEXT:    v_not_b32_e32 v1, v0
2664; GFX9-NEXT:    v_not_b32_e32 v0, v5
2665; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc
2666; GFX9-NEXT:    s_waitcnt vmcnt(0)
2667; GFX9-NEXT:    buffer_wbinvl1_vol
2668; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
2669; GFX9-NEXT:    v_mov_b32_e32 v3, v1
2670; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
2671; GFX9-NEXT:    v_mov_b32_e32 v2, v0
2672; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
2673; GFX9-NEXT:    s_cbranch_execnz .LBB54_1
2674; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
2675; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
2676; GFX9-NEXT:    s_setpc_b64 s[30:31]
2677  %tmp0 = atomicrmw nand ptr addrspace(1) %ptr, i64 %in seq_cst
2678  ret void
2679}
2680
2681define amdgpu_gfx void @global_atomic_nand_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
2682; SI-LABEL: global_atomic_nand_i64_noret_offset_scalar:
2683; SI:       ; %bb.0:
2684; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2685; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
2686; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
2687; SI-NEXT:    s_mov_b64 exec, s[34:35]
2688; SI-NEXT:    s_waitcnt expcnt(0)
2689; SI-NEXT:    v_writelane_b32 v8, s6, 0
2690; SI-NEXT:    v_writelane_b32 v8, s7, 1
2691; SI-NEXT:    s_mov_b32 s34, s7
2692; SI-NEXT:    s_mov_b32 s35, s6
2693; SI-NEXT:    s_mov_b32 s7, 0xf000
2694; SI-NEXT:    s_mov_b32 s6, -1
2695; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32
2696; SI-NEXT:    s_mov_b64 s[36:37], 0
2697; SI-NEXT:  .LBB55_1: ; %atomicrmw.start
2698; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
2699; SI-NEXT:    s_waitcnt vmcnt(0)
2700; SI-NEXT:    v_and_b32_e32 v0, s34, v3
2701; SI-NEXT:    s_waitcnt expcnt(0)
2702; SI-NEXT:    v_and_b32_e32 v4, s35, v2
2703; SI-NEXT:    v_not_b32_e32 v1, v0
2704; SI-NEXT:    v_not_b32_e32 v0, v4
2705; SI-NEXT:    v_mov_b32_e32 v7, v3
2706; SI-NEXT:    v_mov_b32_e32 v6, v2
2707; SI-NEXT:    v_mov_b32_e32 v5, v1
2708; SI-NEXT:    v_mov_b32_e32 v4, v0
2709; SI-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc
2710; SI-NEXT:    s_waitcnt vmcnt(0)
2711; SI-NEXT:    buffer_wbinvl1
2712; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
2713; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
2714; SI-NEXT:    v_mov_b32_e32 v2, v4
2715; SI-NEXT:    v_mov_b32_e32 v3, v5
2716; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
2717; SI-NEXT:    s_cbranch_execnz .LBB55_1
2718; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
2719; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
2720; SI-NEXT:    v_readlane_b32 s7, v8, 1
2721; SI-NEXT:    v_readlane_b32 s6, v8, 0
2722; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
2723; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
2724; SI-NEXT:    s_mov_b64 exec, s[34:35]
2725; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
2726; SI-NEXT:    s_setpc_b64 s[30:31]
2727;
2728; VI-LABEL: global_atomic_nand_i64_noret_offset_scalar:
2729; VI:       ; %bb.0:
2730; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2731; VI-NEXT:    s_add_u32 s34, s4, 32
2732; VI-NEXT:    s_addc_u32 s35, s5, 0
2733; VI-NEXT:    v_mov_b32_e32 v4, s34
2734; VI-NEXT:    v_mov_b32_e32 v5, s35
2735; VI-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
2736; VI-NEXT:    s_mov_b64 s[34:35], 0
2737; VI-NEXT:  .LBB55_1: ; %atomicrmw.start
2738; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
2739; VI-NEXT:    s_waitcnt vmcnt(0)
2740; VI-NEXT:    v_and_b32_e32 v0, s7, v3
2741; VI-NEXT:    v_and_b32_e32 v6, s6, v2
2742; VI-NEXT:    v_not_b32_e32 v1, v0
2743; VI-NEXT:    v_not_b32_e32 v0, v6
2744; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
2745; VI-NEXT:    s_waitcnt vmcnt(0)
2746; VI-NEXT:    buffer_wbinvl1_vol
2747; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
2748; VI-NEXT:    v_mov_b32_e32 v3, v1
2749; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
2750; VI-NEXT:    v_mov_b32_e32 v2, v0
2751; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
2752; VI-NEXT:    s_cbranch_execnz .LBB55_1
2753; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
2754; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
2755; VI-NEXT:    s_setpc_b64 s[30:31]
2756;
2757; GFX9-LABEL: global_atomic_nand_i64_noret_offset_scalar:
2758; GFX9:       ; %bb.0:
2759; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2760; GFX9-NEXT:    v_mov_b32_e32 v4, 0
2761; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[4:5] offset:32
2762; GFX9-NEXT:    s_mov_b64 s[34:35], 0
2763; GFX9-NEXT:  .LBB55_1: ; %atomicrmw.start
2764; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
2765; GFX9-NEXT:    s_waitcnt vmcnt(0)
2766; GFX9-NEXT:    v_and_b32_e32 v0, s7, v3
2767; GFX9-NEXT:    v_and_b32_e32 v5, s6, v2
2768; GFX9-NEXT:    v_not_b32_e32 v1, v0
2769; GFX9-NEXT:    v_not_b32_e32 v0, v5
2770; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc
2771; GFX9-NEXT:    s_waitcnt vmcnt(0)
2772; GFX9-NEXT:    buffer_wbinvl1_vol
2773; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
2774; GFX9-NEXT:    v_mov_b32_e32 v3, v1
2775; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
2776; GFX9-NEXT:    v_mov_b32_e32 v2, v0
2777; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
2778; GFX9-NEXT:    s_cbranch_execnz .LBB55_1
2779; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
2780; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
2781; GFX9-NEXT:    s_setpc_b64 s[30:31]
2782  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
2783  %tmp0 = atomicrmw nand ptr addrspace(1) %gep, i64 %in seq_cst
2784  ret void
2785}
2786
2787define amdgpu_gfx i64 @global_atomic_nand_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
2788; SI-LABEL: global_atomic_nand_i64_ret_scalar:
2789; SI:       ; %bb.0:
2790; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2791; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
2792; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
2793; SI-NEXT:    s_mov_b64 exec, s[34:35]
2794; SI-NEXT:    s_waitcnt expcnt(0)
2795; SI-NEXT:    v_writelane_b32 v6, s6, 0
2796; SI-NEXT:    v_writelane_b32 v6, s7, 1
2797; SI-NEXT:    s_mov_b32 s34, s7
2798; SI-NEXT:    s_mov_b32 s35, s6
2799; SI-NEXT:    s_mov_b32 s7, 0xf000
2800; SI-NEXT:    s_mov_b32 s6, -1
2801; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
2802; SI-NEXT:    s_mov_b64 s[36:37], 0
2803; SI-NEXT:  .LBB56_1: ; %atomicrmw.start
2804; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
2805; SI-NEXT:    s_waitcnt vmcnt(0)
2806; SI-NEXT:    v_mov_b32_e32 v5, v1
2807; SI-NEXT:    v_mov_b32_e32 v4, v0
2808; SI-NEXT:    s_waitcnt expcnt(0)
2809; SI-NEXT:    v_and_b32_e32 v0, s34, v5
2810; SI-NEXT:    v_and_b32_e32 v1, s35, v4
2811; SI-NEXT:    v_not_b32_e32 v3, v0
2812; SI-NEXT:    v_not_b32_e32 v2, v1
2813; SI-NEXT:    v_mov_b32_e32 v0, v2
2814; SI-NEXT:    v_mov_b32_e32 v1, v3
2815; SI-NEXT:    v_mov_b32_e32 v2, v4
2816; SI-NEXT:    v_mov_b32_e32 v3, v5
2817; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
2818; SI-NEXT:    s_waitcnt vmcnt(0)
2819; SI-NEXT:    buffer_wbinvl1
2820; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
2821; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
2822; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
2823; SI-NEXT:    s_cbranch_execnz .LBB56_1
2824; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
2825; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
2826; SI-NEXT:    v_readlane_b32 s7, v6, 1
2827; SI-NEXT:    v_readlane_b32 s6, v6, 0
2828; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
2829; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
2830; SI-NEXT:    s_mov_b64 exec, s[34:35]
2831; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
2832; SI-NEXT:    s_setpc_b64 s[30:31]
2833;
2834; VI-LABEL: global_atomic_nand_i64_ret_scalar:
2835; VI:       ; %bb.0:
2836; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2837; VI-NEXT:    v_mov_b32_e32 v0, s4
2838; VI-NEXT:    v_mov_b32_e32 v1, s5
2839; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
2840; VI-NEXT:    v_mov_b32_e32 v2, s4
2841; VI-NEXT:    s_mov_b64 s[34:35], 0
2842; VI-NEXT:    v_mov_b32_e32 v3, s5
2843; VI-NEXT:  .LBB56_1: ; %atomicrmw.start
2844; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
2845; VI-NEXT:    s_waitcnt vmcnt(0)
2846; VI-NEXT:    v_mov_b32_e32 v7, v1
2847; VI-NEXT:    v_mov_b32_e32 v6, v0
2848; VI-NEXT:    v_and_b32_e32 v0, s7, v7
2849; VI-NEXT:    v_and_b32_e32 v1, s6, v6
2850; VI-NEXT:    v_not_b32_e32 v5, v0
2851; VI-NEXT:    v_not_b32_e32 v4, v1
2852; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
2853; VI-NEXT:    s_waitcnt vmcnt(0)
2854; VI-NEXT:    buffer_wbinvl1_vol
2855; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
2856; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
2857; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
2858; VI-NEXT:    s_cbranch_execnz .LBB56_1
2859; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
2860; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
2861; VI-NEXT:    s_setpc_b64 s[30:31]
2862;
2863; GFX9-LABEL: global_atomic_nand_i64_ret_scalar:
2864; GFX9:       ; %bb.0:
2865; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2866; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2867; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5]
2868; GFX9-NEXT:    s_mov_b64 s[34:35], 0
2869; GFX9-NEXT:  .LBB56_1: ; %atomicrmw.start
2870; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
2871; GFX9-NEXT:    s_waitcnt vmcnt(0)
2872; GFX9-NEXT:    v_mov_b32_e32 v6, v1
2873; GFX9-NEXT:    v_mov_b32_e32 v5, v0
2874; GFX9-NEXT:    v_and_b32_e32 v0, s7, v6
2875; GFX9-NEXT:    v_and_b32_e32 v1, s6, v5
2876; GFX9-NEXT:    v_not_b32_e32 v4, v0
2877; GFX9-NEXT:    v_not_b32_e32 v3, v1
2878; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc
2879; GFX9-NEXT:    s_waitcnt vmcnt(0)
2880; GFX9-NEXT:    buffer_wbinvl1_vol
2881; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
2882; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
2883; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
2884; GFX9-NEXT:    s_cbranch_execnz .LBB56_1
2885; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
2886; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
2887; GFX9-NEXT:    s_setpc_b64 s[30:31]
2888  %result = atomicrmw nand ptr addrspace(1) %ptr, i64 %in seq_cst
2889  ret i64 %result
2890}
2891
2892define amdgpu_gfx i64 @global_atomic_nand_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
2893; SI-LABEL: global_atomic_nand_i64_ret_offset_scalar:
2894; SI:       ; %bb.0:
2895; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2896; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
2897; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
2898; SI-NEXT:    s_mov_b64 exec, s[34:35]
2899; SI-NEXT:    s_waitcnt expcnt(0)
2900; SI-NEXT:    v_writelane_b32 v6, s6, 0
2901; SI-NEXT:    v_writelane_b32 v6, s7, 1
2902; SI-NEXT:    s_mov_b32 s34, s7
2903; SI-NEXT:    s_mov_b32 s35, s6
2904; SI-NEXT:    s_mov_b32 s7, 0xf000
2905; SI-NEXT:    s_mov_b32 s6, -1
2906; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
2907; SI-NEXT:    s_mov_b64 s[36:37], 0
2908; SI-NEXT:  .LBB57_1: ; %atomicrmw.start
2909; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
2910; SI-NEXT:    s_waitcnt vmcnt(0)
2911; SI-NEXT:    v_mov_b32_e32 v5, v1
2912; SI-NEXT:    v_mov_b32_e32 v4, v0
2913; SI-NEXT:    s_waitcnt expcnt(0)
2914; SI-NEXT:    v_and_b32_e32 v0, s34, v5
2915; SI-NEXT:    v_and_b32_e32 v1, s35, v4
2916; SI-NEXT:    v_not_b32_e32 v3, v0
2917; SI-NEXT:    v_not_b32_e32 v2, v1
2918; SI-NEXT:    v_mov_b32_e32 v0, v2
2919; SI-NEXT:    v_mov_b32_e32 v1, v3
2920; SI-NEXT:    v_mov_b32_e32 v2, v4
2921; SI-NEXT:    v_mov_b32_e32 v3, v5
2922; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
2923; SI-NEXT:    s_waitcnt vmcnt(0)
2924; SI-NEXT:    buffer_wbinvl1
2925; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
2926; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
2927; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
2928; SI-NEXT:    s_cbranch_execnz .LBB57_1
2929; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
2930; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
2931; SI-NEXT:    v_readlane_b32 s7, v6, 1
2932; SI-NEXT:    v_readlane_b32 s6, v6, 0
2933; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
2934; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
2935; SI-NEXT:    s_mov_b64 exec, s[34:35]
2936; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
2937; SI-NEXT:    s_setpc_b64 s[30:31]
2938;
2939; VI-LABEL: global_atomic_nand_i64_ret_offset_scalar:
2940; VI:       ; %bb.0:
2941; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2942; VI-NEXT:    s_add_u32 s34, s4, 32
2943; VI-NEXT:    s_addc_u32 s35, s5, 0
2944; VI-NEXT:    v_mov_b32_e32 v2, s34
2945; VI-NEXT:    v_mov_b32_e32 v3, s35
2946; VI-NEXT:    flat_load_dwordx2 v[0:1], v[2:3]
2947; VI-NEXT:    s_mov_b64 s[34:35], 0
2948; VI-NEXT:  .LBB57_1: ; %atomicrmw.start
2949; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
2950; VI-NEXT:    s_waitcnt vmcnt(0)
2951; VI-NEXT:    v_mov_b32_e32 v7, v1
2952; VI-NEXT:    v_mov_b32_e32 v6, v0
2953; VI-NEXT:    v_and_b32_e32 v0, s7, v7
2954; VI-NEXT:    v_and_b32_e32 v1, s6, v6
2955; VI-NEXT:    v_not_b32_e32 v5, v0
2956; VI-NEXT:    v_not_b32_e32 v4, v1
2957; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
2958; VI-NEXT:    s_waitcnt vmcnt(0)
2959; VI-NEXT:    buffer_wbinvl1_vol
2960; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
2961; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
2962; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
2963; VI-NEXT:    s_cbranch_execnz .LBB57_1
2964; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
2965; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
2966; VI-NEXT:    s_setpc_b64 s[30:31]
2967;
2968; GFX9-LABEL: global_atomic_nand_i64_ret_offset_scalar:
2969; GFX9:       ; %bb.0:
2970; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2971; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2972; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5] offset:32
2973; GFX9-NEXT:    s_mov_b64 s[34:35], 0
2974; GFX9-NEXT:  .LBB57_1: ; %atomicrmw.start
2975; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
2976; GFX9-NEXT:    s_waitcnt vmcnt(0)
2977; GFX9-NEXT:    v_mov_b32_e32 v6, v1
2978; GFX9-NEXT:    v_mov_b32_e32 v5, v0
2979; GFX9-NEXT:    v_and_b32_e32 v0, s7, v6
2980; GFX9-NEXT:    v_and_b32_e32 v1, s6, v5
2981; GFX9-NEXT:    v_not_b32_e32 v4, v0
2982; GFX9-NEXT:    v_not_b32_e32 v3, v1
2983; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc
2984; GFX9-NEXT:    s_waitcnt vmcnt(0)
2985; GFX9-NEXT:    buffer_wbinvl1_vol
2986; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
2987; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
2988; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
2989; GFX9-NEXT:    s_cbranch_execnz .LBB57_1
2990; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
2991; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
2992; GFX9-NEXT:    s_setpc_b64 s[30:31]
2993  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
2994  %result = atomicrmw nand ptr addrspace(1) %gep, i64 %in seq_cst
2995  ret i64 %result
2996}
2997
2998define void @global_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
2999; SI-LABEL: global_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory:
3000; SI:       ; %bb.0:
3001; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3002; SI-NEXT:    s_mov_b32 s6, 0
3003; SI-NEXT:    s_mov_b32 s7, 0xf000
3004; SI-NEXT:    s_mov_b32 s4, s6
3005; SI-NEXT:    s_mov_b32 s5, s6
3006; SI-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32
3007; SI-NEXT:    s_mov_b64 s[8:9], 0
3008; SI-NEXT:  .LBB58_1: ; %atomicrmw.start
3009; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
3010; SI-NEXT:    s_waitcnt vmcnt(0)
3011; SI-NEXT:    v_and_b32_e32 v4, v7, v3
3012; SI-NEXT:    s_waitcnt expcnt(0)
3013; SI-NEXT:    v_and_b32_e32 v8, v6, v2
3014; SI-NEXT:    v_not_b32_e32 v5, v4
3015; SI-NEXT:    v_not_b32_e32 v4, v8
3016; SI-NEXT:    v_mov_b32_e32 v11, v7
3017; SI-NEXT:    v_mov_b32_e32 v10, v6
3018; SI-NEXT:    v_mov_b32_e32 v9, v5
3019; SI-NEXT:    v_mov_b32_e32 v8, v4
3020; SI-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
3021; SI-NEXT:    s_waitcnt vmcnt(0)
3022; SI-NEXT:    buffer_wbinvl1
3023; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
3024; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
3025; SI-NEXT:    v_mov_b32_e32 v6, v8
3026; SI-NEXT:    v_mov_b32_e32 v7, v9
3027; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
3028; SI-NEXT:    s_cbranch_execnz .LBB58_1
3029; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
3030; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
3031; SI-NEXT:    s_waitcnt expcnt(0)
3032; SI-NEXT:    s_setpc_b64 s[30:31]
3033;
3034; VI-LABEL: global_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory:
3035; VI:       ; %bb.0:
3036; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3037; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
3038; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3039; VI-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
3040; VI-NEXT:    s_mov_b64 s[4:5], 0
3041; VI-NEXT:  .LBB58_1: ; %atomicrmw.start
3042; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
3043; VI-NEXT:    s_waitcnt vmcnt(0)
3044; VI-NEXT:    v_and_b32_e32 v4, v7, v3
3045; VI-NEXT:    v_and_b32_e32 v8, v6, v2
3046; VI-NEXT:    v_not_b32_e32 v5, v4
3047; VI-NEXT:    v_not_b32_e32 v4, v8
3048; VI-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
3049; VI-NEXT:    s_waitcnt vmcnt(0)
3050; VI-NEXT:    buffer_wbinvl1_vol
3051; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
3052; VI-NEXT:    v_mov_b32_e32 v7, v5
3053; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3054; VI-NEXT:    v_mov_b32_e32 v6, v4
3055; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3056; VI-NEXT:    s_cbranch_execnz .LBB58_1
3057; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
3058; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
3059; VI-NEXT:    s_setpc_b64 s[30:31]
3060;
3061; GFX9-LABEL: global_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory:
3062; GFX9:       ; %bb.0:
3063; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3064; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off offset:32
3065; GFX9-NEXT:    s_mov_b64 s[4:5], 0
3066; GFX9-NEXT:  .LBB58_1: ; %atomicrmw.start
3067; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
3068; GFX9-NEXT:    s_waitcnt vmcnt(0)
3069; GFX9-NEXT:    v_and_b32_e32 v4, v7, v3
3070; GFX9-NEXT:    v_and_b32_e32 v8, v6, v2
3071; GFX9-NEXT:    v_not_b32_e32 v5, v4
3072; GFX9-NEXT:    v_not_b32_e32 v4, v8
3073; GFX9-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
3074; GFX9-NEXT:    s_waitcnt vmcnt(0)
3075; GFX9-NEXT:    buffer_wbinvl1_vol
3076; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
3077; GFX9-NEXT:    v_mov_b32_e32 v7, v5
3078; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3079; GFX9-NEXT:    v_mov_b32_e32 v6, v4
3080; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3081; GFX9-NEXT:    s_cbranch_execnz .LBB58_1
3082; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
3083; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
3084; GFX9-NEXT:    s_setpc_b64 s[30:31]
3085  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
3086  %tmp0 = atomicrmw nand ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
3087  ret void
3088}
3089
3090define i64 @global_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
3091; SI-LABEL: global_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory:
3092; SI:       ; %bb.0:
3093; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3094; SI-NEXT:    v_mov_b32_e32 v6, v3
3095; SI-NEXT:    v_mov_b32_e32 v7, v2
3096; SI-NEXT:    v_mov_b32_e32 v5, v1
3097; SI-NEXT:    v_mov_b32_e32 v4, v0
3098; SI-NEXT:    s_mov_b32 s6, 0
3099; SI-NEXT:    s_mov_b32 s7, 0xf000
3100; SI-NEXT:    s_mov_b32 s4, s6
3101; SI-NEXT:    s_mov_b32 s5, s6
3102; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 offset:32
3103; SI-NEXT:    s_mov_b64 s[8:9], 0
3104; SI-NEXT:  .LBB59_1: ; %atomicrmw.start
3105; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
3106; SI-NEXT:    s_waitcnt vmcnt(0)
3107; SI-NEXT:    v_mov_b32_e32 v11, v1
3108; SI-NEXT:    v_mov_b32_e32 v10, v0
3109; SI-NEXT:    s_waitcnt expcnt(0)
3110; SI-NEXT:    v_and_b32_e32 v0, v11, v6
3111; SI-NEXT:    v_and_b32_e32 v1, v10, v7
3112; SI-NEXT:    v_not_b32_e32 v9, v0
3113; SI-NEXT:    v_not_b32_e32 v8, v1
3114; SI-NEXT:    v_mov_b32_e32 v0, v8
3115; SI-NEXT:    v_mov_b32_e32 v1, v9
3116; SI-NEXT:    v_mov_b32_e32 v2, v10
3117; SI-NEXT:    v_mov_b32_e32 v3, v11
3118; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 offset:32 glc
3119; SI-NEXT:    s_waitcnt vmcnt(0)
3120; SI-NEXT:    buffer_wbinvl1
3121; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
3122; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
3123; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
3124; SI-NEXT:    s_cbranch_execnz .LBB59_1
3125; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
3126; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
3127; SI-NEXT:    s_waitcnt expcnt(0)
3128; SI-NEXT:    s_setpc_b64 s[30:31]
3129;
3130; VI-LABEL: global_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory:
3131; VI:       ; %bb.0:
3132; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3133; VI-NEXT:    v_add_u32_e32 v4, vcc, 32, v0
3134; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
3135; VI-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
3136; VI-NEXT:    s_mov_b64 s[4:5], 0
3137; VI-NEXT:  .LBB59_1: ; %atomicrmw.start
3138; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
3139; VI-NEXT:    s_waitcnt vmcnt(0)
3140; VI-NEXT:    v_mov_b32_e32 v9, v1
3141; VI-NEXT:    v_mov_b32_e32 v8, v0
3142; VI-NEXT:    v_and_b32_e32 v0, v9, v3
3143; VI-NEXT:    v_and_b32_e32 v1, v8, v2
3144; VI-NEXT:    v_not_b32_e32 v7, v0
3145; VI-NEXT:    v_not_b32_e32 v6, v1
3146; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
3147; VI-NEXT:    s_waitcnt vmcnt(0)
3148; VI-NEXT:    buffer_wbinvl1_vol
3149; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
3150; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3151; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3152; VI-NEXT:    s_cbranch_execnz .LBB59_1
3153; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
3154; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
3155; VI-NEXT:    s_setpc_b64 s[30:31]
3156;
3157; GFX9-LABEL: global_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory:
3158; GFX9:       ; %bb.0:
3159; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3160; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:32
3161; GFX9-NEXT:    s_mov_b64 s[4:5], 0
3162; GFX9-NEXT:  .LBB59_1: ; %atomicrmw.start
3163; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
3164; GFX9-NEXT:    s_waitcnt vmcnt(0)
3165; GFX9-NEXT:    v_mov_b32_e32 v7, v5
3166; GFX9-NEXT:    v_mov_b32_e32 v6, v4
3167; GFX9-NEXT:    v_and_b32_e32 v4, v7, v3
3168; GFX9-NEXT:    v_and_b32_e32 v8, v6, v2
3169; GFX9-NEXT:    v_not_b32_e32 v5, v4
3170; GFX9-NEXT:    v_not_b32_e32 v4, v8
3171; GFX9-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
3172; GFX9-NEXT:    s_waitcnt vmcnt(0)
3173; GFX9-NEXT:    buffer_wbinvl1_vol
3174; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
3175; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
3176; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
3177; GFX9-NEXT:    s_cbranch_execnz .LBB59_1
3178; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
3179; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
3180; GFX9-NEXT:    v_mov_b32_e32 v0, v4
3181; GFX9-NEXT:    v_mov_b32_e32 v1, v5
3182; GFX9-NEXT:    s_setpc_b64 s[30:31]
3183  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
3184  %result = atomicrmw nand ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
3185  ret i64 %result
3186}
3187
3188; ---------------------------------------------------------------------
3189; atomicrmw or
3190; ---------------------------------------------------------------------
3191
3192define void @global_atomic_or_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
3193; SI-LABEL: global_atomic_or_i64_noret:
3194; SI:       ; %bb.0:
3195; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3196; SI-NEXT:    s_mov_b32 s6, 0
3197; SI-NEXT:    s_mov_b32 s7, 0xf000
3198; SI-NEXT:    s_mov_b32 s4, s6
3199; SI-NEXT:    s_mov_b32 s5, s6
3200; SI-NEXT:    buffer_atomic_or_x2 v[2:3], v[0:1], s[4:7], 0 addr64
3201; SI-NEXT:    s_waitcnt vmcnt(0)
3202; SI-NEXT:    buffer_wbinvl1
3203; SI-NEXT:    s_waitcnt expcnt(0)
3204; SI-NEXT:    s_setpc_b64 s[30:31]
3205;
3206; VI-LABEL: global_atomic_or_i64_noret:
3207; VI:       ; %bb.0:
3208; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3209; VI-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3]
3210; VI-NEXT:    s_waitcnt vmcnt(0)
3211; VI-NEXT:    buffer_wbinvl1_vol
3212; VI-NEXT:    s_setpc_b64 s[30:31]
3213;
3214; GFX9-LABEL: global_atomic_or_i64_noret:
3215; GFX9:       ; %bb.0:
3216; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3217; GFX9-NEXT:    global_atomic_or_x2 v[0:1], v[2:3], off
3218; GFX9-NEXT:    s_waitcnt vmcnt(0)
3219; GFX9-NEXT:    buffer_wbinvl1_vol
3220; GFX9-NEXT:    s_setpc_b64 s[30:31]
3221  %tmp0 = atomicrmw or ptr addrspace(1) %ptr, i64 %in seq_cst
3222  ret void
3223}
3224
3225define void @global_atomic_or_i64_noret_offset(ptr addrspace(1) %out, i64 %in) {
3226; SI-LABEL: global_atomic_or_i64_noret_offset:
3227; SI:       ; %bb.0:
3228; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3229; SI-NEXT:    s_mov_b32 s6, 0
3230; SI-NEXT:    s_mov_b32 s7, 0xf000
3231; SI-NEXT:    s_mov_b32 s4, s6
3232; SI-NEXT:    s_mov_b32 s5, s6
3233; SI-NEXT:    buffer_atomic_or_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32
3234; SI-NEXT:    s_waitcnt vmcnt(0)
3235; SI-NEXT:    buffer_wbinvl1
3236; SI-NEXT:    s_waitcnt expcnt(0)
3237; SI-NEXT:    s_setpc_b64 s[30:31]
3238;
3239; VI-LABEL: global_atomic_or_i64_noret_offset:
3240; VI:       ; %bb.0:
3241; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3242; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
3243; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3244; VI-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3]
3245; VI-NEXT:    s_waitcnt vmcnt(0)
3246; VI-NEXT:    buffer_wbinvl1_vol
3247; VI-NEXT:    s_setpc_b64 s[30:31]
3248;
3249; GFX9-LABEL: global_atomic_or_i64_noret_offset:
3250; GFX9:       ; %bb.0:
3251; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3252; GFX9-NEXT:    global_atomic_or_x2 v[0:1], v[2:3], off offset:32
3253; GFX9-NEXT:    s_waitcnt vmcnt(0)
3254; GFX9-NEXT:    buffer_wbinvl1_vol
3255; GFX9-NEXT:    s_setpc_b64 s[30:31]
3256  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
3257  %tmp0 = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst
3258  ret void
3259}
3260
3261define i64 @global_atomic_or_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
3262; SI-LABEL: global_atomic_or_i64_ret:
3263; SI:       ; %bb.0:
3264; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3265; SI-NEXT:    s_mov_b32 s6, 0
3266; SI-NEXT:    s_mov_b32 s7, 0xf000
3267; SI-NEXT:    s_mov_b32 s4, s6
3268; SI-NEXT:    s_mov_b32 s5, s6
3269; SI-NEXT:    buffer_atomic_or_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
3270; SI-NEXT:    s_waitcnt vmcnt(0)
3271; SI-NEXT:    buffer_wbinvl1
3272; SI-NEXT:    v_mov_b32_e32 v0, v2
3273; SI-NEXT:    v_mov_b32_e32 v1, v3
3274; SI-NEXT:    s_waitcnt expcnt(0)
3275; SI-NEXT:    s_setpc_b64 s[30:31]
3276;
3277; VI-LABEL: global_atomic_or_i64_ret:
3278; VI:       ; %bb.0:
3279; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3280; VI-NEXT:    flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
3281; VI-NEXT:    s_waitcnt vmcnt(0)
3282; VI-NEXT:    buffer_wbinvl1_vol
3283; VI-NEXT:    s_setpc_b64 s[30:31]
3284;
3285; GFX9-LABEL: global_atomic_or_i64_ret:
3286; GFX9:       ; %bb.0:
3287; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3288; GFX9-NEXT:    global_atomic_or_x2 v[0:1], v[0:1], v[2:3], off glc
3289; GFX9-NEXT:    s_waitcnt vmcnt(0)
3290; GFX9-NEXT:    buffer_wbinvl1_vol
3291; GFX9-NEXT:    s_setpc_b64 s[30:31]
3292  %result = atomicrmw or ptr addrspace(1) %ptr, i64 %in seq_cst
3293  ret i64 %result
3294}
3295
3296define i64 @global_atomic_or_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
3297; SI-LABEL: global_atomic_or_i64_ret_offset:
3298; SI:       ; %bb.0:
3299; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3300; SI-NEXT:    s_mov_b32 s6, 0
3301; SI-NEXT:    s_mov_b32 s7, 0xf000
3302; SI-NEXT:    s_mov_b32 s4, s6
3303; SI-NEXT:    s_mov_b32 s5, s6
3304; SI-NEXT:    buffer_atomic_or_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc
3305; SI-NEXT:    s_waitcnt vmcnt(0)
3306; SI-NEXT:    buffer_wbinvl1
3307; SI-NEXT:    v_mov_b32_e32 v0, v2
3308; SI-NEXT:    v_mov_b32_e32 v1, v3
3309; SI-NEXT:    s_waitcnt expcnt(0)
3310; SI-NEXT:    s_setpc_b64 s[30:31]
3311;
3312; VI-LABEL: global_atomic_or_i64_ret_offset:
3313; VI:       ; %bb.0:
3314; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3315; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
3316; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3317; VI-NEXT:    flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
3318; VI-NEXT:    s_waitcnt vmcnt(0)
3319; VI-NEXT:    buffer_wbinvl1_vol
3320; VI-NEXT:    s_setpc_b64 s[30:31]
3321;
3322; GFX9-LABEL: global_atomic_or_i64_ret_offset:
3323; GFX9:       ; %bb.0:
3324; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3325; GFX9-NEXT:    global_atomic_or_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
3326; GFX9-NEXT:    s_waitcnt vmcnt(0)
3327; GFX9-NEXT:    buffer_wbinvl1_vol
3328; GFX9-NEXT:    s_setpc_b64 s[30:31]
3329  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
3330  %result = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst
3331  ret i64 %result
3332}
3333
3334define amdgpu_gfx void @global_atomic_or_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
3335; SI-LABEL: global_atomic_or_i64_noret_scalar:
3336; SI:       ; %bb.0:
3337; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3338; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
3339; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
3340; SI-NEXT:    s_mov_b64 exec, s[34:35]
3341; SI-NEXT:    s_waitcnt expcnt(0)
3342; SI-NEXT:    v_writelane_b32 v2, s6, 0
3343; SI-NEXT:    v_writelane_b32 v2, s7, 1
3344; SI-NEXT:    s_mov_b32 s34, s7
3345; SI-NEXT:    s_mov_b32 s35, s6
3346; SI-NEXT:    s_mov_b32 s7, 0xf000
3347; SI-NEXT:    s_mov_b32 s6, -1
3348; SI-NEXT:    v_mov_b32_e32 v0, s35
3349; SI-NEXT:    v_mov_b32_e32 v1, s34
3350; SI-NEXT:    s_waitcnt vmcnt(0)
3351; SI-NEXT:    buffer_atomic_or_x2 v[0:1], off, s[4:7], 0
3352; SI-NEXT:    s_waitcnt vmcnt(0)
3353; SI-NEXT:    buffer_wbinvl1
3354; SI-NEXT:    v_readlane_b32 s7, v2, 1
3355; SI-NEXT:    v_readlane_b32 s6, v2, 0
3356; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
3357; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
3358; SI-NEXT:    s_mov_b64 exec, s[34:35]
3359; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
3360; SI-NEXT:    s_setpc_b64 s[30:31]
3361;
3362; VI-LABEL: global_atomic_or_i64_noret_scalar:
3363; VI:       ; %bb.0:
3364; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3365; VI-NEXT:    v_mov_b32_e32 v0, s6
3366; VI-NEXT:    v_mov_b32_e32 v1, s7
3367; VI-NEXT:    v_mov_b32_e32 v2, s4
3368; VI-NEXT:    v_mov_b32_e32 v3, s5
3369; VI-NEXT:    flat_atomic_or_x2 v[2:3], v[0:1]
3370; VI-NEXT:    s_waitcnt vmcnt(0)
3371; VI-NEXT:    buffer_wbinvl1_vol
3372; VI-NEXT:    s_setpc_b64 s[30:31]
3373;
3374; GFX9-LABEL: global_atomic_or_i64_noret_scalar:
3375; GFX9:       ; %bb.0:
3376; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3377; GFX9-NEXT:    v_mov_b32_e32 v0, s6
3378; GFX9-NEXT:    v_mov_b32_e32 v1, s7
3379; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3380; GFX9-NEXT:    global_atomic_or_x2 v2, v[0:1], s[4:5]
3381; GFX9-NEXT:    s_waitcnt vmcnt(0)
3382; GFX9-NEXT:    buffer_wbinvl1_vol
3383; GFX9-NEXT:    s_setpc_b64 s[30:31]
3384  %tmp0 = atomicrmw or ptr addrspace(1) %ptr, i64 %in seq_cst
3385  ret void
3386}
3387
3388define amdgpu_gfx void @global_atomic_or_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
3389; SI-LABEL: global_atomic_or_i64_noret_offset_scalar:
3390; SI:       ; %bb.0:
3391; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3392; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
3393; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
3394; SI-NEXT:    s_mov_b64 exec, s[34:35]
3395; SI-NEXT:    s_waitcnt expcnt(0)
3396; SI-NEXT:    v_writelane_b32 v2, s6, 0
3397; SI-NEXT:    v_writelane_b32 v2, s7, 1
3398; SI-NEXT:    v_mov_b32_e32 v0, s6
3399; SI-NEXT:    v_mov_b32_e32 v1, s7
3400; SI-NEXT:    s_mov_b32 s7, 0xf000
3401; SI-NEXT:    s_mov_b32 s6, -1
3402; SI-NEXT:    s_waitcnt vmcnt(0)
3403; SI-NEXT:    buffer_atomic_or_x2 v[0:1], off, s[4:7], 0 offset:32
3404; SI-NEXT:    s_waitcnt vmcnt(0)
3405; SI-NEXT:    buffer_wbinvl1
3406; SI-NEXT:    v_readlane_b32 s7, v2, 1
3407; SI-NEXT:    v_readlane_b32 s6, v2, 0
3408; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
3409; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
3410; SI-NEXT:    s_mov_b64 exec, s[34:35]
3411; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
3412; SI-NEXT:    s_setpc_b64 s[30:31]
3413;
3414; VI-LABEL: global_atomic_or_i64_noret_offset_scalar:
3415; VI:       ; %bb.0:
3416; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3417; VI-NEXT:    s_add_u32 s34, s4, 32
3418; VI-NEXT:    s_addc_u32 s35, s5, 0
3419; VI-NEXT:    v_mov_b32_e32 v2, s34
3420; VI-NEXT:    v_mov_b32_e32 v0, s6
3421; VI-NEXT:    v_mov_b32_e32 v1, s7
3422; VI-NEXT:    v_mov_b32_e32 v3, s35
3423; VI-NEXT:    flat_atomic_or_x2 v[2:3], v[0:1]
3424; VI-NEXT:    s_waitcnt vmcnt(0)
3425; VI-NEXT:    buffer_wbinvl1_vol
3426; VI-NEXT:    s_setpc_b64 s[30:31]
3427;
3428; GFX9-LABEL: global_atomic_or_i64_noret_offset_scalar:
3429; GFX9:       ; %bb.0:
3430; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3431; GFX9-NEXT:    v_mov_b32_e32 v0, s6
3432; GFX9-NEXT:    v_mov_b32_e32 v1, s7
3433; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3434; GFX9-NEXT:    global_atomic_or_x2 v2, v[0:1], s[4:5] offset:32
3435; GFX9-NEXT:    s_waitcnt vmcnt(0)
3436; GFX9-NEXT:    buffer_wbinvl1_vol
3437; GFX9-NEXT:    s_setpc_b64 s[30:31]
3438  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
3439  %tmp0 = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst
3440  ret void
3441}
3442
3443define amdgpu_gfx i64 @global_atomic_or_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
3444; SI-LABEL: global_atomic_or_i64_ret_scalar:
3445; SI:       ; %bb.0:
3446; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3447; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
3448; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
3449; SI-NEXT:    s_mov_b64 exec, s[34:35]
3450; SI-NEXT:    s_waitcnt expcnt(0)
3451; SI-NEXT:    v_writelane_b32 v2, s6, 0
3452; SI-NEXT:    v_writelane_b32 v2, s7, 1
3453; SI-NEXT:    s_mov_b32 s34, s7
3454; SI-NEXT:    s_mov_b32 s35, s6
3455; SI-NEXT:    s_mov_b32 s7, 0xf000
3456; SI-NEXT:    s_mov_b32 s6, -1
3457; SI-NEXT:    v_mov_b32_e32 v0, s35
3458; SI-NEXT:    v_mov_b32_e32 v1, s34
3459; SI-NEXT:    s_waitcnt vmcnt(0)
3460; SI-NEXT:    buffer_atomic_or_x2 v[0:1], off, s[4:7], 0 glc
3461; SI-NEXT:    s_waitcnt vmcnt(0)
3462; SI-NEXT:    buffer_wbinvl1
3463; SI-NEXT:    v_readlane_b32 s7, v2, 1
3464; SI-NEXT:    v_readlane_b32 s6, v2, 0
3465; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
3466; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
3467; SI-NEXT:    s_mov_b64 exec, s[34:35]
3468; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
3469; SI-NEXT:    s_setpc_b64 s[30:31]
3470;
3471; VI-LABEL: global_atomic_or_i64_ret_scalar:
3472; VI:       ; %bb.0:
3473; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3474; VI-NEXT:    v_mov_b32_e32 v0, s6
3475; VI-NEXT:    v_mov_b32_e32 v1, s7
3476; VI-NEXT:    v_mov_b32_e32 v2, s4
3477; VI-NEXT:    v_mov_b32_e32 v3, s5
3478; VI-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
3479; VI-NEXT:    s_waitcnt vmcnt(0)
3480; VI-NEXT:    buffer_wbinvl1_vol
3481; VI-NEXT:    s_setpc_b64 s[30:31]
3482;
3483; GFX9-LABEL: global_atomic_or_i64_ret_scalar:
3484; GFX9:       ; %bb.0:
3485; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3486; GFX9-NEXT:    v_mov_b32_e32 v0, s6
3487; GFX9-NEXT:    v_mov_b32_e32 v1, s7
3488; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3489; GFX9-NEXT:    global_atomic_or_x2 v[0:1], v2, v[0:1], s[4:5] glc
3490; GFX9-NEXT:    s_waitcnt vmcnt(0)
3491; GFX9-NEXT:    buffer_wbinvl1_vol
3492; GFX9-NEXT:    s_setpc_b64 s[30:31]
3493  %result = atomicrmw or ptr addrspace(1) %ptr, i64 %in seq_cst
3494  ret i64 %result
3495}
3496
3497define amdgpu_gfx i64 @global_atomic_or_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
3498; SI-LABEL: global_atomic_or_i64_ret_offset_scalar:
3499; SI:       ; %bb.0:
3500; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3501; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
3502; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
3503; SI-NEXT:    s_mov_b64 exec, s[34:35]
3504; SI-NEXT:    s_waitcnt expcnt(0)
3505; SI-NEXT:    v_writelane_b32 v2, s6, 0
3506; SI-NEXT:    v_writelane_b32 v2, s7, 1
3507; SI-NEXT:    v_mov_b32_e32 v0, s6
3508; SI-NEXT:    v_mov_b32_e32 v1, s7
3509; SI-NEXT:    s_mov_b32 s7, 0xf000
3510; SI-NEXT:    s_mov_b32 s6, -1
3511; SI-NEXT:    s_waitcnt vmcnt(0)
3512; SI-NEXT:    buffer_atomic_or_x2 v[0:1], off, s[4:7], 0 offset:32 glc
3513; SI-NEXT:    s_waitcnt vmcnt(0)
3514; SI-NEXT:    buffer_wbinvl1
3515; SI-NEXT:    v_readlane_b32 s7, v2, 1
3516; SI-NEXT:    v_readlane_b32 s6, v2, 0
3517; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
3518; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
3519; SI-NEXT:    s_mov_b64 exec, s[34:35]
3520; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
3521; SI-NEXT:    s_setpc_b64 s[30:31]
3522;
3523; VI-LABEL: global_atomic_or_i64_ret_offset_scalar:
3524; VI:       ; %bb.0:
3525; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3526; VI-NEXT:    s_add_u32 s34, s4, 32
3527; VI-NEXT:    s_addc_u32 s35, s5, 0
3528; VI-NEXT:    v_mov_b32_e32 v2, s34
3529; VI-NEXT:    v_mov_b32_e32 v0, s6
3530; VI-NEXT:    v_mov_b32_e32 v1, s7
3531; VI-NEXT:    v_mov_b32_e32 v3, s35
3532; VI-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
3533; VI-NEXT:    s_waitcnt vmcnt(0)
3534; VI-NEXT:    buffer_wbinvl1_vol
3535; VI-NEXT:    s_setpc_b64 s[30:31]
3536;
3537; GFX9-LABEL: global_atomic_or_i64_ret_offset_scalar:
3538; GFX9:       ; %bb.0:
3539; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3540; GFX9-NEXT:    v_mov_b32_e32 v0, s6
3541; GFX9-NEXT:    v_mov_b32_e32 v1, s7
3542; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3543; GFX9-NEXT:    global_atomic_or_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc
3544; GFX9-NEXT:    s_waitcnt vmcnt(0)
3545; GFX9-NEXT:    buffer_wbinvl1_vol
3546; GFX9-NEXT:    s_setpc_b64 s[30:31]
3547  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
3548  %result = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst
3549  ret i64 %result
3550}
3551
3552define void @global_atomic_or_i64_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
3553; SI-LABEL: global_atomic_or_i64_noret_offset__amdgpu_no_remote_memory:
3554; SI:       ; %bb.0:
3555; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3556; SI-NEXT:    s_mov_b32 s6, 0
3557; SI-NEXT:    s_mov_b32 s7, 0xf000
3558; SI-NEXT:    s_mov_b32 s4, s6
3559; SI-NEXT:    s_mov_b32 s5, s6
3560; SI-NEXT:    buffer_atomic_or_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32
3561; SI-NEXT:    s_waitcnt vmcnt(0)
3562; SI-NEXT:    buffer_wbinvl1
3563; SI-NEXT:    s_waitcnt expcnt(0)
3564; SI-NEXT:    s_setpc_b64 s[30:31]
3565;
3566; VI-LABEL: global_atomic_or_i64_noret_offset__amdgpu_no_remote_memory:
3567; VI:       ; %bb.0:
3568; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3569; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
3570; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3571; VI-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3]
3572; VI-NEXT:    s_waitcnt vmcnt(0)
3573; VI-NEXT:    buffer_wbinvl1_vol
3574; VI-NEXT:    s_setpc_b64 s[30:31]
3575;
3576; GFX9-LABEL: global_atomic_or_i64_noret_offset__amdgpu_no_remote_memory:
3577; GFX9:       ; %bb.0:
3578; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3579; GFX9-NEXT:    global_atomic_or_x2 v[0:1], v[2:3], off offset:32
3580; GFX9-NEXT:    s_waitcnt vmcnt(0)
3581; GFX9-NEXT:    buffer_wbinvl1_vol
3582; GFX9-NEXT:    s_setpc_b64 s[30:31]
3583  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
3584  %tmp0 = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
3585  ret void
3586}
3587
3588define i64 @global_atomic_or_i64_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
3589; SI-LABEL: global_atomic_or_i64_ret_offset__amdgpu_no_remote_memory:
3590; SI:       ; %bb.0:
3591; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3592; SI-NEXT:    s_mov_b32 s6, 0
3593; SI-NEXT:    s_mov_b32 s7, 0xf000
3594; SI-NEXT:    s_mov_b32 s4, s6
3595; SI-NEXT:    s_mov_b32 s5, s6
3596; SI-NEXT:    buffer_atomic_or_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc
3597; SI-NEXT:    s_waitcnt vmcnt(0)
3598; SI-NEXT:    buffer_wbinvl1
3599; SI-NEXT:    v_mov_b32_e32 v0, v2
3600; SI-NEXT:    v_mov_b32_e32 v1, v3
3601; SI-NEXT:    s_waitcnt expcnt(0)
3602; SI-NEXT:    s_setpc_b64 s[30:31]
3603;
3604; VI-LABEL: global_atomic_or_i64_ret_offset__amdgpu_no_remote_memory:
3605; VI:       ; %bb.0:
3606; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3607; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
3608; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3609; VI-NEXT:    flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
3610; VI-NEXT:    s_waitcnt vmcnt(0)
3611; VI-NEXT:    buffer_wbinvl1_vol
3612; VI-NEXT:    s_setpc_b64 s[30:31]
3613;
3614; GFX9-LABEL: global_atomic_or_i64_ret_offset__amdgpu_no_remote_memory:
3615; GFX9:       ; %bb.0:
3616; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3617; GFX9-NEXT:    global_atomic_or_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
3618; GFX9-NEXT:    s_waitcnt vmcnt(0)
3619; GFX9-NEXT:    buffer_wbinvl1_vol
3620; GFX9-NEXT:    s_setpc_b64 s[30:31]
3621  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
3622  %result = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
3623  ret i64 %result
3624}
3625
3626; ---------------------------------------------------------------------
3627; atomicrmw xor
3628; ---------------------------------------------------------------------
3629
3630define void @global_atomic_xor_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
3631; SI-LABEL: global_atomic_xor_i64_noret:
3632; SI:       ; %bb.0:
3633; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3634; SI-NEXT:    s_mov_b32 s6, 0
3635; SI-NEXT:    s_mov_b32 s7, 0xf000
3636; SI-NEXT:    s_mov_b32 s4, s6
3637; SI-NEXT:    s_mov_b32 s5, s6
3638; SI-NEXT:    buffer_atomic_xor_x2 v[2:3], v[0:1], s[4:7], 0 addr64
3639; SI-NEXT:    s_waitcnt vmcnt(0)
3640; SI-NEXT:    buffer_wbinvl1
3641; SI-NEXT:    s_waitcnt expcnt(0)
3642; SI-NEXT:    s_setpc_b64 s[30:31]
3643;
3644; VI-LABEL: global_atomic_xor_i64_noret:
3645; VI:       ; %bb.0:
3646; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3647; VI-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3]
3648; VI-NEXT:    s_waitcnt vmcnt(0)
3649; VI-NEXT:    buffer_wbinvl1_vol
3650; VI-NEXT:    s_setpc_b64 s[30:31]
3651;
3652; GFX9-LABEL: global_atomic_xor_i64_noret:
3653; GFX9:       ; %bb.0:
3654; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3655; GFX9-NEXT:    global_atomic_xor_x2 v[0:1], v[2:3], off
3656; GFX9-NEXT:    s_waitcnt vmcnt(0)
3657; GFX9-NEXT:    buffer_wbinvl1_vol
3658; GFX9-NEXT:    s_setpc_b64 s[30:31]
3659  %tmp0 = atomicrmw xor ptr addrspace(1) %ptr, i64 %in seq_cst
3660  ret void
3661}
3662
3663define void @global_atomic_xor_i64_noret_offset(ptr addrspace(1) %out, i64 %in) {
3664; SI-LABEL: global_atomic_xor_i64_noret_offset:
3665; SI:       ; %bb.0:
3666; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3667; SI-NEXT:    s_mov_b32 s6, 0
3668; SI-NEXT:    s_mov_b32 s7, 0xf000
3669; SI-NEXT:    s_mov_b32 s4, s6
3670; SI-NEXT:    s_mov_b32 s5, s6
3671; SI-NEXT:    buffer_atomic_xor_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32
3672; SI-NEXT:    s_waitcnt vmcnt(0)
3673; SI-NEXT:    buffer_wbinvl1
3674; SI-NEXT:    s_waitcnt expcnt(0)
3675; SI-NEXT:    s_setpc_b64 s[30:31]
3676;
3677; VI-LABEL: global_atomic_xor_i64_noret_offset:
3678; VI:       ; %bb.0:
3679; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3680; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
3681; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3682; VI-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3]
3683; VI-NEXT:    s_waitcnt vmcnt(0)
3684; VI-NEXT:    buffer_wbinvl1_vol
3685; VI-NEXT:    s_setpc_b64 s[30:31]
3686;
3687; GFX9-LABEL: global_atomic_xor_i64_noret_offset:
3688; GFX9:       ; %bb.0:
3689; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3690; GFX9-NEXT:    global_atomic_xor_x2 v[0:1], v[2:3], off offset:32
3691; GFX9-NEXT:    s_waitcnt vmcnt(0)
3692; GFX9-NEXT:    buffer_wbinvl1_vol
3693; GFX9-NEXT:    s_setpc_b64 s[30:31]
3694  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
3695  %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst
3696  ret void
3697}
3698
3699define i64 @global_atomic_xor_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
3700; SI-LABEL: global_atomic_xor_i64_ret:
3701; SI:       ; %bb.0:
3702; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3703; SI-NEXT:    s_mov_b32 s6, 0
3704; SI-NEXT:    s_mov_b32 s7, 0xf000
3705; SI-NEXT:    s_mov_b32 s4, s6
3706; SI-NEXT:    s_mov_b32 s5, s6
3707; SI-NEXT:    buffer_atomic_xor_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
3708; SI-NEXT:    s_waitcnt vmcnt(0)
3709; SI-NEXT:    buffer_wbinvl1
3710; SI-NEXT:    v_mov_b32_e32 v0, v2
3711; SI-NEXT:    v_mov_b32_e32 v1, v3
3712; SI-NEXT:    s_waitcnt expcnt(0)
3713; SI-NEXT:    s_setpc_b64 s[30:31]
3714;
3715; VI-LABEL: global_atomic_xor_i64_ret:
3716; VI:       ; %bb.0:
3717; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3718; VI-NEXT:    flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
3719; VI-NEXT:    s_waitcnt vmcnt(0)
3720; VI-NEXT:    buffer_wbinvl1_vol
3721; VI-NEXT:    s_setpc_b64 s[30:31]
3722;
3723; GFX9-LABEL: global_atomic_xor_i64_ret:
3724; GFX9:       ; %bb.0:
3725; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3726; GFX9-NEXT:    global_atomic_xor_x2 v[0:1], v[0:1], v[2:3], off glc
3727; GFX9-NEXT:    s_waitcnt vmcnt(0)
3728; GFX9-NEXT:    buffer_wbinvl1_vol
3729; GFX9-NEXT:    s_setpc_b64 s[30:31]
3730  %result = atomicrmw xor ptr addrspace(1) %ptr, i64 %in seq_cst
3731  ret i64 %result
3732}
3733
3734define i64 @global_atomic_xor_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
3735; SI-LABEL: global_atomic_xor_i64_ret_offset:
3736; SI:       ; %bb.0:
3737; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3738; SI-NEXT:    s_mov_b32 s6, 0
3739; SI-NEXT:    s_mov_b32 s7, 0xf000
3740; SI-NEXT:    s_mov_b32 s4, s6
3741; SI-NEXT:    s_mov_b32 s5, s6
3742; SI-NEXT:    buffer_atomic_xor_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc
3743; SI-NEXT:    s_waitcnt vmcnt(0)
3744; SI-NEXT:    buffer_wbinvl1
3745; SI-NEXT:    v_mov_b32_e32 v0, v2
3746; SI-NEXT:    v_mov_b32_e32 v1, v3
3747; SI-NEXT:    s_waitcnt expcnt(0)
3748; SI-NEXT:    s_setpc_b64 s[30:31]
3749;
3750; VI-LABEL: global_atomic_xor_i64_ret_offset:
3751; VI:       ; %bb.0:
3752; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3753; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
3754; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3755; VI-NEXT:    flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
3756; VI-NEXT:    s_waitcnt vmcnt(0)
3757; VI-NEXT:    buffer_wbinvl1_vol
3758; VI-NEXT:    s_setpc_b64 s[30:31]
3759;
3760; GFX9-LABEL: global_atomic_xor_i64_ret_offset:
3761; GFX9:       ; %bb.0:
3762; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3763; GFX9-NEXT:    global_atomic_xor_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
3764; GFX9-NEXT:    s_waitcnt vmcnt(0)
3765; GFX9-NEXT:    buffer_wbinvl1_vol
3766; GFX9-NEXT:    s_setpc_b64 s[30:31]
3767  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
3768  %result = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst
3769  ret i64 %result
3770}
3771
3772define amdgpu_gfx void @global_atomic_xor_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
3773; SI-LABEL: global_atomic_xor_i64_noret_scalar:
3774; SI:       ; %bb.0:
3775; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3776; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
3777; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
3778; SI-NEXT:    s_mov_b64 exec, s[34:35]
3779; SI-NEXT:    s_waitcnt expcnt(0)
3780; SI-NEXT:    v_writelane_b32 v2, s6, 0
3781; SI-NEXT:    v_writelane_b32 v2, s7, 1
3782; SI-NEXT:    s_mov_b32 s34, s7
3783; SI-NEXT:    s_mov_b32 s35, s6
3784; SI-NEXT:    s_mov_b32 s7, 0xf000
3785; SI-NEXT:    s_mov_b32 s6, -1
3786; SI-NEXT:    v_mov_b32_e32 v0, s35
3787; SI-NEXT:    v_mov_b32_e32 v1, s34
3788; SI-NEXT:    s_waitcnt vmcnt(0)
3789; SI-NEXT:    buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0
3790; SI-NEXT:    s_waitcnt vmcnt(0)
3791; SI-NEXT:    buffer_wbinvl1
3792; SI-NEXT:    v_readlane_b32 s7, v2, 1
3793; SI-NEXT:    v_readlane_b32 s6, v2, 0
3794; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
3795; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
3796; SI-NEXT:    s_mov_b64 exec, s[34:35]
3797; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
3798; SI-NEXT:    s_setpc_b64 s[30:31]
3799;
3800; VI-LABEL: global_atomic_xor_i64_noret_scalar:
3801; VI:       ; %bb.0:
3802; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3803; VI-NEXT:    v_mov_b32_e32 v0, s6
3804; VI-NEXT:    v_mov_b32_e32 v1, s7
3805; VI-NEXT:    v_mov_b32_e32 v2, s4
3806; VI-NEXT:    v_mov_b32_e32 v3, s5
3807; VI-NEXT:    flat_atomic_xor_x2 v[2:3], v[0:1]
3808; VI-NEXT:    s_waitcnt vmcnt(0)
3809; VI-NEXT:    buffer_wbinvl1_vol
3810; VI-NEXT:    s_setpc_b64 s[30:31]
3811;
3812; GFX9-LABEL: global_atomic_xor_i64_noret_scalar:
3813; GFX9:       ; %bb.0:
3814; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3815; GFX9-NEXT:    v_mov_b32_e32 v0, s6
3816; GFX9-NEXT:    v_mov_b32_e32 v1, s7
3817; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3818; GFX9-NEXT:    global_atomic_xor_x2 v2, v[0:1], s[4:5]
3819; GFX9-NEXT:    s_waitcnt vmcnt(0)
3820; GFX9-NEXT:    buffer_wbinvl1_vol
3821; GFX9-NEXT:    s_setpc_b64 s[30:31]
3822  %tmp0 = atomicrmw xor ptr addrspace(1) %ptr, i64 %in seq_cst
3823  ret void
3824}
3825
3826define amdgpu_gfx void @global_atomic_xor_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
3827; SI-LABEL: global_atomic_xor_i64_noret_offset_scalar:
3828; SI:       ; %bb.0:
3829; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3830; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
3831; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
3832; SI-NEXT:    s_mov_b64 exec, s[34:35]
3833; SI-NEXT:    s_waitcnt expcnt(0)
3834; SI-NEXT:    v_writelane_b32 v2, s6, 0
3835; SI-NEXT:    v_writelane_b32 v2, s7, 1
3836; SI-NEXT:    v_mov_b32_e32 v0, s6
3837; SI-NEXT:    v_mov_b32_e32 v1, s7
3838; SI-NEXT:    s_mov_b32 s7, 0xf000
3839; SI-NEXT:    s_mov_b32 s6, -1
3840; SI-NEXT:    s_waitcnt vmcnt(0)
3841; SI-NEXT:    buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0 offset:32
3842; SI-NEXT:    s_waitcnt vmcnt(0)
3843; SI-NEXT:    buffer_wbinvl1
3844; SI-NEXT:    v_readlane_b32 s7, v2, 1
3845; SI-NEXT:    v_readlane_b32 s6, v2, 0
3846; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
3847; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
3848; SI-NEXT:    s_mov_b64 exec, s[34:35]
3849; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
3850; SI-NEXT:    s_setpc_b64 s[30:31]
3851;
3852; VI-LABEL: global_atomic_xor_i64_noret_offset_scalar:
3853; VI:       ; %bb.0:
3854; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3855; VI-NEXT:    s_add_u32 s34, s4, 32
3856; VI-NEXT:    s_addc_u32 s35, s5, 0
3857; VI-NEXT:    v_mov_b32_e32 v2, s34
3858; VI-NEXT:    v_mov_b32_e32 v0, s6
3859; VI-NEXT:    v_mov_b32_e32 v1, s7
3860; VI-NEXT:    v_mov_b32_e32 v3, s35
3861; VI-NEXT:    flat_atomic_xor_x2 v[2:3], v[0:1]
3862; VI-NEXT:    s_waitcnt vmcnt(0)
3863; VI-NEXT:    buffer_wbinvl1_vol
3864; VI-NEXT:    s_setpc_b64 s[30:31]
3865;
3866; GFX9-LABEL: global_atomic_xor_i64_noret_offset_scalar:
3867; GFX9:       ; %bb.0:
3868; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3869; GFX9-NEXT:    v_mov_b32_e32 v0, s6
3870; GFX9-NEXT:    v_mov_b32_e32 v1, s7
3871; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3872; GFX9-NEXT:    global_atomic_xor_x2 v2, v[0:1], s[4:5] offset:32
3873; GFX9-NEXT:    s_waitcnt vmcnt(0)
3874; GFX9-NEXT:    buffer_wbinvl1_vol
3875; GFX9-NEXT:    s_setpc_b64 s[30:31]
3876  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
3877  %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst
3878  ret void
3879}
3880
3881define amdgpu_gfx i64 @global_atomic_xor_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
3882; SI-LABEL: global_atomic_xor_i64_ret_scalar:
3883; SI:       ; %bb.0:
3884; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3885; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
3886; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
3887; SI-NEXT:    s_mov_b64 exec, s[34:35]
3888; SI-NEXT:    s_waitcnt expcnt(0)
3889; SI-NEXT:    v_writelane_b32 v2, s6, 0
3890; SI-NEXT:    v_writelane_b32 v2, s7, 1
3891; SI-NEXT:    s_mov_b32 s34, s7
3892; SI-NEXT:    s_mov_b32 s35, s6
3893; SI-NEXT:    s_mov_b32 s7, 0xf000
3894; SI-NEXT:    s_mov_b32 s6, -1
3895; SI-NEXT:    v_mov_b32_e32 v0, s35
3896; SI-NEXT:    v_mov_b32_e32 v1, s34
3897; SI-NEXT:    s_waitcnt vmcnt(0)
3898; SI-NEXT:    buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0 glc
3899; SI-NEXT:    s_waitcnt vmcnt(0)
3900; SI-NEXT:    buffer_wbinvl1
3901; SI-NEXT:    v_readlane_b32 s7, v2, 1
3902; SI-NEXT:    v_readlane_b32 s6, v2, 0
3903; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
3904; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
3905; SI-NEXT:    s_mov_b64 exec, s[34:35]
3906; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
3907; SI-NEXT:    s_setpc_b64 s[30:31]
3908;
3909; VI-LABEL: global_atomic_xor_i64_ret_scalar:
3910; VI:       ; %bb.0:
3911; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3912; VI-NEXT:    v_mov_b32_e32 v0, s6
3913; VI-NEXT:    v_mov_b32_e32 v1, s7
3914; VI-NEXT:    v_mov_b32_e32 v2, s4
3915; VI-NEXT:    v_mov_b32_e32 v3, s5
3916; VI-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
3917; VI-NEXT:    s_waitcnt vmcnt(0)
3918; VI-NEXT:    buffer_wbinvl1_vol
3919; VI-NEXT:    s_setpc_b64 s[30:31]
3920;
3921; GFX9-LABEL: global_atomic_xor_i64_ret_scalar:
3922; GFX9:       ; %bb.0:
3923; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3924; GFX9-NEXT:    v_mov_b32_e32 v0, s6
3925; GFX9-NEXT:    v_mov_b32_e32 v1, s7
3926; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3927; GFX9-NEXT:    global_atomic_xor_x2 v[0:1], v2, v[0:1], s[4:5] glc
3928; GFX9-NEXT:    s_waitcnt vmcnt(0)
3929; GFX9-NEXT:    buffer_wbinvl1_vol
3930; GFX9-NEXT:    s_setpc_b64 s[30:31]
3931  %result = atomicrmw xor ptr addrspace(1) %ptr, i64 %in seq_cst
3932  ret i64 %result
3933}
3934
3935define amdgpu_gfx i64 @global_atomic_xor_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
3936; SI-LABEL: global_atomic_xor_i64_ret_offset_scalar:
3937; SI:       ; %bb.0:
3938; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3939; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
3940; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
3941; SI-NEXT:    s_mov_b64 exec, s[34:35]
3942; SI-NEXT:    s_waitcnt expcnt(0)
3943; SI-NEXT:    v_writelane_b32 v2, s6, 0
3944; SI-NEXT:    v_writelane_b32 v2, s7, 1
3945; SI-NEXT:    v_mov_b32_e32 v0, s6
3946; SI-NEXT:    v_mov_b32_e32 v1, s7
3947; SI-NEXT:    s_mov_b32 s7, 0xf000
3948; SI-NEXT:    s_mov_b32 s6, -1
3949; SI-NEXT:    s_waitcnt vmcnt(0)
3950; SI-NEXT:    buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0 offset:32 glc
3951; SI-NEXT:    s_waitcnt vmcnt(0)
3952; SI-NEXT:    buffer_wbinvl1
3953; SI-NEXT:    v_readlane_b32 s7, v2, 1
3954; SI-NEXT:    v_readlane_b32 s6, v2, 0
3955; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
3956; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
3957; SI-NEXT:    s_mov_b64 exec, s[34:35]
3958; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
3959; SI-NEXT:    s_setpc_b64 s[30:31]
3960;
3961; VI-LABEL: global_atomic_xor_i64_ret_offset_scalar:
3962; VI:       ; %bb.0:
3963; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3964; VI-NEXT:    s_add_u32 s34, s4, 32
3965; VI-NEXT:    s_addc_u32 s35, s5, 0
3966; VI-NEXT:    v_mov_b32_e32 v2, s34
3967; VI-NEXT:    v_mov_b32_e32 v0, s6
3968; VI-NEXT:    v_mov_b32_e32 v1, s7
3969; VI-NEXT:    v_mov_b32_e32 v3, s35
3970; VI-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
3971; VI-NEXT:    s_waitcnt vmcnt(0)
3972; VI-NEXT:    buffer_wbinvl1_vol
3973; VI-NEXT:    s_setpc_b64 s[30:31]
3974;
3975; GFX9-LABEL: global_atomic_xor_i64_ret_offset_scalar:
3976; GFX9:       ; %bb.0:
3977; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3978; GFX9-NEXT:    v_mov_b32_e32 v0, s6
3979; GFX9-NEXT:    v_mov_b32_e32 v1, s7
3980; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3981; GFX9-NEXT:    global_atomic_xor_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc
3982; GFX9-NEXT:    s_waitcnt vmcnt(0)
3983; GFX9-NEXT:    buffer_wbinvl1_vol
3984; GFX9-NEXT:    s_setpc_b64 s[30:31]
3985  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
3986  %result = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst
3987  ret i64 %result
3988}
3989
3990define void @global_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
3991; SI-LABEL: global_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory:
3992; SI:       ; %bb.0:
3993; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3994; SI-NEXT:    s_mov_b32 s6, 0
3995; SI-NEXT:    s_mov_b32 s7, 0xf000
3996; SI-NEXT:    s_mov_b32 s4, s6
3997; SI-NEXT:    s_mov_b32 s5, s6
3998; SI-NEXT:    buffer_atomic_xor_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32
3999; SI-NEXT:    s_waitcnt vmcnt(0)
4000; SI-NEXT:    buffer_wbinvl1
4001; SI-NEXT:    s_waitcnt expcnt(0)
4002; SI-NEXT:    s_setpc_b64 s[30:31]
4003;
4004; VI-LABEL: global_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory:
4005; VI:       ; %bb.0:
4006; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4007; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
4008; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4009; VI-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3]
4010; VI-NEXT:    s_waitcnt vmcnt(0)
4011; VI-NEXT:    buffer_wbinvl1_vol
4012; VI-NEXT:    s_setpc_b64 s[30:31]
4013;
4014; GFX9-LABEL: global_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory:
4015; GFX9:       ; %bb.0:
4016; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4017; GFX9-NEXT:    global_atomic_xor_x2 v[0:1], v[2:3], off offset:32
4018; GFX9-NEXT:    s_waitcnt vmcnt(0)
4019; GFX9-NEXT:    buffer_wbinvl1_vol
4020; GFX9-NEXT:    s_setpc_b64 s[30:31]
4021  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
4022  %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
4023  ret void
4024}
4025
4026define i64 @global_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
4027; SI-LABEL: global_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory:
4028; SI:       ; %bb.0:
4029; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4030; SI-NEXT:    s_mov_b32 s6, 0
4031; SI-NEXT:    s_mov_b32 s7, 0xf000
4032; SI-NEXT:    s_mov_b32 s4, s6
4033; SI-NEXT:    s_mov_b32 s5, s6
4034; SI-NEXT:    buffer_atomic_xor_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc
4035; SI-NEXT:    s_waitcnt vmcnt(0)
4036; SI-NEXT:    buffer_wbinvl1
4037; SI-NEXT:    v_mov_b32_e32 v0, v2
4038; SI-NEXT:    v_mov_b32_e32 v1, v3
4039; SI-NEXT:    s_waitcnt expcnt(0)
4040; SI-NEXT:    s_setpc_b64 s[30:31]
4041;
4042; VI-LABEL: global_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory:
4043; VI:       ; %bb.0:
4044; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4045; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
4046; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4047; VI-NEXT:    flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
4048; VI-NEXT:    s_waitcnt vmcnt(0)
4049; VI-NEXT:    buffer_wbinvl1_vol
4050; VI-NEXT:    s_setpc_b64 s[30:31]
4051;
4052; GFX9-LABEL: global_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory:
4053; GFX9:       ; %bb.0:
4054; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4055; GFX9-NEXT:    global_atomic_xor_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
4056; GFX9-NEXT:    s_waitcnt vmcnt(0)
4057; GFX9-NEXT:    buffer_wbinvl1_vol
4058; GFX9-NEXT:    s_setpc_b64 s[30:31]
4059  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
4060  %result = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
4061  ret i64 %result
4062}
4063
4064; ---------------------------------------------------------------------
4065; atomicrmw max
4066; ---------------------------------------------------------------------
4067
4068define void @global_atomic_max_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
4069; SI-LABEL: global_atomic_max_i64_noret:
4070; SI:       ; %bb.0:
4071; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4072; SI-NEXT:    s_mov_b32 s6, 0
4073; SI-NEXT:    s_mov_b32 s7, 0xf000
4074; SI-NEXT:    s_mov_b32 s4, s6
4075; SI-NEXT:    s_mov_b32 s5, s6
4076; SI-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
4077; SI-NEXT:    s_mov_b64 s[8:9], 0
4078; SI-NEXT:  .LBB80_1: ; %atomicrmw.start
4079; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
4080; SI-NEXT:    s_waitcnt vmcnt(0)
4081; SI-NEXT:    v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
4082; SI-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
4083; SI-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
4084; SI-NEXT:    s_waitcnt expcnt(0)
4085; SI-NEXT:    v_mov_b32_e32 v11, v7
4086; SI-NEXT:    v_mov_b32_e32 v10, v6
4087; SI-NEXT:    v_mov_b32_e32 v9, v5
4088; SI-NEXT:    v_mov_b32_e32 v8, v4
4089; SI-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
4090; SI-NEXT:    s_waitcnt vmcnt(0)
4091; SI-NEXT:    buffer_wbinvl1
4092; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
4093; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
4094; SI-NEXT:    v_mov_b32_e32 v6, v8
4095; SI-NEXT:    v_mov_b32_e32 v7, v9
4096; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
4097; SI-NEXT:    s_cbranch_execnz .LBB80_1
4098; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
4099; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
4100; SI-NEXT:    s_waitcnt expcnt(0)
4101; SI-NEXT:    s_setpc_b64 s[30:31]
4102;
4103; VI-LABEL: global_atomic_max_i64_noret:
4104; VI:       ; %bb.0:
4105; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4106; VI-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
4107; VI-NEXT:    s_mov_b64 s[4:5], 0
4108; VI-NEXT:  .LBB80_1: ; %atomicrmw.start
4109; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
4110; VI-NEXT:    s_waitcnt vmcnt(0)
4111; VI-NEXT:    v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
4112; VI-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
4113; VI-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
4114; VI-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
4115; VI-NEXT:    s_waitcnt vmcnt(0)
4116; VI-NEXT:    buffer_wbinvl1_vol
4117; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
4118; VI-NEXT:    v_mov_b32_e32 v7, v5
4119; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4120; VI-NEXT:    v_mov_b32_e32 v6, v4
4121; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4122; VI-NEXT:    s_cbranch_execnz .LBB80_1
4123; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
4124; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
4125; VI-NEXT:    s_setpc_b64 s[30:31]
4126;
4127; GFX9-LABEL: global_atomic_max_i64_noret:
4128; GFX9:       ; %bb.0:
4129; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4130; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
4131; GFX9-NEXT:    s_mov_b64 s[4:5], 0
4132; GFX9-NEXT:  .LBB80_1: ; %atomicrmw.start
4133; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
4134; GFX9-NEXT:    s_waitcnt vmcnt(0)
4135; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
4136; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
4137; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
4138; GFX9-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
4139; GFX9-NEXT:    s_waitcnt vmcnt(0)
4140; GFX9-NEXT:    buffer_wbinvl1_vol
4141; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
4142; GFX9-NEXT:    v_mov_b32_e32 v7, v5
4143; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4144; GFX9-NEXT:    v_mov_b32_e32 v6, v4
4145; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4146; GFX9-NEXT:    s_cbranch_execnz .LBB80_1
4147; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
4148; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
4149; GFX9-NEXT:    s_setpc_b64 s[30:31]
4150  %tmp0 = atomicrmw max ptr addrspace(1) %ptr, i64 %in seq_cst
4151  ret void
4152}
4153
4154define void @global_atomic_max_i64_noret_offset(ptr addrspace(1) %out, i64 %in) {
4155; SI-LABEL: global_atomic_max_i64_noret_offset:
4156; SI:       ; %bb.0:
4157; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4158; SI-NEXT:    s_mov_b32 s6, 0
4159; SI-NEXT:    s_mov_b32 s7, 0xf000
4160; SI-NEXT:    s_mov_b32 s4, s6
4161; SI-NEXT:    s_mov_b32 s5, s6
4162; SI-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32
4163; SI-NEXT:    s_mov_b64 s[8:9], 0
4164; SI-NEXT:  .LBB81_1: ; %atomicrmw.start
4165; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
4166; SI-NEXT:    s_waitcnt vmcnt(0)
4167; SI-NEXT:    v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
4168; SI-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
4169; SI-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
4170; SI-NEXT:    s_waitcnt expcnt(0)
4171; SI-NEXT:    v_mov_b32_e32 v11, v7
4172; SI-NEXT:    v_mov_b32_e32 v10, v6
4173; SI-NEXT:    v_mov_b32_e32 v9, v5
4174; SI-NEXT:    v_mov_b32_e32 v8, v4
4175; SI-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
4176; SI-NEXT:    s_waitcnt vmcnt(0)
4177; SI-NEXT:    buffer_wbinvl1
4178; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
4179; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
4180; SI-NEXT:    v_mov_b32_e32 v6, v8
4181; SI-NEXT:    v_mov_b32_e32 v7, v9
4182; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
4183; SI-NEXT:    s_cbranch_execnz .LBB81_1
4184; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
4185; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
4186; SI-NEXT:    s_waitcnt expcnt(0)
4187; SI-NEXT:    s_setpc_b64 s[30:31]
4188;
4189; VI-LABEL: global_atomic_max_i64_noret_offset:
4190; VI:       ; %bb.0:
4191; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4192; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
4193; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4194; VI-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
4195; VI-NEXT:    s_mov_b64 s[4:5], 0
4196; VI-NEXT:  .LBB81_1: ; %atomicrmw.start
4197; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
4198; VI-NEXT:    s_waitcnt vmcnt(0)
4199; VI-NEXT:    v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
4200; VI-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
4201; VI-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
4202; VI-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
4203; VI-NEXT:    s_waitcnt vmcnt(0)
4204; VI-NEXT:    buffer_wbinvl1_vol
4205; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
4206; VI-NEXT:    v_mov_b32_e32 v7, v5
4207; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4208; VI-NEXT:    v_mov_b32_e32 v6, v4
4209; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4210; VI-NEXT:    s_cbranch_execnz .LBB81_1
4211; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
4212; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
4213; VI-NEXT:    s_setpc_b64 s[30:31]
4214;
4215; GFX9-LABEL: global_atomic_max_i64_noret_offset:
4216; GFX9:       ; %bb.0:
4217; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4218; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off offset:32
4219; GFX9-NEXT:    s_mov_b64 s[4:5], 0
4220; GFX9-NEXT:  .LBB81_1: ; %atomicrmw.start
4221; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
4222; GFX9-NEXT:    s_waitcnt vmcnt(0)
4223; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
4224; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
4225; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
4226; GFX9-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
4227; GFX9-NEXT:    s_waitcnt vmcnt(0)
4228; GFX9-NEXT:    buffer_wbinvl1_vol
4229; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
4230; GFX9-NEXT:    v_mov_b32_e32 v7, v5
4231; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4232; GFX9-NEXT:    v_mov_b32_e32 v6, v4
4233; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4234; GFX9-NEXT:    s_cbranch_execnz .LBB81_1
4235; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
4236; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
4237; GFX9-NEXT:    s_setpc_b64 s[30:31]
4238  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
4239  %tmp0 = atomicrmw max ptr addrspace(1) %gep, i64 %in seq_cst
4240  ret void
4241}
4242
4243define i64 @global_atomic_max_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
4244; SI-LABEL: global_atomic_max_i64_ret:
4245; SI:       ; %bb.0:
4246; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4247; SI-NEXT:    v_mov_b32_e32 v5, v3
4248; SI-NEXT:    v_mov_b32_e32 v4, v2
4249; SI-NEXT:    v_mov_b32_e32 v7, v1
4250; SI-NEXT:    v_mov_b32_e32 v6, v0
4251; SI-NEXT:    s_mov_b32 s6, 0
4252; SI-NEXT:    s_mov_b32 s7, 0xf000
4253; SI-NEXT:    s_mov_b32 s4, s6
4254; SI-NEXT:    s_mov_b32 s5, s6
4255; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64
4256; SI-NEXT:    s_mov_b64 s[8:9], 0
4257; SI-NEXT:  .LBB82_1: ; %atomicrmw.start
4258; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
4259; SI-NEXT:    s_waitcnt vmcnt(0)
4260; SI-NEXT:    v_mov_b32_e32 v11, v1
4261; SI-NEXT:    v_mov_b32_e32 v10, v0
4262; SI-NEXT:    v_cmp_gt_i64_e32 vcc, v[10:11], v[4:5]
4263; SI-NEXT:    v_cndmask_b32_e32 v9, v5, v11, vcc
4264; SI-NEXT:    v_cndmask_b32_e32 v8, v4, v10, vcc
4265; SI-NEXT:    s_waitcnt expcnt(0)
4266; SI-NEXT:    v_mov_b32_e32 v0, v8
4267; SI-NEXT:    v_mov_b32_e32 v1, v9
4268; SI-NEXT:    v_mov_b32_e32 v2, v10
4269; SI-NEXT:    v_mov_b32_e32 v3, v11
4270; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
4271; SI-NEXT:    s_waitcnt vmcnt(0)
4272; SI-NEXT:    buffer_wbinvl1
4273; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
4274; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
4275; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
4276; SI-NEXT:    s_cbranch_execnz .LBB82_1
4277; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
4278; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
4279; SI-NEXT:    s_waitcnt expcnt(0)
4280; SI-NEXT:    s_setpc_b64 s[30:31]
4281;
4282; VI-LABEL: global_atomic_max_i64_ret:
4283; VI:       ; %bb.0:
4284; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4285; VI-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
4286; VI-NEXT:    s_mov_b64 s[4:5], 0
4287; VI-NEXT:  .LBB82_1: ; %atomicrmw.start
4288; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
4289; VI-NEXT:    s_waitcnt vmcnt(0)
4290; VI-NEXT:    v_mov_b32_e32 v7, v5
4291; VI-NEXT:    v_mov_b32_e32 v6, v4
4292; VI-NEXT:    v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
4293; VI-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
4294; VI-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
4295; VI-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
4296; VI-NEXT:    s_waitcnt vmcnt(0)
4297; VI-NEXT:    buffer_wbinvl1_vol
4298; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
4299; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4300; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4301; VI-NEXT:    s_cbranch_execnz .LBB82_1
4302; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
4303; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
4304; VI-NEXT:    v_mov_b32_e32 v0, v4
4305; VI-NEXT:    v_mov_b32_e32 v1, v5
4306; VI-NEXT:    s_setpc_b64 s[30:31]
4307;
4308; GFX9-LABEL: global_atomic_max_i64_ret:
4309; GFX9:       ; %bb.0:
4310; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4311; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
4312; GFX9-NEXT:    s_mov_b64 s[4:5], 0
4313; GFX9-NEXT:  .LBB82_1: ; %atomicrmw.start
4314; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
4315; GFX9-NEXT:    s_waitcnt vmcnt(0)
4316; GFX9-NEXT:    v_mov_b32_e32 v7, v5
4317; GFX9-NEXT:    v_mov_b32_e32 v6, v4
4318; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
4319; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
4320; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
4321; GFX9-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
4322; GFX9-NEXT:    s_waitcnt vmcnt(0)
4323; GFX9-NEXT:    buffer_wbinvl1_vol
4324; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
4325; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4326; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4327; GFX9-NEXT:    s_cbranch_execnz .LBB82_1
4328; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
4329; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
4330; GFX9-NEXT:    v_mov_b32_e32 v0, v4
4331; GFX9-NEXT:    v_mov_b32_e32 v1, v5
4332; GFX9-NEXT:    s_setpc_b64 s[30:31]
4333  %result = atomicrmw max ptr addrspace(1) %ptr, i64 %in seq_cst
4334  ret i64 %result
4335}
4336
4337define i64 @global_atomic_max_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
4338; SI-LABEL: global_atomic_max_i64_ret_offset:
4339; SI:       ; %bb.0:
4340; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4341; SI-NEXT:    v_mov_b32_e32 v5, v3
4342; SI-NEXT:    v_mov_b32_e32 v4, v2
4343; SI-NEXT:    v_mov_b32_e32 v7, v1
4344; SI-NEXT:    v_mov_b32_e32 v6, v0
4345; SI-NEXT:    s_mov_b32 s6, 0
4346; SI-NEXT:    s_mov_b32 s7, 0xf000
4347; SI-NEXT:    s_mov_b32 s4, s6
4348; SI-NEXT:    s_mov_b32 s5, s6
4349; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32
4350; SI-NEXT:    s_mov_b64 s[8:9], 0
4351; SI-NEXT:  .LBB83_1: ; %atomicrmw.start
4352; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
4353; SI-NEXT:    s_waitcnt vmcnt(0)
4354; SI-NEXT:    v_mov_b32_e32 v11, v1
4355; SI-NEXT:    v_mov_b32_e32 v10, v0
4356; SI-NEXT:    v_cmp_gt_i64_e32 vcc, v[10:11], v[4:5]
4357; SI-NEXT:    v_cndmask_b32_e32 v9, v5, v11, vcc
4358; SI-NEXT:    v_cndmask_b32_e32 v8, v4, v10, vcc
4359; SI-NEXT:    s_waitcnt expcnt(0)
4360; SI-NEXT:    v_mov_b32_e32 v0, v8
4361; SI-NEXT:    v_mov_b32_e32 v1, v9
4362; SI-NEXT:    v_mov_b32_e32 v2, v10
4363; SI-NEXT:    v_mov_b32_e32 v3, v11
4364; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc
4365; SI-NEXT:    s_waitcnt vmcnt(0)
4366; SI-NEXT:    buffer_wbinvl1
4367; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
4368; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
4369; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
4370; SI-NEXT:    s_cbranch_execnz .LBB83_1
4371; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
4372; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
4373; SI-NEXT:    s_waitcnt expcnt(0)
4374; SI-NEXT:    s_setpc_b64 s[30:31]
4375;
4376; VI-LABEL: global_atomic_max_i64_ret_offset:
4377; VI:       ; %bb.0:
4378; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4379; VI-NEXT:    v_add_u32_e32 v4, vcc, 32, v0
4380; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
4381; VI-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
4382; VI-NEXT:    s_mov_b64 s[4:5], 0
4383; VI-NEXT:  .LBB83_1: ; %atomicrmw.start
4384; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
4385; VI-NEXT:    s_waitcnt vmcnt(0)
4386; VI-NEXT:    v_mov_b32_e32 v9, v1
4387; VI-NEXT:    v_mov_b32_e32 v8, v0
4388; VI-NEXT:    v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3]
4389; VI-NEXT:    v_cndmask_b32_e32 v7, v3, v9, vcc
4390; VI-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
4391; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
4392; VI-NEXT:    s_waitcnt vmcnt(0)
4393; VI-NEXT:    buffer_wbinvl1_vol
4394; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
4395; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4396; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4397; VI-NEXT:    s_cbranch_execnz .LBB83_1
4398; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
4399; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
4400; VI-NEXT:    s_setpc_b64 s[30:31]
4401;
4402; GFX9-LABEL: global_atomic_max_i64_ret_offset:
4403; GFX9:       ; %bb.0:
4404; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4405; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:32
4406; GFX9-NEXT:    s_mov_b64 s[4:5], 0
4407; GFX9-NEXT:  .LBB83_1: ; %atomicrmw.start
4408; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
4409; GFX9-NEXT:    s_waitcnt vmcnt(0)
4410; GFX9-NEXT:    v_mov_b32_e32 v7, v5
4411; GFX9-NEXT:    v_mov_b32_e32 v6, v4
4412; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
4413; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
4414; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
4415; GFX9-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
4416; GFX9-NEXT:    s_waitcnt vmcnt(0)
4417; GFX9-NEXT:    buffer_wbinvl1_vol
4418; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
4419; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4420; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4421; GFX9-NEXT:    s_cbranch_execnz .LBB83_1
4422; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
4423; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
4424; GFX9-NEXT:    v_mov_b32_e32 v0, v4
4425; GFX9-NEXT:    v_mov_b32_e32 v1, v5
4426; GFX9-NEXT:    s_setpc_b64 s[30:31]
4427  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
4428  %result = atomicrmw max ptr addrspace(1) %gep, i64 %in seq_cst
4429  ret i64 %result
4430}
4431
4432define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
4433; SI-LABEL: global_atomic_max_i64_noret_scalar:
4434; SI:       ; %bb.0:
4435; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4436; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
4437; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
4438; SI-NEXT:    s_mov_b64 exec, s[34:35]
4439; SI-NEXT:    s_waitcnt expcnt(0)
4440; SI-NEXT:    v_writelane_b32 v10, s6, 0
4441; SI-NEXT:    v_writelane_b32 v10, s7, 1
4442; SI-NEXT:    s_mov_b32 s35, s7
4443; SI-NEXT:    s_mov_b32 s34, s6
4444; SI-NEXT:    s_mov_b32 s7, 0xf000
4445; SI-NEXT:    s_mov_b32 s6, -1
4446; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
4447; SI-NEXT:    s_mov_b64 s[36:37], 0
4448; SI-NEXT:    v_mov_b32_e32 v4, s35
4449; SI-NEXT:    v_mov_b32_e32 v5, s34
4450; SI-NEXT:  .LBB84_1: ; %atomicrmw.start
4451; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
4452; SI-NEXT:    s_waitcnt vmcnt(0)
4453; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[34:35], v[2:3]
4454; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
4455; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
4456; SI-NEXT:    s_waitcnt expcnt(0)
4457; SI-NEXT:    v_mov_b32_e32 v9, v3
4458; SI-NEXT:    v_mov_b32_e32 v8, v2
4459; SI-NEXT:    v_mov_b32_e32 v7, v1
4460; SI-NEXT:    v_mov_b32_e32 v6, v0
4461; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc
4462; SI-NEXT:    s_waitcnt vmcnt(0)
4463; SI-NEXT:    buffer_wbinvl1
4464; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
4465; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
4466; SI-NEXT:    v_mov_b32_e32 v2, v6
4467; SI-NEXT:    v_mov_b32_e32 v3, v7
4468; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
4469; SI-NEXT:    s_cbranch_execnz .LBB84_1
4470; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
4471; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
4472; SI-NEXT:    v_readlane_b32 s7, v10, 1
4473; SI-NEXT:    v_readlane_b32 s6, v10, 0
4474; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
4475; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
4476; SI-NEXT:    s_mov_b64 exec, s[34:35]
4477; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
4478; SI-NEXT:    s_setpc_b64 s[30:31]
4479;
4480; VI-LABEL: global_atomic_max_i64_noret_scalar:
4481; VI:       ; %bb.0:
4482; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4483; VI-NEXT:    v_mov_b32_e32 v0, s4
4484; VI-NEXT:    v_mov_b32_e32 v1, s5
4485; VI-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
4486; VI-NEXT:    v_mov_b32_e32 v4, s4
4487; VI-NEXT:    s_mov_b64 s[34:35], 0
4488; VI-NEXT:    v_mov_b32_e32 v6, s7
4489; VI-NEXT:    v_mov_b32_e32 v7, s6
4490; VI-NEXT:    v_mov_b32_e32 v5, s5
4491; VI-NEXT:  .LBB84_1: ; %atomicrmw.start
4492; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
4493; VI-NEXT:    s_waitcnt vmcnt(0)
4494; VI-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
4495; VI-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
4496; VI-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
4497; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
4498; VI-NEXT:    s_waitcnt vmcnt(0)
4499; VI-NEXT:    buffer_wbinvl1_vol
4500; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
4501; VI-NEXT:    v_mov_b32_e32 v3, v1
4502; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
4503; VI-NEXT:    v_mov_b32_e32 v2, v0
4504; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
4505; VI-NEXT:    s_cbranch_execnz .LBB84_1
4506; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
4507; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
4508; VI-NEXT:    s_setpc_b64 s[30:31]
4509;
4510; GFX9-LABEL: global_atomic_max_i64_noret_scalar:
4511; GFX9:       ; %bb.0:
4512; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4513; GFX9-NEXT:    v_mov_b32_e32 v4, 0
4514; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[4:5]
4515; GFX9-NEXT:    s_mov_b64 s[34:35], 0
4516; GFX9-NEXT:    v_mov_b32_e32 v5, s7
4517; GFX9-NEXT:    v_mov_b32_e32 v6, s6
4518; GFX9-NEXT:  .LBB84_1: ; %atomicrmw.start
4519; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
4520; GFX9-NEXT:    s_waitcnt vmcnt(0)
4521; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
4522; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
4523; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
4524; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc
4525; GFX9-NEXT:    s_waitcnt vmcnt(0)
4526; GFX9-NEXT:    buffer_wbinvl1_vol
4527; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
4528; GFX9-NEXT:    v_mov_b32_e32 v3, v1
4529; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
4530; GFX9-NEXT:    v_mov_b32_e32 v2, v0
4531; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
4532; GFX9-NEXT:    s_cbranch_execnz .LBB84_1
4533; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
4534; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
4535; GFX9-NEXT:    s_setpc_b64 s[30:31]
4536  %tmp0 = atomicrmw max ptr addrspace(1) %ptr, i64 %in seq_cst
4537  ret void
4538}
4539
4540define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
4541; SI-LABEL: global_atomic_max_i64_noret_offset_scalar:
4542; SI:       ; %bb.0:
4543; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4544; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
4545; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
4546; SI-NEXT:    s_mov_b64 exec, s[34:35]
4547; SI-NEXT:    s_waitcnt expcnt(0)
4548; SI-NEXT:    v_writelane_b32 v10, s6, 0
4549; SI-NEXT:    v_writelane_b32 v10, s7, 1
4550; SI-NEXT:    s_mov_b32 s35, s7
4551; SI-NEXT:    s_mov_b32 s34, s6
4552; SI-NEXT:    s_mov_b32 s7, 0xf000
4553; SI-NEXT:    s_mov_b32 s6, -1
4554; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32
4555; SI-NEXT:    s_mov_b64 s[36:37], 0
4556; SI-NEXT:    v_mov_b32_e32 v4, s35
4557; SI-NEXT:    v_mov_b32_e32 v5, s34
4558; SI-NEXT:  .LBB85_1: ; %atomicrmw.start
4559; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
4560; SI-NEXT:    s_waitcnt vmcnt(0)
4561; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[34:35], v[2:3]
4562; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
4563; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
4564; SI-NEXT:    s_waitcnt expcnt(0)
4565; SI-NEXT:    v_mov_b32_e32 v9, v3
4566; SI-NEXT:    v_mov_b32_e32 v8, v2
4567; SI-NEXT:    v_mov_b32_e32 v7, v1
4568; SI-NEXT:    v_mov_b32_e32 v6, v0
4569; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc
4570; SI-NEXT:    s_waitcnt vmcnt(0)
4571; SI-NEXT:    buffer_wbinvl1
4572; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
4573; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
4574; SI-NEXT:    v_mov_b32_e32 v2, v6
4575; SI-NEXT:    v_mov_b32_e32 v3, v7
4576; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
4577; SI-NEXT:    s_cbranch_execnz .LBB85_1
4578; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
4579; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
4580; SI-NEXT:    v_readlane_b32 s7, v10, 1
4581; SI-NEXT:    v_readlane_b32 s6, v10, 0
4582; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
4583; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
4584; SI-NEXT:    s_mov_b64 exec, s[34:35]
4585; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
4586; SI-NEXT:    s_setpc_b64 s[30:31]
4587;
4588; VI-LABEL: global_atomic_max_i64_noret_offset_scalar:
4589; VI:       ; %bb.0:
4590; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4591; VI-NEXT:    s_add_u32 s34, s4, 32
4592; VI-NEXT:    s_addc_u32 s35, s5, 0
4593; VI-NEXT:    v_mov_b32_e32 v4, s34
4594; VI-NEXT:    v_mov_b32_e32 v5, s35
4595; VI-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
4596; VI-NEXT:    s_mov_b64 s[34:35], 0
4597; VI-NEXT:    v_mov_b32_e32 v6, s7
4598; VI-NEXT:    v_mov_b32_e32 v7, s6
4599; VI-NEXT:  .LBB85_1: ; %atomicrmw.start
4600; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
4601; VI-NEXT:    s_waitcnt vmcnt(0)
4602; VI-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
4603; VI-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
4604; VI-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
4605; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
4606; VI-NEXT:    s_waitcnt vmcnt(0)
4607; VI-NEXT:    buffer_wbinvl1_vol
4608; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
4609; VI-NEXT:    v_mov_b32_e32 v3, v1
4610; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
4611; VI-NEXT:    v_mov_b32_e32 v2, v0
4612; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
4613; VI-NEXT:    s_cbranch_execnz .LBB85_1
4614; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
4615; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
4616; VI-NEXT:    s_setpc_b64 s[30:31]
4617;
4618; GFX9-LABEL: global_atomic_max_i64_noret_offset_scalar:
4619; GFX9:       ; %bb.0:
4620; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4621; GFX9-NEXT:    v_mov_b32_e32 v4, 0
4622; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[4:5] offset:32
4623; GFX9-NEXT:    s_mov_b64 s[34:35], 0
4624; GFX9-NEXT:    v_mov_b32_e32 v5, s7
4625; GFX9-NEXT:    v_mov_b32_e32 v6, s6
4626; GFX9-NEXT:  .LBB85_1: ; %atomicrmw.start
4627; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
4628; GFX9-NEXT:    s_waitcnt vmcnt(0)
4629; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
4630; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
4631; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
4632; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc
4633; GFX9-NEXT:    s_waitcnt vmcnt(0)
4634; GFX9-NEXT:    buffer_wbinvl1_vol
4635; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
4636; GFX9-NEXT:    v_mov_b32_e32 v3, v1
4637; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
4638; GFX9-NEXT:    v_mov_b32_e32 v2, v0
4639; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
4640; GFX9-NEXT:    s_cbranch_execnz .LBB85_1
4641; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
4642; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
4643; GFX9-NEXT:    s_setpc_b64 s[30:31]
4644  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
4645  %tmp0 = atomicrmw max ptr addrspace(1) %gep, i64 %in seq_cst
4646  ret void
4647}
4648
4649define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
4650; SI-LABEL: global_atomic_max_i64_ret_scalar:
4651; SI:       ; %bb.0:
4652; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4653; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
4654; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
4655; SI-NEXT:    s_mov_b64 exec, s[34:35]
4656; SI-NEXT:    s_waitcnt expcnt(0)
4657; SI-NEXT:    v_writelane_b32 v10, s6, 0
4658; SI-NEXT:    v_writelane_b32 v10, s7, 1
4659; SI-NEXT:    s_mov_b32 s35, s7
4660; SI-NEXT:    s_mov_b32 s34, s6
4661; SI-NEXT:    s_mov_b32 s7, 0xf000
4662; SI-NEXT:    s_mov_b32 s6, -1
4663; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
4664; SI-NEXT:    s_mov_b64 s[36:37], 0
4665; SI-NEXT:    v_mov_b32_e32 v4, s35
4666; SI-NEXT:    v_mov_b32_e32 v5, s34
4667; SI-NEXT:  .LBB86_1: ; %atomicrmw.start
4668; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
4669; SI-NEXT:    s_waitcnt vmcnt(0)
4670; SI-NEXT:    v_mov_b32_e32 v9, v1
4671; SI-NEXT:    v_mov_b32_e32 v8, v0
4672; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[34:35], v[8:9]
4673; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
4674; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
4675; SI-NEXT:    s_waitcnt expcnt(0)
4676; SI-NEXT:    v_mov_b32_e32 v0, v6
4677; SI-NEXT:    v_mov_b32_e32 v1, v7
4678; SI-NEXT:    v_mov_b32_e32 v2, v8
4679; SI-NEXT:    v_mov_b32_e32 v3, v9
4680; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
4681; SI-NEXT:    s_waitcnt vmcnt(0)
4682; SI-NEXT:    buffer_wbinvl1
4683; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
4684; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
4685; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
4686; SI-NEXT:    s_cbranch_execnz .LBB86_1
4687; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
4688; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
4689; SI-NEXT:    v_readlane_b32 s7, v10, 1
4690; SI-NEXT:    v_readlane_b32 s6, v10, 0
4691; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
4692; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
4693; SI-NEXT:    s_mov_b64 exec, s[34:35]
4694; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
4695; SI-NEXT:    s_setpc_b64 s[30:31]
4696;
4697; VI-LABEL: global_atomic_max_i64_ret_scalar:
4698; VI:       ; %bb.0:
4699; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4700; VI-NEXT:    v_mov_b32_e32 v0, s4
4701; VI-NEXT:    v_mov_b32_e32 v1, s5
4702; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
4703; VI-NEXT:    v_mov_b32_e32 v2, s4
4704; VI-NEXT:    s_mov_b64 s[34:35], 0
4705; VI-NEXT:    v_mov_b32_e32 v4, s7
4706; VI-NEXT:    v_mov_b32_e32 v5, s6
4707; VI-NEXT:    v_mov_b32_e32 v3, s5
4708; VI-NEXT:  .LBB86_1: ; %atomicrmw.start
4709; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
4710; VI-NEXT:    s_waitcnt vmcnt(0)
4711; VI-NEXT:    v_mov_b32_e32 v9, v1
4712; VI-NEXT:    v_mov_b32_e32 v8, v0
4713; VI-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
4714; VI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
4715; VI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
4716; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
4717; VI-NEXT:    s_waitcnt vmcnt(0)
4718; VI-NEXT:    buffer_wbinvl1_vol
4719; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
4720; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
4721; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
4722; VI-NEXT:    s_cbranch_execnz .LBB86_1
4723; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
4724; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
4725; VI-NEXT:    s_setpc_b64 s[30:31]
4726;
4727; GFX9-LABEL: global_atomic_max_i64_ret_scalar:
4728; GFX9:       ; %bb.0:
4729; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4730; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4731; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5]
4732; GFX9-NEXT:    s_mov_b64 s[34:35], 0
4733; GFX9-NEXT:    v_mov_b32_e32 v3, s7
4734; GFX9-NEXT:    v_mov_b32_e32 v4, s6
4735; GFX9-NEXT:  .LBB86_1: ; %atomicrmw.start
4736; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
4737; GFX9-NEXT:    s_waitcnt vmcnt(0)
4738; GFX9-NEXT:    v_mov_b32_e32 v8, v1
4739; GFX9-NEXT:    v_mov_b32_e32 v7, v0
4740; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[7:8]
4741; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v8, vcc
4742; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v7, vcc
4743; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] glc
4744; GFX9-NEXT:    s_waitcnt vmcnt(0)
4745; GFX9-NEXT:    buffer_wbinvl1_vol
4746; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
4747; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
4748; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
4749; GFX9-NEXT:    s_cbranch_execnz .LBB86_1
4750; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
4751; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
4752; GFX9-NEXT:    s_setpc_b64 s[30:31]
4753  %result = atomicrmw max ptr addrspace(1) %ptr, i64 %in seq_cst
4754  ret i64 %result
4755}
4756
4757define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
4758; SI-LABEL: global_atomic_max_i64_ret_offset_scalar:
4759; SI:       ; %bb.0:
4760; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4761; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
4762; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
4763; SI-NEXT:    s_mov_b64 exec, s[34:35]
4764; SI-NEXT:    s_waitcnt expcnt(0)
4765; SI-NEXT:    v_writelane_b32 v10, s6, 0
4766; SI-NEXT:    v_writelane_b32 v10, s7, 1
4767; SI-NEXT:    s_mov_b32 s35, s7
4768; SI-NEXT:    s_mov_b32 s34, s6
4769; SI-NEXT:    s_mov_b32 s7, 0xf000
4770; SI-NEXT:    s_mov_b32 s6, -1
4771; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
4772; SI-NEXT:    s_mov_b64 s[36:37], 0
4773; SI-NEXT:    v_mov_b32_e32 v4, s35
4774; SI-NEXT:    v_mov_b32_e32 v5, s34
4775; SI-NEXT:  .LBB87_1: ; %atomicrmw.start
4776; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
4777; SI-NEXT:    s_waitcnt vmcnt(0)
4778; SI-NEXT:    v_mov_b32_e32 v9, v1
4779; SI-NEXT:    v_mov_b32_e32 v8, v0
4780; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[34:35], v[8:9]
4781; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
4782; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
4783; SI-NEXT:    s_waitcnt expcnt(0)
4784; SI-NEXT:    v_mov_b32_e32 v0, v6
4785; SI-NEXT:    v_mov_b32_e32 v1, v7
4786; SI-NEXT:    v_mov_b32_e32 v2, v8
4787; SI-NEXT:    v_mov_b32_e32 v3, v9
4788; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
4789; SI-NEXT:    s_waitcnt vmcnt(0)
4790; SI-NEXT:    buffer_wbinvl1
4791; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
4792; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
4793; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
4794; SI-NEXT:    s_cbranch_execnz .LBB87_1
4795; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
4796; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
4797; SI-NEXT:    v_readlane_b32 s7, v10, 1
4798; SI-NEXT:    v_readlane_b32 s6, v10, 0
4799; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
4800; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
4801; SI-NEXT:    s_mov_b64 exec, s[34:35]
4802; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
4803; SI-NEXT:    s_setpc_b64 s[30:31]
4804;
4805; VI-LABEL: global_atomic_max_i64_ret_offset_scalar:
4806; VI:       ; %bb.0:
4807; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4808; VI-NEXT:    s_add_u32 s34, s4, 32
4809; VI-NEXT:    s_addc_u32 s35, s5, 0
4810; VI-NEXT:    v_mov_b32_e32 v2, s34
4811; VI-NEXT:    v_mov_b32_e32 v3, s35
4812; VI-NEXT:    flat_load_dwordx2 v[0:1], v[2:3]
4813; VI-NEXT:    s_mov_b64 s[34:35], 0
4814; VI-NEXT:    v_mov_b32_e32 v4, s7
4815; VI-NEXT:    v_mov_b32_e32 v5, s6
4816; VI-NEXT:  .LBB87_1: ; %atomicrmw.start
4817; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
4818; VI-NEXT:    s_waitcnt vmcnt(0)
4819; VI-NEXT:    v_mov_b32_e32 v9, v1
4820; VI-NEXT:    v_mov_b32_e32 v8, v0
4821; VI-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
4822; VI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
4823; VI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
4824; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
4825; VI-NEXT:    s_waitcnt vmcnt(0)
4826; VI-NEXT:    buffer_wbinvl1_vol
4827; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
4828; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
4829; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
4830; VI-NEXT:    s_cbranch_execnz .LBB87_1
4831; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
4832; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
4833; VI-NEXT:    s_setpc_b64 s[30:31]
4834;
4835; GFX9-LABEL: global_atomic_max_i64_ret_offset_scalar:
4836; GFX9:       ; %bb.0:
4837; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4838; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4839; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5] offset:32
4840; GFX9-NEXT:    s_mov_b64 s[34:35], 0
4841; GFX9-NEXT:    v_mov_b32_e32 v3, s7
4842; GFX9-NEXT:    v_mov_b32_e32 v4, s6
4843; GFX9-NEXT:  .LBB87_1: ; %atomicrmw.start
4844; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
4845; GFX9-NEXT:    s_waitcnt vmcnt(0)
4846; GFX9-NEXT:    v_mov_b32_e32 v8, v1
4847; GFX9-NEXT:    v_mov_b32_e32 v7, v0
4848; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[7:8]
4849; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v8, vcc
4850; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v7, vcc
4851; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] offset:32 glc
4852; GFX9-NEXT:    s_waitcnt vmcnt(0)
4853; GFX9-NEXT:    buffer_wbinvl1_vol
4854; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
4855; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
4856; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
4857; GFX9-NEXT:    s_cbranch_execnz .LBB87_1
4858; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
4859; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
4860; GFX9-NEXT:    s_setpc_b64 s[30:31]
4861  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
4862  %result = atomicrmw max ptr addrspace(1) %gep, i64 %in seq_cst
4863  ret i64 %result
4864}
4865
4866define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) {
4867; SI-LABEL: atomic_max_i64_addr64_offset:
4868; SI:       ; %bb.0: ; %entry
4869; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
4870; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
4871; SI-NEXT:    s_waitcnt lgkmcnt(0)
4872; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
4873; SI-NEXT:    s_add_u32 s4, s0, s4
4874; SI-NEXT:    s_addc_u32 s5, s1, s5
4875; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
4876; SI-NEXT:    s_mov_b64 s[0:1], 0
4877; SI-NEXT:    s_mov_b32 s7, 0xf000
4878; SI-NEXT:    v_mov_b32_e32 v4, s3
4879; SI-NEXT:    v_mov_b32_e32 v5, s2
4880; SI-NEXT:    s_waitcnt lgkmcnt(0)
4881; SI-NEXT:    v_mov_b32_e32 v2, s8
4882; SI-NEXT:    v_mov_b32_e32 v3, s9
4883; SI-NEXT:    s_mov_b32 s6, -1
4884; SI-NEXT:  .LBB88_1: ; %atomicrmw.start
4885; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
4886; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
4887; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
4888; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
4889; SI-NEXT:    s_waitcnt expcnt(0)
4890; SI-NEXT:    v_mov_b32_e32 v9, v3
4891; SI-NEXT:    v_mov_b32_e32 v8, v2
4892; SI-NEXT:    v_mov_b32_e32 v7, v1
4893; SI-NEXT:    v_mov_b32_e32 v6, v0
4894; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc
4895; SI-NEXT:    s_waitcnt vmcnt(0)
4896; SI-NEXT:    buffer_wbinvl1
4897; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
4898; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
4899; SI-NEXT:    v_mov_b32_e32 v2, v6
4900; SI-NEXT:    v_mov_b32_e32 v3, v7
4901; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
4902; SI-NEXT:    s_cbranch_execnz .LBB88_1
4903; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
4904; SI-NEXT:    s_endpgm
4905;
4906; VI-LABEL: atomic_max_i64_addr64_offset:
4907; VI:       ; %bb.0: ; %entry
4908; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
4909; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4910; VI-NEXT:    s_mov_b64 s[4:5], 0
4911; VI-NEXT:    s_waitcnt lgkmcnt(0)
4912; VI-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
4913; VI-NEXT:    s_add_u32 s0, s0, s6
4914; VI-NEXT:    s_addc_u32 s1, s1, s7
4915; VI-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x20
4916; VI-NEXT:    s_add_u32 s0, s0, 32
4917; VI-NEXT:    s_addc_u32 s1, s1, 0
4918; VI-NEXT:    v_mov_b32_e32 v5, s1
4919; VI-NEXT:    v_mov_b32_e32 v6, s3
4920; VI-NEXT:    s_waitcnt lgkmcnt(0)
4921; VI-NEXT:    v_mov_b32_e32 v2, s6
4922; VI-NEXT:    v_mov_b32_e32 v7, s2
4923; VI-NEXT:    v_mov_b32_e32 v3, s7
4924; VI-NEXT:    v_mov_b32_e32 v4, s0
4925; VI-NEXT:  .LBB88_1: ; %atomicrmw.start
4926; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
4927; VI-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
4928; VI-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
4929; VI-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
4930; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
4931; VI-NEXT:    s_waitcnt vmcnt(0)
4932; VI-NEXT:    buffer_wbinvl1_vol
4933; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
4934; VI-NEXT:    v_mov_b32_e32 v3, v1
4935; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4936; VI-NEXT:    v_mov_b32_e32 v2, v0
4937; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4938; VI-NEXT:    s_cbranch_execnz .LBB88_1
4939; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
4940; VI-NEXT:    s_endpgm
4941;
4942; GFX9-LABEL: atomic_max_i64_addr64_offset:
4943; GFX9:       ; %bb.0: ; %entry
4944; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
4945; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4946; GFX9-NEXT:    v_mov_b32_e32 v6, 0
4947; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4948; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
4949; GFX9-NEXT:    s_add_u32 s0, s0, s4
4950; GFX9-NEXT:    s_addc_u32 s1, s1, s5
4951; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x20
4952; GFX9-NEXT:    s_mov_b64 s[4:5], 0
4953; GFX9-NEXT:    v_mov_b32_e32 v4, s3
4954; GFX9-NEXT:    v_mov_b32_e32 v5, s2
4955; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4956; GFX9-NEXT:    v_mov_b32_e32 v2, s6
4957; GFX9-NEXT:    v_mov_b32_e32 v3, s7
4958; GFX9-NEXT:  .LBB88_1: ; %atomicrmw.start
4959; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
4960; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
4961; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
4962; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
4963; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:32 glc
4964; GFX9-NEXT:    s_waitcnt vmcnt(0)
4965; GFX9-NEXT:    buffer_wbinvl1_vol
4966; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
4967; GFX9-NEXT:    v_mov_b32_e32 v3, v1
4968; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
4969; GFX9-NEXT:    v_mov_b32_e32 v2, v0
4970; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
4971; GFX9-NEXT:    s_cbranch_execnz .LBB88_1
4972; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
4973; GFX9-NEXT:    s_endpgm
4974entry:
4975  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
4976  %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
4977  %tmp0 = atomicrmw max ptr addrspace(1) %gep, i64 %in seq_cst
4978  ret void
4979}
4980
4981define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
4982; SI-LABEL: atomic_max_i64_ret_addr64_offset:
4983; SI:       ; %bb.0: ; %entry
4984; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
4985; SI-NEXT:    s_waitcnt lgkmcnt(0)
4986; SI-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
4987; SI-NEXT:    s_add_u32 s8, s0, s6
4988; SI-NEXT:    s_addc_u32 s9, s1, s7
4989; SI-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
4990; SI-NEXT:    s_mov_b64 s[0:1], 0
4991; SI-NEXT:    s_mov_b32 s11, 0xf000
4992; SI-NEXT:    v_mov_b32_e32 v8, s5
4993; SI-NEXT:    v_mov_b32_e32 v9, s4
4994; SI-NEXT:    s_waitcnt lgkmcnt(0)
4995; SI-NEXT:    v_mov_b32_e32 v2, s6
4996; SI-NEXT:    v_mov_b32_e32 v3, s7
4997; SI-NEXT:    s_mov_b32 s10, -1
4998; SI-NEXT:  .LBB89_1: ; %atomicrmw.start
4999; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
5000; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3]
5001; SI-NEXT:    v_cndmask_b32_e32 v1, v8, v3, vcc
5002; SI-NEXT:    v_cndmask_b32_e32 v0, v9, v2, vcc
5003; SI-NEXT:    s_waitcnt expcnt(0)
5004; SI-NEXT:    v_mov_b32_e32 v7, v3
5005; SI-NEXT:    v_mov_b32_e32 v6, v2
5006; SI-NEXT:    v_mov_b32_e32 v5, v1
5007; SI-NEXT:    v_mov_b32_e32 v4, v0
5008; SI-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], off, s[8:11], 0 offset:32 glc
5009; SI-NEXT:    s_waitcnt vmcnt(0)
5010; SI-NEXT:    buffer_wbinvl1
5011; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
5012; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
5013; SI-NEXT:    v_mov_b32_e32 v2, v4
5014; SI-NEXT:    v_mov_b32_e32 v3, v5
5015; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
5016; SI-NEXT:    s_cbranch_execnz .LBB89_1
5017; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
5018; SI-NEXT:    s_or_b64 exec, exec, s[0:1]
5019; SI-NEXT:    s_mov_b32 s7, 0xf000
5020; SI-NEXT:    s_mov_b32 s6, -1
5021; SI-NEXT:    s_mov_b32 s4, s2
5022; SI-NEXT:    s_mov_b32 s5, s3
5023; SI-NEXT:    buffer_store_dwordx2 v[4:5], off, s[4:7], 0
5024; SI-NEXT:    s_endpgm
5025;
5026; VI-LABEL: atomic_max_i64_ret_addr64_offset:
5027; VI:       ; %bb.0: ; %entry
5028; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
5029; VI-NEXT:    s_mov_b64 s[8:9], 0
5030; VI-NEXT:    s_waitcnt lgkmcnt(0)
5031; VI-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
5032; VI-NEXT:    s_add_u32 s0, s0, s6
5033; VI-NEXT:    s_addc_u32 s1, s1, s7
5034; VI-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x20
5035; VI-NEXT:    s_add_u32 s0, s0, 32
5036; VI-NEXT:    s_addc_u32 s1, s1, 0
5037; VI-NEXT:    v_mov_b32_e32 v0, s0
5038; VI-NEXT:    v_mov_b32_e32 v4, s5
5039; VI-NEXT:    s_waitcnt lgkmcnt(0)
5040; VI-NEXT:    v_mov_b32_e32 v2, s6
5041; VI-NEXT:    v_mov_b32_e32 v5, s4
5042; VI-NEXT:    v_mov_b32_e32 v3, s7
5043; VI-NEXT:    v_mov_b32_e32 v1, s1
5044; VI-NEXT:  .LBB89_1: ; %atomicrmw.start
5045; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
5046; VI-NEXT:    v_mov_b32_e32 v9, v3
5047; VI-NEXT:    v_mov_b32_e32 v8, v2
5048; VI-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
5049; VI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
5050; VI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
5051; VI-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
5052; VI-NEXT:    s_waitcnt vmcnt(0)
5053; VI-NEXT:    buffer_wbinvl1_vol
5054; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
5055; VI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
5056; VI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
5057; VI-NEXT:    s_cbranch_execnz .LBB89_1
5058; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
5059; VI-NEXT:    s_or_b64 exec, exec, s[8:9]
5060; VI-NEXT:    v_mov_b32_e32 v0, s2
5061; VI-NEXT:    v_mov_b32_e32 v1, s3
5062; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
5063; VI-NEXT:    s_endpgm
5064;
5065; GFX9-LABEL: atomic_max_i64_ret_addr64_offset:
5066; GFX9:       ; %bb.0: ; %entry
5067; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
5068; GFX9-NEXT:    s_mov_b64 s[2:3], 0
5069; GFX9-NEXT:    v_mov_b32_e32 v4, 0
5070; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5071; GFX9-NEXT:    s_lshl_b64 s[0:1], s[14:15], 3
5072; GFX9-NEXT:    s_add_u32 s0, s8, s0
5073; GFX9-NEXT:    s_addc_u32 s1, s9, s1
5074; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x20
5075; GFX9-NEXT:    v_mov_b32_e32 v2, s13
5076; GFX9-NEXT:    v_mov_b32_e32 v3, s12
5077; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5078; GFX9-NEXT:    v_mov_b32_e32 v0, s4
5079; GFX9-NEXT:    v_mov_b32_e32 v1, s5
5080; GFX9-NEXT:  .LBB89_1: ; %atomicrmw.start
5081; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
5082; GFX9-NEXT:    v_mov_b32_e32 v8, v1
5083; GFX9-NEXT:    v_mov_b32_e32 v7, v0
5084; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[12:13], v[7:8]
5085; GFX9-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
5086; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
5087; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] offset:32 glc
5088; GFX9-NEXT:    s_waitcnt vmcnt(0)
5089; GFX9-NEXT:    buffer_wbinvl1_vol
5090; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
5091; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
5092; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
5093; GFX9-NEXT:    s_cbranch_execnz .LBB89_1
5094; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
5095; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
5096; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5097; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[10:11]
5098; GFX9-NEXT:    s_endpgm
5099entry:
5100  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
5101  %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
5102  %tmp0 = atomicrmw max ptr addrspace(1) %gep, i64 %in seq_cst
5103  store i64 %tmp0, ptr addrspace(1) %out2
5104  ret void
5105}
5106
5107define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) {
5108; SI-LABEL: atomic_max_i64_addr64:
5109; SI:       ; %bb.0: ; %entry
5110; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
5111; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
5112; SI-NEXT:    s_waitcnt lgkmcnt(0)
5113; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
5114; SI-NEXT:    s_add_u32 s4, s0, s4
5115; SI-NEXT:    s_addc_u32 s5, s1, s5
5116; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
5117; SI-NEXT:    s_mov_b64 s[0:1], 0
5118; SI-NEXT:    s_mov_b32 s7, 0xf000
5119; SI-NEXT:    v_mov_b32_e32 v4, s3
5120; SI-NEXT:    v_mov_b32_e32 v5, s2
5121; SI-NEXT:    s_waitcnt lgkmcnt(0)
5122; SI-NEXT:    v_mov_b32_e32 v2, s8
5123; SI-NEXT:    v_mov_b32_e32 v3, s9
5124; SI-NEXT:    s_mov_b32 s6, -1
5125; SI-NEXT:  .LBB90_1: ; %atomicrmw.start
5126; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
5127; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
5128; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
5129; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
5130; SI-NEXT:    s_waitcnt expcnt(0)
5131; SI-NEXT:    v_mov_b32_e32 v9, v3
5132; SI-NEXT:    v_mov_b32_e32 v8, v2
5133; SI-NEXT:    v_mov_b32_e32 v7, v1
5134; SI-NEXT:    v_mov_b32_e32 v6, v0
5135; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc
5136; SI-NEXT:    s_waitcnt vmcnt(0)
5137; SI-NEXT:    buffer_wbinvl1
5138; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
5139; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
5140; SI-NEXT:    v_mov_b32_e32 v2, v6
5141; SI-NEXT:    v_mov_b32_e32 v3, v7
5142; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
5143; SI-NEXT:    s_cbranch_execnz .LBB90_1
5144; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
5145; SI-NEXT:    s_endpgm
5146;
5147; VI-LABEL: atomic_max_i64_addr64:
5148; VI:       ; %bb.0: ; %entry
5149; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
5150; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5151; VI-NEXT:    s_waitcnt lgkmcnt(0)
5152; VI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
5153; VI-NEXT:    s_add_u32 s4, s0, s4
5154; VI-NEXT:    s_addc_u32 s5, s1, s5
5155; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
5156; VI-NEXT:    v_mov_b32_e32 v4, s4
5157; VI-NEXT:    s_mov_b64 s[0:1], 0
5158; VI-NEXT:    v_mov_b32_e32 v6, s3
5159; VI-NEXT:    v_mov_b32_e32 v7, s2
5160; VI-NEXT:    s_waitcnt lgkmcnt(0)
5161; VI-NEXT:    v_mov_b32_e32 v2, s6
5162; VI-NEXT:    v_mov_b32_e32 v3, s7
5163; VI-NEXT:    v_mov_b32_e32 v5, s5
5164; VI-NEXT:  .LBB90_1: ; %atomicrmw.start
5165; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
5166; VI-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
5167; VI-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
5168; VI-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
5169; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
5170; VI-NEXT:    s_waitcnt vmcnt(0)
5171; VI-NEXT:    buffer_wbinvl1_vol
5172; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
5173; VI-NEXT:    v_mov_b32_e32 v3, v1
5174; VI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
5175; VI-NEXT:    v_mov_b32_e32 v2, v0
5176; VI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
5177; VI-NEXT:    s_cbranch_execnz .LBB90_1
5178; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
5179; VI-NEXT:    s_endpgm
5180;
5181; GFX9-LABEL: atomic_max_i64_addr64:
5182; GFX9:       ; %bb.0: ; %entry
5183; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
5184; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5185; GFX9-NEXT:    v_mov_b32_e32 v6, 0
5186; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5187; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
5188; GFX9-NEXT:    s_add_u32 s0, s0, s4
5189; GFX9-NEXT:    s_addc_u32 s1, s1, s5
5190; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
5191; GFX9-NEXT:    s_mov_b64 s[4:5], 0
5192; GFX9-NEXT:    v_mov_b32_e32 v4, s3
5193; GFX9-NEXT:    v_mov_b32_e32 v5, s2
5194; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5195; GFX9-NEXT:    v_mov_b32_e32 v2, s6
5196; GFX9-NEXT:    v_mov_b32_e32 v3, s7
5197; GFX9-NEXT:  .LBB90_1: ; %atomicrmw.start
5198; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
5199; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
5200; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
5201; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
5202; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
5203; GFX9-NEXT:    s_waitcnt vmcnt(0)
5204; GFX9-NEXT:    buffer_wbinvl1_vol
5205; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
5206; GFX9-NEXT:    v_mov_b32_e32 v3, v1
5207; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5208; GFX9-NEXT:    v_mov_b32_e32 v2, v0
5209; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5210; GFX9-NEXT:    s_cbranch_execnz .LBB90_1
5211; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
5212; GFX9-NEXT:    s_endpgm
5213entry:
5214  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
5215  %tmp0 = atomicrmw max ptr addrspace(1) %ptr, i64 %in seq_cst
5216  ret void
5217}
5218
5219define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
5220; SI-LABEL: atomic_max_i64_ret_addr64:
5221; SI:       ; %bb.0: ; %entry
5222; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
5223; SI-NEXT:    s_waitcnt lgkmcnt(0)
5224; SI-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
5225; SI-NEXT:    s_add_u32 s8, s0, s6
5226; SI-NEXT:    s_addc_u32 s9, s1, s7
5227; SI-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
5228; SI-NEXT:    s_mov_b64 s[0:1], 0
5229; SI-NEXT:    s_mov_b32 s11, 0xf000
5230; SI-NEXT:    v_mov_b32_e32 v8, s5
5231; SI-NEXT:    v_mov_b32_e32 v9, s4
5232; SI-NEXT:    s_waitcnt lgkmcnt(0)
5233; SI-NEXT:    v_mov_b32_e32 v2, s6
5234; SI-NEXT:    v_mov_b32_e32 v3, s7
5235; SI-NEXT:    s_mov_b32 s10, -1
5236; SI-NEXT:  .LBB91_1: ; %atomicrmw.start
5237; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
5238; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3]
5239; SI-NEXT:    v_cndmask_b32_e32 v1, v8, v3, vcc
5240; SI-NEXT:    v_cndmask_b32_e32 v0, v9, v2, vcc
5241; SI-NEXT:    s_waitcnt expcnt(0)
5242; SI-NEXT:    v_mov_b32_e32 v7, v3
5243; SI-NEXT:    v_mov_b32_e32 v6, v2
5244; SI-NEXT:    v_mov_b32_e32 v5, v1
5245; SI-NEXT:    v_mov_b32_e32 v4, v0
5246; SI-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], off, s[8:11], 0 glc
5247; SI-NEXT:    s_waitcnt vmcnt(0)
5248; SI-NEXT:    buffer_wbinvl1
5249; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
5250; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
5251; SI-NEXT:    v_mov_b32_e32 v2, v4
5252; SI-NEXT:    v_mov_b32_e32 v3, v5
5253; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
5254; SI-NEXT:    s_cbranch_execnz .LBB91_1
5255; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
5256; SI-NEXT:    s_or_b64 exec, exec, s[0:1]
5257; SI-NEXT:    s_mov_b32 s7, 0xf000
5258; SI-NEXT:    s_mov_b32 s6, -1
5259; SI-NEXT:    s_mov_b32 s4, s2
5260; SI-NEXT:    s_mov_b32 s5, s3
5261; SI-NEXT:    buffer_store_dwordx2 v[4:5], off, s[4:7], 0
5262; SI-NEXT:    s_endpgm
5263;
5264; VI-LABEL: atomic_max_i64_ret_addr64:
5265; VI:       ; %bb.0: ; %entry
5266; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
5267; VI-NEXT:    s_waitcnt lgkmcnt(0)
5268; VI-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
5269; VI-NEXT:    s_add_u32 s6, s0, s6
5270; VI-NEXT:    s_addc_u32 s7, s1, s7
5271; VI-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
5272; VI-NEXT:    v_mov_b32_e32 v0, s6
5273; VI-NEXT:    s_mov_b64 s[0:1], 0
5274; VI-NEXT:    v_mov_b32_e32 v4, s5
5275; VI-NEXT:    v_mov_b32_e32 v5, s4
5276; VI-NEXT:    s_waitcnt lgkmcnt(0)
5277; VI-NEXT:    v_mov_b32_e32 v2, s8
5278; VI-NEXT:    v_mov_b32_e32 v3, s9
5279; VI-NEXT:    v_mov_b32_e32 v1, s7
5280; VI-NEXT:  .LBB91_1: ; %atomicrmw.start
5281; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
5282; VI-NEXT:    v_mov_b32_e32 v9, v3
5283; VI-NEXT:    v_mov_b32_e32 v8, v2
5284; VI-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
5285; VI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
5286; VI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
5287; VI-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
5288; VI-NEXT:    s_waitcnt vmcnt(0)
5289; VI-NEXT:    buffer_wbinvl1_vol
5290; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
5291; VI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
5292; VI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
5293; VI-NEXT:    s_cbranch_execnz .LBB91_1
5294; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
5295; VI-NEXT:    s_or_b64 exec, exec, s[0:1]
5296; VI-NEXT:    v_mov_b32_e32 v0, s2
5297; VI-NEXT:    v_mov_b32_e32 v1, s3
5298; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
5299; VI-NEXT:    s_endpgm
5300;
5301; GFX9-LABEL: atomic_max_i64_ret_addr64:
5302; GFX9:       ; %bb.0: ; %entry
5303; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
5304; GFX9-NEXT:    s_mov_b64 s[2:3], 0
5305; GFX9-NEXT:    v_mov_b32_e32 v4, 0
5306; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5307; GFX9-NEXT:    s_lshl_b64 s[0:1], s[14:15], 3
5308; GFX9-NEXT:    s_add_u32 s0, s8, s0
5309; GFX9-NEXT:    s_addc_u32 s1, s9, s1
5310; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
5311; GFX9-NEXT:    v_mov_b32_e32 v2, s13
5312; GFX9-NEXT:    v_mov_b32_e32 v3, s12
5313; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5314; GFX9-NEXT:    v_mov_b32_e32 v0, s4
5315; GFX9-NEXT:    v_mov_b32_e32 v1, s5
5316; GFX9-NEXT:  .LBB91_1: ; %atomicrmw.start
5317; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
5318; GFX9-NEXT:    v_mov_b32_e32 v8, v1
5319; GFX9-NEXT:    v_mov_b32_e32 v7, v0
5320; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[12:13], v[7:8]
5321; GFX9-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
5322; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
5323; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] glc
5324; GFX9-NEXT:    s_waitcnt vmcnt(0)
5325; GFX9-NEXT:    buffer_wbinvl1_vol
5326; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
5327; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
5328; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
5329; GFX9-NEXT:    s_cbranch_execnz .LBB91_1
5330; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
5331; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
5332; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5333; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[10:11]
5334; GFX9-NEXT:    s_endpgm
5335entry:
5336  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
5337  %tmp0 = atomicrmw max ptr addrspace(1) %ptr, i64 %in seq_cst
5338  store i64 %tmp0, ptr addrspace(1) %out2
5339  ret void
5340}
5341
5342define void @global_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
5343; SI-LABEL: global_atomic_max_i64_noret_offset__amdgpu_no_remote_memory:
5344; SI:       ; %bb.0:
5345; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5346; SI-NEXT:    s_mov_b32 s6, 0
5347; SI-NEXT:    s_mov_b32 s7, 0xf000
5348; SI-NEXT:    s_mov_b32 s4, s6
5349; SI-NEXT:    s_mov_b32 s5, s6
5350; SI-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32
5351; SI-NEXT:    s_mov_b64 s[8:9], 0
5352; SI-NEXT:  .LBB92_1: ; %atomicrmw.start
5353; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
5354; SI-NEXT:    s_waitcnt vmcnt(0)
5355; SI-NEXT:    v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
5356; SI-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
5357; SI-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
5358; SI-NEXT:    s_waitcnt expcnt(0)
5359; SI-NEXT:    v_mov_b32_e32 v11, v7
5360; SI-NEXT:    v_mov_b32_e32 v10, v6
5361; SI-NEXT:    v_mov_b32_e32 v9, v5
5362; SI-NEXT:    v_mov_b32_e32 v8, v4
5363; SI-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
5364; SI-NEXT:    s_waitcnt vmcnt(0)
5365; SI-NEXT:    buffer_wbinvl1
5366; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
5367; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
5368; SI-NEXT:    v_mov_b32_e32 v6, v8
5369; SI-NEXT:    v_mov_b32_e32 v7, v9
5370; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
5371; SI-NEXT:    s_cbranch_execnz .LBB92_1
5372; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
5373; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
5374; SI-NEXT:    s_waitcnt expcnt(0)
5375; SI-NEXT:    s_setpc_b64 s[30:31]
5376;
5377; VI-LABEL: global_atomic_max_i64_noret_offset__amdgpu_no_remote_memory:
5378; VI:       ; %bb.0:
5379; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5380; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
5381; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
5382; VI-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
5383; VI-NEXT:    s_mov_b64 s[4:5], 0
5384; VI-NEXT:  .LBB92_1: ; %atomicrmw.start
5385; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
5386; VI-NEXT:    s_waitcnt vmcnt(0)
5387; VI-NEXT:    v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
5388; VI-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
5389; VI-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
5390; VI-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
5391; VI-NEXT:    s_waitcnt vmcnt(0)
5392; VI-NEXT:    buffer_wbinvl1_vol
5393; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
5394; VI-NEXT:    v_mov_b32_e32 v7, v5
5395; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5396; VI-NEXT:    v_mov_b32_e32 v6, v4
5397; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5398; VI-NEXT:    s_cbranch_execnz .LBB92_1
5399; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
5400; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
5401; VI-NEXT:    s_setpc_b64 s[30:31]
5402;
5403; GFX9-LABEL: global_atomic_max_i64_noret_offset__amdgpu_no_remote_memory:
5404; GFX9:       ; %bb.0:
5405; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5406; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off offset:32
5407; GFX9-NEXT:    s_mov_b64 s[4:5], 0
5408; GFX9-NEXT:  .LBB92_1: ; %atomicrmw.start
5409; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
5410; GFX9-NEXT:    s_waitcnt vmcnt(0)
5411; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
5412; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
5413; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
5414; GFX9-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
5415; GFX9-NEXT:    s_waitcnt vmcnt(0)
5416; GFX9-NEXT:    buffer_wbinvl1_vol
5417; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
5418; GFX9-NEXT:    v_mov_b32_e32 v7, v5
5419; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5420; GFX9-NEXT:    v_mov_b32_e32 v6, v4
5421; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5422; GFX9-NEXT:    s_cbranch_execnz .LBB92_1
5423; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
5424; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
5425; GFX9-NEXT:    s_setpc_b64 s[30:31]
5426  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
5427  %tmp0 = atomicrmw max ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
5428  ret void
5429}
5430
5431define i64 @global_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
5432; SI-LABEL: global_atomic_max_i64_ret_offset__amdgpu_no_remote_memory:
5433; SI:       ; %bb.0:
5434; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5435; SI-NEXT:    v_mov_b32_e32 v5, v3
5436; SI-NEXT:    v_mov_b32_e32 v4, v2
5437; SI-NEXT:    v_mov_b32_e32 v7, v1
5438; SI-NEXT:    v_mov_b32_e32 v6, v0
5439; SI-NEXT:    s_mov_b32 s6, 0
5440; SI-NEXT:    s_mov_b32 s7, 0xf000
5441; SI-NEXT:    s_mov_b32 s4, s6
5442; SI-NEXT:    s_mov_b32 s5, s6
5443; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32
5444; SI-NEXT:    s_mov_b64 s[8:9], 0
5445; SI-NEXT:  .LBB93_1: ; %atomicrmw.start
5446; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
5447; SI-NEXT:    s_waitcnt vmcnt(0)
5448; SI-NEXT:    v_mov_b32_e32 v11, v1
5449; SI-NEXT:    v_mov_b32_e32 v10, v0
5450; SI-NEXT:    v_cmp_gt_i64_e32 vcc, v[10:11], v[4:5]
5451; SI-NEXT:    v_cndmask_b32_e32 v9, v5, v11, vcc
5452; SI-NEXT:    v_cndmask_b32_e32 v8, v4, v10, vcc
5453; SI-NEXT:    s_waitcnt expcnt(0)
5454; SI-NEXT:    v_mov_b32_e32 v0, v8
5455; SI-NEXT:    v_mov_b32_e32 v1, v9
5456; SI-NEXT:    v_mov_b32_e32 v2, v10
5457; SI-NEXT:    v_mov_b32_e32 v3, v11
5458; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc
5459; SI-NEXT:    s_waitcnt vmcnt(0)
5460; SI-NEXT:    buffer_wbinvl1
5461; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
5462; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
5463; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
5464; SI-NEXT:    s_cbranch_execnz .LBB93_1
5465; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
5466; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
5467; SI-NEXT:    s_waitcnt expcnt(0)
5468; SI-NEXT:    s_setpc_b64 s[30:31]
5469;
5470; VI-LABEL: global_atomic_max_i64_ret_offset__amdgpu_no_remote_memory:
5471; VI:       ; %bb.0:
5472; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5473; VI-NEXT:    v_add_u32_e32 v4, vcc, 32, v0
5474; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
5475; VI-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
5476; VI-NEXT:    s_mov_b64 s[4:5], 0
5477; VI-NEXT:  .LBB93_1: ; %atomicrmw.start
5478; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
5479; VI-NEXT:    s_waitcnt vmcnt(0)
5480; VI-NEXT:    v_mov_b32_e32 v9, v1
5481; VI-NEXT:    v_mov_b32_e32 v8, v0
5482; VI-NEXT:    v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3]
5483; VI-NEXT:    v_cndmask_b32_e32 v7, v3, v9, vcc
5484; VI-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
5485; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
5486; VI-NEXT:    s_waitcnt vmcnt(0)
5487; VI-NEXT:    buffer_wbinvl1_vol
5488; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
5489; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5490; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5491; VI-NEXT:    s_cbranch_execnz .LBB93_1
5492; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
5493; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
5494; VI-NEXT:    s_setpc_b64 s[30:31]
5495;
5496; GFX9-LABEL: global_atomic_max_i64_ret_offset__amdgpu_no_remote_memory:
5497; GFX9:       ; %bb.0:
5498; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5499; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:32
5500; GFX9-NEXT:    s_mov_b64 s[4:5], 0
5501; GFX9-NEXT:  .LBB93_1: ; %atomicrmw.start
5502; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
5503; GFX9-NEXT:    s_waitcnt vmcnt(0)
5504; GFX9-NEXT:    v_mov_b32_e32 v7, v5
5505; GFX9-NEXT:    v_mov_b32_e32 v6, v4
5506; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
5507; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
5508; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
5509; GFX9-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
5510; GFX9-NEXT:    s_waitcnt vmcnt(0)
5511; GFX9-NEXT:    buffer_wbinvl1_vol
5512; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
5513; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5514; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5515; GFX9-NEXT:    s_cbranch_execnz .LBB93_1
5516; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
5517; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
5518; GFX9-NEXT:    v_mov_b32_e32 v0, v4
5519; GFX9-NEXT:    v_mov_b32_e32 v1, v5
5520; GFX9-NEXT:    s_setpc_b64 s[30:31]
5521  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
5522  %result = atomicrmw max ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
5523  ret i64 %result
5524}
5525
5526; ---------------------------------------------------------------------
5527; atomicrmw umax
5528; ---------------------------------------------------------------------
5529
5530define void @global_atomic_umax_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
5531; SI-LABEL: global_atomic_umax_i64_noret:
5532; SI:       ; %bb.0:
5533; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5534; SI-NEXT:    s_mov_b32 s6, 0
5535; SI-NEXT:    s_mov_b32 s7, 0xf000
5536; SI-NEXT:    s_mov_b32 s4, s6
5537; SI-NEXT:    s_mov_b32 s5, s6
5538; SI-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
5539; SI-NEXT:    s_mov_b64 s[8:9], 0
5540; SI-NEXT:  .LBB94_1: ; %atomicrmw.start
5541; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
5542; SI-NEXT:    s_waitcnt vmcnt(0)
5543; SI-NEXT:    v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
5544; SI-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
5545; SI-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
5546; SI-NEXT:    s_waitcnt expcnt(0)
5547; SI-NEXT:    v_mov_b32_e32 v11, v7
5548; SI-NEXT:    v_mov_b32_e32 v10, v6
5549; SI-NEXT:    v_mov_b32_e32 v9, v5
5550; SI-NEXT:    v_mov_b32_e32 v8, v4
5551; SI-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
5552; SI-NEXT:    s_waitcnt vmcnt(0)
5553; SI-NEXT:    buffer_wbinvl1
5554; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
5555; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
5556; SI-NEXT:    v_mov_b32_e32 v6, v8
5557; SI-NEXT:    v_mov_b32_e32 v7, v9
5558; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
5559; SI-NEXT:    s_cbranch_execnz .LBB94_1
5560; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
5561; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
5562; SI-NEXT:    s_waitcnt expcnt(0)
5563; SI-NEXT:    s_setpc_b64 s[30:31]
5564;
5565; VI-LABEL: global_atomic_umax_i64_noret:
5566; VI:       ; %bb.0:
5567; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5568; VI-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
5569; VI-NEXT:    s_mov_b64 s[4:5], 0
5570; VI-NEXT:  .LBB94_1: ; %atomicrmw.start
5571; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
5572; VI-NEXT:    s_waitcnt vmcnt(0)
5573; VI-NEXT:    v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
5574; VI-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
5575; VI-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
5576; VI-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
5577; VI-NEXT:    s_waitcnt vmcnt(0)
5578; VI-NEXT:    buffer_wbinvl1_vol
5579; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
5580; VI-NEXT:    v_mov_b32_e32 v7, v5
5581; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5582; VI-NEXT:    v_mov_b32_e32 v6, v4
5583; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5584; VI-NEXT:    s_cbranch_execnz .LBB94_1
5585; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
5586; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
5587; VI-NEXT:    s_setpc_b64 s[30:31]
5588;
5589; GFX9-LABEL: global_atomic_umax_i64_noret:
5590; GFX9:       ; %bb.0:
5591; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5592; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
5593; GFX9-NEXT:    s_mov_b64 s[4:5], 0
5594; GFX9-NEXT:  .LBB94_1: ; %atomicrmw.start
5595; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
5596; GFX9-NEXT:    s_waitcnt vmcnt(0)
5597; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
5598; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
5599; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
5600; GFX9-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
5601; GFX9-NEXT:    s_waitcnt vmcnt(0)
5602; GFX9-NEXT:    buffer_wbinvl1_vol
5603; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
5604; GFX9-NEXT:    v_mov_b32_e32 v7, v5
5605; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5606; GFX9-NEXT:    v_mov_b32_e32 v6, v4
5607; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5608; GFX9-NEXT:    s_cbranch_execnz .LBB94_1
5609; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
5610; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
5611; GFX9-NEXT:    s_setpc_b64 s[30:31]
5612  %tmp0 = atomicrmw umax ptr addrspace(1) %ptr, i64 %in seq_cst
5613  ret void
5614}
5615
5616define void @global_atomic_umax_i64_noret_offset(ptr addrspace(1) %out, i64 %in) {
5617; SI-LABEL: global_atomic_umax_i64_noret_offset:
5618; SI:       ; %bb.0:
5619; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5620; SI-NEXT:    s_mov_b32 s6, 0
5621; SI-NEXT:    s_mov_b32 s7, 0xf000
5622; SI-NEXT:    s_mov_b32 s4, s6
5623; SI-NEXT:    s_mov_b32 s5, s6
5624; SI-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32
5625; SI-NEXT:    s_mov_b64 s[8:9], 0
5626; SI-NEXT:  .LBB95_1: ; %atomicrmw.start
5627; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
5628; SI-NEXT:    s_waitcnt vmcnt(0)
5629; SI-NEXT:    v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
5630; SI-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
5631; SI-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
5632; SI-NEXT:    s_waitcnt expcnt(0)
5633; SI-NEXT:    v_mov_b32_e32 v11, v7
5634; SI-NEXT:    v_mov_b32_e32 v10, v6
5635; SI-NEXT:    v_mov_b32_e32 v9, v5
5636; SI-NEXT:    v_mov_b32_e32 v8, v4
5637; SI-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
5638; SI-NEXT:    s_waitcnt vmcnt(0)
5639; SI-NEXT:    buffer_wbinvl1
5640; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
5641; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
5642; SI-NEXT:    v_mov_b32_e32 v6, v8
5643; SI-NEXT:    v_mov_b32_e32 v7, v9
5644; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
5645; SI-NEXT:    s_cbranch_execnz .LBB95_1
5646; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
5647; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
5648; SI-NEXT:    s_waitcnt expcnt(0)
5649; SI-NEXT:    s_setpc_b64 s[30:31]
5650;
5651; VI-LABEL: global_atomic_umax_i64_noret_offset:
5652; VI:       ; %bb.0:
5653; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5654; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
5655; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
5656; VI-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
5657; VI-NEXT:    s_mov_b64 s[4:5], 0
5658; VI-NEXT:  .LBB95_1: ; %atomicrmw.start
5659; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
5660; VI-NEXT:    s_waitcnt vmcnt(0)
5661; VI-NEXT:    v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
5662; VI-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
5663; VI-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
5664; VI-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
5665; VI-NEXT:    s_waitcnt vmcnt(0)
5666; VI-NEXT:    buffer_wbinvl1_vol
5667; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
5668; VI-NEXT:    v_mov_b32_e32 v7, v5
5669; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5670; VI-NEXT:    v_mov_b32_e32 v6, v4
5671; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5672; VI-NEXT:    s_cbranch_execnz .LBB95_1
5673; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
5674; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
5675; VI-NEXT:    s_setpc_b64 s[30:31]
5676;
5677; GFX9-LABEL: global_atomic_umax_i64_noret_offset:
5678; GFX9:       ; %bb.0:
5679; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5680; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off offset:32
5681; GFX9-NEXT:    s_mov_b64 s[4:5], 0
5682; GFX9-NEXT:  .LBB95_1: ; %atomicrmw.start
5683; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
5684; GFX9-NEXT:    s_waitcnt vmcnt(0)
5685; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
5686; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
5687; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
5688; GFX9-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
5689; GFX9-NEXT:    s_waitcnt vmcnt(0)
5690; GFX9-NEXT:    buffer_wbinvl1_vol
5691; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
5692; GFX9-NEXT:    v_mov_b32_e32 v7, v5
5693; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5694; GFX9-NEXT:    v_mov_b32_e32 v6, v4
5695; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5696; GFX9-NEXT:    s_cbranch_execnz .LBB95_1
5697; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
5698; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
5699; GFX9-NEXT:    s_setpc_b64 s[30:31]
5700  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
5701  %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i64 %in seq_cst
5702  ret void
5703}
5704
5705define i64 @global_atomic_umax_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
5706; SI-LABEL: global_atomic_umax_i64_ret:
5707; SI:       ; %bb.0:
5708; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5709; SI-NEXT:    v_mov_b32_e32 v5, v3
5710; SI-NEXT:    v_mov_b32_e32 v4, v2
5711; SI-NEXT:    v_mov_b32_e32 v7, v1
5712; SI-NEXT:    v_mov_b32_e32 v6, v0
5713; SI-NEXT:    s_mov_b32 s6, 0
5714; SI-NEXT:    s_mov_b32 s7, 0xf000
5715; SI-NEXT:    s_mov_b32 s4, s6
5716; SI-NEXT:    s_mov_b32 s5, s6
5717; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64
5718; SI-NEXT:    s_mov_b64 s[8:9], 0
5719; SI-NEXT:  .LBB96_1: ; %atomicrmw.start
5720; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
5721; SI-NEXT:    s_waitcnt vmcnt(0)
5722; SI-NEXT:    v_mov_b32_e32 v11, v1
5723; SI-NEXT:    v_mov_b32_e32 v10, v0
5724; SI-NEXT:    v_cmp_gt_u64_e32 vcc, v[10:11], v[4:5]
5725; SI-NEXT:    v_cndmask_b32_e32 v9, v5, v11, vcc
5726; SI-NEXT:    v_cndmask_b32_e32 v8, v4, v10, vcc
5727; SI-NEXT:    s_waitcnt expcnt(0)
5728; SI-NEXT:    v_mov_b32_e32 v0, v8
5729; SI-NEXT:    v_mov_b32_e32 v1, v9
5730; SI-NEXT:    v_mov_b32_e32 v2, v10
5731; SI-NEXT:    v_mov_b32_e32 v3, v11
5732; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
5733; SI-NEXT:    s_waitcnt vmcnt(0)
5734; SI-NEXT:    buffer_wbinvl1
5735; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
5736; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
5737; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
5738; SI-NEXT:    s_cbranch_execnz .LBB96_1
5739; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
5740; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
5741; SI-NEXT:    s_waitcnt expcnt(0)
5742; SI-NEXT:    s_setpc_b64 s[30:31]
5743;
5744; VI-LABEL: global_atomic_umax_i64_ret:
5745; VI:       ; %bb.0:
5746; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5747; VI-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
5748; VI-NEXT:    s_mov_b64 s[4:5], 0
5749; VI-NEXT:  .LBB96_1: ; %atomicrmw.start
5750; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
5751; VI-NEXT:    s_waitcnt vmcnt(0)
5752; VI-NEXT:    v_mov_b32_e32 v7, v5
5753; VI-NEXT:    v_mov_b32_e32 v6, v4
5754; VI-NEXT:    v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
5755; VI-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
5756; VI-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
5757; VI-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
5758; VI-NEXT:    s_waitcnt vmcnt(0)
5759; VI-NEXT:    buffer_wbinvl1_vol
5760; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
5761; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5762; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5763; VI-NEXT:    s_cbranch_execnz .LBB96_1
5764; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
5765; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
5766; VI-NEXT:    v_mov_b32_e32 v0, v4
5767; VI-NEXT:    v_mov_b32_e32 v1, v5
5768; VI-NEXT:    s_setpc_b64 s[30:31]
5769;
5770; GFX9-LABEL: global_atomic_umax_i64_ret:
5771; GFX9:       ; %bb.0:
5772; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5773; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
5774; GFX9-NEXT:    s_mov_b64 s[4:5], 0
5775; GFX9-NEXT:  .LBB96_1: ; %atomicrmw.start
5776; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
5777; GFX9-NEXT:    s_waitcnt vmcnt(0)
5778; GFX9-NEXT:    v_mov_b32_e32 v7, v5
5779; GFX9-NEXT:    v_mov_b32_e32 v6, v4
5780; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
5781; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
5782; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
5783; GFX9-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
5784; GFX9-NEXT:    s_waitcnt vmcnt(0)
5785; GFX9-NEXT:    buffer_wbinvl1_vol
5786; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
5787; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5788; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5789; GFX9-NEXT:    s_cbranch_execnz .LBB96_1
5790; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
5791; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
5792; GFX9-NEXT:    v_mov_b32_e32 v0, v4
5793; GFX9-NEXT:    v_mov_b32_e32 v1, v5
5794; GFX9-NEXT:    s_setpc_b64 s[30:31]
5795  %result = atomicrmw umax ptr addrspace(1) %ptr, i64 %in seq_cst
5796  ret i64 %result
5797}
5798
5799define i64 @global_atomic_umax_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
5800; SI-LABEL: global_atomic_umax_i64_ret_offset:
5801; SI:       ; %bb.0:
5802; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5803; SI-NEXT:    v_mov_b32_e32 v5, v3
5804; SI-NEXT:    v_mov_b32_e32 v4, v2
5805; SI-NEXT:    v_mov_b32_e32 v7, v1
5806; SI-NEXT:    v_mov_b32_e32 v6, v0
5807; SI-NEXT:    s_mov_b32 s6, 0
5808; SI-NEXT:    s_mov_b32 s7, 0xf000
5809; SI-NEXT:    s_mov_b32 s4, s6
5810; SI-NEXT:    s_mov_b32 s5, s6
5811; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32
5812; SI-NEXT:    s_mov_b64 s[8:9], 0
5813; SI-NEXT:  .LBB97_1: ; %atomicrmw.start
5814; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
5815; SI-NEXT:    s_waitcnt vmcnt(0)
5816; SI-NEXT:    v_mov_b32_e32 v11, v1
5817; SI-NEXT:    v_mov_b32_e32 v10, v0
5818; SI-NEXT:    v_cmp_gt_u64_e32 vcc, v[10:11], v[4:5]
5819; SI-NEXT:    v_cndmask_b32_e32 v9, v5, v11, vcc
5820; SI-NEXT:    v_cndmask_b32_e32 v8, v4, v10, vcc
5821; SI-NEXT:    s_waitcnt expcnt(0)
5822; SI-NEXT:    v_mov_b32_e32 v0, v8
5823; SI-NEXT:    v_mov_b32_e32 v1, v9
5824; SI-NEXT:    v_mov_b32_e32 v2, v10
5825; SI-NEXT:    v_mov_b32_e32 v3, v11
5826; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc
5827; SI-NEXT:    s_waitcnt vmcnt(0)
5828; SI-NEXT:    buffer_wbinvl1
5829; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
5830; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
5831; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
5832; SI-NEXT:    s_cbranch_execnz .LBB97_1
5833; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
5834; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
5835; SI-NEXT:    s_waitcnt expcnt(0)
5836; SI-NEXT:    s_setpc_b64 s[30:31]
5837;
5838; VI-LABEL: global_atomic_umax_i64_ret_offset:
5839; VI:       ; %bb.0:
5840; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5841; VI-NEXT:    v_add_u32_e32 v4, vcc, 32, v0
5842; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
5843; VI-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
5844; VI-NEXT:    s_mov_b64 s[4:5], 0
5845; VI-NEXT:  .LBB97_1: ; %atomicrmw.start
5846; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
5847; VI-NEXT:    s_waitcnt vmcnt(0)
5848; VI-NEXT:    v_mov_b32_e32 v9, v1
5849; VI-NEXT:    v_mov_b32_e32 v8, v0
5850; VI-NEXT:    v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3]
5851; VI-NEXT:    v_cndmask_b32_e32 v7, v3, v9, vcc
5852; VI-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
5853; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
5854; VI-NEXT:    s_waitcnt vmcnt(0)
5855; VI-NEXT:    buffer_wbinvl1_vol
5856; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
5857; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5858; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5859; VI-NEXT:    s_cbranch_execnz .LBB97_1
5860; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
5861; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
5862; VI-NEXT:    s_setpc_b64 s[30:31]
5863;
5864; GFX9-LABEL: global_atomic_umax_i64_ret_offset:
5865; GFX9:       ; %bb.0:
5866; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5867; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:32
5868; GFX9-NEXT:    s_mov_b64 s[4:5], 0
5869; GFX9-NEXT:  .LBB97_1: ; %atomicrmw.start
5870; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
5871; GFX9-NEXT:    s_waitcnt vmcnt(0)
5872; GFX9-NEXT:    v_mov_b32_e32 v7, v5
5873; GFX9-NEXT:    v_mov_b32_e32 v6, v4
5874; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
5875; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
5876; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
5877; GFX9-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
5878; GFX9-NEXT:    s_waitcnt vmcnt(0)
5879; GFX9-NEXT:    buffer_wbinvl1_vol
5880; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
5881; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
5882; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
5883; GFX9-NEXT:    s_cbranch_execnz .LBB97_1
5884; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
5885; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
5886; GFX9-NEXT:    v_mov_b32_e32 v0, v4
5887; GFX9-NEXT:    v_mov_b32_e32 v1, v5
5888; GFX9-NEXT:    s_setpc_b64 s[30:31]
5889  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
5890  %result = atomicrmw umax ptr addrspace(1) %gep, i64 %in seq_cst
5891  ret i64 %result
5892}
5893
5894define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
5895; SI-LABEL: global_atomic_umax_i64_noret_scalar:
5896; SI:       ; %bb.0:
5897; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5898; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
5899; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
5900; SI-NEXT:    s_mov_b64 exec, s[34:35]
5901; SI-NEXT:    s_waitcnt expcnt(0)
5902; SI-NEXT:    v_writelane_b32 v10, s6, 0
5903; SI-NEXT:    v_writelane_b32 v10, s7, 1
5904; SI-NEXT:    s_mov_b32 s35, s7
5905; SI-NEXT:    s_mov_b32 s34, s6
5906; SI-NEXT:    s_mov_b32 s7, 0xf000
5907; SI-NEXT:    s_mov_b32 s6, -1
5908; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
5909; SI-NEXT:    s_mov_b64 s[36:37], 0
5910; SI-NEXT:    v_mov_b32_e32 v4, s35
5911; SI-NEXT:    v_mov_b32_e32 v5, s34
5912; SI-NEXT:  .LBB98_1: ; %atomicrmw.start
5913; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
5914; SI-NEXT:    s_waitcnt vmcnt(0)
5915; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[34:35], v[2:3]
5916; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
5917; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
5918; SI-NEXT:    s_waitcnt expcnt(0)
5919; SI-NEXT:    v_mov_b32_e32 v9, v3
5920; SI-NEXT:    v_mov_b32_e32 v8, v2
5921; SI-NEXT:    v_mov_b32_e32 v7, v1
5922; SI-NEXT:    v_mov_b32_e32 v6, v0
5923; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc
5924; SI-NEXT:    s_waitcnt vmcnt(0)
5925; SI-NEXT:    buffer_wbinvl1
5926; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
5927; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
5928; SI-NEXT:    v_mov_b32_e32 v2, v6
5929; SI-NEXT:    v_mov_b32_e32 v3, v7
5930; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
5931; SI-NEXT:    s_cbranch_execnz .LBB98_1
5932; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
5933; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
5934; SI-NEXT:    v_readlane_b32 s7, v10, 1
5935; SI-NEXT:    v_readlane_b32 s6, v10, 0
5936; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
5937; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
5938; SI-NEXT:    s_mov_b64 exec, s[34:35]
5939; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5940; SI-NEXT:    s_setpc_b64 s[30:31]
5941;
5942; VI-LABEL: global_atomic_umax_i64_noret_scalar:
5943; VI:       ; %bb.0:
5944; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5945; VI-NEXT:    v_mov_b32_e32 v0, s4
5946; VI-NEXT:    v_mov_b32_e32 v1, s5
5947; VI-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
5948; VI-NEXT:    v_mov_b32_e32 v4, s4
5949; VI-NEXT:    s_mov_b64 s[34:35], 0
5950; VI-NEXT:    v_mov_b32_e32 v6, s7
5951; VI-NEXT:    v_mov_b32_e32 v7, s6
5952; VI-NEXT:    v_mov_b32_e32 v5, s5
5953; VI-NEXT:  .LBB98_1: ; %atomicrmw.start
5954; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
5955; VI-NEXT:    s_waitcnt vmcnt(0)
5956; VI-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
5957; VI-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
5958; VI-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
5959; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
5960; VI-NEXT:    s_waitcnt vmcnt(0)
5961; VI-NEXT:    buffer_wbinvl1_vol
5962; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
5963; VI-NEXT:    v_mov_b32_e32 v3, v1
5964; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
5965; VI-NEXT:    v_mov_b32_e32 v2, v0
5966; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
5967; VI-NEXT:    s_cbranch_execnz .LBB98_1
5968; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
5969; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
5970; VI-NEXT:    s_setpc_b64 s[30:31]
5971;
5972; GFX9-LABEL: global_atomic_umax_i64_noret_scalar:
5973; GFX9:       ; %bb.0:
5974; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5975; GFX9-NEXT:    v_mov_b32_e32 v4, 0
5976; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[4:5]
5977; GFX9-NEXT:    s_mov_b64 s[34:35], 0
5978; GFX9-NEXT:    v_mov_b32_e32 v5, s7
5979; GFX9-NEXT:    v_mov_b32_e32 v6, s6
5980; GFX9-NEXT:  .LBB98_1: ; %atomicrmw.start
5981; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
5982; GFX9-NEXT:    s_waitcnt vmcnt(0)
5983; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
5984; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
5985; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
5986; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc
5987; GFX9-NEXT:    s_waitcnt vmcnt(0)
5988; GFX9-NEXT:    buffer_wbinvl1_vol
5989; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
5990; GFX9-NEXT:    v_mov_b32_e32 v3, v1
5991; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
5992; GFX9-NEXT:    v_mov_b32_e32 v2, v0
5993; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
5994; GFX9-NEXT:    s_cbranch_execnz .LBB98_1
5995; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
5996; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
5997; GFX9-NEXT:    s_setpc_b64 s[30:31]
5998  %tmp0 = atomicrmw umax ptr addrspace(1) %ptr, i64 %in seq_cst
5999  ret void
6000}
6001
6002define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
6003; SI-LABEL: global_atomic_umax_i64_noret_offset_scalar:
6004; SI:       ; %bb.0:
6005; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6006; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
6007; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
6008; SI-NEXT:    s_mov_b64 exec, s[34:35]
6009; SI-NEXT:    s_waitcnt expcnt(0)
6010; SI-NEXT:    v_writelane_b32 v10, s6, 0
6011; SI-NEXT:    v_writelane_b32 v10, s7, 1
6012; SI-NEXT:    s_mov_b32 s35, s7
6013; SI-NEXT:    s_mov_b32 s34, s6
6014; SI-NEXT:    s_mov_b32 s7, 0xf000
6015; SI-NEXT:    s_mov_b32 s6, -1
6016; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32
6017; SI-NEXT:    s_mov_b64 s[36:37], 0
6018; SI-NEXT:    v_mov_b32_e32 v4, s35
6019; SI-NEXT:    v_mov_b32_e32 v5, s34
6020; SI-NEXT:  .LBB99_1: ; %atomicrmw.start
6021; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
6022; SI-NEXT:    s_waitcnt vmcnt(0)
6023; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[34:35], v[2:3]
6024; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
6025; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
6026; SI-NEXT:    s_waitcnt expcnt(0)
6027; SI-NEXT:    v_mov_b32_e32 v9, v3
6028; SI-NEXT:    v_mov_b32_e32 v8, v2
6029; SI-NEXT:    v_mov_b32_e32 v7, v1
6030; SI-NEXT:    v_mov_b32_e32 v6, v0
6031; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc
6032; SI-NEXT:    s_waitcnt vmcnt(0)
6033; SI-NEXT:    buffer_wbinvl1
6034; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
6035; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
6036; SI-NEXT:    v_mov_b32_e32 v2, v6
6037; SI-NEXT:    v_mov_b32_e32 v3, v7
6038; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
6039; SI-NEXT:    s_cbranch_execnz .LBB99_1
6040; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
6041; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
6042; SI-NEXT:    v_readlane_b32 s7, v10, 1
6043; SI-NEXT:    v_readlane_b32 s6, v10, 0
6044; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
6045; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
6046; SI-NEXT:    s_mov_b64 exec, s[34:35]
6047; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
6048; SI-NEXT:    s_setpc_b64 s[30:31]
6049;
6050; VI-LABEL: global_atomic_umax_i64_noret_offset_scalar:
6051; VI:       ; %bb.0:
6052; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6053; VI-NEXT:    s_add_u32 s34, s4, 32
6054; VI-NEXT:    s_addc_u32 s35, s5, 0
6055; VI-NEXT:    v_mov_b32_e32 v4, s34
6056; VI-NEXT:    v_mov_b32_e32 v5, s35
6057; VI-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
6058; VI-NEXT:    s_mov_b64 s[34:35], 0
6059; VI-NEXT:    v_mov_b32_e32 v6, s7
6060; VI-NEXT:    v_mov_b32_e32 v7, s6
6061; VI-NEXT:  .LBB99_1: ; %atomicrmw.start
6062; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
6063; VI-NEXT:    s_waitcnt vmcnt(0)
6064; VI-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
6065; VI-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
6066; VI-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
6067; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
6068; VI-NEXT:    s_waitcnt vmcnt(0)
6069; VI-NEXT:    buffer_wbinvl1_vol
6070; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
6071; VI-NEXT:    v_mov_b32_e32 v3, v1
6072; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
6073; VI-NEXT:    v_mov_b32_e32 v2, v0
6074; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
6075; VI-NEXT:    s_cbranch_execnz .LBB99_1
6076; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
6077; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
6078; VI-NEXT:    s_setpc_b64 s[30:31]
6079;
6080; GFX9-LABEL: global_atomic_umax_i64_noret_offset_scalar:
6081; GFX9:       ; %bb.0:
6082; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6083; GFX9-NEXT:    v_mov_b32_e32 v4, 0
6084; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[4:5] offset:32
6085; GFX9-NEXT:    s_mov_b64 s[34:35], 0
6086; GFX9-NEXT:    v_mov_b32_e32 v5, s7
6087; GFX9-NEXT:    v_mov_b32_e32 v6, s6
6088; GFX9-NEXT:  .LBB99_1: ; %atomicrmw.start
6089; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
6090; GFX9-NEXT:    s_waitcnt vmcnt(0)
6091; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
6092; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
6093; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
6094; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc
6095; GFX9-NEXT:    s_waitcnt vmcnt(0)
6096; GFX9-NEXT:    buffer_wbinvl1_vol
6097; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
6098; GFX9-NEXT:    v_mov_b32_e32 v3, v1
6099; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
6100; GFX9-NEXT:    v_mov_b32_e32 v2, v0
6101; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
6102; GFX9-NEXT:    s_cbranch_execnz .LBB99_1
6103; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
6104; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
6105; GFX9-NEXT:    s_setpc_b64 s[30:31]
6106  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
6107  %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i64 %in seq_cst
6108  ret void
6109}
6110
6111define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
6112; SI-LABEL: global_atomic_umax_i64_ret_scalar:
6113; SI:       ; %bb.0:
6114; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6115; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
6116; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
6117; SI-NEXT:    s_mov_b64 exec, s[34:35]
6118; SI-NEXT:    s_waitcnt expcnt(0)
6119; SI-NEXT:    v_writelane_b32 v10, s6, 0
6120; SI-NEXT:    v_writelane_b32 v10, s7, 1
6121; SI-NEXT:    s_mov_b32 s35, s7
6122; SI-NEXT:    s_mov_b32 s34, s6
6123; SI-NEXT:    s_mov_b32 s7, 0xf000
6124; SI-NEXT:    s_mov_b32 s6, -1
6125; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
6126; SI-NEXT:    s_mov_b64 s[36:37], 0
6127; SI-NEXT:    v_mov_b32_e32 v4, s35
6128; SI-NEXT:    v_mov_b32_e32 v5, s34
6129; SI-NEXT:  .LBB100_1: ; %atomicrmw.start
6130; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
6131; SI-NEXT:    s_waitcnt vmcnt(0)
6132; SI-NEXT:    v_mov_b32_e32 v9, v1
6133; SI-NEXT:    v_mov_b32_e32 v8, v0
6134; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[34:35], v[8:9]
6135; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
6136; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
6137; SI-NEXT:    s_waitcnt expcnt(0)
6138; SI-NEXT:    v_mov_b32_e32 v0, v6
6139; SI-NEXT:    v_mov_b32_e32 v1, v7
6140; SI-NEXT:    v_mov_b32_e32 v2, v8
6141; SI-NEXT:    v_mov_b32_e32 v3, v9
6142; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
6143; SI-NEXT:    s_waitcnt vmcnt(0)
6144; SI-NEXT:    buffer_wbinvl1
6145; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
6146; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
6147; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
6148; SI-NEXT:    s_cbranch_execnz .LBB100_1
6149; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
6150; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
6151; SI-NEXT:    v_readlane_b32 s7, v10, 1
6152; SI-NEXT:    v_readlane_b32 s6, v10, 0
6153; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
6154; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
6155; SI-NEXT:    s_mov_b64 exec, s[34:35]
6156; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
6157; SI-NEXT:    s_setpc_b64 s[30:31]
6158;
6159; VI-LABEL: global_atomic_umax_i64_ret_scalar:
6160; VI:       ; %bb.0:
6161; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6162; VI-NEXT:    v_mov_b32_e32 v0, s4
6163; VI-NEXT:    v_mov_b32_e32 v1, s5
6164; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
6165; VI-NEXT:    v_mov_b32_e32 v2, s4
6166; VI-NEXT:    s_mov_b64 s[34:35], 0
6167; VI-NEXT:    v_mov_b32_e32 v4, s7
6168; VI-NEXT:    v_mov_b32_e32 v5, s6
6169; VI-NEXT:    v_mov_b32_e32 v3, s5
6170; VI-NEXT:  .LBB100_1: ; %atomicrmw.start
6171; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
6172; VI-NEXT:    s_waitcnt vmcnt(0)
6173; VI-NEXT:    v_mov_b32_e32 v9, v1
6174; VI-NEXT:    v_mov_b32_e32 v8, v0
6175; VI-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
6176; VI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
6177; VI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
6178; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
6179; VI-NEXT:    s_waitcnt vmcnt(0)
6180; VI-NEXT:    buffer_wbinvl1_vol
6181; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
6182; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
6183; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
6184; VI-NEXT:    s_cbranch_execnz .LBB100_1
6185; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
6186; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
6187; VI-NEXT:    s_setpc_b64 s[30:31]
6188;
6189; GFX9-LABEL: global_atomic_umax_i64_ret_scalar:
6190; GFX9:       ; %bb.0:
6191; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6192; GFX9-NEXT:    v_mov_b32_e32 v2, 0
6193; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5]
6194; GFX9-NEXT:    s_mov_b64 s[34:35], 0
6195; GFX9-NEXT:    v_mov_b32_e32 v3, s7
6196; GFX9-NEXT:    v_mov_b32_e32 v4, s6
6197; GFX9-NEXT:  .LBB100_1: ; %atomicrmw.start
6198; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
6199; GFX9-NEXT:    s_waitcnt vmcnt(0)
6200; GFX9-NEXT:    v_mov_b32_e32 v8, v1
6201; GFX9-NEXT:    v_mov_b32_e32 v7, v0
6202; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[7:8]
6203; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v8, vcc
6204; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v7, vcc
6205; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] glc
6206; GFX9-NEXT:    s_waitcnt vmcnt(0)
6207; GFX9-NEXT:    buffer_wbinvl1_vol
6208; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
6209; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
6210; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
6211; GFX9-NEXT:    s_cbranch_execnz .LBB100_1
6212; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
6213; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
6214; GFX9-NEXT:    s_setpc_b64 s[30:31]
6215  %result = atomicrmw umax ptr addrspace(1) %ptr, i64 %in seq_cst
6216  ret i64 %result
6217}
6218
6219define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
6220; SI-LABEL: global_atomic_umax_i64_ret_offset_scalar:
6221; SI:       ; %bb.0:
6222; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6223; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
6224; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
6225; SI-NEXT:    s_mov_b64 exec, s[34:35]
6226; SI-NEXT:    s_waitcnt expcnt(0)
6227; SI-NEXT:    v_writelane_b32 v10, s6, 0
6228; SI-NEXT:    v_writelane_b32 v10, s7, 1
6229; SI-NEXT:    s_mov_b32 s35, s7
6230; SI-NEXT:    s_mov_b32 s34, s6
6231; SI-NEXT:    s_mov_b32 s7, 0xf000
6232; SI-NEXT:    s_mov_b32 s6, -1
6233; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
6234; SI-NEXT:    s_mov_b64 s[36:37], 0
6235; SI-NEXT:    v_mov_b32_e32 v4, s35
6236; SI-NEXT:    v_mov_b32_e32 v5, s34
6237; SI-NEXT:  .LBB101_1: ; %atomicrmw.start
6238; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
6239; SI-NEXT:    s_waitcnt vmcnt(0)
6240; SI-NEXT:    v_mov_b32_e32 v9, v1
6241; SI-NEXT:    v_mov_b32_e32 v8, v0
6242; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[34:35], v[8:9]
6243; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
6244; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
6245; SI-NEXT:    s_waitcnt expcnt(0)
6246; SI-NEXT:    v_mov_b32_e32 v0, v6
6247; SI-NEXT:    v_mov_b32_e32 v1, v7
6248; SI-NEXT:    v_mov_b32_e32 v2, v8
6249; SI-NEXT:    v_mov_b32_e32 v3, v9
6250; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
6251; SI-NEXT:    s_waitcnt vmcnt(0)
6252; SI-NEXT:    buffer_wbinvl1
6253; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
6254; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
6255; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
6256; SI-NEXT:    s_cbranch_execnz .LBB101_1
6257; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
6258; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
6259; SI-NEXT:    v_readlane_b32 s7, v10, 1
6260; SI-NEXT:    v_readlane_b32 s6, v10, 0
6261; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
6262; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
6263; SI-NEXT:    s_mov_b64 exec, s[34:35]
6264; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
6265; SI-NEXT:    s_setpc_b64 s[30:31]
6266;
6267; VI-LABEL: global_atomic_umax_i64_ret_offset_scalar:
6268; VI:       ; %bb.0:
6269; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6270; VI-NEXT:    s_add_u32 s34, s4, 32
6271; VI-NEXT:    s_addc_u32 s35, s5, 0
6272; VI-NEXT:    v_mov_b32_e32 v2, s34
6273; VI-NEXT:    v_mov_b32_e32 v3, s35
6274; VI-NEXT:    flat_load_dwordx2 v[0:1], v[2:3]
6275; VI-NEXT:    s_mov_b64 s[34:35], 0
6276; VI-NEXT:    v_mov_b32_e32 v4, s7
6277; VI-NEXT:    v_mov_b32_e32 v5, s6
6278; VI-NEXT:  .LBB101_1: ; %atomicrmw.start
6279; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
6280; VI-NEXT:    s_waitcnt vmcnt(0)
6281; VI-NEXT:    v_mov_b32_e32 v9, v1
6282; VI-NEXT:    v_mov_b32_e32 v8, v0
6283; VI-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
6284; VI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
6285; VI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
6286; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
6287; VI-NEXT:    s_waitcnt vmcnt(0)
6288; VI-NEXT:    buffer_wbinvl1_vol
6289; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
6290; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
6291; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
6292; VI-NEXT:    s_cbranch_execnz .LBB101_1
6293; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
6294; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
6295; VI-NEXT:    s_setpc_b64 s[30:31]
6296;
6297; GFX9-LABEL: global_atomic_umax_i64_ret_offset_scalar:
6298; GFX9:       ; %bb.0:
6299; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6300; GFX9-NEXT:    v_mov_b32_e32 v2, 0
6301; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5] offset:32
6302; GFX9-NEXT:    s_mov_b64 s[34:35], 0
6303; GFX9-NEXT:    v_mov_b32_e32 v3, s7
6304; GFX9-NEXT:    v_mov_b32_e32 v4, s6
6305; GFX9-NEXT:  .LBB101_1: ; %atomicrmw.start
6306; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
6307; GFX9-NEXT:    s_waitcnt vmcnt(0)
6308; GFX9-NEXT:    v_mov_b32_e32 v8, v1
6309; GFX9-NEXT:    v_mov_b32_e32 v7, v0
6310; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[7:8]
6311; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v8, vcc
6312; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v7, vcc
6313; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] offset:32 glc
6314; GFX9-NEXT:    s_waitcnt vmcnt(0)
6315; GFX9-NEXT:    buffer_wbinvl1_vol
6316; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
6317; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
6318; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
6319; GFX9-NEXT:    s_cbranch_execnz .LBB101_1
6320; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
6321; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
6322; GFX9-NEXT:    s_setpc_b64 s[30:31]
6323  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
6324  %result = atomicrmw umax ptr addrspace(1) %gep, i64 %in seq_cst
6325  ret i64 %result
6326}
6327
6328define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) {
6329; SI-LABEL: atomic_umax_i64_addr64_offset:
6330; SI:       ; %bb.0: ; %entry
6331; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
6332; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
6333; SI-NEXT:    s_waitcnt lgkmcnt(0)
6334; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
6335; SI-NEXT:    s_add_u32 s4, s0, s4
6336; SI-NEXT:    s_addc_u32 s5, s1, s5
6337; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
6338; SI-NEXT:    s_mov_b64 s[0:1], 0
6339; SI-NEXT:    s_mov_b32 s7, 0xf000
6340; SI-NEXT:    v_mov_b32_e32 v4, s3
6341; SI-NEXT:    v_mov_b32_e32 v5, s2
6342; SI-NEXT:    s_waitcnt lgkmcnt(0)
6343; SI-NEXT:    v_mov_b32_e32 v2, s8
6344; SI-NEXT:    v_mov_b32_e32 v3, s9
6345; SI-NEXT:    s_mov_b32 s6, -1
6346; SI-NEXT:  .LBB102_1: ; %atomicrmw.start
6347; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
6348; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
6349; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
6350; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
6351; SI-NEXT:    s_waitcnt expcnt(0)
6352; SI-NEXT:    v_mov_b32_e32 v9, v3
6353; SI-NEXT:    v_mov_b32_e32 v8, v2
6354; SI-NEXT:    v_mov_b32_e32 v7, v1
6355; SI-NEXT:    v_mov_b32_e32 v6, v0
6356; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc
6357; SI-NEXT:    s_waitcnt vmcnt(0)
6358; SI-NEXT:    buffer_wbinvl1
6359; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
6360; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
6361; SI-NEXT:    v_mov_b32_e32 v2, v6
6362; SI-NEXT:    v_mov_b32_e32 v3, v7
6363; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
6364; SI-NEXT:    s_cbranch_execnz .LBB102_1
6365; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
6366; SI-NEXT:    s_endpgm
6367;
6368; VI-LABEL: atomic_umax_i64_addr64_offset:
6369; VI:       ; %bb.0: ; %entry
6370; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
6371; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
6372; VI-NEXT:    s_mov_b64 s[4:5], 0
6373; VI-NEXT:    s_waitcnt lgkmcnt(0)
6374; VI-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
6375; VI-NEXT:    s_add_u32 s0, s0, s6
6376; VI-NEXT:    s_addc_u32 s1, s1, s7
6377; VI-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x20
6378; VI-NEXT:    s_add_u32 s0, s0, 32
6379; VI-NEXT:    s_addc_u32 s1, s1, 0
6380; VI-NEXT:    v_mov_b32_e32 v5, s1
6381; VI-NEXT:    v_mov_b32_e32 v6, s3
6382; VI-NEXT:    s_waitcnt lgkmcnt(0)
6383; VI-NEXT:    v_mov_b32_e32 v2, s6
6384; VI-NEXT:    v_mov_b32_e32 v7, s2
6385; VI-NEXT:    v_mov_b32_e32 v3, s7
6386; VI-NEXT:    v_mov_b32_e32 v4, s0
6387; VI-NEXT:  .LBB102_1: ; %atomicrmw.start
6388; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
6389; VI-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
6390; VI-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
6391; VI-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
6392; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
6393; VI-NEXT:    s_waitcnt vmcnt(0)
6394; VI-NEXT:    buffer_wbinvl1_vol
6395; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
6396; VI-NEXT:    v_mov_b32_e32 v3, v1
6397; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6398; VI-NEXT:    v_mov_b32_e32 v2, v0
6399; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6400; VI-NEXT:    s_cbranch_execnz .LBB102_1
6401; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
6402; VI-NEXT:    s_endpgm
6403;
6404; GFX9-LABEL: atomic_umax_i64_addr64_offset:
6405; GFX9:       ; %bb.0: ; %entry
6406; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
6407; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
6408; GFX9-NEXT:    v_mov_b32_e32 v6, 0
6409; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6410; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
6411; GFX9-NEXT:    s_add_u32 s0, s0, s4
6412; GFX9-NEXT:    s_addc_u32 s1, s1, s5
6413; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x20
6414; GFX9-NEXT:    s_mov_b64 s[4:5], 0
6415; GFX9-NEXT:    v_mov_b32_e32 v4, s3
6416; GFX9-NEXT:    v_mov_b32_e32 v5, s2
6417; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6418; GFX9-NEXT:    v_mov_b32_e32 v2, s6
6419; GFX9-NEXT:    v_mov_b32_e32 v3, s7
6420; GFX9-NEXT:  .LBB102_1: ; %atomicrmw.start
6421; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
6422; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
6423; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
6424; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
6425; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:32 glc
6426; GFX9-NEXT:    s_waitcnt vmcnt(0)
6427; GFX9-NEXT:    buffer_wbinvl1_vol
6428; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
6429; GFX9-NEXT:    v_mov_b32_e32 v3, v1
6430; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6431; GFX9-NEXT:    v_mov_b32_e32 v2, v0
6432; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6433; GFX9-NEXT:    s_cbranch_execnz .LBB102_1
6434; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
6435; GFX9-NEXT:    s_endpgm
6436entry:
6437  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
6438  %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
6439  %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i64 %in seq_cst
6440  ret void
6441}
6442
6443define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
6444; SI-LABEL: atomic_umax_i64_ret_addr64_offset:
6445; SI:       ; %bb.0: ; %entry
6446; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
6447; SI-NEXT:    s_waitcnt lgkmcnt(0)
6448; SI-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
6449; SI-NEXT:    s_add_u32 s8, s0, s6
6450; SI-NEXT:    s_addc_u32 s9, s1, s7
6451; SI-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
6452; SI-NEXT:    s_mov_b64 s[0:1], 0
6453; SI-NEXT:    s_mov_b32 s11, 0xf000
6454; SI-NEXT:    v_mov_b32_e32 v8, s5
6455; SI-NEXT:    v_mov_b32_e32 v9, s4
6456; SI-NEXT:    s_waitcnt lgkmcnt(0)
6457; SI-NEXT:    v_mov_b32_e32 v2, s6
6458; SI-NEXT:    v_mov_b32_e32 v3, s7
6459; SI-NEXT:    s_mov_b32 s10, -1
6460; SI-NEXT:  .LBB103_1: ; %atomicrmw.start
6461; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
6462; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
6463; SI-NEXT:    v_cndmask_b32_e32 v1, v8, v3, vcc
6464; SI-NEXT:    v_cndmask_b32_e32 v0, v9, v2, vcc
6465; SI-NEXT:    s_waitcnt expcnt(0)
6466; SI-NEXT:    v_mov_b32_e32 v7, v3
6467; SI-NEXT:    v_mov_b32_e32 v6, v2
6468; SI-NEXT:    v_mov_b32_e32 v5, v1
6469; SI-NEXT:    v_mov_b32_e32 v4, v0
6470; SI-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], off, s[8:11], 0 offset:32 glc
6471; SI-NEXT:    s_waitcnt vmcnt(0)
6472; SI-NEXT:    buffer_wbinvl1
6473; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
6474; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
6475; SI-NEXT:    v_mov_b32_e32 v2, v4
6476; SI-NEXT:    v_mov_b32_e32 v3, v5
6477; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
6478; SI-NEXT:    s_cbranch_execnz .LBB103_1
6479; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
6480; SI-NEXT:    s_or_b64 exec, exec, s[0:1]
6481; SI-NEXT:    s_mov_b32 s7, 0xf000
6482; SI-NEXT:    s_mov_b32 s6, -1
6483; SI-NEXT:    s_mov_b32 s4, s2
6484; SI-NEXT:    s_mov_b32 s5, s3
6485; SI-NEXT:    buffer_store_dwordx2 v[4:5], off, s[4:7], 0
6486; SI-NEXT:    s_endpgm
6487;
6488; VI-LABEL: atomic_umax_i64_ret_addr64_offset:
6489; VI:       ; %bb.0: ; %entry
6490; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
6491; VI-NEXT:    s_mov_b64 s[8:9], 0
6492; VI-NEXT:    s_waitcnt lgkmcnt(0)
6493; VI-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
6494; VI-NEXT:    s_add_u32 s0, s0, s6
6495; VI-NEXT:    s_addc_u32 s1, s1, s7
6496; VI-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x20
6497; VI-NEXT:    s_add_u32 s0, s0, 32
6498; VI-NEXT:    s_addc_u32 s1, s1, 0
6499; VI-NEXT:    v_mov_b32_e32 v0, s0
6500; VI-NEXT:    v_mov_b32_e32 v4, s5
6501; VI-NEXT:    s_waitcnt lgkmcnt(0)
6502; VI-NEXT:    v_mov_b32_e32 v2, s6
6503; VI-NEXT:    v_mov_b32_e32 v5, s4
6504; VI-NEXT:    v_mov_b32_e32 v3, s7
6505; VI-NEXT:    v_mov_b32_e32 v1, s1
6506; VI-NEXT:  .LBB103_1: ; %atomicrmw.start
6507; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
6508; VI-NEXT:    v_mov_b32_e32 v9, v3
6509; VI-NEXT:    v_mov_b32_e32 v8, v2
6510; VI-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
6511; VI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
6512; VI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
6513; VI-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
6514; VI-NEXT:    s_waitcnt vmcnt(0)
6515; VI-NEXT:    buffer_wbinvl1_vol
6516; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
6517; VI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
6518; VI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
6519; VI-NEXT:    s_cbranch_execnz .LBB103_1
6520; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
6521; VI-NEXT:    s_or_b64 exec, exec, s[8:9]
6522; VI-NEXT:    v_mov_b32_e32 v0, s2
6523; VI-NEXT:    v_mov_b32_e32 v1, s3
6524; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
6525; VI-NEXT:    s_endpgm
6526;
6527; GFX9-LABEL: atomic_umax_i64_ret_addr64_offset:
6528; GFX9:       ; %bb.0: ; %entry
6529; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
6530; GFX9-NEXT:    s_mov_b64 s[2:3], 0
6531; GFX9-NEXT:    v_mov_b32_e32 v4, 0
6532; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6533; GFX9-NEXT:    s_lshl_b64 s[0:1], s[14:15], 3
6534; GFX9-NEXT:    s_add_u32 s0, s8, s0
6535; GFX9-NEXT:    s_addc_u32 s1, s9, s1
6536; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x20
6537; GFX9-NEXT:    v_mov_b32_e32 v2, s13
6538; GFX9-NEXT:    v_mov_b32_e32 v3, s12
6539; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6540; GFX9-NEXT:    v_mov_b32_e32 v0, s4
6541; GFX9-NEXT:    v_mov_b32_e32 v1, s5
6542; GFX9-NEXT:  .LBB103_1: ; %atomicrmw.start
6543; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
6544; GFX9-NEXT:    v_mov_b32_e32 v8, v1
6545; GFX9-NEXT:    v_mov_b32_e32 v7, v0
6546; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[12:13], v[7:8]
6547; GFX9-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
6548; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
6549; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] offset:32 glc
6550; GFX9-NEXT:    s_waitcnt vmcnt(0)
6551; GFX9-NEXT:    buffer_wbinvl1_vol
6552; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
6553; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
6554; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
6555; GFX9-NEXT:    s_cbranch_execnz .LBB103_1
6556; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
6557; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
6558; GFX9-NEXT:    v_mov_b32_e32 v2, 0
6559; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[10:11]
6560; GFX9-NEXT:    s_endpgm
6561entry:
6562  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
6563  %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
6564  %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i64 %in seq_cst
6565  store i64 %tmp0, ptr addrspace(1) %out2
6566  ret void
6567}
6568
6569define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
6570; SI-LABEL: atomic_umax_i64_ret_addr64:
6571; SI:       ; %bb.0: ; %entry
6572; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
6573; SI-NEXT:    s_waitcnt lgkmcnt(0)
6574; SI-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
6575; SI-NEXT:    s_add_u32 s8, s0, s6
6576; SI-NEXT:    s_addc_u32 s9, s1, s7
6577; SI-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
6578; SI-NEXT:    s_mov_b64 s[0:1], 0
6579; SI-NEXT:    s_mov_b32 s11, 0xf000
6580; SI-NEXT:    v_mov_b32_e32 v8, s5
6581; SI-NEXT:    v_mov_b32_e32 v9, s4
6582; SI-NEXT:    s_waitcnt lgkmcnt(0)
6583; SI-NEXT:    v_mov_b32_e32 v2, s6
6584; SI-NEXT:    v_mov_b32_e32 v3, s7
6585; SI-NEXT:    s_mov_b32 s10, -1
6586; SI-NEXT:  .LBB104_1: ; %atomicrmw.start
6587; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
6588; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
6589; SI-NEXT:    v_cndmask_b32_e32 v1, v8, v3, vcc
6590; SI-NEXT:    v_cndmask_b32_e32 v0, v9, v2, vcc
6591; SI-NEXT:    s_waitcnt expcnt(0)
6592; SI-NEXT:    v_mov_b32_e32 v7, v3
6593; SI-NEXT:    v_mov_b32_e32 v6, v2
6594; SI-NEXT:    v_mov_b32_e32 v5, v1
6595; SI-NEXT:    v_mov_b32_e32 v4, v0
6596; SI-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], off, s[8:11], 0 glc
6597; SI-NEXT:    s_waitcnt vmcnt(0)
6598; SI-NEXT:    buffer_wbinvl1
6599; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
6600; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
6601; SI-NEXT:    v_mov_b32_e32 v2, v4
6602; SI-NEXT:    v_mov_b32_e32 v3, v5
6603; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
6604; SI-NEXT:    s_cbranch_execnz .LBB104_1
6605; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
6606; SI-NEXT:    s_or_b64 exec, exec, s[0:1]
6607; SI-NEXT:    s_mov_b32 s7, 0xf000
6608; SI-NEXT:    s_mov_b32 s6, -1
6609; SI-NEXT:    s_mov_b32 s4, s2
6610; SI-NEXT:    s_mov_b32 s5, s3
6611; SI-NEXT:    buffer_store_dwordx2 v[4:5], off, s[4:7], 0
6612; SI-NEXT:    s_endpgm
6613;
6614; VI-LABEL: atomic_umax_i64_ret_addr64:
6615; VI:       ; %bb.0: ; %entry
6616; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
6617; VI-NEXT:    s_waitcnt lgkmcnt(0)
6618; VI-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
6619; VI-NEXT:    s_add_u32 s6, s0, s6
6620; VI-NEXT:    s_addc_u32 s7, s1, s7
6621; VI-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
6622; VI-NEXT:    v_mov_b32_e32 v0, s6
6623; VI-NEXT:    s_mov_b64 s[0:1], 0
6624; VI-NEXT:    v_mov_b32_e32 v4, s5
6625; VI-NEXT:    v_mov_b32_e32 v5, s4
6626; VI-NEXT:    s_waitcnt lgkmcnt(0)
6627; VI-NEXT:    v_mov_b32_e32 v2, s8
6628; VI-NEXT:    v_mov_b32_e32 v3, s9
6629; VI-NEXT:    v_mov_b32_e32 v1, s7
6630; VI-NEXT:  .LBB104_1: ; %atomicrmw.start
6631; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
6632; VI-NEXT:    v_mov_b32_e32 v9, v3
6633; VI-NEXT:    v_mov_b32_e32 v8, v2
6634; VI-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
6635; VI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
6636; VI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
6637; VI-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
6638; VI-NEXT:    s_waitcnt vmcnt(0)
6639; VI-NEXT:    buffer_wbinvl1_vol
6640; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
6641; VI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
6642; VI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
6643; VI-NEXT:    s_cbranch_execnz .LBB104_1
6644; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
6645; VI-NEXT:    s_or_b64 exec, exec, s[0:1]
6646; VI-NEXT:    v_mov_b32_e32 v0, s2
6647; VI-NEXT:    v_mov_b32_e32 v1, s3
6648; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
6649; VI-NEXT:    s_endpgm
6650;
6651; GFX9-LABEL: atomic_umax_i64_ret_addr64:
6652; GFX9:       ; %bb.0: ; %entry
6653; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
6654; GFX9-NEXT:    s_mov_b64 s[2:3], 0
6655; GFX9-NEXT:    v_mov_b32_e32 v4, 0
6656; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6657; GFX9-NEXT:    s_lshl_b64 s[0:1], s[14:15], 3
6658; GFX9-NEXT:    s_add_u32 s0, s8, s0
6659; GFX9-NEXT:    s_addc_u32 s1, s9, s1
6660; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
6661; GFX9-NEXT:    v_mov_b32_e32 v2, s13
6662; GFX9-NEXT:    v_mov_b32_e32 v3, s12
6663; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6664; GFX9-NEXT:    v_mov_b32_e32 v0, s4
6665; GFX9-NEXT:    v_mov_b32_e32 v1, s5
6666; GFX9-NEXT:  .LBB104_1: ; %atomicrmw.start
6667; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
6668; GFX9-NEXT:    v_mov_b32_e32 v8, v1
6669; GFX9-NEXT:    v_mov_b32_e32 v7, v0
6670; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[12:13], v[7:8]
6671; GFX9-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
6672; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
6673; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] glc
6674; GFX9-NEXT:    s_waitcnt vmcnt(0)
6675; GFX9-NEXT:    buffer_wbinvl1_vol
6676; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
6677; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
6678; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
6679; GFX9-NEXT:    s_cbranch_execnz .LBB104_1
6680; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
6681; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
6682; GFX9-NEXT:    v_mov_b32_e32 v2, 0
6683; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[10:11]
6684; GFX9-NEXT:    s_endpgm
6685entry:
6686  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
6687  %tmp0 = atomicrmw umax ptr addrspace(1) %ptr, i64 %in seq_cst
6688  store i64 %tmp0, ptr addrspace(1) %out2
6689  ret void
6690}
6691
6692define void @global_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
6693; SI-LABEL: global_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory:
6694; SI:       ; %bb.0:
6695; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6696; SI-NEXT:    s_mov_b32 s6, 0
6697; SI-NEXT:    s_mov_b32 s7, 0xf000
6698; SI-NEXT:    s_mov_b32 s4, s6
6699; SI-NEXT:    s_mov_b32 s5, s6
6700; SI-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32
6701; SI-NEXT:    s_mov_b64 s[8:9], 0
6702; SI-NEXT:  .LBB105_1: ; %atomicrmw.start
6703; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
6704; SI-NEXT:    s_waitcnt vmcnt(0)
6705; SI-NEXT:    v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
6706; SI-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
6707; SI-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
6708; SI-NEXT:    s_waitcnt expcnt(0)
6709; SI-NEXT:    v_mov_b32_e32 v11, v7
6710; SI-NEXT:    v_mov_b32_e32 v10, v6
6711; SI-NEXT:    v_mov_b32_e32 v9, v5
6712; SI-NEXT:    v_mov_b32_e32 v8, v4
6713; SI-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
6714; SI-NEXT:    s_waitcnt vmcnt(0)
6715; SI-NEXT:    buffer_wbinvl1
6716; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
6717; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
6718; SI-NEXT:    v_mov_b32_e32 v6, v8
6719; SI-NEXT:    v_mov_b32_e32 v7, v9
6720; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
6721; SI-NEXT:    s_cbranch_execnz .LBB105_1
6722; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
6723; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
6724; SI-NEXT:    s_waitcnt expcnt(0)
6725; SI-NEXT:    s_setpc_b64 s[30:31]
6726;
6727; VI-LABEL: global_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory:
6728; VI:       ; %bb.0:
6729; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6730; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
6731; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
6732; VI-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
6733; VI-NEXT:    s_mov_b64 s[4:5], 0
6734; VI-NEXT:  .LBB105_1: ; %atomicrmw.start
6735; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
6736; VI-NEXT:    s_waitcnt vmcnt(0)
6737; VI-NEXT:    v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
6738; VI-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
6739; VI-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
6740; VI-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
6741; VI-NEXT:    s_waitcnt vmcnt(0)
6742; VI-NEXT:    buffer_wbinvl1_vol
6743; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
6744; VI-NEXT:    v_mov_b32_e32 v7, v5
6745; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6746; VI-NEXT:    v_mov_b32_e32 v6, v4
6747; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6748; VI-NEXT:    s_cbranch_execnz .LBB105_1
6749; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
6750; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
6751; VI-NEXT:    s_setpc_b64 s[30:31]
6752;
6753; GFX9-LABEL: global_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory:
6754; GFX9:       ; %bb.0:
6755; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6756; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off offset:32
6757; GFX9-NEXT:    s_mov_b64 s[4:5], 0
6758; GFX9-NEXT:  .LBB105_1: ; %atomicrmw.start
6759; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
6760; GFX9-NEXT:    s_waitcnt vmcnt(0)
6761; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
6762; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
6763; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
6764; GFX9-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
6765; GFX9-NEXT:    s_waitcnt vmcnt(0)
6766; GFX9-NEXT:    buffer_wbinvl1_vol
6767; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
6768; GFX9-NEXT:    v_mov_b32_e32 v7, v5
6769; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6770; GFX9-NEXT:    v_mov_b32_e32 v6, v4
6771; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6772; GFX9-NEXT:    s_cbranch_execnz .LBB105_1
6773; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
6774; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
6775; GFX9-NEXT:    s_setpc_b64 s[30:31]
6776  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
6777  %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
6778  ret void
6779}
6780
6781define i64 @global_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
6782; SI-LABEL: global_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory:
6783; SI:       ; %bb.0:
6784; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6785; SI-NEXT:    v_mov_b32_e32 v5, v3
6786; SI-NEXT:    v_mov_b32_e32 v4, v2
6787; SI-NEXT:    v_mov_b32_e32 v7, v1
6788; SI-NEXT:    v_mov_b32_e32 v6, v0
6789; SI-NEXT:    s_mov_b32 s6, 0
6790; SI-NEXT:    s_mov_b32 s7, 0xf000
6791; SI-NEXT:    s_mov_b32 s4, s6
6792; SI-NEXT:    s_mov_b32 s5, s6
6793; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32
6794; SI-NEXT:    s_mov_b64 s[8:9], 0
6795; SI-NEXT:  .LBB106_1: ; %atomicrmw.start
6796; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
6797; SI-NEXT:    s_waitcnt vmcnt(0)
6798; SI-NEXT:    v_mov_b32_e32 v11, v1
6799; SI-NEXT:    v_mov_b32_e32 v10, v0
6800; SI-NEXT:    v_cmp_gt_u64_e32 vcc, v[10:11], v[4:5]
6801; SI-NEXT:    v_cndmask_b32_e32 v9, v5, v11, vcc
6802; SI-NEXT:    v_cndmask_b32_e32 v8, v4, v10, vcc
6803; SI-NEXT:    s_waitcnt expcnt(0)
6804; SI-NEXT:    v_mov_b32_e32 v0, v8
6805; SI-NEXT:    v_mov_b32_e32 v1, v9
6806; SI-NEXT:    v_mov_b32_e32 v2, v10
6807; SI-NEXT:    v_mov_b32_e32 v3, v11
6808; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc
6809; SI-NEXT:    s_waitcnt vmcnt(0)
6810; SI-NEXT:    buffer_wbinvl1
6811; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
6812; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
6813; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
6814; SI-NEXT:    s_cbranch_execnz .LBB106_1
6815; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
6816; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
6817; SI-NEXT:    s_waitcnt expcnt(0)
6818; SI-NEXT:    s_setpc_b64 s[30:31]
6819;
6820; VI-LABEL: global_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory:
6821; VI:       ; %bb.0:
6822; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6823; VI-NEXT:    v_add_u32_e32 v4, vcc, 32, v0
6824; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
6825; VI-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
6826; VI-NEXT:    s_mov_b64 s[4:5], 0
6827; VI-NEXT:  .LBB106_1: ; %atomicrmw.start
6828; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
6829; VI-NEXT:    s_waitcnt vmcnt(0)
6830; VI-NEXT:    v_mov_b32_e32 v9, v1
6831; VI-NEXT:    v_mov_b32_e32 v8, v0
6832; VI-NEXT:    v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3]
6833; VI-NEXT:    v_cndmask_b32_e32 v7, v3, v9, vcc
6834; VI-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
6835; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
6836; VI-NEXT:    s_waitcnt vmcnt(0)
6837; VI-NEXT:    buffer_wbinvl1_vol
6838; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
6839; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6840; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6841; VI-NEXT:    s_cbranch_execnz .LBB106_1
6842; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
6843; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
6844; VI-NEXT:    s_setpc_b64 s[30:31]
6845;
6846; GFX9-LABEL: global_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory:
6847; GFX9:       ; %bb.0:
6848; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6849; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:32
6850; GFX9-NEXT:    s_mov_b64 s[4:5], 0
6851; GFX9-NEXT:  .LBB106_1: ; %atomicrmw.start
6852; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
6853; GFX9-NEXT:    s_waitcnt vmcnt(0)
6854; GFX9-NEXT:    v_mov_b32_e32 v7, v5
6855; GFX9-NEXT:    v_mov_b32_e32 v6, v4
6856; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
6857; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
6858; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
6859; GFX9-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
6860; GFX9-NEXT:    s_waitcnt vmcnt(0)
6861; GFX9-NEXT:    buffer_wbinvl1_vol
6862; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
6863; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6864; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6865; GFX9-NEXT:    s_cbranch_execnz .LBB106_1
6866; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
6867; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
6868; GFX9-NEXT:    v_mov_b32_e32 v0, v4
6869; GFX9-NEXT:    v_mov_b32_e32 v1, v5
6870; GFX9-NEXT:    s_setpc_b64 s[30:31]
6871  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
6872  %result = atomicrmw umax ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
6873  ret i64 %result
6874}
6875
6876; ---------------------------------------------------------------------
6877; atomicrmw umin
6878; ---------------------------------------------------------------------
6879
6880define void @global_atomic_umin_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
6881; SI-LABEL: global_atomic_umin_i64_noret:
6882; SI:       ; %bb.0:
6883; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6884; SI-NEXT:    s_mov_b32 s6, 0
6885; SI-NEXT:    s_mov_b32 s7, 0xf000
6886; SI-NEXT:    s_mov_b32 s4, s6
6887; SI-NEXT:    s_mov_b32 s5, s6
6888; SI-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
6889; SI-NEXT:    s_mov_b64 s[8:9], 0
6890; SI-NEXT:  .LBB107_1: ; %atomicrmw.start
6891; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
6892; SI-NEXT:    s_waitcnt vmcnt(0)
6893; SI-NEXT:    v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
6894; SI-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
6895; SI-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
6896; SI-NEXT:    s_waitcnt expcnt(0)
6897; SI-NEXT:    v_mov_b32_e32 v11, v7
6898; SI-NEXT:    v_mov_b32_e32 v10, v6
6899; SI-NEXT:    v_mov_b32_e32 v9, v5
6900; SI-NEXT:    v_mov_b32_e32 v8, v4
6901; SI-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
6902; SI-NEXT:    s_waitcnt vmcnt(0)
6903; SI-NEXT:    buffer_wbinvl1
6904; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
6905; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
6906; SI-NEXT:    v_mov_b32_e32 v6, v8
6907; SI-NEXT:    v_mov_b32_e32 v7, v9
6908; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
6909; SI-NEXT:    s_cbranch_execnz .LBB107_1
6910; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
6911; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
6912; SI-NEXT:    s_waitcnt expcnt(0)
6913; SI-NEXT:    s_setpc_b64 s[30:31]
6914;
6915; VI-LABEL: global_atomic_umin_i64_noret:
6916; VI:       ; %bb.0:
6917; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6918; VI-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
6919; VI-NEXT:    s_mov_b64 s[4:5], 0
6920; VI-NEXT:  .LBB107_1: ; %atomicrmw.start
6921; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
6922; VI-NEXT:    s_waitcnt vmcnt(0)
6923; VI-NEXT:    v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
6924; VI-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
6925; VI-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
6926; VI-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
6927; VI-NEXT:    s_waitcnt vmcnt(0)
6928; VI-NEXT:    buffer_wbinvl1_vol
6929; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
6930; VI-NEXT:    v_mov_b32_e32 v7, v5
6931; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6932; VI-NEXT:    v_mov_b32_e32 v6, v4
6933; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6934; VI-NEXT:    s_cbranch_execnz .LBB107_1
6935; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
6936; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
6937; VI-NEXT:    s_setpc_b64 s[30:31]
6938;
6939; GFX9-LABEL: global_atomic_umin_i64_noret:
6940; GFX9:       ; %bb.0:
6941; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6942; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
6943; GFX9-NEXT:    s_mov_b64 s[4:5], 0
6944; GFX9-NEXT:  .LBB107_1: ; %atomicrmw.start
6945; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
6946; GFX9-NEXT:    s_waitcnt vmcnt(0)
6947; GFX9-NEXT:    v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
6948; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
6949; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
6950; GFX9-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
6951; GFX9-NEXT:    s_waitcnt vmcnt(0)
6952; GFX9-NEXT:    buffer_wbinvl1_vol
6953; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
6954; GFX9-NEXT:    v_mov_b32_e32 v7, v5
6955; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6956; GFX9-NEXT:    v_mov_b32_e32 v6, v4
6957; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6958; GFX9-NEXT:    s_cbranch_execnz .LBB107_1
6959; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
6960; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
6961; GFX9-NEXT:    s_setpc_b64 s[30:31]
6962  %tmp0 = atomicrmw umin ptr addrspace(1) %ptr, i64 %in seq_cst
6963  ret void
6964}
6965
6966define void @global_atomic_umin_i64_noret_offset(ptr addrspace(1) %out, i64 %in) {
6967; SI-LABEL: global_atomic_umin_i64_noret_offset:
6968; SI:       ; %bb.0:
6969; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6970; SI-NEXT:    s_mov_b32 s6, 0
6971; SI-NEXT:    s_mov_b32 s7, 0xf000
6972; SI-NEXT:    s_mov_b32 s4, s6
6973; SI-NEXT:    s_mov_b32 s5, s6
6974; SI-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32
6975; SI-NEXT:    s_mov_b64 s[8:9], 0
6976; SI-NEXT:  .LBB108_1: ; %atomicrmw.start
6977; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
6978; SI-NEXT:    s_waitcnt vmcnt(0)
6979; SI-NEXT:    v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
6980; SI-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
6981; SI-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
6982; SI-NEXT:    s_waitcnt expcnt(0)
6983; SI-NEXT:    v_mov_b32_e32 v11, v7
6984; SI-NEXT:    v_mov_b32_e32 v10, v6
6985; SI-NEXT:    v_mov_b32_e32 v9, v5
6986; SI-NEXT:    v_mov_b32_e32 v8, v4
6987; SI-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
6988; SI-NEXT:    s_waitcnt vmcnt(0)
6989; SI-NEXT:    buffer_wbinvl1
6990; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
6991; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
6992; SI-NEXT:    v_mov_b32_e32 v6, v8
6993; SI-NEXT:    v_mov_b32_e32 v7, v9
6994; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
6995; SI-NEXT:    s_cbranch_execnz .LBB108_1
6996; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
6997; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
6998; SI-NEXT:    s_waitcnt expcnt(0)
6999; SI-NEXT:    s_setpc_b64 s[30:31]
7000;
7001; VI-LABEL: global_atomic_umin_i64_noret_offset:
7002; VI:       ; %bb.0:
7003; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7004; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
7005; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
7006; VI-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
7007; VI-NEXT:    s_mov_b64 s[4:5], 0
7008; VI-NEXT:  .LBB108_1: ; %atomicrmw.start
7009; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
7010; VI-NEXT:    s_waitcnt vmcnt(0)
7011; VI-NEXT:    v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
7012; VI-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
7013; VI-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
7014; VI-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
7015; VI-NEXT:    s_waitcnt vmcnt(0)
7016; VI-NEXT:    buffer_wbinvl1_vol
7017; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
7018; VI-NEXT:    v_mov_b32_e32 v7, v5
7019; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7020; VI-NEXT:    v_mov_b32_e32 v6, v4
7021; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7022; VI-NEXT:    s_cbranch_execnz .LBB108_1
7023; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
7024; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
7025; VI-NEXT:    s_setpc_b64 s[30:31]
7026;
7027; GFX9-LABEL: global_atomic_umin_i64_noret_offset:
7028; GFX9:       ; %bb.0:
7029; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7030; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off offset:32
7031; GFX9-NEXT:    s_mov_b64 s[4:5], 0
7032; GFX9-NEXT:  .LBB108_1: ; %atomicrmw.start
7033; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
7034; GFX9-NEXT:    s_waitcnt vmcnt(0)
7035; GFX9-NEXT:    v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
7036; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
7037; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
7038; GFX9-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
7039; GFX9-NEXT:    s_waitcnt vmcnt(0)
7040; GFX9-NEXT:    buffer_wbinvl1_vol
7041; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
7042; GFX9-NEXT:    v_mov_b32_e32 v7, v5
7043; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7044; GFX9-NEXT:    v_mov_b32_e32 v6, v4
7045; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7046; GFX9-NEXT:    s_cbranch_execnz .LBB108_1
7047; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
7048; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
7049; GFX9-NEXT:    s_setpc_b64 s[30:31]
7050  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
7051  %tmp0 = atomicrmw umin ptr addrspace(1) %gep, i64 %in seq_cst
7052  ret void
7053}
7054
7055define i64 @global_atomic_umin_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
7056; SI-LABEL: global_atomic_umin_i64_ret:
7057; SI:       ; %bb.0:
7058; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7059; SI-NEXT:    v_mov_b32_e32 v5, v3
7060; SI-NEXT:    v_mov_b32_e32 v4, v2
7061; SI-NEXT:    v_mov_b32_e32 v7, v1
7062; SI-NEXT:    v_mov_b32_e32 v6, v0
7063; SI-NEXT:    s_mov_b32 s6, 0
7064; SI-NEXT:    s_mov_b32 s7, 0xf000
7065; SI-NEXT:    s_mov_b32 s4, s6
7066; SI-NEXT:    s_mov_b32 s5, s6
7067; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64
7068; SI-NEXT:    s_mov_b64 s[8:9], 0
7069; SI-NEXT:  .LBB109_1: ; %atomicrmw.start
7070; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
7071; SI-NEXT:    s_waitcnt vmcnt(0)
7072; SI-NEXT:    v_mov_b32_e32 v11, v1
7073; SI-NEXT:    v_mov_b32_e32 v10, v0
7074; SI-NEXT:    v_cmp_le_u64_e32 vcc, v[10:11], v[4:5]
7075; SI-NEXT:    v_cndmask_b32_e32 v9, v5, v11, vcc
7076; SI-NEXT:    v_cndmask_b32_e32 v8, v4, v10, vcc
7077; SI-NEXT:    s_waitcnt expcnt(0)
7078; SI-NEXT:    v_mov_b32_e32 v0, v8
7079; SI-NEXT:    v_mov_b32_e32 v1, v9
7080; SI-NEXT:    v_mov_b32_e32 v2, v10
7081; SI-NEXT:    v_mov_b32_e32 v3, v11
7082; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
7083; SI-NEXT:    s_waitcnt vmcnt(0)
7084; SI-NEXT:    buffer_wbinvl1
7085; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
7086; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
7087; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
7088; SI-NEXT:    s_cbranch_execnz .LBB109_1
7089; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
7090; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
7091; SI-NEXT:    s_waitcnt expcnt(0)
7092; SI-NEXT:    s_setpc_b64 s[30:31]
7093;
7094; VI-LABEL: global_atomic_umin_i64_ret:
7095; VI:       ; %bb.0:
7096; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7097; VI-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
7098; VI-NEXT:    s_mov_b64 s[4:5], 0
7099; VI-NEXT:  .LBB109_1: ; %atomicrmw.start
7100; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
7101; VI-NEXT:    s_waitcnt vmcnt(0)
7102; VI-NEXT:    v_mov_b32_e32 v7, v5
7103; VI-NEXT:    v_mov_b32_e32 v6, v4
7104; VI-NEXT:    v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
7105; VI-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
7106; VI-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
7107; VI-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
7108; VI-NEXT:    s_waitcnt vmcnt(0)
7109; VI-NEXT:    buffer_wbinvl1_vol
7110; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
7111; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7112; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7113; VI-NEXT:    s_cbranch_execnz .LBB109_1
7114; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
7115; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
7116; VI-NEXT:    v_mov_b32_e32 v0, v4
7117; VI-NEXT:    v_mov_b32_e32 v1, v5
7118; VI-NEXT:    s_setpc_b64 s[30:31]
7119;
7120; GFX9-LABEL: global_atomic_umin_i64_ret:
7121; GFX9:       ; %bb.0:
7122; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7123; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
7124; GFX9-NEXT:    s_mov_b64 s[4:5], 0
7125; GFX9-NEXT:  .LBB109_1: ; %atomicrmw.start
7126; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
7127; GFX9-NEXT:    s_waitcnt vmcnt(0)
7128; GFX9-NEXT:    v_mov_b32_e32 v7, v5
7129; GFX9-NEXT:    v_mov_b32_e32 v6, v4
7130; GFX9-NEXT:    v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
7131; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
7132; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
7133; GFX9-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
7134; GFX9-NEXT:    s_waitcnt vmcnt(0)
7135; GFX9-NEXT:    buffer_wbinvl1_vol
7136; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
7137; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7138; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7139; GFX9-NEXT:    s_cbranch_execnz .LBB109_1
7140; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
7141; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
7142; GFX9-NEXT:    v_mov_b32_e32 v0, v4
7143; GFX9-NEXT:    v_mov_b32_e32 v1, v5
7144; GFX9-NEXT:    s_setpc_b64 s[30:31]
7145  %result = atomicrmw umin ptr addrspace(1) %ptr, i64 %in seq_cst
7146  ret i64 %result
7147}
7148
7149define i64 @global_atomic_umin_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
7150; SI-LABEL: global_atomic_umin_i64_ret_offset:
7151; SI:       ; %bb.0:
7152; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7153; SI-NEXT:    v_mov_b32_e32 v5, v3
7154; SI-NEXT:    v_mov_b32_e32 v4, v2
7155; SI-NEXT:    v_mov_b32_e32 v7, v1
7156; SI-NEXT:    v_mov_b32_e32 v6, v0
7157; SI-NEXT:    s_mov_b32 s6, 0
7158; SI-NEXT:    s_mov_b32 s7, 0xf000
7159; SI-NEXT:    s_mov_b32 s4, s6
7160; SI-NEXT:    s_mov_b32 s5, s6
7161; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32
7162; SI-NEXT:    s_mov_b64 s[8:9], 0
7163; SI-NEXT:  .LBB110_1: ; %atomicrmw.start
7164; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
7165; SI-NEXT:    s_waitcnt vmcnt(0)
7166; SI-NEXT:    v_mov_b32_e32 v11, v1
7167; SI-NEXT:    v_mov_b32_e32 v10, v0
7168; SI-NEXT:    v_cmp_le_u64_e32 vcc, v[10:11], v[4:5]
7169; SI-NEXT:    v_cndmask_b32_e32 v9, v5, v11, vcc
7170; SI-NEXT:    v_cndmask_b32_e32 v8, v4, v10, vcc
7171; SI-NEXT:    s_waitcnt expcnt(0)
7172; SI-NEXT:    v_mov_b32_e32 v0, v8
7173; SI-NEXT:    v_mov_b32_e32 v1, v9
7174; SI-NEXT:    v_mov_b32_e32 v2, v10
7175; SI-NEXT:    v_mov_b32_e32 v3, v11
7176; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc
7177; SI-NEXT:    s_waitcnt vmcnt(0)
7178; SI-NEXT:    buffer_wbinvl1
7179; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
7180; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
7181; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
7182; SI-NEXT:    s_cbranch_execnz .LBB110_1
7183; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
7184; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
7185; SI-NEXT:    s_waitcnt expcnt(0)
7186; SI-NEXT:    s_setpc_b64 s[30:31]
7187;
7188; VI-LABEL: global_atomic_umin_i64_ret_offset:
7189; VI:       ; %bb.0:
7190; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7191; VI-NEXT:    v_add_u32_e32 v4, vcc, 32, v0
7192; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
7193; VI-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
7194; VI-NEXT:    s_mov_b64 s[4:5], 0
7195; VI-NEXT:  .LBB110_1: ; %atomicrmw.start
7196; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
7197; VI-NEXT:    s_waitcnt vmcnt(0)
7198; VI-NEXT:    v_mov_b32_e32 v9, v1
7199; VI-NEXT:    v_mov_b32_e32 v8, v0
7200; VI-NEXT:    v_cmp_le_u64_e32 vcc, v[8:9], v[2:3]
7201; VI-NEXT:    v_cndmask_b32_e32 v7, v3, v9, vcc
7202; VI-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
7203; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
7204; VI-NEXT:    s_waitcnt vmcnt(0)
7205; VI-NEXT:    buffer_wbinvl1_vol
7206; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
7207; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7208; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7209; VI-NEXT:    s_cbranch_execnz .LBB110_1
7210; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
7211; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
7212; VI-NEXT:    s_setpc_b64 s[30:31]
7213;
7214; GFX9-LABEL: global_atomic_umin_i64_ret_offset:
7215; GFX9:       ; %bb.0:
7216; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7217; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:32
7218; GFX9-NEXT:    s_mov_b64 s[4:5], 0
7219; GFX9-NEXT:  .LBB110_1: ; %atomicrmw.start
7220; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
7221; GFX9-NEXT:    s_waitcnt vmcnt(0)
7222; GFX9-NEXT:    v_mov_b32_e32 v7, v5
7223; GFX9-NEXT:    v_mov_b32_e32 v6, v4
7224; GFX9-NEXT:    v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
7225; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
7226; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
7227; GFX9-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
7228; GFX9-NEXT:    s_waitcnt vmcnt(0)
7229; GFX9-NEXT:    buffer_wbinvl1_vol
7230; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
7231; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7232; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7233; GFX9-NEXT:    s_cbranch_execnz .LBB110_1
7234; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
7235; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
7236; GFX9-NEXT:    v_mov_b32_e32 v0, v4
7237; GFX9-NEXT:    v_mov_b32_e32 v1, v5
7238; GFX9-NEXT:    s_setpc_b64 s[30:31]
7239  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
7240  %result = atomicrmw umin ptr addrspace(1) %gep, i64 %in seq_cst
7241  ret i64 %result
7242}
7243
7244define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
7245; SI-LABEL: global_atomic_umin_i64_noret_scalar:
7246; SI:       ; %bb.0:
7247; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7248; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
7249; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
7250; SI-NEXT:    s_mov_b64 exec, s[34:35]
7251; SI-NEXT:    s_waitcnt expcnt(0)
7252; SI-NEXT:    v_writelane_b32 v10, s6, 0
7253; SI-NEXT:    v_writelane_b32 v10, s7, 1
7254; SI-NEXT:    s_mov_b32 s35, s7
7255; SI-NEXT:    s_mov_b32 s34, s6
7256; SI-NEXT:    s_mov_b32 s7, 0xf000
7257; SI-NEXT:    s_mov_b32 s6, -1
7258; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
7259; SI-NEXT:    s_mov_b64 s[36:37], 0
7260; SI-NEXT:    v_mov_b32_e32 v4, s35
7261; SI-NEXT:    v_mov_b32_e32 v5, s34
7262; SI-NEXT:  .LBB111_1: ; %atomicrmw.start
7263; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
7264; SI-NEXT:    s_waitcnt vmcnt(0)
7265; SI-NEXT:    v_cmp_ge_u64_e32 vcc, s[34:35], v[2:3]
7266; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
7267; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
7268; SI-NEXT:    s_waitcnt expcnt(0)
7269; SI-NEXT:    v_mov_b32_e32 v9, v3
7270; SI-NEXT:    v_mov_b32_e32 v8, v2
7271; SI-NEXT:    v_mov_b32_e32 v7, v1
7272; SI-NEXT:    v_mov_b32_e32 v6, v0
7273; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc
7274; SI-NEXT:    s_waitcnt vmcnt(0)
7275; SI-NEXT:    buffer_wbinvl1
7276; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
7277; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
7278; SI-NEXT:    v_mov_b32_e32 v2, v6
7279; SI-NEXT:    v_mov_b32_e32 v3, v7
7280; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
7281; SI-NEXT:    s_cbranch_execnz .LBB111_1
7282; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
7283; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
7284; SI-NEXT:    v_readlane_b32 s7, v10, 1
7285; SI-NEXT:    v_readlane_b32 s6, v10, 0
7286; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
7287; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
7288; SI-NEXT:    s_mov_b64 exec, s[34:35]
7289; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
7290; SI-NEXT:    s_setpc_b64 s[30:31]
7291;
7292; VI-LABEL: global_atomic_umin_i64_noret_scalar:
7293; VI:       ; %bb.0:
7294; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7295; VI-NEXT:    v_mov_b32_e32 v0, s4
7296; VI-NEXT:    v_mov_b32_e32 v1, s5
7297; VI-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
7298; VI-NEXT:    v_mov_b32_e32 v4, s4
7299; VI-NEXT:    s_mov_b64 s[34:35], 0
7300; VI-NEXT:    v_mov_b32_e32 v6, s7
7301; VI-NEXT:    v_mov_b32_e32 v7, s6
7302; VI-NEXT:    v_mov_b32_e32 v5, s5
7303; VI-NEXT:  .LBB111_1: ; %atomicrmw.start
7304; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
7305; VI-NEXT:    s_waitcnt vmcnt(0)
7306; VI-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
7307; VI-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
7308; VI-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
7309; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
7310; VI-NEXT:    s_waitcnt vmcnt(0)
7311; VI-NEXT:    buffer_wbinvl1_vol
7312; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
7313; VI-NEXT:    v_mov_b32_e32 v3, v1
7314; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
7315; VI-NEXT:    v_mov_b32_e32 v2, v0
7316; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
7317; VI-NEXT:    s_cbranch_execnz .LBB111_1
7318; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
7319; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
7320; VI-NEXT:    s_setpc_b64 s[30:31]
7321;
7322; GFX9-LABEL: global_atomic_umin_i64_noret_scalar:
7323; GFX9:       ; %bb.0:
7324; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7325; GFX9-NEXT:    v_mov_b32_e32 v4, 0
7326; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[4:5]
7327; GFX9-NEXT:    s_mov_b64 s[34:35], 0
7328; GFX9-NEXT:    v_mov_b32_e32 v5, s7
7329; GFX9-NEXT:    v_mov_b32_e32 v6, s6
7330; GFX9-NEXT:  .LBB111_1: ; %atomicrmw.start
7331; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
7332; GFX9-NEXT:    s_waitcnt vmcnt(0)
7333; GFX9-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
7334; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
7335; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
7336; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc
7337; GFX9-NEXT:    s_waitcnt vmcnt(0)
7338; GFX9-NEXT:    buffer_wbinvl1_vol
7339; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
7340; GFX9-NEXT:    v_mov_b32_e32 v3, v1
7341; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
7342; GFX9-NEXT:    v_mov_b32_e32 v2, v0
7343; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
7344; GFX9-NEXT:    s_cbranch_execnz .LBB111_1
7345; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
7346; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
7347; GFX9-NEXT:    s_setpc_b64 s[30:31]
7348  %tmp0 = atomicrmw umin ptr addrspace(1) %ptr, i64 %in seq_cst
7349  ret void
7350}
7351
7352define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
7353; SI-LABEL: global_atomic_umin_i64_noret_offset_scalar:
7354; SI:       ; %bb.0:
7355; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7356; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
7357; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
7358; SI-NEXT:    s_mov_b64 exec, s[34:35]
7359; SI-NEXT:    s_waitcnt expcnt(0)
7360; SI-NEXT:    v_writelane_b32 v10, s6, 0
7361; SI-NEXT:    v_writelane_b32 v10, s7, 1
7362; SI-NEXT:    s_mov_b32 s35, s7
7363; SI-NEXT:    s_mov_b32 s34, s6
7364; SI-NEXT:    s_mov_b32 s7, 0xf000
7365; SI-NEXT:    s_mov_b32 s6, -1
7366; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32
7367; SI-NEXT:    s_mov_b64 s[36:37], 0
7368; SI-NEXT:    v_mov_b32_e32 v4, s35
7369; SI-NEXT:    v_mov_b32_e32 v5, s34
7370; SI-NEXT:  .LBB112_1: ; %atomicrmw.start
7371; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
7372; SI-NEXT:    s_waitcnt vmcnt(0)
7373; SI-NEXT:    v_cmp_ge_u64_e32 vcc, s[34:35], v[2:3]
7374; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
7375; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
7376; SI-NEXT:    s_waitcnt expcnt(0)
7377; SI-NEXT:    v_mov_b32_e32 v9, v3
7378; SI-NEXT:    v_mov_b32_e32 v8, v2
7379; SI-NEXT:    v_mov_b32_e32 v7, v1
7380; SI-NEXT:    v_mov_b32_e32 v6, v0
7381; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc
7382; SI-NEXT:    s_waitcnt vmcnt(0)
7383; SI-NEXT:    buffer_wbinvl1
7384; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
7385; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
7386; SI-NEXT:    v_mov_b32_e32 v2, v6
7387; SI-NEXT:    v_mov_b32_e32 v3, v7
7388; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
7389; SI-NEXT:    s_cbranch_execnz .LBB112_1
7390; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
7391; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
7392; SI-NEXT:    v_readlane_b32 s7, v10, 1
7393; SI-NEXT:    v_readlane_b32 s6, v10, 0
7394; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
7395; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
7396; SI-NEXT:    s_mov_b64 exec, s[34:35]
7397; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
7398; SI-NEXT:    s_setpc_b64 s[30:31]
7399;
7400; VI-LABEL: global_atomic_umin_i64_noret_offset_scalar:
7401; VI:       ; %bb.0:
7402; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7403; VI-NEXT:    s_add_u32 s34, s4, 32
7404; VI-NEXT:    s_addc_u32 s35, s5, 0
7405; VI-NEXT:    v_mov_b32_e32 v4, s34
7406; VI-NEXT:    v_mov_b32_e32 v5, s35
7407; VI-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
7408; VI-NEXT:    s_mov_b64 s[34:35], 0
7409; VI-NEXT:    v_mov_b32_e32 v6, s7
7410; VI-NEXT:    v_mov_b32_e32 v7, s6
7411; VI-NEXT:  .LBB112_1: ; %atomicrmw.start
7412; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
7413; VI-NEXT:    s_waitcnt vmcnt(0)
7414; VI-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
7415; VI-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
7416; VI-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
7417; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
7418; VI-NEXT:    s_waitcnt vmcnt(0)
7419; VI-NEXT:    buffer_wbinvl1_vol
7420; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
7421; VI-NEXT:    v_mov_b32_e32 v3, v1
7422; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
7423; VI-NEXT:    v_mov_b32_e32 v2, v0
7424; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
7425; VI-NEXT:    s_cbranch_execnz .LBB112_1
7426; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
7427; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
7428; VI-NEXT:    s_setpc_b64 s[30:31]
7429;
7430; GFX9-LABEL: global_atomic_umin_i64_noret_offset_scalar:
7431; GFX9:       ; %bb.0:
7432; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7433; GFX9-NEXT:    v_mov_b32_e32 v4, 0
7434; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[4:5] offset:32
7435; GFX9-NEXT:    s_mov_b64 s[34:35], 0
7436; GFX9-NEXT:    v_mov_b32_e32 v5, s7
7437; GFX9-NEXT:    v_mov_b32_e32 v6, s6
7438; GFX9-NEXT:  .LBB112_1: ; %atomicrmw.start
7439; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
7440; GFX9-NEXT:    s_waitcnt vmcnt(0)
7441; GFX9-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
7442; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
7443; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
7444; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc
7445; GFX9-NEXT:    s_waitcnt vmcnt(0)
7446; GFX9-NEXT:    buffer_wbinvl1_vol
7447; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
7448; GFX9-NEXT:    v_mov_b32_e32 v3, v1
7449; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
7450; GFX9-NEXT:    v_mov_b32_e32 v2, v0
7451; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
7452; GFX9-NEXT:    s_cbranch_execnz .LBB112_1
7453; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
7454; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
7455; GFX9-NEXT:    s_setpc_b64 s[30:31]
7456  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
7457  %tmp0 = atomicrmw umin ptr addrspace(1) %gep, i64 %in seq_cst
7458  ret void
7459}
7460
7461define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
7462; SI-LABEL: global_atomic_umin_i64_ret_scalar:
7463; SI:       ; %bb.0:
7464; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7465; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
7466; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
7467; SI-NEXT:    s_mov_b64 exec, s[34:35]
7468; SI-NEXT:    s_waitcnt expcnt(0)
7469; SI-NEXT:    v_writelane_b32 v10, s6, 0
7470; SI-NEXT:    v_writelane_b32 v10, s7, 1
7471; SI-NEXT:    s_mov_b32 s35, s7
7472; SI-NEXT:    s_mov_b32 s34, s6
7473; SI-NEXT:    s_mov_b32 s7, 0xf000
7474; SI-NEXT:    s_mov_b32 s6, -1
7475; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
7476; SI-NEXT:    s_mov_b64 s[36:37], 0
7477; SI-NEXT:    v_mov_b32_e32 v4, s35
7478; SI-NEXT:    v_mov_b32_e32 v5, s34
7479; SI-NEXT:  .LBB113_1: ; %atomicrmw.start
7480; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
7481; SI-NEXT:    s_waitcnt vmcnt(0)
7482; SI-NEXT:    v_mov_b32_e32 v9, v1
7483; SI-NEXT:    v_mov_b32_e32 v8, v0
7484; SI-NEXT:    v_cmp_ge_u64_e32 vcc, s[34:35], v[8:9]
7485; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
7486; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
7487; SI-NEXT:    s_waitcnt expcnt(0)
7488; SI-NEXT:    v_mov_b32_e32 v0, v6
7489; SI-NEXT:    v_mov_b32_e32 v1, v7
7490; SI-NEXT:    v_mov_b32_e32 v2, v8
7491; SI-NEXT:    v_mov_b32_e32 v3, v9
7492; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
7493; SI-NEXT:    s_waitcnt vmcnt(0)
7494; SI-NEXT:    buffer_wbinvl1
7495; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
7496; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
7497; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
7498; SI-NEXT:    s_cbranch_execnz .LBB113_1
7499; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
7500; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
7501; SI-NEXT:    v_readlane_b32 s7, v10, 1
7502; SI-NEXT:    v_readlane_b32 s6, v10, 0
7503; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
7504; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
7505; SI-NEXT:    s_mov_b64 exec, s[34:35]
7506; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
7507; SI-NEXT:    s_setpc_b64 s[30:31]
7508;
7509; VI-LABEL: global_atomic_umin_i64_ret_scalar:
7510; VI:       ; %bb.0:
7511; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7512; VI-NEXT:    v_mov_b32_e32 v0, s4
7513; VI-NEXT:    v_mov_b32_e32 v1, s5
7514; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
7515; VI-NEXT:    v_mov_b32_e32 v2, s4
7516; VI-NEXT:    s_mov_b64 s[34:35], 0
7517; VI-NEXT:    v_mov_b32_e32 v4, s7
7518; VI-NEXT:    v_mov_b32_e32 v5, s6
7519; VI-NEXT:    v_mov_b32_e32 v3, s5
7520; VI-NEXT:  .LBB113_1: ; %atomicrmw.start
7521; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
7522; VI-NEXT:    s_waitcnt vmcnt(0)
7523; VI-NEXT:    v_mov_b32_e32 v9, v1
7524; VI-NEXT:    v_mov_b32_e32 v8, v0
7525; VI-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
7526; VI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
7527; VI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
7528; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
7529; VI-NEXT:    s_waitcnt vmcnt(0)
7530; VI-NEXT:    buffer_wbinvl1_vol
7531; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
7532; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
7533; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
7534; VI-NEXT:    s_cbranch_execnz .LBB113_1
7535; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
7536; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
7537; VI-NEXT:    s_setpc_b64 s[30:31]
7538;
7539; GFX9-LABEL: global_atomic_umin_i64_ret_scalar:
7540; GFX9:       ; %bb.0:
7541; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7542; GFX9-NEXT:    v_mov_b32_e32 v2, 0
7543; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5]
7544; GFX9-NEXT:    s_mov_b64 s[34:35], 0
7545; GFX9-NEXT:    v_mov_b32_e32 v3, s7
7546; GFX9-NEXT:    v_mov_b32_e32 v4, s6
7547; GFX9-NEXT:  .LBB113_1: ; %atomicrmw.start
7548; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
7549; GFX9-NEXT:    s_waitcnt vmcnt(0)
7550; GFX9-NEXT:    v_mov_b32_e32 v8, v1
7551; GFX9-NEXT:    v_mov_b32_e32 v7, v0
7552; GFX9-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[7:8]
7553; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v8, vcc
7554; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v7, vcc
7555; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] glc
7556; GFX9-NEXT:    s_waitcnt vmcnt(0)
7557; GFX9-NEXT:    buffer_wbinvl1_vol
7558; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
7559; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
7560; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
7561; GFX9-NEXT:    s_cbranch_execnz .LBB113_1
7562; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
7563; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
7564; GFX9-NEXT:    s_setpc_b64 s[30:31]
7565  %result = atomicrmw umin ptr addrspace(1) %ptr, i64 %in seq_cst
7566  ret i64 %result
7567}
7568
7569define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
7570; SI-LABEL: global_atomic_umin_i64_ret_offset_scalar:
7571; SI:       ; %bb.0:
7572; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7573; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
7574; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
7575; SI-NEXT:    s_mov_b64 exec, s[34:35]
7576; SI-NEXT:    s_waitcnt expcnt(0)
7577; SI-NEXT:    v_writelane_b32 v10, s6, 0
7578; SI-NEXT:    v_writelane_b32 v10, s7, 1
7579; SI-NEXT:    s_mov_b32 s35, s7
7580; SI-NEXT:    s_mov_b32 s34, s6
7581; SI-NEXT:    s_mov_b32 s7, 0xf000
7582; SI-NEXT:    s_mov_b32 s6, -1
7583; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
7584; SI-NEXT:    s_mov_b64 s[36:37], 0
7585; SI-NEXT:    v_mov_b32_e32 v4, s35
7586; SI-NEXT:    v_mov_b32_e32 v5, s34
7587; SI-NEXT:  .LBB114_1: ; %atomicrmw.start
7588; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
7589; SI-NEXT:    s_waitcnt vmcnt(0)
7590; SI-NEXT:    v_mov_b32_e32 v9, v1
7591; SI-NEXT:    v_mov_b32_e32 v8, v0
7592; SI-NEXT:    v_cmp_ge_u64_e32 vcc, s[34:35], v[8:9]
7593; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
7594; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
7595; SI-NEXT:    s_waitcnt expcnt(0)
7596; SI-NEXT:    v_mov_b32_e32 v0, v6
7597; SI-NEXT:    v_mov_b32_e32 v1, v7
7598; SI-NEXT:    v_mov_b32_e32 v2, v8
7599; SI-NEXT:    v_mov_b32_e32 v3, v9
7600; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
7601; SI-NEXT:    s_waitcnt vmcnt(0)
7602; SI-NEXT:    buffer_wbinvl1
7603; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
7604; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
7605; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
7606; SI-NEXT:    s_cbranch_execnz .LBB114_1
7607; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
7608; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
7609; SI-NEXT:    v_readlane_b32 s7, v10, 1
7610; SI-NEXT:    v_readlane_b32 s6, v10, 0
7611; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
7612; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
7613; SI-NEXT:    s_mov_b64 exec, s[34:35]
7614; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
7615; SI-NEXT:    s_setpc_b64 s[30:31]
7616;
7617; VI-LABEL: global_atomic_umin_i64_ret_offset_scalar:
7618; VI:       ; %bb.0:
7619; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7620; VI-NEXT:    s_add_u32 s34, s4, 32
7621; VI-NEXT:    s_addc_u32 s35, s5, 0
7622; VI-NEXT:    v_mov_b32_e32 v2, s34
7623; VI-NEXT:    v_mov_b32_e32 v3, s35
7624; VI-NEXT:    flat_load_dwordx2 v[0:1], v[2:3]
7625; VI-NEXT:    s_mov_b64 s[34:35], 0
7626; VI-NEXT:    v_mov_b32_e32 v4, s7
7627; VI-NEXT:    v_mov_b32_e32 v5, s6
7628; VI-NEXT:  .LBB114_1: ; %atomicrmw.start
7629; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
7630; VI-NEXT:    s_waitcnt vmcnt(0)
7631; VI-NEXT:    v_mov_b32_e32 v9, v1
7632; VI-NEXT:    v_mov_b32_e32 v8, v0
7633; VI-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
7634; VI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
7635; VI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
7636; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
7637; VI-NEXT:    s_waitcnt vmcnt(0)
7638; VI-NEXT:    buffer_wbinvl1_vol
7639; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
7640; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
7641; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
7642; VI-NEXT:    s_cbranch_execnz .LBB114_1
7643; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
7644; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
7645; VI-NEXT:    s_setpc_b64 s[30:31]
7646;
7647; GFX9-LABEL: global_atomic_umin_i64_ret_offset_scalar:
7648; GFX9:       ; %bb.0:
7649; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7650; GFX9-NEXT:    v_mov_b32_e32 v2, 0
7651; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5] offset:32
7652; GFX9-NEXT:    s_mov_b64 s[34:35], 0
7653; GFX9-NEXT:    v_mov_b32_e32 v3, s7
7654; GFX9-NEXT:    v_mov_b32_e32 v4, s6
7655; GFX9-NEXT:  .LBB114_1: ; %atomicrmw.start
7656; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
7657; GFX9-NEXT:    s_waitcnt vmcnt(0)
7658; GFX9-NEXT:    v_mov_b32_e32 v8, v1
7659; GFX9-NEXT:    v_mov_b32_e32 v7, v0
7660; GFX9-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[7:8]
7661; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v8, vcc
7662; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v7, vcc
7663; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] offset:32 glc
7664; GFX9-NEXT:    s_waitcnt vmcnt(0)
7665; GFX9-NEXT:    buffer_wbinvl1_vol
7666; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
7667; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
7668; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
7669; GFX9-NEXT:    s_cbranch_execnz .LBB114_1
7670; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
7671; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
7672; GFX9-NEXT:    s_setpc_b64 s[30:31]
7673  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
7674  %result = atomicrmw umin ptr addrspace(1) %gep, i64 %in seq_cst
7675  ret i64 %result
7676}
7677
7678define void @global_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
7679; SI-LABEL: global_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory:
7680; SI:       ; %bb.0:
7681; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7682; SI-NEXT:    s_mov_b32 s6, 0
7683; SI-NEXT:    s_mov_b32 s7, 0xf000
7684; SI-NEXT:    s_mov_b32 s4, s6
7685; SI-NEXT:    s_mov_b32 s5, s6
7686; SI-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32
7687; SI-NEXT:    s_mov_b64 s[8:9], 0
7688; SI-NEXT:  .LBB115_1: ; %atomicrmw.start
7689; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
7690; SI-NEXT:    s_waitcnt vmcnt(0)
7691; SI-NEXT:    v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
7692; SI-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
7693; SI-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
7694; SI-NEXT:    s_waitcnt expcnt(0)
7695; SI-NEXT:    v_mov_b32_e32 v11, v7
7696; SI-NEXT:    v_mov_b32_e32 v10, v6
7697; SI-NEXT:    v_mov_b32_e32 v9, v5
7698; SI-NEXT:    v_mov_b32_e32 v8, v4
7699; SI-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
7700; SI-NEXT:    s_waitcnt vmcnt(0)
7701; SI-NEXT:    buffer_wbinvl1
7702; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
7703; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
7704; SI-NEXT:    v_mov_b32_e32 v6, v8
7705; SI-NEXT:    v_mov_b32_e32 v7, v9
7706; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
7707; SI-NEXT:    s_cbranch_execnz .LBB115_1
7708; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
7709; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
7710; SI-NEXT:    s_waitcnt expcnt(0)
7711; SI-NEXT:    s_setpc_b64 s[30:31]
7712;
7713; VI-LABEL: global_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory:
7714; VI:       ; %bb.0:
7715; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7716; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
7717; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
7718; VI-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
7719; VI-NEXT:    s_mov_b64 s[4:5], 0
7720; VI-NEXT:  .LBB115_1: ; %atomicrmw.start
7721; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
7722; VI-NEXT:    s_waitcnt vmcnt(0)
7723; VI-NEXT:    v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
7724; VI-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
7725; VI-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
7726; VI-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
7727; VI-NEXT:    s_waitcnt vmcnt(0)
7728; VI-NEXT:    buffer_wbinvl1_vol
7729; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
7730; VI-NEXT:    v_mov_b32_e32 v7, v5
7731; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7732; VI-NEXT:    v_mov_b32_e32 v6, v4
7733; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7734; VI-NEXT:    s_cbranch_execnz .LBB115_1
7735; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
7736; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
7737; VI-NEXT:    s_setpc_b64 s[30:31]
7738;
7739; GFX9-LABEL: global_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory:
7740; GFX9:       ; %bb.0:
7741; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7742; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off offset:32
7743; GFX9-NEXT:    s_mov_b64 s[4:5], 0
7744; GFX9-NEXT:  .LBB115_1: ; %atomicrmw.start
7745; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
7746; GFX9-NEXT:    s_waitcnt vmcnt(0)
7747; GFX9-NEXT:    v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
7748; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
7749; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
7750; GFX9-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
7751; GFX9-NEXT:    s_waitcnt vmcnt(0)
7752; GFX9-NEXT:    buffer_wbinvl1_vol
7753; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
7754; GFX9-NEXT:    v_mov_b32_e32 v7, v5
7755; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7756; GFX9-NEXT:    v_mov_b32_e32 v6, v4
7757; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7758; GFX9-NEXT:    s_cbranch_execnz .LBB115_1
7759; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
7760; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
7761; GFX9-NEXT:    s_setpc_b64 s[30:31]
7762  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
7763  %tmp0 = atomicrmw umin ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
7764  ret void
7765}
7766
7767define i64 @global_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
7768; SI-LABEL: global_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory:
7769; SI:       ; %bb.0:
7770; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7771; SI-NEXT:    v_mov_b32_e32 v5, v3
7772; SI-NEXT:    v_mov_b32_e32 v4, v2
7773; SI-NEXT:    v_mov_b32_e32 v7, v1
7774; SI-NEXT:    v_mov_b32_e32 v6, v0
7775; SI-NEXT:    s_mov_b32 s6, 0
7776; SI-NEXT:    s_mov_b32 s7, 0xf000
7777; SI-NEXT:    s_mov_b32 s4, s6
7778; SI-NEXT:    s_mov_b32 s5, s6
7779; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32
7780; SI-NEXT:    s_mov_b64 s[8:9], 0
7781; SI-NEXT:  .LBB116_1: ; %atomicrmw.start
7782; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
7783; SI-NEXT:    s_waitcnt vmcnt(0)
7784; SI-NEXT:    v_mov_b32_e32 v11, v1
7785; SI-NEXT:    v_mov_b32_e32 v10, v0
7786; SI-NEXT:    v_cmp_le_u64_e32 vcc, v[10:11], v[4:5]
7787; SI-NEXT:    v_cndmask_b32_e32 v9, v5, v11, vcc
7788; SI-NEXT:    v_cndmask_b32_e32 v8, v4, v10, vcc
7789; SI-NEXT:    s_waitcnt expcnt(0)
7790; SI-NEXT:    v_mov_b32_e32 v0, v8
7791; SI-NEXT:    v_mov_b32_e32 v1, v9
7792; SI-NEXT:    v_mov_b32_e32 v2, v10
7793; SI-NEXT:    v_mov_b32_e32 v3, v11
7794; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc
7795; SI-NEXT:    s_waitcnt vmcnt(0)
7796; SI-NEXT:    buffer_wbinvl1
7797; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
7798; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
7799; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
7800; SI-NEXT:    s_cbranch_execnz .LBB116_1
7801; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
7802; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
7803; SI-NEXT:    s_waitcnt expcnt(0)
7804; SI-NEXT:    s_setpc_b64 s[30:31]
7805;
7806; VI-LABEL: global_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory:
7807; VI:       ; %bb.0:
7808; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7809; VI-NEXT:    v_add_u32_e32 v4, vcc, 32, v0
7810; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
7811; VI-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
7812; VI-NEXT:    s_mov_b64 s[4:5], 0
7813; VI-NEXT:  .LBB116_1: ; %atomicrmw.start
7814; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
7815; VI-NEXT:    s_waitcnt vmcnt(0)
7816; VI-NEXT:    v_mov_b32_e32 v9, v1
7817; VI-NEXT:    v_mov_b32_e32 v8, v0
7818; VI-NEXT:    v_cmp_le_u64_e32 vcc, v[8:9], v[2:3]
7819; VI-NEXT:    v_cndmask_b32_e32 v7, v3, v9, vcc
7820; VI-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
7821; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
7822; VI-NEXT:    s_waitcnt vmcnt(0)
7823; VI-NEXT:    buffer_wbinvl1_vol
7824; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
7825; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7826; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7827; VI-NEXT:    s_cbranch_execnz .LBB116_1
7828; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
7829; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
7830; VI-NEXT:    s_setpc_b64 s[30:31]
7831;
7832; GFX9-LABEL: global_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory:
7833; GFX9:       ; %bb.0:
7834; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7835; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:32
7836; GFX9-NEXT:    s_mov_b64 s[4:5], 0
7837; GFX9-NEXT:  .LBB116_1: ; %atomicrmw.start
7838; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
7839; GFX9-NEXT:    s_waitcnt vmcnt(0)
7840; GFX9-NEXT:    v_mov_b32_e32 v7, v5
7841; GFX9-NEXT:    v_mov_b32_e32 v6, v4
7842; GFX9-NEXT:    v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
7843; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
7844; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
7845; GFX9-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
7846; GFX9-NEXT:    s_waitcnt vmcnt(0)
7847; GFX9-NEXT:    buffer_wbinvl1_vol
7848; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
7849; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7850; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7851; GFX9-NEXT:    s_cbranch_execnz .LBB116_1
7852; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
7853; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
7854; GFX9-NEXT:    v_mov_b32_e32 v0, v4
7855; GFX9-NEXT:    v_mov_b32_e32 v1, v5
7856; GFX9-NEXT:    s_setpc_b64 s[30:31]
7857  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
7858  %result = atomicrmw umin ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
7859  ret i64 %result
7860}
7861
7862; ---------------------------------------------------------------------
7863; atomicrmw min
7864; ---------------------------------------------------------------------
7865
7866define void @global_atomic_min_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
7867; SI-LABEL: global_atomic_min_i64_noret:
7868; SI:       ; %bb.0:
7869; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7870; SI-NEXT:    s_mov_b32 s6, 0
7871; SI-NEXT:    s_mov_b32 s7, 0xf000
7872; SI-NEXT:    s_mov_b32 s4, s6
7873; SI-NEXT:    s_mov_b32 s5, s6
7874; SI-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
7875; SI-NEXT:    s_mov_b64 s[8:9], 0
7876; SI-NEXT:  .LBB117_1: ; %atomicrmw.start
7877; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
7878; SI-NEXT:    s_waitcnt vmcnt(0)
7879; SI-NEXT:    v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
7880; SI-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
7881; SI-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
7882; SI-NEXT:    s_waitcnt expcnt(0)
7883; SI-NEXT:    v_mov_b32_e32 v11, v7
7884; SI-NEXT:    v_mov_b32_e32 v10, v6
7885; SI-NEXT:    v_mov_b32_e32 v9, v5
7886; SI-NEXT:    v_mov_b32_e32 v8, v4
7887; SI-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
7888; SI-NEXT:    s_waitcnt vmcnt(0)
7889; SI-NEXT:    buffer_wbinvl1
7890; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
7891; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
7892; SI-NEXT:    v_mov_b32_e32 v6, v8
7893; SI-NEXT:    v_mov_b32_e32 v7, v9
7894; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
7895; SI-NEXT:    s_cbranch_execnz .LBB117_1
7896; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
7897; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
7898; SI-NEXT:    s_waitcnt expcnt(0)
7899; SI-NEXT:    s_setpc_b64 s[30:31]
7900;
7901; VI-LABEL: global_atomic_min_i64_noret:
7902; VI:       ; %bb.0:
7903; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7904; VI-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
7905; VI-NEXT:    s_mov_b64 s[4:5], 0
7906; VI-NEXT:  .LBB117_1: ; %atomicrmw.start
7907; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
7908; VI-NEXT:    s_waitcnt vmcnt(0)
7909; VI-NEXT:    v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
7910; VI-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
7911; VI-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
7912; VI-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
7913; VI-NEXT:    s_waitcnt vmcnt(0)
7914; VI-NEXT:    buffer_wbinvl1_vol
7915; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
7916; VI-NEXT:    v_mov_b32_e32 v7, v5
7917; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7918; VI-NEXT:    v_mov_b32_e32 v6, v4
7919; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7920; VI-NEXT:    s_cbranch_execnz .LBB117_1
7921; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
7922; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
7923; VI-NEXT:    s_setpc_b64 s[30:31]
7924;
7925; GFX9-LABEL: global_atomic_min_i64_noret:
7926; GFX9:       ; %bb.0:
7927; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7928; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
7929; GFX9-NEXT:    s_mov_b64 s[4:5], 0
7930; GFX9-NEXT:  .LBB117_1: ; %atomicrmw.start
7931; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
7932; GFX9-NEXT:    s_waitcnt vmcnt(0)
7933; GFX9-NEXT:    v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
7934; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
7935; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
7936; GFX9-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
7937; GFX9-NEXT:    s_waitcnt vmcnt(0)
7938; GFX9-NEXT:    buffer_wbinvl1_vol
7939; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
7940; GFX9-NEXT:    v_mov_b32_e32 v7, v5
7941; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7942; GFX9-NEXT:    v_mov_b32_e32 v6, v4
7943; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7944; GFX9-NEXT:    s_cbranch_execnz .LBB117_1
7945; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
7946; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
7947; GFX9-NEXT:    s_setpc_b64 s[30:31]
7948  %tmp0 = atomicrmw min ptr addrspace(1) %ptr, i64 %in seq_cst
7949  ret void
7950}
7951
7952define void @global_atomic_min_i64_noret_offset(ptr addrspace(1) %out, i64 %in) {
7953; SI-LABEL: global_atomic_min_i64_noret_offset:
7954; SI:       ; %bb.0:
7955; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7956; SI-NEXT:    s_mov_b32 s6, 0
7957; SI-NEXT:    s_mov_b32 s7, 0xf000
7958; SI-NEXT:    s_mov_b32 s4, s6
7959; SI-NEXT:    s_mov_b32 s5, s6
7960; SI-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32
7961; SI-NEXT:    s_mov_b64 s[8:9], 0
7962; SI-NEXT:  .LBB118_1: ; %atomicrmw.start
7963; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
7964; SI-NEXT:    s_waitcnt vmcnt(0)
7965; SI-NEXT:    v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
7966; SI-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
7967; SI-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
7968; SI-NEXT:    s_waitcnt expcnt(0)
7969; SI-NEXT:    v_mov_b32_e32 v11, v7
7970; SI-NEXT:    v_mov_b32_e32 v10, v6
7971; SI-NEXT:    v_mov_b32_e32 v9, v5
7972; SI-NEXT:    v_mov_b32_e32 v8, v4
7973; SI-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
7974; SI-NEXT:    s_waitcnt vmcnt(0)
7975; SI-NEXT:    buffer_wbinvl1
7976; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
7977; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
7978; SI-NEXT:    v_mov_b32_e32 v6, v8
7979; SI-NEXT:    v_mov_b32_e32 v7, v9
7980; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
7981; SI-NEXT:    s_cbranch_execnz .LBB118_1
7982; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
7983; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
7984; SI-NEXT:    s_waitcnt expcnt(0)
7985; SI-NEXT:    s_setpc_b64 s[30:31]
7986;
7987; VI-LABEL: global_atomic_min_i64_noret_offset:
7988; VI:       ; %bb.0:
7989; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7990; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
7991; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
7992; VI-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
7993; VI-NEXT:    s_mov_b64 s[4:5], 0
7994; VI-NEXT:  .LBB118_1: ; %atomicrmw.start
7995; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
7996; VI-NEXT:    s_waitcnt vmcnt(0)
7997; VI-NEXT:    v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
7998; VI-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
7999; VI-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
8000; VI-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
8001; VI-NEXT:    s_waitcnt vmcnt(0)
8002; VI-NEXT:    buffer_wbinvl1_vol
8003; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
8004; VI-NEXT:    v_mov_b32_e32 v7, v5
8005; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8006; VI-NEXT:    v_mov_b32_e32 v6, v4
8007; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8008; VI-NEXT:    s_cbranch_execnz .LBB118_1
8009; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
8010; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
8011; VI-NEXT:    s_setpc_b64 s[30:31]
8012;
8013; GFX9-LABEL: global_atomic_min_i64_noret_offset:
8014; GFX9:       ; %bb.0:
8015; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8016; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off offset:32
8017; GFX9-NEXT:    s_mov_b64 s[4:5], 0
8018; GFX9-NEXT:  .LBB118_1: ; %atomicrmw.start
8019; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
8020; GFX9-NEXT:    s_waitcnt vmcnt(0)
8021; GFX9-NEXT:    v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
8022; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
8023; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
8024; GFX9-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
8025; GFX9-NEXT:    s_waitcnt vmcnt(0)
8026; GFX9-NEXT:    buffer_wbinvl1_vol
8027; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
8028; GFX9-NEXT:    v_mov_b32_e32 v7, v5
8029; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8030; GFX9-NEXT:    v_mov_b32_e32 v6, v4
8031; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8032; GFX9-NEXT:    s_cbranch_execnz .LBB118_1
8033; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
8034; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
8035; GFX9-NEXT:    s_setpc_b64 s[30:31]
8036  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
8037  %tmp0 = atomicrmw min ptr addrspace(1) %gep, i64 %in seq_cst
8038  ret void
8039}
8040
8041define i64 @global_atomic_min_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
8042; SI-LABEL: global_atomic_min_i64_ret:
8043; SI:       ; %bb.0:
8044; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8045; SI-NEXT:    v_mov_b32_e32 v5, v3
8046; SI-NEXT:    v_mov_b32_e32 v4, v2
8047; SI-NEXT:    v_mov_b32_e32 v7, v1
8048; SI-NEXT:    v_mov_b32_e32 v6, v0
8049; SI-NEXT:    s_mov_b32 s6, 0
8050; SI-NEXT:    s_mov_b32 s7, 0xf000
8051; SI-NEXT:    s_mov_b32 s4, s6
8052; SI-NEXT:    s_mov_b32 s5, s6
8053; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64
8054; SI-NEXT:    s_mov_b64 s[8:9], 0
8055; SI-NEXT:  .LBB119_1: ; %atomicrmw.start
8056; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
8057; SI-NEXT:    s_waitcnt vmcnt(0)
8058; SI-NEXT:    v_mov_b32_e32 v11, v1
8059; SI-NEXT:    v_mov_b32_e32 v10, v0
8060; SI-NEXT:    v_cmp_le_i64_e32 vcc, v[10:11], v[4:5]
8061; SI-NEXT:    v_cndmask_b32_e32 v9, v5, v11, vcc
8062; SI-NEXT:    v_cndmask_b32_e32 v8, v4, v10, vcc
8063; SI-NEXT:    s_waitcnt expcnt(0)
8064; SI-NEXT:    v_mov_b32_e32 v0, v8
8065; SI-NEXT:    v_mov_b32_e32 v1, v9
8066; SI-NEXT:    v_mov_b32_e32 v2, v10
8067; SI-NEXT:    v_mov_b32_e32 v3, v11
8068; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
8069; SI-NEXT:    s_waitcnt vmcnt(0)
8070; SI-NEXT:    buffer_wbinvl1
8071; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
8072; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
8073; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
8074; SI-NEXT:    s_cbranch_execnz .LBB119_1
8075; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
8076; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
8077; SI-NEXT:    s_waitcnt expcnt(0)
8078; SI-NEXT:    s_setpc_b64 s[30:31]
8079;
8080; VI-LABEL: global_atomic_min_i64_ret:
8081; VI:       ; %bb.0:
8082; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8083; VI-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
8084; VI-NEXT:    s_mov_b64 s[4:5], 0
8085; VI-NEXT:  .LBB119_1: ; %atomicrmw.start
8086; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
8087; VI-NEXT:    s_waitcnt vmcnt(0)
8088; VI-NEXT:    v_mov_b32_e32 v7, v5
8089; VI-NEXT:    v_mov_b32_e32 v6, v4
8090; VI-NEXT:    v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
8091; VI-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
8092; VI-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
8093; VI-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
8094; VI-NEXT:    s_waitcnt vmcnt(0)
8095; VI-NEXT:    buffer_wbinvl1_vol
8096; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
8097; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8098; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8099; VI-NEXT:    s_cbranch_execnz .LBB119_1
8100; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
8101; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
8102; VI-NEXT:    v_mov_b32_e32 v0, v4
8103; VI-NEXT:    v_mov_b32_e32 v1, v5
8104; VI-NEXT:    s_setpc_b64 s[30:31]
8105;
8106; GFX9-LABEL: global_atomic_min_i64_ret:
8107; GFX9:       ; %bb.0:
8108; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8109; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
8110; GFX9-NEXT:    s_mov_b64 s[4:5], 0
8111; GFX9-NEXT:  .LBB119_1: ; %atomicrmw.start
8112; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
8113; GFX9-NEXT:    s_waitcnt vmcnt(0)
8114; GFX9-NEXT:    v_mov_b32_e32 v7, v5
8115; GFX9-NEXT:    v_mov_b32_e32 v6, v4
8116; GFX9-NEXT:    v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
8117; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
8118; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
8119; GFX9-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
8120; GFX9-NEXT:    s_waitcnt vmcnt(0)
8121; GFX9-NEXT:    buffer_wbinvl1_vol
8122; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
8123; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8124; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8125; GFX9-NEXT:    s_cbranch_execnz .LBB119_1
8126; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
8127; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
8128; GFX9-NEXT:    v_mov_b32_e32 v0, v4
8129; GFX9-NEXT:    v_mov_b32_e32 v1, v5
8130; GFX9-NEXT:    s_setpc_b64 s[30:31]
8131  %result = atomicrmw min ptr addrspace(1) %ptr, i64 %in seq_cst
8132  ret i64 %result
8133}
8134
8135define i64 @global_atomic_min_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
8136; SI-LABEL: global_atomic_min_i64_ret_offset:
8137; SI:       ; %bb.0:
8138; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8139; SI-NEXT:    v_mov_b32_e32 v5, v3
8140; SI-NEXT:    v_mov_b32_e32 v4, v2
8141; SI-NEXT:    v_mov_b32_e32 v7, v1
8142; SI-NEXT:    v_mov_b32_e32 v6, v0
8143; SI-NEXT:    s_mov_b32 s6, 0
8144; SI-NEXT:    s_mov_b32 s7, 0xf000
8145; SI-NEXT:    s_mov_b32 s4, s6
8146; SI-NEXT:    s_mov_b32 s5, s6
8147; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32
8148; SI-NEXT:    s_mov_b64 s[8:9], 0
8149; SI-NEXT:  .LBB120_1: ; %atomicrmw.start
8150; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
8151; SI-NEXT:    s_waitcnt vmcnt(0)
8152; SI-NEXT:    v_mov_b32_e32 v11, v1
8153; SI-NEXT:    v_mov_b32_e32 v10, v0
8154; SI-NEXT:    v_cmp_le_i64_e32 vcc, v[10:11], v[4:5]
8155; SI-NEXT:    v_cndmask_b32_e32 v9, v5, v11, vcc
8156; SI-NEXT:    v_cndmask_b32_e32 v8, v4, v10, vcc
8157; SI-NEXT:    s_waitcnt expcnt(0)
8158; SI-NEXT:    v_mov_b32_e32 v0, v8
8159; SI-NEXT:    v_mov_b32_e32 v1, v9
8160; SI-NEXT:    v_mov_b32_e32 v2, v10
8161; SI-NEXT:    v_mov_b32_e32 v3, v11
8162; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc
8163; SI-NEXT:    s_waitcnt vmcnt(0)
8164; SI-NEXT:    buffer_wbinvl1
8165; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
8166; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
8167; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
8168; SI-NEXT:    s_cbranch_execnz .LBB120_1
8169; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
8170; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
8171; SI-NEXT:    s_waitcnt expcnt(0)
8172; SI-NEXT:    s_setpc_b64 s[30:31]
8173;
8174; VI-LABEL: global_atomic_min_i64_ret_offset:
8175; VI:       ; %bb.0:
8176; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8177; VI-NEXT:    v_add_u32_e32 v4, vcc, 32, v0
8178; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
8179; VI-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
8180; VI-NEXT:    s_mov_b64 s[4:5], 0
8181; VI-NEXT:  .LBB120_1: ; %atomicrmw.start
8182; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
8183; VI-NEXT:    s_waitcnt vmcnt(0)
8184; VI-NEXT:    v_mov_b32_e32 v9, v1
8185; VI-NEXT:    v_mov_b32_e32 v8, v0
8186; VI-NEXT:    v_cmp_le_i64_e32 vcc, v[8:9], v[2:3]
8187; VI-NEXT:    v_cndmask_b32_e32 v7, v3, v9, vcc
8188; VI-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
8189; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
8190; VI-NEXT:    s_waitcnt vmcnt(0)
8191; VI-NEXT:    buffer_wbinvl1_vol
8192; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
8193; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8194; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8195; VI-NEXT:    s_cbranch_execnz .LBB120_1
8196; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
8197; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
8198; VI-NEXT:    s_setpc_b64 s[30:31]
8199;
8200; GFX9-LABEL: global_atomic_min_i64_ret_offset:
8201; GFX9:       ; %bb.0:
8202; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8203; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:32
8204; GFX9-NEXT:    s_mov_b64 s[4:5], 0
8205; GFX9-NEXT:  .LBB120_1: ; %atomicrmw.start
8206; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
8207; GFX9-NEXT:    s_waitcnt vmcnt(0)
8208; GFX9-NEXT:    v_mov_b32_e32 v7, v5
8209; GFX9-NEXT:    v_mov_b32_e32 v6, v4
8210; GFX9-NEXT:    v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
8211; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
8212; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
8213; GFX9-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
8214; GFX9-NEXT:    s_waitcnt vmcnt(0)
8215; GFX9-NEXT:    buffer_wbinvl1_vol
8216; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
8217; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8218; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8219; GFX9-NEXT:    s_cbranch_execnz .LBB120_1
8220; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
8221; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
8222; GFX9-NEXT:    v_mov_b32_e32 v0, v4
8223; GFX9-NEXT:    v_mov_b32_e32 v1, v5
8224; GFX9-NEXT:    s_setpc_b64 s[30:31]
8225  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
8226  %result = atomicrmw min ptr addrspace(1) %gep, i64 %in seq_cst
8227  ret i64 %result
8228}
8229
8230define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
8231; SI-LABEL: global_atomic_min_i64_noret_scalar:
8232; SI:       ; %bb.0:
8233; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8234; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
8235; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
8236; SI-NEXT:    s_mov_b64 exec, s[34:35]
8237; SI-NEXT:    s_waitcnt expcnt(0)
8238; SI-NEXT:    v_writelane_b32 v10, s6, 0
8239; SI-NEXT:    v_writelane_b32 v10, s7, 1
8240; SI-NEXT:    s_mov_b32 s35, s7
8241; SI-NEXT:    s_mov_b32 s34, s6
8242; SI-NEXT:    s_mov_b32 s7, 0xf000
8243; SI-NEXT:    s_mov_b32 s6, -1
8244; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
8245; SI-NEXT:    s_mov_b64 s[36:37], 0
8246; SI-NEXT:    v_mov_b32_e32 v4, s35
8247; SI-NEXT:    v_mov_b32_e32 v5, s34
8248; SI-NEXT:  .LBB121_1: ; %atomicrmw.start
8249; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
8250; SI-NEXT:    s_waitcnt vmcnt(0)
8251; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[34:35], v[2:3]
8252; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
8253; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
8254; SI-NEXT:    s_waitcnt expcnt(0)
8255; SI-NEXT:    v_mov_b32_e32 v9, v3
8256; SI-NEXT:    v_mov_b32_e32 v8, v2
8257; SI-NEXT:    v_mov_b32_e32 v7, v1
8258; SI-NEXT:    v_mov_b32_e32 v6, v0
8259; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc
8260; SI-NEXT:    s_waitcnt vmcnt(0)
8261; SI-NEXT:    buffer_wbinvl1
8262; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
8263; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
8264; SI-NEXT:    v_mov_b32_e32 v2, v6
8265; SI-NEXT:    v_mov_b32_e32 v3, v7
8266; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
8267; SI-NEXT:    s_cbranch_execnz .LBB121_1
8268; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
8269; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
8270; SI-NEXT:    v_readlane_b32 s7, v10, 1
8271; SI-NEXT:    v_readlane_b32 s6, v10, 0
8272; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
8273; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
8274; SI-NEXT:    s_mov_b64 exec, s[34:35]
8275; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
8276; SI-NEXT:    s_setpc_b64 s[30:31]
8277;
8278; VI-LABEL: global_atomic_min_i64_noret_scalar:
8279; VI:       ; %bb.0:
8280; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8281; VI-NEXT:    v_mov_b32_e32 v0, s4
8282; VI-NEXT:    v_mov_b32_e32 v1, s5
8283; VI-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
8284; VI-NEXT:    v_mov_b32_e32 v4, s4
8285; VI-NEXT:    s_mov_b64 s[34:35], 0
8286; VI-NEXT:    v_mov_b32_e32 v6, s7
8287; VI-NEXT:    v_mov_b32_e32 v7, s6
8288; VI-NEXT:    v_mov_b32_e32 v5, s5
8289; VI-NEXT:  .LBB121_1: ; %atomicrmw.start
8290; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
8291; VI-NEXT:    s_waitcnt vmcnt(0)
8292; VI-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
8293; VI-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
8294; VI-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
8295; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
8296; VI-NEXT:    s_waitcnt vmcnt(0)
8297; VI-NEXT:    buffer_wbinvl1_vol
8298; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
8299; VI-NEXT:    v_mov_b32_e32 v3, v1
8300; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
8301; VI-NEXT:    v_mov_b32_e32 v2, v0
8302; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
8303; VI-NEXT:    s_cbranch_execnz .LBB121_1
8304; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
8305; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
8306; VI-NEXT:    s_setpc_b64 s[30:31]
8307;
8308; GFX9-LABEL: global_atomic_min_i64_noret_scalar:
8309; GFX9:       ; %bb.0:
8310; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8311; GFX9-NEXT:    v_mov_b32_e32 v4, 0
8312; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[4:5]
8313; GFX9-NEXT:    s_mov_b64 s[34:35], 0
8314; GFX9-NEXT:    v_mov_b32_e32 v5, s7
8315; GFX9-NEXT:    v_mov_b32_e32 v6, s6
8316; GFX9-NEXT:  .LBB121_1: ; %atomicrmw.start
8317; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
8318; GFX9-NEXT:    s_waitcnt vmcnt(0)
8319; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
8320; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
8321; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
8322; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc
8323; GFX9-NEXT:    s_waitcnt vmcnt(0)
8324; GFX9-NEXT:    buffer_wbinvl1_vol
8325; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
8326; GFX9-NEXT:    v_mov_b32_e32 v3, v1
8327; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
8328; GFX9-NEXT:    v_mov_b32_e32 v2, v0
8329; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
8330; GFX9-NEXT:    s_cbranch_execnz .LBB121_1
8331; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
8332; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
8333; GFX9-NEXT:    s_setpc_b64 s[30:31]
8334  %tmp0 = atomicrmw min ptr addrspace(1) %ptr, i64 %in seq_cst
8335  ret void
8336}
8337
8338define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
8339; SI-LABEL: global_atomic_min_i64_noret_offset_scalar:
8340; SI:       ; %bb.0:
8341; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8342; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
8343; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
8344; SI-NEXT:    s_mov_b64 exec, s[34:35]
8345; SI-NEXT:    s_waitcnt expcnt(0)
8346; SI-NEXT:    v_writelane_b32 v10, s6, 0
8347; SI-NEXT:    v_writelane_b32 v10, s7, 1
8348; SI-NEXT:    s_mov_b32 s35, s7
8349; SI-NEXT:    s_mov_b32 s34, s6
8350; SI-NEXT:    s_mov_b32 s7, 0xf000
8351; SI-NEXT:    s_mov_b32 s6, -1
8352; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32
8353; SI-NEXT:    s_mov_b64 s[36:37], 0
8354; SI-NEXT:    v_mov_b32_e32 v4, s35
8355; SI-NEXT:    v_mov_b32_e32 v5, s34
8356; SI-NEXT:  .LBB122_1: ; %atomicrmw.start
8357; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
8358; SI-NEXT:    s_waitcnt vmcnt(0)
8359; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[34:35], v[2:3]
8360; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
8361; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
8362; SI-NEXT:    s_waitcnt expcnt(0)
8363; SI-NEXT:    v_mov_b32_e32 v9, v3
8364; SI-NEXT:    v_mov_b32_e32 v8, v2
8365; SI-NEXT:    v_mov_b32_e32 v7, v1
8366; SI-NEXT:    v_mov_b32_e32 v6, v0
8367; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc
8368; SI-NEXT:    s_waitcnt vmcnt(0)
8369; SI-NEXT:    buffer_wbinvl1
8370; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
8371; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
8372; SI-NEXT:    v_mov_b32_e32 v2, v6
8373; SI-NEXT:    v_mov_b32_e32 v3, v7
8374; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
8375; SI-NEXT:    s_cbranch_execnz .LBB122_1
8376; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
8377; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
8378; SI-NEXT:    v_readlane_b32 s7, v10, 1
8379; SI-NEXT:    v_readlane_b32 s6, v10, 0
8380; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
8381; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
8382; SI-NEXT:    s_mov_b64 exec, s[34:35]
8383; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
8384; SI-NEXT:    s_setpc_b64 s[30:31]
8385;
8386; VI-LABEL: global_atomic_min_i64_noret_offset_scalar:
8387; VI:       ; %bb.0:
8388; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8389; VI-NEXT:    s_add_u32 s34, s4, 32
8390; VI-NEXT:    s_addc_u32 s35, s5, 0
8391; VI-NEXT:    v_mov_b32_e32 v4, s34
8392; VI-NEXT:    v_mov_b32_e32 v5, s35
8393; VI-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
8394; VI-NEXT:    s_mov_b64 s[34:35], 0
8395; VI-NEXT:    v_mov_b32_e32 v6, s7
8396; VI-NEXT:    v_mov_b32_e32 v7, s6
8397; VI-NEXT:  .LBB122_1: ; %atomicrmw.start
8398; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
8399; VI-NEXT:    s_waitcnt vmcnt(0)
8400; VI-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
8401; VI-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
8402; VI-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
8403; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
8404; VI-NEXT:    s_waitcnt vmcnt(0)
8405; VI-NEXT:    buffer_wbinvl1_vol
8406; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
8407; VI-NEXT:    v_mov_b32_e32 v3, v1
8408; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
8409; VI-NEXT:    v_mov_b32_e32 v2, v0
8410; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
8411; VI-NEXT:    s_cbranch_execnz .LBB122_1
8412; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
8413; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
8414; VI-NEXT:    s_setpc_b64 s[30:31]
8415;
8416; GFX9-LABEL: global_atomic_min_i64_noret_offset_scalar:
8417; GFX9:       ; %bb.0:
8418; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8419; GFX9-NEXT:    v_mov_b32_e32 v4, 0
8420; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[4:5] offset:32
8421; GFX9-NEXT:    s_mov_b64 s[34:35], 0
8422; GFX9-NEXT:    v_mov_b32_e32 v5, s7
8423; GFX9-NEXT:    v_mov_b32_e32 v6, s6
8424; GFX9-NEXT:  .LBB122_1: ; %atomicrmw.start
8425; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
8426; GFX9-NEXT:    s_waitcnt vmcnt(0)
8427; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
8428; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
8429; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
8430; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc
8431; GFX9-NEXT:    s_waitcnt vmcnt(0)
8432; GFX9-NEXT:    buffer_wbinvl1_vol
8433; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
8434; GFX9-NEXT:    v_mov_b32_e32 v3, v1
8435; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
8436; GFX9-NEXT:    v_mov_b32_e32 v2, v0
8437; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
8438; GFX9-NEXT:    s_cbranch_execnz .LBB122_1
8439; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
8440; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
8441; GFX9-NEXT:    s_setpc_b64 s[30:31]
8442  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
8443  %tmp0 = atomicrmw min ptr addrspace(1) %gep, i64 %in seq_cst
8444  ret void
8445}
8446
8447define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
8448; SI-LABEL: global_atomic_min_i64_ret_scalar:
8449; SI:       ; %bb.0:
8450; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8451; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
8452; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
8453; SI-NEXT:    s_mov_b64 exec, s[34:35]
8454; SI-NEXT:    s_waitcnt expcnt(0)
8455; SI-NEXT:    v_writelane_b32 v10, s6, 0
8456; SI-NEXT:    v_writelane_b32 v10, s7, 1
8457; SI-NEXT:    s_mov_b32 s35, s7
8458; SI-NEXT:    s_mov_b32 s34, s6
8459; SI-NEXT:    s_mov_b32 s7, 0xf000
8460; SI-NEXT:    s_mov_b32 s6, -1
8461; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
8462; SI-NEXT:    s_mov_b64 s[36:37], 0
8463; SI-NEXT:    v_mov_b32_e32 v4, s35
8464; SI-NEXT:    v_mov_b32_e32 v5, s34
8465; SI-NEXT:  .LBB123_1: ; %atomicrmw.start
8466; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
8467; SI-NEXT:    s_waitcnt vmcnt(0)
8468; SI-NEXT:    v_mov_b32_e32 v9, v1
8469; SI-NEXT:    v_mov_b32_e32 v8, v0
8470; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[34:35], v[8:9]
8471; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
8472; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
8473; SI-NEXT:    s_waitcnt expcnt(0)
8474; SI-NEXT:    v_mov_b32_e32 v0, v6
8475; SI-NEXT:    v_mov_b32_e32 v1, v7
8476; SI-NEXT:    v_mov_b32_e32 v2, v8
8477; SI-NEXT:    v_mov_b32_e32 v3, v9
8478; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
8479; SI-NEXT:    s_waitcnt vmcnt(0)
8480; SI-NEXT:    buffer_wbinvl1
8481; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
8482; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
8483; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
8484; SI-NEXT:    s_cbranch_execnz .LBB123_1
8485; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
8486; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
8487; SI-NEXT:    v_readlane_b32 s7, v10, 1
8488; SI-NEXT:    v_readlane_b32 s6, v10, 0
8489; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
8490; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
8491; SI-NEXT:    s_mov_b64 exec, s[34:35]
8492; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
8493; SI-NEXT:    s_setpc_b64 s[30:31]
8494;
8495; VI-LABEL: global_atomic_min_i64_ret_scalar:
8496; VI:       ; %bb.0:
8497; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8498; VI-NEXT:    v_mov_b32_e32 v0, s4
8499; VI-NEXT:    v_mov_b32_e32 v1, s5
8500; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
8501; VI-NEXT:    v_mov_b32_e32 v2, s4
8502; VI-NEXT:    s_mov_b64 s[34:35], 0
8503; VI-NEXT:    v_mov_b32_e32 v4, s7
8504; VI-NEXT:    v_mov_b32_e32 v5, s6
8505; VI-NEXT:    v_mov_b32_e32 v3, s5
8506; VI-NEXT:  .LBB123_1: ; %atomicrmw.start
8507; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
8508; VI-NEXT:    s_waitcnt vmcnt(0)
8509; VI-NEXT:    v_mov_b32_e32 v9, v1
8510; VI-NEXT:    v_mov_b32_e32 v8, v0
8511; VI-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
8512; VI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
8513; VI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
8514; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
8515; VI-NEXT:    s_waitcnt vmcnt(0)
8516; VI-NEXT:    buffer_wbinvl1_vol
8517; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
8518; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
8519; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
8520; VI-NEXT:    s_cbranch_execnz .LBB123_1
8521; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
8522; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
8523; VI-NEXT:    s_setpc_b64 s[30:31]
8524;
8525; GFX9-LABEL: global_atomic_min_i64_ret_scalar:
8526; GFX9:       ; %bb.0:
8527; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8528; GFX9-NEXT:    v_mov_b32_e32 v2, 0
8529; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5]
8530; GFX9-NEXT:    s_mov_b64 s[34:35], 0
8531; GFX9-NEXT:    v_mov_b32_e32 v3, s7
8532; GFX9-NEXT:    v_mov_b32_e32 v4, s6
8533; GFX9-NEXT:  .LBB123_1: ; %atomicrmw.start
8534; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
8535; GFX9-NEXT:    s_waitcnt vmcnt(0)
8536; GFX9-NEXT:    v_mov_b32_e32 v8, v1
8537; GFX9-NEXT:    v_mov_b32_e32 v7, v0
8538; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[7:8]
8539; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v8, vcc
8540; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v7, vcc
8541; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] glc
8542; GFX9-NEXT:    s_waitcnt vmcnt(0)
8543; GFX9-NEXT:    buffer_wbinvl1_vol
8544; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
8545; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
8546; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
8547; GFX9-NEXT:    s_cbranch_execnz .LBB123_1
8548; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
8549; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
8550; GFX9-NEXT:    s_setpc_b64 s[30:31]
8551  %result = atomicrmw min ptr addrspace(1) %ptr, i64 %in seq_cst
8552  ret i64 %result
8553}
8554
8555define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
8556; SI-LABEL: global_atomic_min_i64_ret_offset_scalar:
8557; SI:       ; %bb.0:
8558; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8559; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
8560; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
8561; SI-NEXT:    s_mov_b64 exec, s[34:35]
8562; SI-NEXT:    s_waitcnt expcnt(0)
8563; SI-NEXT:    v_writelane_b32 v10, s6, 0
8564; SI-NEXT:    v_writelane_b32 v10, s7, 1
8565; SI-NEXT:    s_mov_b32 s35, s7
8566; SI-NEXT:    s_mov_b32 s34, s6
8567; SI-NEXT:    s_mov_b32 s7, 0xf000
8568; SI-NEXT:    s_mov_b32 s6, -1
8569; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
8570; SI-NEXT:    s_mov_b64 s[36:37], 0
8571; SI-NEXT:    v_mov_b32_e32 v4, s35
8572; SI-NEXT:    v_mov_b32_e32 v5, s34
8573; SI-NEXT:  .LBB124_1: ; %atomicrmw.start
8574; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
8575; SI-NEXT:    s_waitcnt vmcnt(0)
8576; SI-NEXT:    v_mov_b32_e32 v9, v1
8577; SI-NEXT:    v_mov_b32_e32 v8, v0
8578; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[34:35], v[8:9]
8579; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
8580; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
8581; SI-NEXT:    s_waitcnt expcnt(0)
8582; SI-NEXT:    v_mov_b32_e32 v0, v6
8583; SI-NEXT:    v_mov_b32_e32 v1, v7
8584; SI-NEXT:    v_mov_b32_e32 v2, v8
8585; SI-NEXT:    v_mov_b32_e32 v3, v9
8586; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
8587; SI-NEXT:    s_waitcnt vmcnt(0)
8588; SI-NEXT:    buffer_wbinvl1
8589; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
8590; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
8591; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
8592; SI-NEXT:    s_cbranch_execnz .LBB124_1
8593; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
8594; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
8595; SI-NEXT:    v_readlane_b32 s7, v10, 1
8596; SI-NEXT:    v_readlane_b32 s6, v10, 0
8597; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
8598; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
8599; SI-NEXT:    s_mov_b64 exec, s[34:35]
8600; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
8601; SI-NEXT:    s_setpc_b64 s[30:31]
8602;
8603; VI-LABEL: global_atomic_min_i64_ret_offset_scalar:
8604; VI:       ; %bb.0:
8605; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8606; VI-NEXT:    s_add_u32 s34, s4, 32
8607; VI-NEXT:    s_addc_u32 s35, s5, 0
8608; VI-NEXT:    v_mov_b32_e32 v2, s34
8609; VI-NEXT:    v_mov_b32_e32 v3, s35
8610; VI-NEXT:    flat_load_dwordx2 v[0:1], v[2:3]
8611; VI-NEXT:    s_mov_b64 s[34:35], 0
8612; VI-NEXT:    v_mov_b32_e32 v4, s7
8613; VI-NEXT:    v_mov_b32_e32 v5, s6
8614; VI-NEXT:  .LBB124_1: ; %atomicrmw.start
8615; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
8616; VI-NEXT:    s_waitcnt vmcnt(0)
8617; VI-NEXT:    v_mov_b32_e32 v9, v1
8618; VI-NEXT:    v_mov_b32_e32 v8, v0
8619; VI-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
8620; VI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
8621; VI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
8622; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
8623; VI-NEXT:    s_waitcnt vmcnt(0)
8624; VI-NEXT:    buffer_wbinvl1_vol
8625; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
8626; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
8627; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
8628; VI-NEXT:    s_cbranch_execnz .LBB124_1
8629; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
8630; VI-NEXT:    s_or_b64 exec, exec, s[34:35]
8631; VI-NEXT:    s_setpc_b64 s[30:31]
8632;
8633; GFX9-LABEL: global_atomic_min_i64_ret_offset_scalar:
8634; GFX9:       ; %bb.0:
8635; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8636; GFX9-NEXT:    v_mov_b32_e32 v2, 0
8637; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5] offset:32
8638; GFX9-NEXT:    s_mov_b64 s[34:35], 0
8639; GFX9-NEXT:    v_mov_b32_e32 v3, s7
8640; GFX9-NEXT:    v_mov_b32_e32 v4, s6
8641; GFX9-NEXT:  .LBB124_1: ; %atomicrmw.start
8642; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
8643; GFX9-NEXT:    s_waitcnt vmcnt(0)
8644; GFX9-NEXT:    v_mov_b32_e32 v8, v1
8645; GFX9-NEXT:    v_mov_b32_e32 v7, v0
8646; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[7:8]
8647; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v8, vcc
8648; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v7, vcc
8649; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] offset:32 glc
8650; GFX9-NEXT:    s_waitcnt vmcnt(0)
8651; GFX9-NEXT:    buffer_wbinvl1_vol
8652; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
8653; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
8654; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
8655; GFX9-NEXT:    s_cbranch_execnz .LBB124_1
8656; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
8657; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
8658; GFX9-NEXT:    s_setpc_b64 s[30:31]
8659  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
8660  %result = atomicrmw min ptr addrspace(1) %gep, i64 %in seq_cst
8661  ret i64 %result
8662}
8663
8664define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) {
8665; SI-LABEL: atomic_min_i64_addr64_offset:
8666; SI:       ; %bb.0: ; %entry
8667; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
8668; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
8669; SI-NEXT:    s_waitcnt lgkmcnt(0)
8670; SI-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
8671; SI-NEXT:    s_add_u32 s4, s0, s4
8672; SI-NEXT:    s_addc_u32 s5, s1, s5
8673; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
8674; SI-NEXT:    s_mov_b64 s[0:1], 0
8675; SI-NEXT:    s_mov_b32 s7, 0xf000
8676; SI-NEXT:    v_mov_b32_e32 v4, s3
8677; SI-NEXT:    v_mov_b32_e32 v5, s2
8678; SI-NEXT:    s_waitcnt lgkmcnt(0)
8679; SI-NEXT:    v_mov_b32_e32 v2, s8
8680; SI-NEXT:    v_mov_b32_e32 v3, s9
8681; SI-NEXT:    s_mov_b32 s6, -1
8682; SI-NEXT:  .LBB125_1: ; %atomicrmw.start
8683; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
8684; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
8685; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
8686; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
8687; SI-NEXT:    s_waitcnt expcnt(0)
8688; SI-NEXT:    v_mov_b32_e32 v9, v3
8689; SI-NEXT:    v_mov_b32_e32 v8, v2
8690; SI-NEXT:    v_mov_b32_e32 v7, v1
8691; SI-NEXT:    v_mov_b32_e32 v6, v0
8692; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc
8693; SI-NEXT:    s_waitcnt vmcnt(0)
8694; SI-NEXT:    buffer_wbinvl1
8695; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
8696; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
8697; SI-NEXT:    v_mov_b32_e32 v2, v6
8698; SI-NEXT:    v_mov_b32_e32 v3, v7
8699; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
8700; SI-NEXT:    s_cbranch_execnz .LBB125_1
8701; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
8702; SI-NEXT:    s_endpgm
8703;
8704; VI-LABEL: atomic_min_i64_addr64_offset:
8705; VI:       ; %bb.0: ; %entry
8706; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
8707; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
8708; VI-NEXT:    s_mov_b64 s[4:5], 0
8709; VI-NEXT:    s_waitcnt lgkmcnt(0)
8710; VI-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
8711; VI-NEXT:    s_add_u32 s0, s0, s6
8712; VI-NEXT:    s_addc_u32 s1, s1, s7
8713; VI-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x20
8714; VI-NEXT:    s_add_u32 s0, s0, 32
8715; VI-NEXT:    s_addc_u32 s1, s1, 0
8716; VI-NEXT:    v_mov_b32_e32 v5, s1
8717; VI-NEXT:    v_mov_b32_e32 v6, s3
8718; VI-NEXT:    s_waitcnt lgkmcnt(0)
8719; VI-NEXT:    v_mov_b32_e32 v2, s6
8720; VI-NEXT:    v_mov_b32_e32 v7, s2
8721; VI-NEXT:    v_mov_b32_e32 v3, s7
8722; VI-NEXT:    v_mov_b32_e32 v4, s0
8723; VI-NEXT:  .LBB125_1: ; %atomicrmw.start
8724; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
8725; VI-NEXT:    v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
8726; VI-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
8727; VI-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
8728; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
8729; VI-NEXT:    s_waitcnt vmcnt(0)
8730; VI-NEXT:    buffer_wbinvl1_vol
8731; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
8732; VI-NEXT:    v_mov_b32_e32 v3, v1
8733; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8734; VI-NEXT:    v_mov_b32_e32 v2, v0
8735; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8736; VI-NEXT:    s_cbranch_execnz .LBB125_1
8737; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
8738; VI-NEXT:    s_endpgm
8739;
8740; GFX9-LABEL: atomic_min_i64_addr64_offset:
8741; GFX9:       ; %bb.0: ; %entry
8742; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
8743; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
8744; GFX9-NEXT:    v_mov_b32_e32 v6, 0
8745; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8746; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
8747; GFX9-NEXT:    s_add_u32 s0, s0, s4
8748; GFX9-NEXT:    s_addc_u32 s1, s1, s5
8749; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x20
8750; GFX9-NEXT:    s_mov_b64 s[4:5], 0
8751; GFX9-NEXT:    v_mov_b32_e32 v4, s3
8752; GFX9-NEXT:    v_mov_b32_e32 v5, s2
8753; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8754; GFX9-NEXT:    v_mov_b32_e32 v2, s6
8755; GFX9-NEXT:    v_mov_b32_e32 v3, s7
8756; GFX9-NEXT:  .LBB125_1: ; %atomicrmw.start
8757; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
8758; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
8759; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
8760; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
8761; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:32 glc
8762; GFX9-NEXT:    s_waitcnt vmcnt(0)
8763; GFX9-NEXT:    buffer_wbinvl1_vol
8764; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
8765; GFX9-NEXT:    v_mov_b32_e32 v3, v1
8766; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8767; GFX9-NEXT:    v_mov_b32_e32 v2, v0
8768; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8769; GFX9-NEXT:    s_cbranch_execnz .LBB125_1
8770; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
8771; GFX9-NEXT:    s_endpgm
8772entry:
8773  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
8774  %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
8775  %tmp0 = atomicrmw min ptr addrspace(1) %gep, i64 %in seq_cst
8776  ret void
8777}
8778
8779define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
8780; SI-LABEL: atomic_min_i64_ret_addr64_offset:
8781; SI:       ; %bb.0: ; %entry
8782; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
8783; SI-NEXT:    s_waitcnt lgkmcnt(0)
8784; SI-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
8785; SI-NEXT:    s_add_u32 s8, s0, s6
8786; SI-NEXT:    s_addc_u32 s9, s1, s7
8787; SI-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
8788; SI-NEXT:    s_mov_b64 s[0:1], 0
8789; SI-NEXT:    s_mov_b32 s11, 0xf000
8790; SI-NEXT:    v_mov_b32_e32 v8, s5
8791; SI-NEXT:    v_mov_b32_e32 v9, s4
8792; SI-NEXT:    s_waitcnt lgkmcnt(0)
8793; SI-NEXT:    v_mov_b32_e32 v2, s6
8794; SI-NEXT:    v_mov_b32_e32 v3, s7
8795; SI-NEXT:    s_mov_b32 s10, -1
8796; SI-NEXT:  .LBB126_1: ; %atomicrmw.start
8797; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
8798; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3]
8799; SI-NEXT:    v_cndmask_b32_e32 v1, v8, v3, vcc
8800; SI-NEXT:    v_cndmask_b32_e32 v0, v9, v2, vcc
8801; SI-NEXT:    s_waitcnt expcnt(0)
8802; SI-NEXT:    v_mov_b32_e32 v7, v3
8803; SI-NEXT:    v_mov_b32_e32 v6, v2
8804; SI-NEXT:    v_mov_b32_e32 v5, v1
8805; SI-NEXT:    v_mov_b32_e32 v4, v0
8806; SI-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], off, s[8:11], 0 offset:32 glc
8807; SI-NEXT:    s_waitcnt vmcnt(0)
8808; SI-NEXT:    buffer_wbinvl1
8809; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
8810; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
8811; SI-NEXT:    v_mov_b32_e32 v2, v4
8812; SI-NEXT:    v_mov_b32_e32 v3, v5
8813; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
8814; SI-NEXT:    s_cbranch_execnz .LBB126_1
8815; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
8816; SI-NEXT:    s_or_b64 exec, exec, s[0:1]
8817; SI-NEXT:    s_mov_b32 s7, 0xf000
8818; SI-NEXT:    s_mov_b32 s6, -1
8819; SI-NEXT:    s_mov_b32 s4, s2
8820; SI-NEXT:    s_mov_b32 s5, s3
8821; SI-NEXT:    buffer_store_dwordx2 v[4:5], off, s[4:7], 0
8822; SI-NEXT:    s_endpgm
8823;
8824; VI-LABEL: atomic_min_i64_ret_addr64_offset:
8825; VI:       ; %bb.0: ; %entry
8826; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
8827; VI-NEXT:    s_mov_b64 s[8:9], 0
8828; VI-NEXT:    s_waitcnt lgkmcnt(0)
8829; VI-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
8830; VI-NEXT:    s_add_u32 s0, s0, s6
8831; VI-NEXT:    s_addc_u32 s1, s1, s7
8832; VI-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x20
8833; VI-NEXT:    s_add_u32 s0, s0, 32
8834; VI-NEXT:    s_addc_u32 s1, s1, 0
8835; VI-NEXT:    v_mov_b32_e32 v0, s0
8836; VI-NEXT:    v_mov_b32_e32 v4, s5
8837; VI-NEXT:    s_waitcnt lgkmcnt(0)
8838; VI-NEXT:    v_mov_b32_e32 v2, s6
8839; VI-NEXT:    v_mov_b32_e32 v5, s4
8840; VI-NEXT:    v_mov_b32_e32 v3, s7
8841; VI-NEXT:    v_mov_b32_e32 v1, s1
8842; VI-NEXT:  .LBB126_1: ; %atomicrmw.start
8843; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
8844; VI-NEXT:    v_mov_b32_e32 v9, v3
8845; VI-NEXT:    v_mov_b32_e32 v8, v2
8846; VI-NEXT:    v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
8847; VI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
8848; VI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
8849; VI-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
8850; VI-NEXT:    s_waitcnt vmcnt(0)
8851; VI-NEXT:    buffer_wbinvl1_vol
8852; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
8853; VI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
8854; VI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
8855; VI-NEXT:    s_cbranch_execnz .LBB126_1
8856; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
8857; VI-NEXT:    s_or_b64 exec, exec, s[8:9]
8858; VI-NEXT:    v_mov_b32_e32 v0, s2
8859; VI-NEXT:    v_mov_b32_e32 v1, s3
8860; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
8861; VI-NEXT:    s_endpgm
8862;
8863; GFX9-LABEL: atomic_min_i64_ret_addr64_offset:
8864; GFX9:       ; %bb.0: ; %entry
8865; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
8866; GFX9-NEXT:    s_mov_b64 s[2:3], 0
8867; GFX9-NEXT:    v_mov_b32_e32 v4, 0
8868; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8869; GFX9-NEXT:    s_lshl_b64 s[0:1], s[14:15], 3
8870; GFX9-NEXT:    s_add_u32 s0, s8, s0
8871; GFX9-NEXT:    s_addc_u32 s1, s9, s1
8872; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x20
8873; GFX9-NEXT:    v_mov_b32_e32 v2, s13
8874; GFX9-NEXT:    v_mov_b32_e32 v3, s12
8875; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8876; GFX9-NEXT:    v_mov_b32_e32 v0, s4
8877; GFX9-NEXT:    v_mov_b32_e32 v1, s5
8878; GFX9-NEXT:  .LBB126_1: ; %atomicrmw.start
8879; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
8880; GFX9-NEXT:    v_mov_b32_e32 v8, v1
8881; GFX9-NEXT:    v_mov_b32_e32 v7, v0
8882; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[12:13], v[7:8]
8883; GFX9-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
8884; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
8885; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] offset:32 glc
8886; GFX9-NEXT:    s_waitcnt vmcnt(0)
8887; GFX9-NEXT:    buffer_wbinvl1_vol
8888; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
8889; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
8890; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
8891; GFX9-NEXT:    s_cbranch_execnz .LBB126_1
8892; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
8893; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
8894; GFX9-NEXT:    v_mov_b32_e32 v2, 0
8895; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[10:11]
8896; GFX9-NEXT:    s_endpgm
8897entry:
8898  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
8899  %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
8900  %tmp0 = atomicrmw min ptr addrspace(1) %gep, i64 %in seq_cst
8901  store i64 %tmp0, ptr addrspace(1) %out2
8902  ret void
8903}
8904
8905define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) {
8906; SI-LABEL: atomic_min_i64:
8907; SI:       ; %bb.0: ; %entry
8908; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
8909; SI-NEXT:    s_waitcnt lgkmcnt(0)
8910; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
8911; SI-NEXT:    s_mov_b64 s[8:9], 0
8912; SI-NEXT:    s_mov_b32 s7, 0xf000
8913; SI-NEXT:    v_mov_b32_e32 v4, s3
8914; SI-NEXT:    v_mov_b32_e32 v5, s2
8915; SI-NEXT:    s_waitcnt lgkmcnt(0)
8916; SI-NEXT:    v_mov_b32_e32 v2, s4
8917; SI-NEXT:    v_mov_b32_e32 v3, s5
8918; SI-NEXT:    s_mov_b32 s6, -1
8919; SI-NEXT:    s_mov_b32 s4, s0
8920; SI-NEXT:    s_mov_b32 s5, s1
8921; SI-NEXT:  .LBB127_1: ; %atomicrmw.start
8922; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
8923; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
8924; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
8925; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
8926; SI-NEXT:    s_waitcnt expcnt(0)
8927; SI-NEXT:    v_mov_b32_e32 v9, v3
8928; SI-NEXT:    v_mov_b32_e32 v8, v2
8929; SI-NEXT:    v_mov_b32_e32 v7, v1
8930; SI-NEXT:    v_mov_b32_e32 v6, v0
8931; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc
8932; SI-NEXT:    s_waitcnt vmcnt(0)
8933; SI-NEXT:    buffer_wbinvl1
8934; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
8935; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
8936; SI-NEXT:    v_mov_b32_e32 v2, v6
8937; SI-NEXT:    v_mov_b32_e32 v3, v7
8938; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
8939; SI-NEXT:    s_cbranch_execnz .LBB127_1
8940; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
8941; SI-NEXT:    s_endpgm
8942;
8943; VI-LABEL: atomic_min_i64:
8944; VI:       ; %bb.0: ; %entry
8945; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
8946; VI-NEXT:    s_mov_b64 s[4:5], 0
8947; VI-NEXT:    s_waitcnt lgkmcnt(0)
8948; VI-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
8949; VI-NEXT:    v_mov_b32_e32 v5, s1
8950; VI-NEXT:    v_mov_b32_e32 v6, s3
8951; VI-NEXT:    v_mov_b32_e32 v7, s2
8952; VI-NEXT:    v_mov_b32_e32 v4, s0
8953; VI-NEXT:    s_waitcnt lgkmcnt(0)
8954; VI-NEXT:    v_mov_b32_e32 v2, s6
8955; VI-NEXT:    v_mov_b32_e32 v3, s7
8956; VI-NEXT:  .LBB127_1: ; %atomicrmw.start
8957; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
8958; VI-NEXT:    v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
8959; VI-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
8960; VI-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
8961; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
8962; VI-NEXT:    s_waitcnt vmcnt(0)
8963; VI-NEXT:    buffer_wbinvl1_vol
8964; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
8965; VI-NEXT:    v_mov_b32_e32 v3, v1
8966; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8967; VI-NEXT:    v_mov_b32_e32 v2, v0
8968; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8969; VI-NEXT:    s_cbranch_execnz .LBB127_1
8970; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
8971; VI-NEXT:    s_endpgm
8972;
8973; GFX9-LABEL: atomic_min_i64:
8974; GFX9:       ; %bb.0: ; %entry
8975; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
8976; GFX9-NEXT:    s_mov_b64 s[4:5], 0
8977; GFX9-NEXT:    v_mov_b32_e32 v6, 0
8978; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8979; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
8980; GFX9-NEXT:    v_mov_b32_e32 v4, s3
8981; GFX9-NEXT:    v_mov_b32_e32 v5, s2
8982; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8983; GFX9-NEXT:    v_mov_b32_e32 v2, s6
8984; GFX9-NEXT:    v_mov_b32_e32 v3, s7
8985; GFX9-NEXT:  .LBB127_1: ; %atomicrmw.start
8986; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
8987; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
8988; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
8989; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
8990; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
8991; GFX9-NEXT:    s_waitcnt vmcnt(0)
8992; GFX9-NEXT:    buffer_wbinvl1_vol
8993; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
8994; GFX9-NEXT:    v_mov_b32_e32 v3, v1
8995; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8996; GFX9-NEXT:    v_mov_b32_e32 v2, v0
8997; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8998; GFX9-NEXT:    s_cbranch_execnz .LBB127_1
8999; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
9000; GFX9-NEXT:    s_endpgm
9001entry:
9002  %tmp0 = atomicrmw min ptr addrspace(1) %out, i64 %in seq_cst
9003  ret void
9004}
9005
9006define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
9007; SI-LABEL: atomic_min_i64_ret_addr64:
9008; SI:       ; %bb.0: ; %entry
9009; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
9010; SI-NEXT:    s_waitcnt lgkmcnt(0)
9011; SI-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
9012; SI-NEXT:    s_add_u32 s8, s0, s6
9013; SI-NEXT:    s_addc_u32 s9, s1, s7
9014; SI-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
9015; SI-NEXT:    s_mov_b64 s[0:1], 0
9016; SI-NEXT:    s_mov_b32 s11, 0xf000
9017; SI-NEXT:    v_mov_b32_e32 v8, s5
9018; SI-NEXT:    v_mov_b32_e32 v9, s4
9019; SI-NEXT:    s_waitcnt lgkmcnt(0)
9020; SI-NEXT:    v_mov_b32_e32 v2, s6
9021; SI-NEXT:    v_mov_b32_e32 v3, s7
9022; SI-NEXT:    s_mov_b32 s10, -1
9023; SI-NEXT:  .LBB128_1: ; %atomicrmw.start
9024; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
9025; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3]
9026; SI-NEXT:    v_cndmask_b32_e32 v1, v8, v3, vcc
9027; SI-NEXT:    v_cndmask_b32_e32 v0, v9, v2, vcc
9028; SI-NEXT:    s_waitcnt expcnt(0)
9029; SI-NEXT:    v_mov_b32_e32 v7, v3
9030; SI-NEXT:    v_mov_b32_e32 v6, v2
9031; SI-NEXT:    v_mov_b32_e32 v5, v1
9032; SI-NEXT:    v_mov_b32_e32 v4, v0
9033; SI-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], off, s[8:11], 0 glc
9034; SI-NEXT:    s_waitcnt vmcnt(0)
9035; SI-NEXT:    buffer_wbinvl1
9036; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
9037; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
9038; SI-NEXT:    v_mov_b32_e32 v2, v4
9039; SI-NEXT:    v_mov_b32_e32 v3, v5
9040; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
9041; SI-NEXT:    s_cbranch_execnz .LBB128_1
9042; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
9043; SI-NEXT:    s_or_b64 exec, exec, s[0:1]
9044; SI-NEXT:    s_mov_b32 s7, 0xf000
9045; SI-NEXT:    s_mov_b32 s6, -1
9046; SI-NEXT:    s_mov_b32 s4, s2
9047; SI-NEXT:    s_mov_b32 s5, s3
9048; SI-NEXT:    buffer_store_dwordx2 v[4:5], off, s[4:7], 0
9049; SI-NEXT:    s_endpgm
9050;
9051; VI-LABEL: atomic_min_i64_ret_addr64:
9052; VI:       ; %bb.0: ; %entry
9053; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
9054; VI-NEXT:    s_waitcnt lgkmcnt(0)
9055; VI-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
9056; VI-NEXT:    s_add_u32 s6, s0, s6
9057; VI-NEXT:    s_addc_u32 s7, s1, s7
9058; VI-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
9059; VI-NEXT:    v_mov_b32_e32 v0, s6
9060; VI-NEXT:    s_mov_b64 s[0:1], 0
9061; VI-NEXT:    v_mov_b32_e32 v4, s5
9062; VI-NEXT:    v_mov_b32_e32 v5, s4
9063; VI-NEXT:    s_waitcnt lgkmcnt(0)
9064; VI-NEXT:    v_mov_b32_e32 v2, s8
9065; VI-NEXT:    v_mov_b32_e32 v3, s9
9066; VI-NEXT:    v_mov_b32_e32 v1, s7
9067; VI-NEXT:  .LBB128_1: ; %atomicrmw.start
9068; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
9069; VI-NEXT:    v_mov_b32_e32 v9, v3
9070; VI-NEXT:    v_mov_b32_e32 v8, v2
9071; VI-NEXT:    v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
9072; VI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
9073; VI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
9074; VI-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
9075; VI-NEXT:    s_waitcnt vmcnt(0)
9076; VI-NEXT:    buffer_wbinvl1_vol
9077; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
9078; VI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
9079; VI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
9080; VI-NEXT:    s_cbranch_execnz .LBB128_1
9081; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
9082; VI-NEXT:    s_or_b64 exec, exec, s[0:1]
9083; VI-NEXT:    v_mov_b32_e32 v0, s2
9084; VI-NEXT:    v_mov_b32_e32 v1, s3
9085; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
9086; VI-NEXT:    s_endpgm
9087;
9088; GFX9-LABEL: atomic_min_i64_ret_addr64:
9089; GFX9:       ; %bb.0: ; %entry
9090; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
9091; GFX9-NEXT:    s_mov_b64 s[2:3], 0
9092; GFX9-NEXT:    v_mov_b32_e32 v4, 0
9093; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
9094; GFX9-NEXT:    s_lshl_b64 s[0:1], s[14:15], 3
9095; GFX9-NEXT:    s_add_u32 s0, s8, s0
9096; GFX9-NEXT:    s_addc_u32 s1, s9, s1
9097; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
9098; GFX9-NEXT:    v_mov_b32_e32 v2, s13
9099; GFX9-NEXT:    v_mov_b32_e32 v3, s12
9100; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
9101; GFX9-NEXT:    v_mov_b32_e32 v0, s4
9102; GFX9-NEXT:    v_mov_b32_e32 v1, s5
9103; GFX9-NEXT:  .LBB128_1: ; %atomicrmw.start
9104; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
9105; GFX9-NEXT:    v_mov_b32_e32 v8, v1
9106; GFX9-NEXT:    v_mov_b32_e32 v7, v0
9107; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[12:13], v[7:8]
9108; GFX9-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
9109; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
9110; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] glc
9111; GFX9-NEXT:    s_waitcnt vmcnt(0)
9112; GFX9-NEXT:    buffer_wbinvl1_vol
9113; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
9114; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
9115; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
9116; GFX9-NEXT:    s_cbranch_execnz .LBB128_1
9117; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
9118; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
9119; GFX9-NEXT:    v_mov_b32_e32 v2, 0
9120; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[10:11]
9121; GFX9-NEXT:    s_endpgm
9122entry:
9123  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
9124  %tmp0 = atomicrmw min ptr addrspace(1) %ptr, i64 %in seq_cst
9125  store i64 %tmp0, ptr addrspace(1) %out2
9126  ret void
9127}
9128
9129define void @global_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
9130; SI-LABEL: global_atomic_min_i64_noret_offset__amdgpu_no_remote_memory:
9131; SI:       ; %bb.0:
9132; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9133; SI-NEXT:    s_mov_b32 s6, 0
9134; SI-NEXT:    s_mov_b32 s7, 0xf000
9135; SI-NEXT:    s_mov_b32 s4, s6
9136; SI-NEXT:    s_mov_b32 s5, s6
9137; SI-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32
9138; SI-NEXT:    s_mov_b64 s[8:9], 0
9139; SI-NEXT:  .LBB129_1: ; %atomicrmw.start
9140; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
9141; SI-NEXT:    s_waitcnt vmcnt(0)
9142; SI-NEXT:    v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
9143; SI-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
9144; SI-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
9145; SI-NEXT:    s_waitcnt expcnt(0)
9146; SI-NEXT:    v_mov_b32_e32 v11, v7
9147; SI-NEXT:    v_mov_b32_e32 v10, v6
9148; SI-NEXT:    v_mov_b32_e32 v9, v5
9149; SI-NEXT:    v_mov_b32_e32 v8, v4
9150; SI-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
9151; SI-NEXT:    s_waitcnt vmcnt(0)
9152; SI-NEXT:    buffer_wbinvl1
9153; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
9154; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
9155; SI-NEXT:    v_mov_b32_e32 v6, v8
9156; SI-NEXT:    v_mov_b32_e32 v7, v9
9157; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
9158; SI-NEXT:    s_cbranch_execnz .LBB129_1
9159; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
9160; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
9161; SI-NEXT:    s_waitcnt expcnt(0)
9162; SI-NEXT:    s_setpc_b64 s[30:31]
9163;
9164; VI-LABEL: global_atomic_min_i64_noret_offset__amdgpu_no_remote_memory:
9165; VI:       ; %bb.0:
9166; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9167; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
9168; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
9169; VI-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
9170; VI-NEXT:    s_mov_b64 s[4:5], 0
9171; VI-NEXT:  .LBB129_1: ; %atomicrmw.start
9172; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
9173; VI-NEXT:    s_waitcnt vmcnt(0)
9174; VI-NEXT:    v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
9175; VI-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
9176; VI-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
9177; VI-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
9178; VI-NEXT:    s_waitcnt vmcnt(0)
9179; VI-NEXT:    buffer_wbinvl1_vol
9180; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
9181; VI-NEXT:    v_mov_b32_e32 v7, v5
9182; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
9183; VI-NEXT:    v_mov_b32_e32 v6, v4
9184; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
9185; VI-NEXT:    s_cbranch_execnz .LBB129_1
9186; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
9187; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
9188; VI-NEXT:    s_setpc_b64 s[30:31]
9189;
9190; GFX9-LABEL: global_atomic_min_i64_noret_offset__amdgpu_no_remote_memory:
9191; GFX9:       ; %bb.0:
9192; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9193; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off offset:32
9194; GFX9-NEXT:    s_mov_b64 s[4:5], 0
9195; GFX9-NEXT:  .LBB129_1: ; %atomicrmw.start
9196; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
9197; GFX9-NEXT:    s_waitcnt vmcnt(0)
9198; GFX9-NEXT:    v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
9199; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
9200; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
9201; GFX9-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
9202; GFX9-NEXT:    s_waitcnt vmcnt(0)
9203; GFX9-NEXT:    buffer_wbinvl1_vol
9204; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
9205; GFX9-NEXT:    v_mov_b32_e32 v7, v5
9206; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
9207; GFX9-NEXT:    v_mov_b32_e32 v6, v4
9208; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
9209; GFX9-NEXT:    s_cbranch_execnz .LBB129_1
9210; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
9211; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
9212; GFX9-NEXT:    s_setpc_b64 s[30:31]
9213  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
9214  %tmp0 = atomicrmw min ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
9215  ret void
9216}
9217
9218define i64 @global_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
9219; SI-LABEL: global_atomic_min_i64_ret_offset__amdgpu_no_remote_memory:
9220; SI:       ; %bb.0:
9221; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9222; SI-NEXT:    v_mov_b32_e32 v5, v3
9223; SI-NEXT:    v_mov_b32_e32 v4, v2
9224; SI-NEXT:    v_mov_b32_e32 v7, v1
9225; SI-NEXT:    v_mov_b32_e32 v6, v0
9226; SI-NEXT:    s_mov_b32 s6, 0
9227; SI-NEXT:    s_mov_b32 s7, 0xf000
9228; SI-NEXT:    s_mov_b32 s4, s6
9229; SI-NEXT:    s_mov_b32 s5, s6
9230; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32
9231; SI-NEXT:    s_mov_b64 s[8:9], 0
9232; SI-NEXT:  .LBB130_1: ; %atomicrmw.start
9233; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
9234; SI-NEXT:    s_waitcnt vmcnt(0)
9235; SI-NEXT:    v_mov_b32_e32 v11, v1
9236; SI-NEXT:    v_mov_b32_e32 v10, v0
9237; SI-NEXT:    v_cmp_le_i64_e32 vcc, v[10:11], v[4:5]
9238; SI-NEXT:    v_cndmask_b32_e32 v9, v5, v11, vcc
9239; SI-NEXT:    v_cndmask_b32_e32 v8, v4, v10, vcc
9240; SI-NEXT:    s_waitcnt expcnt(0)
9241; SI-NEXT:    v_mov_b32_e32 v0, v8
9242; SI-NEXT:    v_mov_b32_e32 v1, v9
9243; SI-NEXT:    v_mov_b32_e32 v2, v10
9244; SI-NEXT:    v_mov_b32_e32 v3, v11
9245; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc
9246; SI-NEXT:    s_waitcnt vmcnt(0)
9247; SI-NEXT:    buffer_wbinvl1
9248; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
9249; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
9250; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
9251; SI-NEXT:    s_cbranch_execnz .LBB130_1
9252; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
9253; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
9254; SI-NEXT:    s_waitcnt expcnt(0)
9255; SI-NEXT:    s_setpc_b64 s[30:31]
9256;
9257; VI-LABEL: global_atomic_min_i64_ret_offset__amdgpu_no_remote_memory:
9258; VI:       ; %bb.0:
9259; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9260; VI-NEXT:    v_add_u32_e32 v4, vcc, 32, v0
9261; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
9262; VI-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
9263; VI-NEXT:    s_mov_b64 s[4:5], 0
9264; VI-NEXT:  .LBB130_1: ; %atomicrmw.start
9265; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
9266; VI-NEXT:    s_waitcnt vmcnt(0)
9267; VI-NEXT:    v_mov_b32_e32 v9, v1
9268; VI-NEXT:    v_mov_b32_e32 v8, v0
9269; VI-NEXT:    v_cmp_le_i64_e32 vcc, v[8:9], v[2:3]
9270; VI-NEXT:    v_cndmask_b32_e32 v7, v3, v9, vcc
9271; VI-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
9272; VI-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
9273; VI-NEXT:    s_waitcnt vmcnt(0)
9274; VI-NEXT:    buffer_wbinvl1_vol
9275; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
9276; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
9277; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
9278; VI-NEXT:    s_cbranch_execnz .LBB130_1
9279; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
9280; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
9281; VI-NEXT:    s_setpc_b64 s[30:31]
9282;
9283; GFX9-LABEL: global_atomic_min_i64_ret_offset__amdgpu_no_remote_memory:
9284; GFX9:       ; %bb.0:
9285; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9286; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:32
9287; GFX9-NEXT:    s_mov_b64 s[4:5], 0
9288; GFX9-NEXT:  .LBB130_1: ; %atomicrmw.start
9289; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
9290; GFX9-NEXT:    s_waitcnt vmcnt(0)
9291; GFX9-NEXT:    v_mov_b32_e32 v7, v5
9292; GFX9-NEXT:    v_mov_b32_e32 v6, v4
9293; GFX9-NEXT:    v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
9294; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
9295; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
9296; GFX9-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
9297; GFX9-NEXT:    s_waitcnt vmcnt(0)
9298; GFX9-NEXT:    buffer_wbinvl1_vol
9299; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
9300; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
9301; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
9302; GFX9-NEXT:    s_cbranch_execnz .LBB130_1
9303; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
9304; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
9305; GFX9-NEXT:    v_mov_b32_e32 v0, v4
9306; GFX9-NEXT:    v_mov_b32_e32 v1, v5
9307; GFX9-NEXT:    s_setpc_b64 s[30:31]
9308  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
9309  %result = atomicrmw min ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
9310  ret i64 %result
9311}
9312
9313; ---------------------------------------------------------------------
9314; atomicrmw uinc_wrap
9315; ---------------------------------------------------------------------
9316
9317define void @global_atomic_uinc_wrap_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
9318; SI-LABEL: global_atomic_uinc_wrap_i64_noret:
9319; SI:       ; %bb.0:
9320; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9321; SI-NEXT:    s_mov_b32 s6, 0
9322; SI-NEXT:    s_mov_b32 s7, 0xf000
9323; SI-NEXT:    s_mov_b32 s4, s6
9324; SI-NEXT:    s_mov_b32 s5, s6
9325; SI-NEXT:    buffer_atomic_inc_x2 v[2:3], v[0:1], s[4:7], 0 addr64
9326; SI-NEXT:    s_waitcnt vmcnt(0)
9327; SI-NEXT:    buffer_wbinvl1
9328; SI-NEXT:    s_waitcnt expcnt(0)
9329; SI-NEXT:    s_setpc_b64 s[30:31]
9330;
9331; VI-LABEL: global_atomic_uinc_wrap_i64_noret:
9332; VI:       ; %bb.0:
9333; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9334; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3]
9335; VI-NEXT:    s_waitcnt vmcnt(0)
9336; VI-NEXT:    buffer_wbinvl1_vol
9337; VI-NEXT:    s_setpc_b64 s[30:31]
9338;
9339; GFX9-LABEL: global_atomic_uinc_wrap_i64_noret:
9340; GFX9:       ; %bb.0:
9341; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9342; GFX9-NEXT:    global_atomic_inc_x2 v[0:1], v[2:3], off
9343; GFX9-NEXT:    s_waitcnt vmcnt(0)
9344; GFX9-NEXT:    buffer_wbinvl1_vol
9345; GFX9-NEXT:    s_setpc_b64 s[30:31]
9346  %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %in seq_cst
9347  ret void
9348}
9349
9350define void @global_atomic_uinc_wrap_i64_noret_offset(ptr addrspace(1) %out, i64 %in) {
9351; SI-LABEL: global_atomic_uinc_wrap_i64_noret_offset:
9352; SI:       ; %bb.0:
9353; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9354; SI-NEXT:    s_mov_b32 s6, 0
9355; SI-NEXT:    s_mov_b32 s7, 0xf000
9356; SI-NEXT:    s_mov_b32 s4, s6
9357; SI-NEXT:    s_mov_b32 s5, s6
9358; SI-NEXT:    buffer_atomic_inc_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32
9359; SI-NEXT:    s_waitcnt vmcnt(0)
9360; SI-NEXT:    buffer_wbinvl1
9361; SI-NEXT:    s_waitcnt expcnt(0)
9362; SI-NEXT:    s_setpc_b64 s[30:31]
9363;
9364; VI-LABEL: global_atomic_uinc_wrap_i64_noret_offset:
9365; VI:       ; %bb.0:
9366; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9367; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
9368; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
9369; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3]
9370; VI-NEXT:    s_waitcnt vmcnt(0)
9371; VI-NEXT:    buffer_wbinvl1_vol
9372; VI-NEXT:    s_setpc_b64 s[30:31]
9373;
9374; GFX9-LABEL: global_atomic_uinc_wrap_i64_noret_offset:
9375; GFX9:       ; %bb.0:
9376; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9377; GFX9-NEXT:    global_atomic_inc_x2 v[0:1], v[2:3], off offset:32
9378; GFX9-NEXT:    s_waitcnt vmcnt(0)
9379; GFX9-NEXT:    buffer_wbinvl1_vol
9380; GFX9-NEXT:    s_setpc_b64 s[30:31]
9381  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
9382  %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst
9383  ret void
9384}
9385
9386define i64 @global_atomic_uinc_wrap_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
9387; SI-LABEL: global_atomic_uinc_wrap_i64_ret:
9388; SI:       ; %bb.0:
9389; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9390; SI-NEXT:    s_mov_b32 s6, 0
9391; SI-NEXT:    s_mov_b32 s7, 0xf000
9392; SI-NEXT:    s_mov_b32 s4, s6
9393; SI-NEXT:    s_mov_b32 s5, s6
9394; SI-NEXT:    buffer_atomic_inc_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
9395; SI-NEXT:    s_waitcnt vmcnt(0)
9396; SI-NEXT:    buffer_wbinvl1
9397; SI-NEXT:    v_mov_b32_e32 v0, v2
9398; SI-NEXT:    v_mov_b32_e32 v1, v3
9399; SI-NEXT:    s_waitcnt expcnt(0)
9400; SI-NEXT:    s_setpc_b64 s[30:31]
9401;
9402; VI-LABEL: global_atomic_uinc_wrap_i64_ret:
9403; VI:       ; %bb.0:
9404; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9405; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
9406; VI-NEXT:    s_waitcnt vmcnt(0)
9407; VI-NEXT:    buffer_wbinvl1_vol
9408; VI-NEXT:    s_setpc_b64 s[30:31]
9409;
9410; GFX9-LABEL: global_atomic_uinc_wrap_i64_ret:
9411; GFX9:       ; %bb.0:
9412; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9413; GFX9-NEXT:    global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off glc
9414; GFX9-NEXT:    s_waitcnt vmcnt(0)
9415; GFX9-NEXT:    buffer_wbinvl1_vol
9416; GFX9-NEXT:    s_setpc_b64 s[30:31]
9417  %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %in seq_cst
9418  ret i64 %result
9419}
9420
9421define i64 @global_atomic_uinc_wrap_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
9422; SI-LABEL: global_atomic_uinc_wrap_i64_ret_offset:
9423; SI:       ; %bb.0:
9424; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9425; SI-NEXT:    s_mov_b32 s6, 0
9426; SI-NEXT:    s_mov_b32 s7, 0xf000
9427; SI-NEXT:    s_mov_b32 s4, s6
9428; SI-NEXT:    s_mov_b32 s5, s6
9429; SI-NEXT:    buffer_atomic_inc_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc
9430; SI-NEXT:    s_waitcnt vmcnt(0)
9431; SI-NEXT:    buffer_wbinvl1
9432; SI-NEXT:    v_mov_b32_e32 v0, v2
9433; SI-NEXT:    v_mov_b32_e32 v1, v3
9434; SI-NEXT:    s_waitcnt expcnt(0)
9435; SI-NEXT:    s_setpc_b64 s[30:31]
9436;
9437; VI-LABEL: global_atomic_uinc_wrap_i64_ret_offset:
9438; VI:       ; %bb.0:
9439; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9440; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
9441; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
9442; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
9443; VI-NEXT:    s_waitcnt vmcnt(0)
9444; VI-NEXT:    buffer_wbinvl1_vol
9445; VI-NEXT:    s_setpc_b64 s[30:31]
9446;
9447; GFX9-LABEL: global_atomic_uinc_wrap_i64_ret_offset:
9448; GFX9:       ; %bb.0:
9449; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9450; GFX9-NEXT:    global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
9451; GFX9-NEXT:    s_waitcnt vmcnt(0)
9452; GFX9-NEXT:    buffer_wbinvl1_vol
9453; GFX9-NEXT:    s_setpc_b64 s[30:31]
9454  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
9455  %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst
9456  ret i64 %result
9457}
9458
9459define amdgpu_gfx void @global_atomic_uinc_wrap_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
9460; SI-LABEL: global_atomic_uinc_wrap_i64_noret_scalar:
9461; SI:       ; %bb.0:
9462; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9463; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
9464; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
9465; SI-NEXT:    s_mov_b64 exec, s[34:35]
9466; SI-NEXT:    s_waitcnt expcnt(0)
9467; SI-NEXT:    v_writelane_b32 v2, s6, 0
9468; SI-NEXT:    v_writelane_b32 v2, s7, 1
9469; SI-NEXT:    s_mov_b32 s34, s7
9470; SI-NEXT:    s_mov_b32 s35, s6
9471; SI-NEXT:    s_mov_b32 s7, 0xf000
9472; SI-NEXT:    s_mov_b32 s6, -1
9473; SI-NEXT:    v_mov_b32_e32 v0, s35
9474; SI-NEXT:    v_mov_b32_e32 v1, s34
9475; SI-NEXT:    s_waitcnt vmcnt(0)
9476; SI-NEXT:    buffer_atomic_inc_x2 v[0:1], off, s[4:7], 0
9477; SI-NEXT:    s_waitcnt vmcnt(0)
9478; SI-NEXT:    buffer_wbinvl1
9479; SI-NEXT:    v_readlane_b32 s7, v2, 1
9480; SI-NEXT:    v_readlane_b32 s6, v2, 0
9481; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
9482; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
9483; SI-NEXT:    s_mov_b64 exec, s[34:35]
9484; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
9485; SI-NEXT:    s_setpc_b64 s[30:31]
9486;
9487; VI-LABEL: global_atomic_uinc_wrap_i64_noret_scalar:
9488; VI:       ; %bb.0:
9489; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9490; VI-NEXT:    v_mov_b32_e32 v0, s6
9491; VI-NEXT:    v_mov_b32_e32 v1, s7
9492; VI-NEXT:    v_mov_b32_e32 v2, s4
9493; VI-NEXT:    v_mov_b32_e32 v3, s5
9494; VI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
9495; VI-NEXT:    s_waitcnt vmcnt(0)
9496; VI-NEXT:    buffer_wbinvl1_vol
9497; VI-NEXT:    s_setpc_b64 s[30:31]
9498;
9499; GFX9-LABEL: global_atomic_uinc_wrap_i64_noret_scalar:
9500; GFX9:       ; %bb.0:
9501; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9502; GFX9-NEXT:    v_mov_b32_e32 v0, s6
9503; GFX9-NEXT:    v_mov_b32_e32 v1, s7
9504; GFX9-NEXT:    v_mov_b32_e32 v2, 0
9505; GFX9-NEXT:    global_atomic_inc_x2 v2, v[0:1], s[4:5]
9506; GFX9-NEXT:    s_waitcnt vmcnt(0)
9507; GFX9-NEXT:    buffer_wbinvl1_vol
9508; GFX9-NEXT:    s_setpc_b64 s[30:31]
9509  %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %in seq_cst
9510  ret void
9511}
9512
9513define amdgpu_gfx void @global_atomic_uinc_wrap_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
9514; SI-LABEL: global_atomic_uinc_wrap_i64_noret_offset_scalar:
9515; SI:       ; %bb.0:
9516; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9517; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
9518; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
9519; SI-NEXT:    s_mov_b64 exec, s[34:35]
9520; SI-NEXT:    s_waitcnt expcnt(0)
9521; SI-NEXT:    v_writelane_b32 v2, s6, 0
9522; SI-NEXT:    v_writelane_b32 v2, s7, 1
9523; SI-NEXT:    v_mov_b32_e32 v0, s6
9524; SI-NEXT:    v_mov_b32_e32 v1, s7
9525; SI-NEXT:    s_mov_b32 s7, 0xf000
9526; SI-NEXT:    s_mov_b32 s6, -1
9527; SI-NEXT:    s_waitcnt vmcnt(0)
9528; SI-NEXT:    buffer_atomic_inc_x2 v[0:1], off, s[4:7], 0 offset:32
9529; SI-NEXT:    s_waitcnt vmcnt(0)
9530; SI-NEXT:    buffer_wbinvl1
9531; SI-NEXT:    v_readlane_b32 s7, v2, 1
9532; SI-NEXT:    v_readlane_b32 s6, v2, 0
9533; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
9534; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
9535; SI-NEXT:    s_mov_b64 exec, s[34:35]
9536; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
9537; SI-NEXT:    s_setpc_b64 s[30:31]
9538;
9539; VI-LABEL: global_atomic_uinc_wrap_i64_noret_offset_scalar:
9540; VI:       ; %bb.0:
9541; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9542; VI-NEXT:    s_add_u32 s34, s4, 32
9543; VI-NEXT:    s_addc_u32 s35, s5, 0
9544; VI-NEXT:    v_mov_b32_e32 v2, s34
9545; VI-NEXT:    v_mov_b32_e32 v0, s6
9546; VI-NEXT:    v_mov_b32_e32 v1, s7
9547; VI-NEXT:    v_mov_b32_e32 v3, s35
9548; VI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
9549; VI-NEXT:    s_waitcnt vmcnt(0)
9550; VI-NEXT:    buffer_wbinvl1_vol
9551; VI-NEXT:    s_setpc_b64 s[30:31]
9552;
9553; GFX9-LABEL: global_atomic_uinc_wrap_i64_noret_offset_scalar:
9554; GFX9:       ; %bb.0:
9555; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9556; GFX9-NEXT:    v_mov_b32_e32 v0, s6
9557; GFX9-NEXT:    v_mov_b32_e32 v1, s7
9558; GFX9-NEXT:    v_mov_b32_e32 v2, 0
9559; GFX9-NEXT:    global_atomic_inc_x2 v2, v[0:1], s[4:5] offset:32
9560; GFX9-NEXT:    s_waitcnt vmcnt(0)
9561; GFX9-NEXT:    buffer_wbinvl1_vol
9562; GFX9-NEXT:    s_setpc_b64 s[30:31]
9563  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
9564  %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst
9565  ret void
9566}
9567
9568define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
9569; SI-LABEL: global_atomic_uinc_wrap_i64_ret_scalar:
9570; SI:       ; %bb.0:
9571; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9572; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
9573; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
9574; SI-NEXT:    s_mov_b64 exec, s[34:35]
9575; SI-NEXT:    s_waitcnt expcnt(0)
9576; SI-NEXT:    v_writelane_b32 v2, s6, 0
9577; SI-NEXT:    v_writelane_b32 v2, s7, 1
9578; SI-NEXT:    s_mov_b32 s34, s7
9579; SI-NEXT:    s_mov_b32 s35, s6
9580; SI-NEXT:    s_mov_b32 s7, 0xf000
9581; SI-NEXT:    s_mov_b32 s6, -1
9582; SI-NEXT:    v_mov_b32_e32 v0, s35
9583; SI-NEXT:    v_mov_b32_e32 v1, s34
9584; SI-NEXT:    s_waitcnt vmcnt(0)
9585; SI-NEXT:    buffer_atomic_inc_x2 v[0:1], off, s[4:7], 0 glc
9586; SI-NEXT:    s_waitcnt vmcnt(0)
9587; SI-NEXT:    buffer_wbinvl1
9588; SI-NEXT:    v_readlane_b32 s7, v2, 1
9589; SI-NEXT:    v_readlane_b32 s6, v2, 0
9590; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
9591; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
9592; SI-NEXT:    s_mov_b64 exec, s[34:35]
9593; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
9594; SI-NEXT:    s_setpc_b64 s[30:31]
9595;
9596; VI-LABEL: global_atomic_uinc_wrap_i64_ret_scalar:
9597; VI:       ; %bb.0:
9598; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9599; VI-NEXT:    v_mov_b32_e32 v0, s6
9600; VI-NEXT:    v_mov_b32_e32 v1, s7
9601; VI-NEXT:    v_mov_b32_e32 v2, s4
9602; VI-NEXT:    v_mov_b32_e32 v3, s5
9603; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
9604; VI-NEXT:    s_waitcnt vmcnt(0)
9605; VI-NEXT:    buffer_wbinvl1_vol
9606; VI-NEXT:    s_setpc_b64 s[30:31]
9607;
9608; GFX9-LABEL: global_atomic_uinc_wrap_i64_ret_scalar:
9609; GFX9:       ; %bb.0:
9610; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9611; GFX9-NEXT:    v_mov_b32_e32 v0, s6
9612; GFX9-NEXT:    v_mov_b32_e32 v1, s7
9613; GFX9-NEXT:    v_mov_b32_e32 v2, 0
9614; GFX9-NEXT:    global_atomic_inc_x2 v[0:1], v2, v[0:1], s[4:5] glc
9615; GFX9-NEXT:    s_waitcnt vmcnt(0)
9616; GFX9-NEXT:    buffer_wbinvl1_vol
9617; GFX9-NEXT:    s_setpc_b64 s[30:31]
9618  %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %in seq_cst
9619  ret i64 %result
9620}
9621
9622define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
9623; SI-LABEL: global_atomic_uinc_wrap_i64_ret_offset_scalar:
9624; SI:       ; %bb.0:
9625; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9626; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
9627; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
9628; SI-NEXT:    s_mov_b64 exec, s[34:35]
9629; SI-NEXT:    s_waitcnt expcnt(0)
9630; SI-NEXT:    v_writelane_b32 v2, s6, 0
9631; SI-NEXT:    v_writelane_b32 v2, s7, 1
9632; SI-NEXT:    v_mov_b32_e32 v0, s6
9633; SI-NEXT:    v_mov_b32_e32 v1, s7
9634; SI-NEXT:    s_mov_b32 s7, 0xf000
9635; SI-NEXT:    s_mov_b32 s6, -1
9636; SI-NEXT:    s_waitcnt vmcnt(0)
9637; SI-NEXT:    buffer_atomic_inc_x2 v[0:1], off, s[4:7], 0 offset:32 glc
9638; SI-NEXT:    s_waitcnt vmcnt(0)
9639; SI-NEXT:    buffer_wbinvl1
9640; SI-NEXT:    v_readlane_b32 s7, v2, 1
9641; SI-NEXT:    v_readlane_b32 s6, v2, 0
9642; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
9643; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
9644; SI-NEXT:    s_mov_b64 exec, s[34:35]
9645; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
9646; SI-NEXT:    s_setpc_b64 s[30:31]
9647;
9648; VI-LABEL: global_atomic_uinc_wrap_i64_ret_offset_scalar:
9649; VI:       ; %bb.0:
9650; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9651; VI-NEXT:    s_add_u32 s34, s4, 32
9652; VI-NEXT:    s_addc_u32 s35, s5, 0
9653; VI-NEXT:    v_mov_b32_e32 v2, s34
9654; VI-NEXT:    v_mov_b32_e32 v0, s6
9655; VI-NEXT:    v_mov_b32_e32 v1, s7
9656; VI-NEXT:    v_mov_b32_e32 v3, s35
9657; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
9658; VI-NEXT:    s_waitcnt vmcnt(0)
9659; VI-NEXT:    buffer_wbinvl1_vol
9660; VI-NEXT:    s_setpc_b64 s[30:31]
9661;
9662; GFX9-LABEL: global_atomic_uinc_wrap_i64_ret_offset_scalar:
9663; GFX9:       ; %bb.0:
9664; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9665; GFX9-NEXT:    v_mov_b32_e32 v0, s6
9666; GFX9-NEXT:    v_mov_b32_e32 v1, s7
9667; GFX9-NEXT:    v_mov_b32_e32 v2, 0
9668; GFX9-NEXT:    global_atomic_inc_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc
9669; GFX9-NEXT:    s_waitcnt vmcnt(0)
9670; GFX9-NEXT:    buffer_wbinvl1_vol
9671; GFX9-NEXT:    s_setpc_b64 s[30:31]
9672  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
9673  %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst
9674  ret i64 %result
9675}
9676
9677define void @global_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
9678; SI-LABEL: global_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory:
9679; SI:       ; %bb.0:
9680; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9681; SI-NEXT:    s_mov_b32 s6, 0
9682; SI-NEXT:    s_mov_b32 s7, 0xf000
9683; SI-NEXT:    s_mov_b32 s4, s6
9684; SI-NEXT:    s_mov_b32 s5, s6
9685; SI-NEXT:    buffer_atomic_inc_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32
9686; SI-NEXT:    s_waitcnt vmcnt(0)
9687; SI-NEXT:    buffer_wbinvl1
9688; SI-NEXT:    s_waitcnt expcnt(0)
9689; SI-NEXT:    s_setpc_b64 s[30:31]
9690;
9691; VI-LABEL: global_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory:
9692; VI:       ; %bb.0:
9693; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9694; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
9695; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
9696; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3]
9697; VI-NEXT:    s_waitcnt vmcnt(0)
9698; VI-NEXT:    buffer_wbinvl1_vol
9699; VI-NEXT:    s_setpc_b64 s[30:31]
9700;
9701; GFX9-LABEL: global_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory:
9702; GFX9:       ; %bb.0:
9703; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9704; GFX9-NEXT:    global_atomic_inc_x2 v[0:1], v[2:3], off offset:32
9705; GFX9-NEXT:    s_waitcnt vmcnt(0)
9706; GFX9-NEXT:    buffer_wbinvl1_vol
9707; GFX9-NEXT:    s_setpc_b64 s[30:31]
9708  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
9709  %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
9710  ret void
9711}
9712
9713define i64 @global_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
9714; SI-LABEL: global_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory:
9715; SI:       ; %bb.0:
9716; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9717; SI-NEXT:    s_mov_b32 s6, 0
9718; SI-NEXT:    s_mov_b32 s7, 0xf000
9719; SI-NEXT:    s_mov_b32 s4, s6
9720; SI-NEXT:    s_mov_b32 s5, s6
9721; SI-NEXT:    buffer_atomic_inc_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc
9722; SI-NEXT:    s_waitcnt vmcnt(0)
9723; SI-NEXT:    buffer_wbinvl1
9724; SI-NEXT:    v_mov_b32_e32 v0, v2
9725; SI-NEXT:    v_mov_b32_e32 v1, v3
9726; SI-NEXT:    s_waitcnt expcnt(0)
9727; SI-NEXT:    s_setpc_b64 s[30:31]
9728;
9729; VI-LABEL: global_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory:
9730; VI:       ; %bb.0:
9731; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9732; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
9733; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
9734; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
9735; VI-NEXT:    s_waitcnt vmcnt(0)
9736; VI-NEXT:    buffer_wbinvl1_vol
9737; VI-NEXT:    s_setpc_b64 s[30:31]
9738;
9739; GFX9-LABEL: global_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory:
9740; GFX9:       ; %bb.0:
9741; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9742; GFX9-NEXT:    global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
9743; GFX9-NEXT:    s_waitcnt vmcnt(0)
9744; GFX9-NEXT:    buffer_wbinvl1_vol
9745; GFX9-NEXT:    s_setpc_b64 s[30:31]
9746  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
9747  %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
9748  ret i64 %result
9749}
9750
9751; ---------------------------------------------------------------------
9752; atomicrmw udec_wrap
9753; ---------------------------------------------------------------------
9754
9755define void @global_atomic_udec_wrap_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
9756; SI-LABEL: global_atomic_udec_wrap_i64_noret:
9757; SI:       ; %bb.0:
9758; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9759; SI-NEXT:    s_mov_b32 s6, 0
9760; SI-NEXT:    s_mov_b32 s7, 0xf000
9761; SI-NEXT:    s_mov_b32 s4, s6
9762; SI-NEXT:    s_mov_b32 s5, s6
9763; SI-NEXT:    buffer_atomic_dec_x2 v[2:3], v[0:1], s[4:7], 0 addr64
9764; SI-NEXT:    s_waitcnt vmcnt(0)
9765; SI-NEXT:    buffer_wbinvl1
9766; SI-NEXT:    s_waitcnt expcnt(0)
9767; SI-NEXT:    s_setpc_b64 s[30:31]
9768;
9769; VI-LABEL: global_atomic_udec_wrap_i64_noret:
9770; VI:       ; %bb.0:
9771; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9772; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3]
9773; VI-NEXT:    s_waitcnt vmcnt(0)
9774; VI-NEXT:    buffer_wbinvl1_vol
9775; VI-NEXT:    s_setpc_b64 s[30:31]
9776;
9777; GFX9-LABEL: global_atomic_udec_wrap_i64_noret:
9778; GFX9:       ; %bb.0:
9779; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9780; GFX9-NEXT:    global_atomic_dec_x2 v[0:1], v[2:3], off
9781; GFX9-NEXT:    s_waitcnt vmcnt(0)
9782; GFX9-NEXT:    buffer_wbinvl1_vol
9783; GFX9-NEXT:    s_setpc_b64 s[30:31]
9784  %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %in seq_cst
9785  ret void
9786}
9787
9788define void @global_atomic_udec_wrap_i64_noret_offset(ptr addrspace(1) %out, i64 %in) {
9789; SI-LABEL: global_atomic_udec_wrap_i64_noret_offset:
9790; SI:       ; %bb.0:
9791; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9792; SI-NEXT:    s_mov_b32 s6, 0
9793; SI-NEXT:    s_mov_b32 s7, 0xf000
9794; SI-NEXT:    s_mov_b32 s4, s6
9795; SI-NEXT:    s_mov_b32 s5, s6
9796; SI-NEXT:    buffer_atomic_dec_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32
9797; SI-NEXT:    s_waitcnt vmcnt(0)
9798; SI-NEXT:    buffer_wbinvl1
9799; SI-NEXT:    s_waitcnt expcnt(0)
9800; SI-NEXT:    s_setpc_b64 s[30:31]
9801;
9802; VI-LABEL: global_atomic_udec_wrap_i64_noret_offset:
9803; VI:       ; %bb.0:
9804; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9805; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
9806; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
9807; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3]
9808; VI-NEXT:    s_waitcnt vmcnt(0)
9809; VI-NEXT:    buffer_wbinvl1_vol
9810; VI-NEXT:    s_setpc_b64 s[30:31]
9811;
9812; GFX9-LABEL: global_atomic_udec_wrap_i64_noret_offset:
9813; GFX9:       ; %bb.0:
9814; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9815; GFX9-NEXT:    global_atomic_dec_x2 v[0:1], v[2:3], off offset:32
9816; GFX9-NEXT:    s_waitcnt vmcnt(0)
9817; GFX9-NEXT:    buffer_wbinvl1_vol
9818; GFX9-NEXT:    s_setpc_b64 s[30:31]
9819  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
9820  %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst
9821  ret void
9822}
9823
9824define i64 @global_atomic_udec_wrap_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
9825; SI-LABEL: global_atomic_udec_wrap_i64_ret:
9826; SI:       ; %bb.0:
9827; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9828; SI-NEXT:    s_mov_b32 s6, 0
9829; SI-NEXT:    s_mov_b32 s7, 0xf000
9830; SI-NEXT:    s_mov_b32 s4, s6
9831; SI-NEXT:    s_mov_b32 s5, s6
9832; SI-NEXT:    buffer_atomic_dec_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
9833; SI-NEXT:    s_waitcnt vmcnt(0)
9834; SI-NEXT:    buffer_wbinvl1
9835; SI-NEXT:    v_mov_b32_e32 v0, v2
9836; SI-NEXT:    v_mov_b32_e32 v1, v3
9837; SI-NEXT:    s_waitcnt expcnt(0)
9838; SI-NEXT:    s_setpc_b64 s[30:31]
9839;
9840; VI-LABEL: global_atomic_udec_wrap_i64_ret:
9841; VI:       ; %bb.0:
9842; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9843; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
9844; VI-NEXT:    s_waitcnt vmcnt(0)
9845; VI-NEXT:    buffer_wbinvl1_vol
9846; VI-NEXT:    s_setpc_b64 s[30:31]
9847;
9848; GFX9-LABEL: global_atomic_udec_wrap_i64_ret:
9849; GFX9:       ; %bb.0:
9850; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9851; GFX9-NEXT:    global_atomic_dec_x2 v[0:1], v[0:1], v[2:3], off glc
9852; GFX9-NEXT:    s_waitcnt vmcnt(0)
9853; GFX9-NEXT:    buffer_wbinvl1_vol
9854; GFX9-NEXT:    s_setpc_b64 s[30:31]
9855  %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %in seq_cst
9856  ret i64 %result
9857}
9858
9859define i64 @global_atomic_udec_wrap_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
9860; SI-LABEL: global_atomic_udec_wrap_i64_ret_offset:
9861; SI:       ; %bb.0:
9862; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9863; SI-NEXT:    s_mov_b32 s6, 0
9864; SI-NEXT:    s_mov_b32 s7, 0xf000
9865; SI-NEXT:    s_mov_b32 s4, s6
9866; SI-NEXT:    s_mov_b32 s5, s6
9867; SI-NEXT:    buffer_atomic_dec_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc
9868; SI-NEXT:    s_waitcnt vmcnt(0)
9869; SI-NEXT:    buffer_wbinvl1
9870; SI-NEXT:    v_mov_b32_e32 v0, v2
9871; SI-NEXT:    v_mov_b32_e32 v1, v3
9872; SI-NEXT:    s_waitcnt expcnt(0)
9873; SI-NEXT:    s_setpc_b64 s[30:31]
9874;
9875; VI-LABEL: global_atomic_udec_wrap_i64_ret_offset:
9876; VI:       ; %bb.0:
9877; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9878; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
9879; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
9880; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
9881; VI-NEXT:    s_waitcnt vmcnt(0)
9882; VI-NEXT:    buffer_wbinvl1_vol
9883; VI-NEXT:    s_setpc_b64 s[30:31]
9884;
9885; GFX9-LABEL: global_atomic_udec_wrap_i64_ret_offset:
9886; GFX9:       ; %bb.0:
9887; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9888; GFX9-NEXT:    global_atomic_dec_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
9889; GFX9-NEXT:    s_waitcnt vmcnt(0)
9890; GFX9-NEXT:    buffer_wbinvl1_vol
9891; GFX9-NEXT:    s_setpc_b64 s[30:31]
9892  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
9893  %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst
9894  ret i64 %result
9895}
9896
9897define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
9898; SI-LABEL: global_atomic_udec_wrap_i64_noret_scalar:
9899; SI:       ; %bb.0:
9900; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9901; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
9902; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
9903; SI-NEXT:    s_mov_b64 exec, s[34:35]
9904; SI-NEXT:    s_waitcnt expcnt(0)
9905; SI-NEXT:    v_writelane_b32 v2, s6, 0
9906; SI-NEXT:    v_writelane_b32 v2, s7, 1
9907; SI-NEXT:    s_mov_b32 s34, s7
9908; SI-NEXT:    s_mov_b32 s35, s6
9909; SI-NEXT:    s_mov_b32 s7, 0xf000
9910; SI-NEXT:    s_mov_b32 s6, -1
9911; SI-NEXT:    v_mov_b32_e32 v0, s35
9912; SI-NEXT:    v_mov_b32_e32 v1, s34
9913; SI-NEXT:    s_waitcnt vmcnt(0)
9914; SI-NEXT:    buffer_atomic_dec_x2 v[0:1], off, s[4:7], 0
9915; SI-NEXT:    s_waitcnt vmcnt(0)
9916; SI-NEXT:    buffer_wbinvl1
9917; SI-NEXT:    v_readlane_b32 s7, v2, 1
9918; SI-NEXT:    v_readlane_b32 s6, v2, 0
9919; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
9920; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
9921; SI-NEXT:    s_mov_b64 exec, s[34:35]
9922; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
9923; SI-NEXT:    s_setpc_b64 s[30:31]
9924;
9925; VI-LABEL: global_atomic_udec_wrap_i64_noret_scalar:
9926; VI:       ; %bb.0:
9927; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9928; VI-NEXT:    v_mov_b32_e32 v0, s6
9929; VI-NEXT:    v_mov_b32_e32 v1, s7
9930; VI-NEXT:    v_mov_b32_e32 v2, s4
9931; VI-NEXT:    v_mov_b32_e32 v3, s5
9932; VI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
9933; VI-NEXT:    s_waitcnt vmcnt(0)
9934; VI-NEXT:    buffer_wbinvl1_vol
9935; VI-NEXT:    s_setpc_b64 s[30:31]
9936;
9937; GFX9-LABEL: global_atomic_udec_wrap_i64_noret_scalar:
9938; GFX9:       ; %bb.0:
9939; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9940; GFX9-NEXT:    v_mov_b32_e32 v0, s6
9941; GFX9-NEXT:    v_mov_b32_e32 v1, s7
9942; GFX9-NEXT:    v_mov_b32_e32 v2, 0
9943; GFX9-NEXT:    global_atomic_dec_x2 v2, v[0:1], s[4:5]
9944; GFX9-NEXT:    s_waitcnt vmcnt(0)
9945; GFX9-NEXT:    buffer_wbinvl1_vol
9946; GFX9-NEXT:    s_setpc_b64 s[30:31]
9947  %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %in seq_cst
9948  ret void
9949}
9950
9951define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
9952; SI-LABEL: global_atomic_udec_wrap_i64_noret_offset_scalar:
9953; SI:       ; %bb.0:
9954; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9955; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
9956; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
9957; SI-NEXT:    s_mov_b64 exec, s[34:35]
9958; SI-NEXT:    s_waitcnt expcnt(0)
9959; SI-NEXT:    v_writelane_b32 v2, s6, 0
9960; SI-NEXT:    v_writelane_b32 v2, s7, 1
9961; SI-NEXT:    v_mov_b32_e32 v0, s6
9962; SI-NEXT:    v_mov_b32_e32 v1, s7
9963; SI-NEXT:    s_mov_b32 s7, 0xf000
9964; SI-NEXT:    s_mov_b32 s6, -1
9965; SI-NEXT:    s_waitcnt vmcnt(0)
9966; SI-NEXT:    buffer_atomic_dec_x2 v[0:1], off, s[4:7], 0 offset:32
9967; SI-NEXT:    s_waitcnt vmcnt(0)
9968; SI-NEXT:    buffer_wbinvl1
9969; SI-NEXT:    v_readlane_b32 s7, v2, 1
9970; SI-NEXT:    v_readlane_b32 s6, v2, 0
9971; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
9972; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
9973; SI-NEXT:    s_mov_b64 exec, s[34:35]
9974; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
9975; SI-NEXT:    s_setpc_b64 s[30:31]
9976;
9977; VI-LABEL: global_atomic_udec_wrap_i64_noret_offset_scalar:
9978; VI:       ; %bb.0:
9979; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9980; VI-NEXT:    s_add_u32 s34, s4, 32
9981; VI-NEXT:    s_addc_u32 s35, s5, 0
9982; VI-NEXT:    v_mov_b32_e32 v2, s34
9983; VI-NEXT:    v_mov_b32_e32 v0, s6
9984; VI-NEXT:    v_mov_b32_e32 v1, s7
9985; VI-NEXT:    v_mov_b32_e32 v3, s35
9986; VI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
9987; VI-NEXT:    s_waitcnt vmcnt(0)
9988; VI-NEXT:    buffer_wbinvl1_vol
9989; VI-NEXT:    s_setpc_b64 s[30:31]
9990;
9991; GFX9-LABEL: global_atomic_udec_wrap_i64_noret_offset_scalar:
9992; GFX9:       ; %bb.0:
9993; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9994; GFX9-NEXT:    v_mov_b32_e32 v0, s6
9995; GFX9-NEXT:    v_mov_b32_e32 v1, s7
9996; GFX9-NEXT:    v_mov_b32_e32 v2, 0
9997; GFX9-NEXT:    global_atomic_dec_x2 v2, v[0:1], s[4:5] offset:32
9998; GFX9-NEXT:    s_waitcnt vmcnt(0)
9999; GFX9-NEXT:    buffer_wbinvl1_vol
10000; GFX9-NEXT:    s_setpc_b64 s[30:31]
10001  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
10002  %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst
10003  ret void
10004}
10005
10006define amdgpu_gfx i64 @global_atomic_udec_wrap_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
10007; SI-LABEL: global_atomic_udec_wrap_i64_ret_scalar:
10008; SI:       ; %bb.0:
10009; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10010; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
10011; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
10012; SI-NEXT:    s_mov_b64 exec, s[34:35]
10013; SI-NEXT:    s_waitcnt expcnt(0)
10014; SI-NEXT:    v_writelane_b32 v2, s6, 0
10015; SI-NEXT:    v_writelane_b32 v2, s7, 1
10016; SI-NEXT:    s_mov_b32 s34, s7
10017; SI-NEXT:    s_mov_b32 s35, s6
10018; SI-NEXT:    s_mov_b32 s7, 0xf000
10019; SI-NEXT:    s_mov_b32 s6, -1
10020; SI-NEXT:    v_mov_b32_e32 v0, s35
10021; SI-NEXT:    v_mov_b32_e32 v1, s34
10022; SI-NEXT:    s_waitcnt vmcnt(0)
10023; SI-NEXT:    buffer_atomic_dec_x2 v[0:1], off, s[4:7], 0 glc
10024; SI-NEXT:    s_waitcnt vmcnt(0)
10025; SI-NEXT:    buffer_wbinvl1
10026; SI-NEXT:    v_readlane_b32 s7, v2, 1
10027; SI-NEXT:    v_readlane_b32 s6, v2, 0
10028; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
10029; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
10030; SI-NEXT:    s_mov_b64 exec, s[34:35]
10031; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
10032; SI-NEXT:    s_setpc_b64 s[30:31]
10033;
10034; VI-LABEL: global_atomic_udec_wrap_i64_ret_scalar:
10035; VI:       ; %bb.0:
10036; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10037; VI-NEXT:    v_mov_b32_e32 v0, s6
10038; VI-NEXT:    v_mov_b32_e32 v1, s7
10039; VI-NEXT:    v_mov_b32_e32 v2, s4
10040; VI-NEXT:    v_mov_b32_e32 v3, s5
10041; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
10042; VI-NEXT:    s_waitcnt vmcnt(0)
10043; VI-NEXT:    buffer_wbinvl1_vol
10044; VI-NEXT:    s_setpc_b64 s[30:31]
10045;
10046; GFX9-LABEL: global_atomic_udec_wrap_i64_ret_scalar:
10047; GFX9:       ; %bb.0:
10048; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10049; GFX9-NEXT:    v_mov_b32_e32 v0, s6
10050; GFX9-NEXT:    v_mov_b32_e32 v1, s7
10051; GFX9-NEXT:    v_mov_b32_e32 v2, 0
10052; GFX9-NEXT:    global_atomic_dec_x2 v[0:1], v2, v[0:1], s[4:5] glc
10053; GFX9-NEXT:    s_waitcnt vmcnt(0)
10054; GFX9-NEXT:    buffer_wbinvl1_vol
10055; GFX9-NEXT:    s_setpc_b64 s[30:31]
10056  %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %in seq_cst
10057  ret i64 %result
10058}
10059
10060define amdgpu_gfx i64 @global_atomic_udec_wrap_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
10061; SI-LABEL: global_atomic_udec_wrap_i64_ret_offset_scalar:
10062; SI:       ; %bb.0:
10063; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10064; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
10065; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
10066; SI-NEXT:    s_mov_b64 exec, s[34:35]
10067; SI-NEXT:    s_waitcnt expcnt(0)
10068; SI-NEXT:    v_writelane_b32 v2, s6, 0
10069; SI-NEXT:    v_writelane_b32 v2, s7, 1
10070; SI-NEXT:    v_mov_b32_e32 v0, s6
10071; SI-NEXT:    v_mov_b32_e32 v1, s7
10072; SI-NEXT:    s_mov_b32 s7, 0xf000
10073; SI-NEXT:    s_mov_b32 s6, -1
10074; SI-NEXT:    s_waitcnt vmcnt(0)
10075; SI-NEXT:    buffer_atomic_dec_x2 v[0:1], off, s[4:7], 0 offset:32 glc
10076; SI-NEXT:    s_waitcnt vmcnt(0)
10077; SI-NEXT:    buffer_wbinvl1
10078; SI-NEXT:    v_readlane_b32 s7, v2, 1
10079; SI-NEXT:    v_readlane_b32 s6, v2, 0
10080; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
10081; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
10082; SI-NEXT:    s_mov_b64 exec, s[34:35]
10083; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
10084; SI-NEXT:    s_setpc_b64 s[30:31]
10085;
10086; VI-LABEL: global_atomic_udec_wrap_i64_ret_offset_scalar:
10087; VI:       ; %bb.0:
10088; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10089; VI-NEXT:    s_add_u32 s34, s4, 32
10090; VI-NEXT:    s_addc_u32 s35, s5, 0
10091; VI-NEXT:    v_mov_b32_e32 v2, s34
10092; VI-NEXT:    v_mov_b32_e32 v0, s6
10093; VI-NEXT:    v_mov_b32_e32 v1, s7
10094; VI-NEXT:    v_mov_b32_e32 v3, s35
10095; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
10096; VI-NEXT:    s_waitcnt vmcnt(0)
10097; VI-NEXT:    buffer_wbinvl1_vol
10098; VI-NEXT:    s_setpc_b64 s[30:31]
10099;
10100; GFX9-LABEL: global_atomic_udec_wrap_i64_ret_offset_scalar:
10101; GFX9:       ; %bb.0:
10102; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10103; GFX9-NEXT:    v_mov_b32_e32 v0, s6
10104; GFX9-NEXT:    v_mov_b32_e32 v1, s7
10105; GFX9-NEXT:    v_mov_b32_e32 v2, 0
10106; GFX9-NEXT:    global_atomic_dec_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc
10107; GFX9-NEXT:    s_waitcnt vmcnt(0)
10108; GFX9-NEXT:    buffer_wbinvl1_vol
10109; GFX9-NEXT:    s_setpc_b64 s[30:31]
10110  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
10111  %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst
10112  ret i64 %result
10113}
10114
10115define void @global_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
10116; SI-LABEL: global_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory:
10117; SI:       ; %bb.0:
10118; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10119; SI-NEXT:    s_mov_b32 s6, 0
10120; SI-NEXT:    s_mov_b32 s7, 0xf000
10121; SI-NEXT:    s_mov_b32 s4, s6
10122; SI-NEXT:    s_mov_b32 s5, s6
10123; SI-NEXT:    buffer_atomic_dec_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32
10124; SI-NEXT:    s_waitcnt vmcnt(0)
10125; SI-NEXT:    buffer_wbinvl1
10126; SI-NEXT:    s_waitcnt expcnt(0)
10127; SI-NEXT:    s_setpc_b64 s[30:31]
10128;
10129; VI-LABEL: global_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory:
10130; VI:       ; %bb.0:
10131; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10132; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
10133; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
10134; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3]
10135; VI-NEXT:    s_waitcnt vmcnt(0)
10136; VI-NEXT:    buffer_wbinvl1_vol
10137; VI-NEXT:    s_setpc_b64 s[30:31]
10138;
10139; GFX9-LABEL: global_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory:
10140; GFX9:       ; %bb.0:
10141; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10142; GFX9-NEXT:    global_atomic_dec_x2 v[0:1], v[2:3], off offset:32
10143; GFX9-NEXT:    s_waitcnt vmcnt(0)
10144; GFX9-NEXT:    buffer_wbinvl1_vol
10145; GFX9-NEXT:    s_setpc_b64 s[30:31]
10146  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
10147  %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
10148  ret void
10149}
10150
10151define i64 @global_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
10152; SI-LABEL: global_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory:
10153; SI:       ; %bb.0:
10154; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10155; SI-NEXT:    s_mov_b32 s6, 0
10156; SI-NEXT:    s_mov_b32 s7, 0xf000
10157; SI-NEXT:    s_mov_b32 s4, s6
10158; SI-NEXT:    s_mov_b32 s5, s6
10159; SI-NEXT:    buffer_atomic_dec_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc
10160; SI-NEXT:    s_waitcnt vmcnt(0)
10161; SI-NEXT:    buffer_wbinvl1
10162; SI-NEXT:    v_mov_b32_e32 v0, v2
10163; SI-NEXT:    v_mov_b32_e32 v1, v3
10164; SI-NEXT:    s_waitcnt expcnt(0)
10165; SI-NEXT:    s_setpc_b64 s[30:31]
10166;
10167; VI-LABEL: global_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory:
10168; VI:       ; %bb.0:
10169; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10170; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
10171; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
10172; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
10173; VI-NEXT:    s_waitcnt vmcnt(0)
10174; VI-NEXT:    buffer_wbinvl1_vol
10175; VI-NEXT:    s_setpc_b64 s[30:31]
10176;
10177; GFX9-LABEL: global_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory:
10178; GFX9:       ; %bb.0:
10179; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10180; GFX9-NEXT:    global_atomic_dec_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
10181; GFX9-NEXT:    s_waitcnt vmcnt(0)
10182; GFX9-NEXT:    buffer_wbinvl1_vol
10183; GFX9-NEXT:    s_setpc_b64 s[30:31]
10184  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
10185  %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
10186  ret i64 %result
10187}
10188
10189!0 = !{}
10190