xref: /llvm-project/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll (revision 2d6d723a85c2d007b0359c206d66cd2e5a9f00e1)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
3; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10 %s
4; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11 %s
5; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12 %s
6
7; Test using saddr addressing mode of global_* flat atomic instructions.
8
9; --------------------------------------------------------------------------------
10; atomicrmw max
11; --------------------------------------------------------------------------------
12
13define amdgpu_ps float @global_max_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
14; GFX9-LABEL: global_max_saddr_i32_rtn:
15; GFX9:       ; %bb.0:
16; GFX9-NEXT:    v_mov_b32_e32 v2, v0
17; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
18; GFX9-NEXT:    v_mov_b32_e32 v3, s3
19; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s2, v2
20; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
21; GFX9-NEXT:    s_mov_b64 s[0:1], 0
22; GFX9-NEXT:  .LBB0_1: ; %atomicrmw.start
23; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
24; GFX9-NEXT:    s_waitcnt vmcnt(0)
25; GFX9-NEXT:    v_mov_b32_e32 v5, v0
26; GFX9-NEXT:    v_max_i32_e32 v4, v5, v1
27; GFX9-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
28; GFX9-NEXT:    s_waitcnt vmcnt(0)
29; GFX9-NEXT:    buffer_wbinvl1
30; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
31; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
32; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
33; GFX9-NEXT:    s_cbranch_execnz .LBB0_1
34; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
35; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
36; GFX9-NEXT:    ; return to shader part epilog
37;
38; GFX10-LABEL: global_max_saddr_i32_rtn:
39; GFX10:       ; %bb.0:
40; GFX10-NEXT:    v_mov_b32_e32 v2, v0
41; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
42; GFX10-NEXT:    v_add_co_u32 v2, s[0:1], s2, v2
43; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
44; GFX10-NEXT:    s_mov_b64 s[0:1], 0
45; GFX10-NEXT:  .LBB0_1: ; %atomicrmw.start
46; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
47; GFX10-NEXT:    s_waitcnt vmcnt(0)
48; GFX10-NEXT:    v_mov_b32_e32 v5, v0
49; GFX10-NEXT:    v_max_i32_e32 v4, v5, v1
50; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
51; GFX10-NEXT:    s_waitcnt vmcnt(0)
52; GFX10-NEXT:    buffer_gl1_inv
53; GFX10-NEXT:    buffer_gl0_inv
54; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
55; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
56; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
57; GFX10-NEXT:    s_cbranch_execnz .LBB0_1
58; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
59; GFX10-NEXT:    s_or_b64 exec, exec, s[0:1]
60; GFX10-NEXT:    ; return to shader part epilog
61;
62; GFX11-LABEL: global_max_saddr_i32_rtn:
63; GFX11:       ; %bb.0:
64; GFX11-NEXT:    v_mov_b32_e32 v2, v0
65; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
66; GFX11-NEXT:    v_add_co_u32 v2, s[0:1], s2, v2
67; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
68; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
69; GFX11-NEXT:    s_mov_b64 s[0:1], 0
70; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
71; GFX11-NEXT:  .LBB0_1: ; %atomicrmw.start
72; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
73; GFX11-NEXT:    s_waitcnt vmcnt(0)
74; GFX11-NEXT:    v_mov_b32_e32 v5, v0
75; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
76; GFX11-NEXT:    v_max_i32_e32 v4, v5, v1
77; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc
78; GFX11-NEXT:    s_waitcnt vmcnt(0)
79; GFX11-NEXT:    buffer_gl1_inv
80; GFX11-NEXT:    buffer_gl0_inv
81; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
82; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
83; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
84; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
85; GFX11-NEXT:    s_cbranch_execnz .LBB0_1
86; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
87; GFX11-NEXT:    s_or_b64 exec, exec, s[0:1]
88; GFX11-NEXT:    ; return to shader part epilog
89;
90; GFX12-LABEL: global_max_saddr_i32_rtn:
91; GFX12:       ; %bb.0:
92; GFX12-NEXT:    v_mov_b32_e32 v2, v0
93; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3]
94; GFX12-NEXT:    v_add_co_u32 v2, s[0:1], s2, v2
95; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
96; GFX12-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
97; GFX12-NEXT:    s_mov_b64 s[0:1], 0
98; GFX12-NEXT:  .LBB0_1: ; %atomicrmw.start
99; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
100; GFX12-NEXT:    s_wait_loadcnt 0x0
101; GFX12-NEXT:    v_mov_b32_e32 v5, v0
102; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
103; GFX12-NEXT:    v_max_i32_e32 v4, v5, v1
104; GFX12-NEXT:    global_wb scope:SCOPE_SYS
105; GFX12-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
106; GFX12-NEXT:    s_wait_loadcnt 0x0
107; GFX12-NEXT:    global_inv scope:SCOPE_SYS
108; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
109; GFX12-NEXT:    s_wait_alu 0xfffe
110; GFX12-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
111; GFX12-NEXT:    s_wait_alu 0xfffe
112; GFX12-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
113; GFX12-NEXT:    s_cbranch_execnz .LBB0_1
114; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
115; GFX12-NEXT:    s_or_b64 exec, exec, s[0:1]
116; GFX12-NEXT:    ; return to shader part epilog
117  %zext.offset = zext i32 %voffset to i64
118  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
119  %rtn = atomicrmw max ptr addrspace(1) %gep0, i32 %data seq_cst
120  %cast.rtn = bitcast i32 %rtn to float
121  ret float %cast.rtn
122}
123
124define amdgpu_ps float @global_max_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
125; GFX9-LABEL: global_max_saddr_i32_rtn_neg128:
126; GFX9:       ; %bb.0:
127; GFX9-NEXT:    v_mov_b32_e32 v2, v0
128; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] offset:-128
129; GFX9-NEXT:    v_mov_b32_e32 v3, s3
130; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s2, v2
131; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
132; GFX9-NEXT:    s_mov_b64 s[0:1], 0
133; GFX9-NEXT:  .LBB1_1: ; %atomicrmw.start
134; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
135; GFX9-NEXT:    s_waitcnt vmcnt(0)
136; GFX9-NEXT:    v_mov_b32_e32 v5, v0
137; GFX9-NEXT:    v_max_i32_e32 v4, v5, v1
138; GFX9-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
139; GFX9-NEXT:    s_waitcnt vmcnt(0)
140; GFX9-NEXT:    buffer_wbinvl1
141; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
142; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
143; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
144; GFX9-NEXT:    s_cbranch_execnz .LBB1_1
145; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
146; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
147; GFX9-NEXT:    ; return to shader part epilog
148;
149; GFX10-LABEL: global_max_saddr_i32_rtn_neg128:
150; GFX10:       ; %bb.0:
151; GFX10-NEXT:    v_mov_b32_e32 v2, v0
152; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] offset:-128
153; GFX10-NEXT:    v_add_co_u32 v2, s[0:1], s2, v2
154; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
155; GFX10-NEXT:    s_mov_b64 s[0:1], 0
156; GFX10-NEXT:  .LBB1_1: ; %atomicrmw.start
157; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
158; GFX10-NEXT:    s_waitcnt vmcnt(0)
159; GFX10-NEXT:    v_mov_b32_e32 v5, v0
160; GFX10-NEXT:    v_max_i32_e32 v4, v5, v1
161; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
162; GFX10-NEXT:    s_waitcnt vmcnt(0)
163; GFX10-NEXT:    buffer_gl1_inv
164; GFX10-NEXT:    buffer_gl0_inv
165; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
166; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
167; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
168; GFX10-NEXT:    s_cbranch_execnz .LBB1_1
169; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
170; GFX10-NEXT:    s_or_b64 exec, exec, s[0:1]
171; GFX10-NEXT:    ; return to shader part epilog
172;
173; GFX11-LABEL: global_max_saddr_i32_rtn_neg128:
174; GFX11:       ; %bb.0:
175; GFX11-NEXT:    v_mov_b32_e32 v2, v0
176; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] offset:-128
177; GFX11-NEXT:    v_add_co_u32 v2, s[0:1], s2, v2
178; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
179; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
180; GFX11-NEXT:    s_mov_b64 s[0:1], 0
181; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
182; GFX11-NEXT:  .LBB1_1: ; %atomicrmw.start
183; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
184; GFX11-NEXT:    s_waitcnt vmcnt(0)
185; GFX11-NEXT:    v_mov_b32_e32 v5, v0
186; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
187; GFX11-NEXT:    v_max_i32_e32 v4, v5, v1
188; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc
189; GFX11-NEXT:    s_waitcnt vmcnt(0)
190; GFX11-NEXT:    buffer_gl1_inv
191; GFX11-NEXT:    buffer_gl0_inv
192; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
193; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
194; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
195; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
196; GFX11-NEXT:    s_cbranch_execnz .LBB1_1
197; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
198; GFX11-NEXT:    s_or_b64 exec, exec, s[0:1]
199; GFX11-NEXT:    ; return to shader part epilog
200;
201; GFX12-LABEL: global_max_saddr_i32_rtn_neg128:
202; GFX12:       ; %bb.0:
203; GFX12-NEXT:    v_mov_b32_e32 v2, v0
204; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3] offset:-128
205; GFX12-NEXT:    v_add_co_u32 v2, s[0:1], s2, v2
206; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
207; GFX12-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
208; GFX12-NEXT:    s_mov_b64 s[0:1], 0
209; GFX12-NEXT:  .LBB1_1: ; %atomicrmw.start
210; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
211; GFX12-NEXT:    s_wait_loadcnt 0x0
212; GFX12-NEXT:    v_mov_b32_e32 v5, v0
213; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
214; GFX12-NEXT:    v_max_i32_e32 v4, v5, v1
215; GFX12-NEXT:    global_wb scope:SCOPE_SYS
216; GFX12-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
217; GFX12-NEXT:    s_wait_loadcnt 0x0
218; GFX12-NEXT:    global_inv scope:SCOPE_SYS
219; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
220; GFX12-NEXT:    s_wait_alu 0xfffe
221; GFX12-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
222; GFX12-NEXT:    s_wait_alu 0xfffe
223; GFX12-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
224; GFX12-NEXT:    s_cbranch_execnz .LBB1_1
225; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
226; GFX12-NEXT:    s_or_b64 exec, exec, s[0:1]
227; GFX12-NEXT:    ; return to shader part epilog
228  %zext.offset = zext i32 %voffset to i64
229  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
230  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
231  %rtn = atomicrmw max ptr addrspace(1) %gep1, i32 %data seq_cst
232  %cast.rtn = bitcast i32 %rtn to float
233  ret float %cast.rtn
234}
235
236define amdgpu_ps void @global_max_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
237; GFX9-LABEL: global_max_saddr_i32_nortn:
238; GFX9:       ; %bb.0:
239; GFX9-NEXT:    global_load_dword v5, v0, s[2:3]
240; GFX9-NEXT:    v_mov_b32_e32 v3, s3
241; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s2, v0
242; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
243; GFX9-NEXT:    s_mov_b64 s[0:1], 0
244; GFX9-NEXT:  .LBB2_1: ; %atomicrmw.start
245; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
246; GFX9-NEXT:    s_waitcnt vmcnt(0)
247; GFX9-NEXT:    v_max_i32_e32 v4, v5, v1
248; GFX9-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
249; GFX9-NEXT:    s_waitcnt vmcnt(0)
250; GFX9-NEXT:    buffer_wbinvl1
251; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
252; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
253; GFX9-NEXT:    v_mov_b32_e32 v5, v0
254; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
255; GFX9-NEXT:    s_cbranch_execnz .LBB2_1
256; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
257; GFX9-NEXT:    s_endpgm
258;
259; GFX10-LABEL: global_max_saddr_i32_nortn:
260; GFX10:       ; %bb.0:
261; GFX10-NEXT:    global_load_dword v5, v0, s[2:3]
262; GFX10-NEXT:    v_add_co_u32 v2, s[0:1], s2, v0
263; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
264; GFX10-NEXT:    s_mov_b64 s[0:1], 0
265; GFX10-NEXT:  .LBB2_1: ; %atomicrmw.start
266; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
267; GFX10-NEXT:    s_waitcnt vmcnt(0)
268; GFX10-NEXT:    v_max_i32_e32 v4, v5, v1
269; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
270; GFX10-NEXT:    s_waitcnt vmcnt(0)
271; GFX10-NEXT:    buffer_gl1_inv
272; GFX10-NEXT:    buffer_gl0_inv
273; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
274; GFX10-NEXT:    v_mov_b32_e32 v5, v0
275; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
276; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
277; GFX10-NEXT:    s_cbranch_execnz .LBB2_1
278; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
279; GFX10-NEXT:    s_endpgm
280;
281; GFX11-LABEL: global_max_saddr_i32_nortn:
282; GFX11:       ; %bb.0:
283; GFX11-NEXT:    global_load_b32 v5, v0, s[2:3]
284; GFX11-NEXT:    v_add_co_u32 v2, s[0:1], s2, v0
285; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
286; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
287; GFX11-NEXT:    s_mov_b64 s[0:1], 0
288; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
289; GFX11-NEXT:  .LBB2_1: ; %atomicrmw.start
290; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
291; GFX11-NEXT:    s_waitcnt vmcnt(0)
292; GFX11-NEXT:    v_max_i32_e32 v4, v5, v1
293; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc
294; GFX11-NEXT:    s_waitcnt vmcnt(0)
295; GFX11-NEXT:    buffer_gl1_inv
296; GFX11-NEXT:    buffer_gl0_inv
297; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
298; GFX11-NEXT:    v_mov_b32_e32 v5, v0
299; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
300; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
301; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
302; GFX11-NEXT:    s_cbranch_execnz .LBB2_1
303; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
304; GFX11-NEXT:    s_endpgm
305;
306; GFX12-LABEL: global_max_saddr_i32_nortn:
307; GFX12:       ; %bb.0:
308; GFX12-NEXT:    global_load_b32 v5, v0, s[2:3]
309; GFX12-NEXT:    v_add_co_u32 v2, s[0:1], s2, v0
310; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
311; GFX12-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
312; GFX12-NEXT:    s_mov_b64 s[0:1], 0
313; GFX12-NEXT:  .LBB2_1: ; %atomicrmw.start
314; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
315; GFX12-NEXT:    s_wait_loadcnt 0x0
316; GFX12-NEXT:    v_max_i32_e32 v4, v5, v1
317; GFX12-NEXT:    global_wb scope:SCOPE_SYS
318; GFX12-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
319; GFX12-NEXT:    s_wait_loadcnt 0x0
320; GFX12-NEXT:    global_inv scope:SCOPE_SYS
321; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
322; GFX12-NEXT:    v_mov_b32_e32 v5, v0
323; GFX12-NEXT:    s_wait_alu 0xfffe
324; GFX12-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
325; GFX12-NEXT:    s_wait_alu 0xfffe
326; GFX12-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
327; GFX12-NEXT:    s_cbranch_execnz .LBB2_1
328; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
329; GFX12-NEXT:    s_endpgm
330  %zext.offset = zext i32 %voffset to i64
331  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
332  %unused = atomicrmw max ptr addrspace(1) %gep0, i32 %data seq_cst
333  ret void
334}
335
336define amdgpu_ps void @global_max_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
337; GFX9-LABEL: global_max_saddr_i32_nortn_neg128:
338; GFX9:       ; %bb.0:
339; GFX9-NEXT:    global_load_dword v5, v0, s[2:3] offset:-128
340; GFX9-NEXT:    v_mov_b32_e32 v3, s3
341; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s2, v0
342; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
343; GFX9-NEXT:    s_mov_b64 s[0:1], 0
344; GFX9-NEXT:  .LBB3_1: ; %atomicrmw.start
345; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
346; GFX9-NEXT:    s_waitcnt vmcnt(0)
347; GFX9-NEXT:    v_max_i32_e32 v4, v5, v1
348; GFX9-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
349; GFX9-NEXT:    s_waitcnt vmcnt(0)
350; GFX9-NEXT:    buffer_wbinvl1
351; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
352; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
353; GFX9-NEXT:    v_mov_b32_e32 v5, v0
354; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
355; GFX9-NEXT:    s_cbranch_execnz .LBB3_1
356; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
357; GFX9-NEXT:    s_endpgm
358;
359; GFX10-LABEL: global_max_saddr_i32_nortn_neg128:
360; GFX10:       ; %bb.0:
361; GFX10-NEXT:    global_load_dword v5, v0, s[2:3] offset:-128
362; GFX10-NEXT:    v_add_co_u32 v2, s[0:1], s2, v0
363; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
364; GFX10-NEXT:    s_mov_b64 s[0:1], 0
365; GFX10-NEXT:  .LBB3_1: ; %atomicrmw.start
366; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
367; GFX10-NEXT:    s_waitcnt vmcnt(0)
368; GFX10-NEXT:    v_max_i32_e32 v4, v5, v1
369; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
370; GFX10-NEXT:    s_waitcnt vmcnt(0)
371; GFX10-NEXT:    buffer_gl1_inv
372; GFX10-NEXT:    buffer_gl0_inv
373; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
374; GFX10-NEXT:    v_mov_b32_e32 v5, v0
375; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
376; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
377; GFX10-NEXT:    s_cbranch_execnz .LBB3_1
378; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
379; GFX10-NEXT:    s_endpgm
380;
381; GFX11-LABEL: global_max_saddr_i32_nortn_neg128:
382; GFX11:       ; %bb.0:
383; GFX11-NEXT:    global_load_b32 v5, v0, s[2:3] offset:-128
384; GFX11-NEXT:    v_add_co_u32 v2, s[0:1], s2, v0
385; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
386; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
387; GFX11-NEXT:    s_mov_b64 s[0:1], 0
388; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
389; GFX11-NEXT:  .LBB3_1: ; %atomicrmw.start
390; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
391; GFX11-NEXT:    s_waitcnt vmcnt(0)
392; GFX11-NEXT:    v_max_i32_e32 v4, v5, v1
393; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc
394; GFX11-NEXT:    s_waitcnt vmcnt(0)
395; GFX11-NEXT:    buffer_gl1_inv
396; GFX11-NEXT:    buffer_gl0_inv
397; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
398; GFX11-NEXT:    v_mov_b32_e32 v5, v0
399; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
400; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
401; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
402; GFX11-NEXT:    s_cbranch_execnz .LBB3_1
403; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
404; GFX11-NEXT:    s_endpgm
405;
406; GFX12-LABEL: global_max_saddr_i32_nortn_neg128:
407; GFX12:       ; %bb.0:
408; GFX12-NEXT:    global_load_b32 v5, v0, s[2:3] offset:-128
409; GFX12-NEXT:    v_add_co_u32 v2, s[0:1], s2, v0
410; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
411; GFX12-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
412; GFX12-NEXT:    s_mov_b64 s[0:1], 0
413; GFX12-NEXT:  .LBB3_1: ; %atomicrmw.start
414; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
415; GFX12-NEXT:    s_wait_loadcnt 0x0
416; GFX12-NEXT:    v_max_i32_e32 v4, v5, v1
417; GFX12-NEXT:    global_wb scope:SCOPE_SYS
418; GFX12-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
419; GFX12-NEXT:    s_wait_loadcnt 0x0
420; GFX12-NEXT:    global_inv scope:SCOPE_SYS
421; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
422; GFX12-NEXT:    v_mov_b32_e32 v5, v0
423; GFX12-NEXT:    s_wait_alu 0xfffe
424; GFX12-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
425; GFX12-NEXT:    s_wait_alu 0xfffe
426; GFX12-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
427; GFX12-NEXT:    s_cbranch_execnz .LBB3_1
428; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
429; GFX12-NEXT:    s_endpgm
430  %zext.offset = zext i32 %voffset to i64
431  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
432  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
433  %unused = atomicrmw max ptr addrspace(1) %gep1, i32 %data seq_cst
434  ret void
435}
436
437define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
438; GFX9-LABEL: global_max_saddr_i64_rtn:
439; GFX9:       ; %bb.0:
440; GFX9-NEXT:    global_load_dwordx2 v[3:4], v0, s[2:3]
441; GFX9-NEXT:    v_mov_b32_e32 v6, s3
442; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, s2, v0
443; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
444; GFX9-NEXT:    s_mov_b64 s[0:1], 0
445; GFX9-NEXT:  .LBB4_1: ; %atomicrmw.start
446; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
447; GFX9-NEXT:    s_waitcnt vmcnt(0)
448; GFX9-NEXT:    v_mov_b32_e32 v10, v4
449; GFX9-NEXT:    v_mov_b32_e32 v9, v3
450; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2]
451; GFX9-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
452; GFX9-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
453; GFX9-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc
454; GFX9-NEXT:    s_waitcnt vmcnt(0)
455; GFX9-NEXT:    buffer_wbinvl1
456; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
457; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
458; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
459; GFX9-NEXT:    s_cbranch_execnz .LBB4_1
460; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
461; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
462; GFX9-NEXT:    v_mov_b32_e32 v0, v3
463; GFX9-NEXT:    v_mov_b32_e32 v1, v4
464; GFX9-NEXT:    ; return to shader part epilog
465;
466; GFX10-LABEL: global_max_saddr_i64_rtn:
467; GFX10:       ; %bb.0:
468; GFX10-NEXT:    global_load_dwordx2 v[3:4], v0, s[2:3]
469; GFX10-NEXT:    v_add_co_u32 v5, s[0:1], s2, v0
470; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1]
471; GFX10-NEXT:    s_mov_b64 s[0:1], 0
472; GFX10-NEXT:  .LBB4_1: ; %atomicrmw.start
473; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
474; GFX10-NEXT:    s_waitcnt vmcnt(0)
475; GFX10-NEXT:    v_mov_b32_e32 v10, v4
476; GFX10-NEXT:    v_mov_b32_e32 v9, v3
477; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2]
478; GFX10-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
479; GFX10-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
480; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc
481; GFX10-NEXT:    s_waitcnt vmcnt(0)
482; GFX10-NEXT:    buffer_gl1_inv
483; GFX10-NEXT:    buffer_gl0_inv
484; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
485; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
486; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
487; GFX10-NEXT:    s_cbranch_execnz .LBB4_1
488; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
489; GFX10-NEXT:    s_or_b64 exec, exec, s[0:1]
490; GFX10-NEXT:    v_mov_b32_e32 v0, v3
491; GFX10-NEXT:    v_mov_b32_e32 v1, v4
492; GFX10-NEXT:    ; return to shader part epilog
493;
494; GFX11-LABEL: global_max_saddr_i64_rtn:
495; GFX11:       ; %bb.0:
496; GFX11-NEXT:    global_load_b64 v[3:4], v0, s[2:3]
497; GFX11-NEXT:    v_add_co_u32 v5, s[0:1], s2, v0
498; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
499; GFX11-NEXT:    v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1]
500; GFX11-NEXT:    s_mov_b64 s[0:1], 0
501; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
502; GFX11-NEXT:  .LBB4_1: ; %atomicrmw.start
503; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
504; GFX11-NEXT:    s_waitcnt vmcnt(0)
505; GFX11-NEXT:    v_mov_b32_e32 v10, v4
506; GFX11-NEXT:    v_mov_b32_e32 v9, v3
507; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
508; GFX11-NEXT:    v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2]
509; GFX11-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
510; GFX11-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
511; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off glc
512; GFX11-NEXT:    s_waitcnt vmcnt(0)
513; GFX11-NEXT:    buffer_gl1_inv
514; GFX11-NEXT:    buffer_gl0_inv
515; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
516; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
517; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
518; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
519; GFX11-NEXT:    s_cbranch_execnz .LBB4_1
520; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
521; GFX11-NEXT:    s_or_b64 exec, exec, s[0:1]
522; GFX11-NEXT:    v_mov_b32_e32 v0, v3
523; GFX11-NEXT:    v_mov_b32_e32 v1, v4
524; GFX11-NEXT:    ; return to shader part epilog
525;
526; GFX12-LABEL: global_max_saddr_i64_rtn:
527; GFX12:       ; %bb.0:
528; GFX12-NEXT:    global_load_b64 v[3:4], v0, s[2:3]
529; GFX12-NEXT:    v_add_co_u32 v5, s[0:1], s2, v0
530; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
531; GFX12-NEXT:    v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1]
532; GFX12-NEXT:    s_mov_b64 s[0:1], 0
533; GFX12-NEXT:  .LBB4_1: ; %atomicrmw.start
534; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
535; GFX12-NEXT:    s_wait_loadcnt 0x0
536; GFX12-NEXT:    v_mov_b32_e32 v10, v4
537; GFX12-NEXT:    v_mov_b32_e32 v9, v3
538; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
539; GFX12-NEXT:    v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2]
540; GFX12-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
541; GFX12-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
542; GFX12-NEXT:    global_wb scope:SCOPE_SYS
543; GFX12-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
544; GFX12-NEXT:    s_wait_loadcnt 0x0
545; GFX12-NEXT:    global_inv scope:SCOPE_SYS
546; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
547; GFX12-NEXT:    s_wait_alu 0xfffe
548; GFX12-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
549; GFX12-NEXT:    s_wait_alu 0xfffe
550; GFX12-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
551; GFX12-NEXT:    s_cbranch_execnz .LBB4_1
552; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
553; GFX12-NEXT:    s_or_b64 exec, exec, s[0:1]
554; GFX12-NEXT:    v_mov_b32_e32 v0, v3
555; GFX12-NEXT:    v_mov_b32_e32 v1, v4
556; GFX12-NEXT:    ; return to shader part epilog
557  %zext.offset = zext i32 %voffset to i64
558  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
559  %rtn = atomicrmw max ptr addrspace(1) %gep0, i64 %data seq_cst
560  %cast.rtn = bitcast i64 %rtn to <2 x float>
561  ret <2 x float> %cast.rtn
562}
563
564define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
565; GFX9-LABEL: global_max_saddr_i64_rtn_neg128:
566; GFX9:       ; %bb.0:
567; GFX9-NEXT:    global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128
568; GFX9-NEXT:    v_mov_b32_e32 v6, s3
569; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, s2, v0
570; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
571; GFX9-NEXT:    s_mov_b64 s[0:1], 0
572; GFX9-NEXT:  .LBB5_1: ; %atomicrmw.start
573; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
574; GFX9-NEXT:    s_waitcnt vmcnt(0)
575; GFX9-NEXT:    v_mov_b32_e32 v10, v4
576; GFX9-NEXT:    v_mov_b32_e32 v9, v3
577; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2]
578; GFX9-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
579; GFX9-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
580; GFX9-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc
581; GFX9-NEXT:    s_waitcnt vmcnt(0)
582; GFX9-NEXT:    buffer_wbinvl1
583; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
584; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
585; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
586; GFX9-NEXT:    s_cbranch_execnz .LBB5_1
587; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
588; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
589; GFX9-NEXT:    v_mov_b32_e32 v0, v3
590; GFX9-NEXT:    v_mov_b32_e32 v1, v4
591; GFX9-NEXT:    ; return to shader part epilog
592;
593; GFX10-LABEL: global_max_saddr_i64_rtn_neg128:
594; GFX10:       ; %bb.0:
595; GFX10-NEXT:    global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128
596; GFX10-NEXT:    v_add_co_u32 v5, s[0:1], s2, v0
597; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1]
598; GFX10-NEXT:    s_mov_b64 s[0:1], 0
599; GFX10-NEXT:  .LBB5_1: ; %atomicrmw.start
600; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
601; GFX10-NEXT:    s_waitcnt vmcnt(0)
602; GFX10-NEXT:    v_mov_b32_e32 v10, v4
603; GFX10-NEXT:    v_mov_b32_e32 v9, v3
604; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2]
605; GFX10-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
606; GFX10-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
607; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc
608; GFX10-NEXT:    s_waitcnt vmcnt(0)
609; GFX10-NEXT:    buffer_gl1_inv
610; GFX10-NEXT:    buffer_gl0_inv
611; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
612; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
613; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
614; GFX10-NEXT:    s_cbranch_execnz .LBB5_1
615; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
616; GFX10-NEXT:    s_or_b64 exec, exec, s[0:1]
617; GFX10-NEXT:    v_mov_b32_e32 v0, v3
618; GFX10-NEXT:    v_mov_b32_e32 v1, v4
619; GFX10-NEXT:    ; return to shader part epilog
620;
621; GFX11-LABEL: global_max_saddr_i64_rtn_neg128:
622; GFX11:       ; %bb.0:
623; GFX11-NEXT:    global_load_b64 v[3:4], v0, s[2:3] offset:-128
624; GFX11-NEXT:    v_add_co_u32 v5, s[0:1], s2, v0
625; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
626; GFX11-NEXT:    v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1]
627; GFX11-NEXT:    s_mov_b64 s[0:1], 0
628; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
629; GFX11-NEXT:  .LBB5_1: ; %atomicrmw.start
630; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
631; GFX11-NEXT:    s_waitcnt vmcnt(0)
632; GFX11-NEXT:    v_mov_b32_e32 v10, v4
633; GFX11-NEXT:    v_mov_b32_e32 v9, v3
634; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
635; GFX11-NEXT:    v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2]
636; GFX11-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
637; GFX11-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
638; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 glc
639; GFX11-NEXT:    s_waitcnt vmcnt(0)
640; GFX11-NEXT:    buffer_gl1_inv
641; GFX11-NEXT:    buffer_gl0_inv
642; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
643; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
644; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
645; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
646; GFX11-NEXT:    s_cbranch_execnz .LBB5_1
647; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
648; GFX11-NEXT:    s_or_b64 exec, exec, s[0:1]
649; GFX11-NEXT:    v_mov_b32_e32 v0, v3
650; GFX11-NEXT:    v_mov_b32_e32 v1, v4
651; GFX11-NEXT:    ; return to shader part epilog
652;
653; GFX12-LABEL: global_max_saddr_i64_rtn_neg128:
654; GFX12:       ; %bb.0:
655; GFX12-NEXT:    global_load_b64 v[3:4], v0, s[2:3] offset:-128
656; GFX12-NEXT:    v_add_co_u32 v5, s[0:1], s2, v0
657; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
658; GFX12-NEXT:    v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1]
659; GFX12-NEXT:    s_mov_b64 s[0:1], 0
660; GFX12-NEXT:  .LBB5_1: ; %atomicrmw.start
661; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
662; GFX12-NEXT:    s_wait_loadcnt 0x0
663; GFX12-NEXT:    v_mov_b32_e32 v10, v4
664; GFX12-NEXT:    v_mov_b32_e32 v9, v3
665; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
666; GFX12-NEXT:    v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2]
667; GFX12-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
668; GFX12-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
669; GFX12-NEXT:    global_wb scope:SCOPE_SYS
670; GFX12-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
671; GFX12-NEXT:    s_wait_loadcnt 0x0
672; GFX12-NEXT:    global_inv scope:SCOPE_SYS
673; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
674; GFX12-NEXT:    s_wait_alu 0xfffe
675; GFX12-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
676; GFX12-NEXT:    s_wait_alu 0xfffe
677; GFX12-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
678; GFX12-NEXT:    s_cbranch_execnz .LBB5_1
679; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
680; GFX12-NEXT:    s_or_b64 exec, exec, s[0:1]
681; GFX12-NEXT:    v_mov_b32_e32 v0, v3
682; GFX12-NEXT:    v_mov_b32_e32 v1, v4
683; GFX12-NEXT:    ; return to shader part epilog
684  %zext.offset = zext i32 %voffset to i64
685  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
686  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
687  %rtn = atomicrmw max ptr addrspace(1) %gep1, i64 %data seq_cst
688  %cast.rtn = bitcast i64 %rtn to <2 x float>
689  ret <2 x float> %cast.rtn
690}
691
692define amdgpu_ps void @global_max_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
693; GFX9-LABEL: global_max_saddr_i64_nortn:
694; GFX9:       ; %bb.0:
695; GFX9-NEXT:    global_load_dwordx2 v[5:6], v0, s[2:3]
696; GFX9-NEXT:    v_mov_b32_e32 v3, s3
697; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, s2, v0
698; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v3, vcc
699; GFX9-NEXT:    s_mov_b64 s[0:1], 0
700; GFX9-NEXT:  .LBB6_1: ; %atomicrmw.start
701; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
702; GFX9-NEXT:    s_waitcnt vmcnt(0)
703; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2]
704; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
705; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
706; GFX9-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc
707; GFX9-NEXT:    s_waitcnt vmcnt(0)
708; GFX9-NEXT:    buffer_wbinvl1
709; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
710; GFX9-NEXT:    v_mov_b32_e32 v6, v4
711; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
712; GFX9-NEXT:    v_mov_b32_e32 v5, v3
713; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
714; GFX9-NEXT:    s_cbranch_execnz .LBB6_1
715; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
716; GFX9-NEXT:    s_endpgm
717;
718; GFX10-LABEL: global_max_saddr_i64_nortn:
719; GFX10:       ; %bb.0:
720; GFX10-NEXT:    global_load_dwordx2 v[5:6], v0, s[2:3]
721; GFX10-NEXT:    v_add_co_u32 v7, s[0:1], s2, v0
722; GFX10-NEXT:    v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1]
723; GFX10-NEXT:    s_mov_b64 s[0:1], 0
724; GFX10-NEXT:  .LBB6_1: ; %atomicrmw.start
725; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
726; GFX10-NEXT:    s_waitcnt vmcnt(0)
727; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2]
728; GFX10-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
729; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
730; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc
731; GFX10-NEXT:    s_waitcnt vmcnt(0)
732; GFX10-NEXT:    buffer_gl1_inv
733; GFX10-NEXT:    buffer_gl0_inv
734; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
735; GFX10-NEXT:    v_mov_b32_e32 v6, v4
736; GFX10-NEXT:    v_mov_b32_e32 v5, v3
737; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
738; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
739; GFX10-NEXT:    s_cbranch_execnz .LBB6_1
740; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
741; GFX10-NEXT:    s_endpgm
742;
743; GFX11-LABEL: global_max_saddr_i64_nortn:
744; GFX11:       ; %bb.0:
745; GFX11-NEXT:    global_load_b64 v[5:6], v0, s[2:3]
746; GFX11-NEXT:    v_add_co_u32 v7, s[0:1], s2, v0
747; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
748; GFX11-NEXT:    v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1]
749; GFX11-NEXT:    s_mov_b64 s[0:1], 0
750; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
751; GFX11-NEXT:  .LBB6_1: ; %atomicrmw.start
752; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
753; GFX11-NEXT:    s_waitcnt vmcnt(0)
754; GFX11-NEXT:    v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2]
755; GFX11-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
756; GFX11-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
757; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off glc
758; GFX11-NEXT:    s_waitcnt vmcnt(0)
759; GFX11-NEXT:    buffer_gl1_inv
760; GFX11-NEXT:    buffer_gl0_inv
761; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
762; GFX11-NEXT:    v_mov_b32_e32 v6, v4
763; GFX11-NEXT:    v_mov_b32_e32 v5, v3
764; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
765; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
766; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
767; GFX11-NEXT:    s_cbranch_execnz .LBB6_1
768; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
769; GFX11-NEXT:    s_endpgm
770;
771; GFX12-LABEL: global_max_saddr_i64_nortn:
772; GFX12:       ; %bb.0:
773; GFX12-NEXT:    global_load_b64 v[5:6], v0, s[2:3]
774; GFX12-NEXT:    v_add_co_u32 v7, s[0:1], s2, v0
775; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
776; GFX12-NEXT:    v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1]
777; GFX12-NEXT:    s_mov_b64 s[0:1], 0
778; GFX12-NEXT:  .LBB6_1: ; %atomicrmw.start
779; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
780; GFX12-NEXT:    s_wait_loadcnt 0x0
781; GFX12-NEXT:    v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2]
782; GFX12-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
783; GFX12-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
784; GFX12-NEXT:    global_wb scope:SCOPE_SYS
785; GFX12-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
786; GFX12-NEXT:    s_wait_loadcnt 0x0
787; GFX12-NEXT:    global_inv scope:SCOPE_SYS
788; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
789; GFX12-NEXT:    v_mov_b32_e32 v6, v4
790; GFX12-NEXT:    v_mov_b32_e32 v5, v3
791; GFX12-NEXT:    s_wait_alu 0xfffe
792; GFX12-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
793; GFX12-NEXT:    s_wait_alu 0xfffe
794; GFX12-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
795; GFX12-NEXT:    s_cbranch_execnz .LBB6_1
796; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
797; GFX12-NEXT:    s_endpgm
798  %zext.offset = zext i32 %voffset to i64
799  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
800  %unused = atomicrmw max ptr addrspace(1) %gep0, i64 %data seq_cst
801  ret void
802}
803
804define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
805; GFX9-LABEL: global_max_saddr_i64_nortn_neg128:
806; GFX9:       ; %bb.0:
807; GFX9-NEXT:    global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128
808; GFX9-NEXT:    v_mov_b32_e32 v3, s3
809; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, s2, v0
810; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v3, vcc
811; GFX9-NEXT:    s_mov_b64 s[0:1], 0
812; GFX9-NEXT:  .LBB7_1: ; %atomicrmw.start
813; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
814; GFX9-NEXT:    s_waitcnt vmcnt(0)
815; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2]
816; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
817; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
818; GFX9-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc
819; GFX9-NEXT:    s_waitcnt vmcnt(0)
820; GFX9-NEXT:    buffer_wbinvl1
821; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
822; GFX9-NEXT:    v_mov_b32_e32 v6, v4
823; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
824; GFX9-NEXT:    v_mov_b32_e32 v5, v3
825; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
826; GFX9-NEXT:    s_cbranch_execnz .LBB7_1
827; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
828; GFX9-NEXT:    s_endpgm
829;
830; GFX10-LABEL: global_max_saddr_i64_nortn_neg128:
831; GFX10:       ; %bb.0:
832; GFX10-NEXT:    global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128
833; GFX10-NEXT:    v_add_co_u32 v7, s[0:1], s2, v0
834; GFX10-NEXT:    v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1]
835; GFX10-NEXT:    s_mov_b64 s[0:1], 0
836; GFX10-NEXT:  .LBB7_1: ; %atomicrmw.start
837; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
838; GFX10-NEXT:    s_waitcnt vmcnt(0)
839; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2]
840; GFX10-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
841; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
842; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc
843; GFX10-NEXT:    s_waitcnt vmcnt(0)
844; GFX10-NEXT:    buffer_gl1_inv
845; GFX10-NEXT:    buffer_gl0_inv
846; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
847; GFX10-NEXT:    v_mov_b32_e32 v6, v4
848; GFX10-NEXT:    v_mov_b32_e32 v5, v3
849; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
850; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
851; GFX10-NEXT:    s_cbranch_execnz .LBB7_1
852; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
853; GFX10-NEXT:    s_endpgm
854;
855; GFX11-LABEL: global_max_saddr_i64_nortn_neg128:
856; GFX11:       ; %bb.0:
857; GFX11-NEXT:    global_load_b64 v[5:6], v0, s[2:3] offset:-128
858; GFX11-NEXT:    v_add_co_u32 v7, s[0:1], s2, v0
859; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
860; GFX11-NEXT:    v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1]
861; GFX11-NEXT:    s_mov_b64 s[0:1], 0
862; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
863; GFX11-NEXT:  .LBB7_1: ; %atomicrmw.start
864; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
865; GFX11-NEXT:    s_waitcnt vmcnt(0)
866; GFX11-NEXT:    v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2]
867; GFX11-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
868; GFX11-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
869; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 glc
870; GFX11-NEXT:    s_waitcnt vmcnt(0)
871; GFX11-NEXT:    buffer_gl1_inv
872; GFX11-NEXT:    buffer_gl0_inv
873; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
874; GFX11-NEXT:    v_mov_b32_e32 v6, v4
875; GFX11-NEXT:    v_mov_b32_e32 v5, v3
876; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
877; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
878; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
879; GFX11-NEXT:    s_cbranch_execnz .LBB7_1
880; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
881; GFX11-NEXT:    s_endpgm
882;
883; GFX12-LABEL: global_max_saddr_i64_nortn_neg128:
884; GFX12:       ; %bb.0:
885; GFX12-NEXT:    global_load_b64 v[5:6], v0, s[2:3] offset:-128
886; GFX12-NEXT:    v_add_co_u32 v7, s[0:1], s2, v0
887; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
888; GFX12-NEXT:    v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1]
889; GFX12-NEXT:    s_mov_b64 s[0:1], 0
890; GFX12-NEXT:  .LBB7_1: ; %atomicrmw.start
891; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
892; GFX12-NEXT:    s_wait_loadcnt 0x0
893; GFX12-NEXT:    v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2]
894; GFX12-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
895; GFX12-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
896; GFX12-NEXT:    global_wb scope:SCOPE_SYS
897; GFX12-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
898; GFX12-NEXT:    s_wait_loadcnt 0x0
899; GFX12-NEXT:    global_inv scope:SCOPE_SYS
900; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
901; GFX12-NEXT:    v_mov_b32_e32 v6, v4
902; GFX12-NEXT:    v_mov_b32_e32 v5, v3
903; GFX12-NEXT:    s_wait_alu 0xfffe
904; GFX12-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
905; GFX12-NEXT:    s_wait_alu 0xfffe
906; GFX12-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
907; GFX12-NEXT:    s_cbranch_execnz .LBB7_1
908; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
909; GFX12-NEXT:    s_endpgm
910  %zext.offset = zext i32 %voffset to i64
911  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
912  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
913  %unused = atomicrmw max ptr addrspace(1) %gep1, i64 %data seq_cst
914  ret void
915}
916
917; --------------------------------------------------------------------------------
918; atomicrmw min
919; --------------------------------------------------------------------------------
920
921define amdgpu_ps float @global_min_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
922; GFX9-LABEL: global_min_saddr_i32_rtn:
923; GFX9:       ; %bb.0:
924; GFX9-NEXT:    v_mov_b32_e32 v2, v0
925; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
926; GFX9-NEXT:    v_mov_b32_e32 v3, s3
927; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s2, v2
928; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
929; GFX9-NEXT:    s_mov_b64 s[0:1], 0
930; GFX9-NEXT:  .LBB8_1: ; %atomicrmw.start
931; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
932; GFX9-NEXT:    s_waitcnt vmcnt(0)
933; GFX9-NEXT:    v_mov_b32_e32 v5, v0
934; GFX9-NEXT:    v_min_i32_e32 v4, v5, v1
935; GFX9-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
936; GFX9-NEXT:    s_waitcnt vmcnt(0)
937; GFX9-NEXT:    buffer_wbinvl1
938; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
939; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
940; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
941; GFX9-NEXT:    s_cbranch_execnz .LBB8_1
942; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
943; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
944; GFX9-NEXT:    ; return to shader part epilog
945;
946; GFX10-LABEL: global_min_saddr_i32_rtn:
947; GFX10:       ; %bb.0:
948; GFX10-NEXT:    v_mov_b32_e32 v2, v0
949; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
950; GFX10-NEXT:    v_add_co_u32 v2, s[0:1], s2, v2
951; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
952; GFX10-NEXT:    s_mov_b64 s[0:1], 0
953; GFX10-NEXT:  .LBB8_1: ; %atomicrmw.start
954; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
955; GFX10-NEXT:    s_waitcnt vmcnt(0)
956; GFX10-NEXT:    v_mov_b32_e32 v5, v0
957; GFX10-NEXT:    v_min_i32_e32 v4, v5, v1
958; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
959; GFX10-NEXT:    s_waitcnt vmcnt(0)
960; GFX10-NEXT:    buffer_gl1_inv
961; GFX10-NEXT:    buffer_gl0_inv
962; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
963; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
964; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
965; GFX10-NEXT:    s_cbranch_execnz .LBB8_1
966; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
967; GFX10-NEXT:    s_or_b64 exec, exec, s[0:1]
968; GFX10-NEXT:    ; return to shader part epilog
969;
970; GFX11-LABEL: global_min_saddr_i32_rtn:
971; GFX11:       ; %bb.0:
972; GFX11-NEXT:    v_mov_b32_e32 v2, v0
973; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
974; GFX11-NEXT:    v_add_co_u32 v2, s[0:1], s2, v2
975; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
976; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
977; GFX11-NEXT:    s_mov_b64 s[0:1], 0
978; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
979; GFX11-NEXT:  .LBB8_1: ; %atomicrmw.start
980; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
981; GFX11-NEXT:    s_waitcnt vmcnt(0)
982; GFX11-NEXT:    v_mov_b32_e32 v5, v0
983; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
984; GFX11-NEXT:    v_min_i32_e32 v4, v5, v1
985; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc
986; GFX11-NEXT:    s_waitcnt vmcnt(0)
987; GFX11-NEXT:    buffer_gl1_inv
988; GFX11-NEXT:    buffer_gl0_inv
989; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
990; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
991; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
992; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
993; GFX11-NEXT:    s_cbranch_execnz .LBB8_1
994; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
995; GFX11-NEXT:    s_or_b64 exec, exec, s[0:1]
996; GFX11-NEXT:    ; return to shader part epilog
997;
998; GFX12-LABEL: global_min_saddr_i32_rtn:
999; GFX12:       ; %bb.0:
1000; GFX12-NEXT:    v_mov_b32_e32 v2, v0
1001; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3]
1002; GFX12-NEXT:    v_add_co_u32 v2, s[0:1], s2, v2
1003; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1004; GFX12-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
1005; GFX12-NEXT:    s_mov_b64 s[0:1], 0
1006; GFX12-NEXT:  .LBB8_1: ; %atomicrmw.start
1007; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
1008; GFX12-NEXT:    s_wait_loadcnt 0x0
1009; GFX12-NEXT:    v_mov_b32_e32 v5, v0
1010; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1011; GFX12-NEXT:    v_min_i32_e32 v4, v5, v1
1012; GFX12-NEXT:    global_wb scope:SCOPE_SYS
1013; GFX12-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
1014; GFX12-NEXT:    s_wait_loadcnt 0x0
1015; GFX12-NEXT:    global_inv scope:SCOPE_SYS
1016; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
1017; GFX12-NEXT:    s_wait_alu 0xfffe
1018; GFX12-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1019; GFX12-NEXT:    s_wait_alu 0xfffe
1020; GFX12-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
1021; GFX12-NEXT:    s_cbranch_execnz .LBB8_1
1022; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
1023; GFX12-NEXT:    s_or_b64 exec, exec, s[0:1]
1024; GFX12-NEXT:    ; return to shader part epilog
1025  %zext.offset = zext i32 %voffset to i64
1026  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1027  %rtn = atomicrmw min ptr addrspace(1) %gep0, i32 %data seq_cst
1028  %cast.rtn = bitcast i32 %rtn to float
1029  ret float %cast.rtn
1030}
1031
1032define amdgpu_ps float @global_min_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
1033; GFX9-LABEL: global_min_saddr_i32_rtn_neg128:
1034; GFX9:       ; %bb.0:
1035; GFX9-NEXT:    v_mov_b32_e32 v2, v0
1036; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] offset:-128
1037; GFX9-NEXT:    v_mov_b32_e32 v3, s3
1038; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s2, v2
1039; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1040; GFX9-NEXT:    s_mov_b64 s[0:1], 0
1041; GFX9-NEXT:  .LBB9_1: ; %atomicrmw.start
1042; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
1043; GFX9-NEXT:    s_waitcnt vmcnt(0)
1044; GFX9-NEXT:    v_mov_b32_e32 v5, v0
1045; GFX9-NEXT:    v_min_i32_e32 v4, v5, v1
1046; GFX9-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
1047; GFX9-NEXT:    s_waitcnt vmcnt(0)
1048; GFX9-NEXT:    buffer_wbinvl1
1049; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
1050; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1051; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
1052; GFX9-NEXT:    s_cbranch_execnz .LBB9_1
1053; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
1054; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
1055; GFX9-NEXT:    ; return to shader part epilog
1056;
1057; GFX10-LABEL: global_min_saddr_i32_rtn_neg128:
1058; GFX10:       ; %bb.0:
1059; GFX10-NEXT:    v_mov_b32_e32 v2, v0
1060; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] offset:-128
1061; GFX10-NEXT:    v_add_co_u32 v2, s[0:1], s2, v2
1062; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
1063; GFX10-NEXT:    s_mov_b64 s[0:1], 0
1064; GFX10-NEXT:  .LBB9_1: ; %atomicrmw.start
1065; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
1066; GFX10-NEXT:    s_waitcnt vmcnt(0)
1067; GFX10-NEXT:    v_mov_b32_e32 v5, v0
1068; GFX10-NEXT:    v_min_i32_e32 v4, v5, v1
1069; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
1070; GFX10-NEXT:    s_waitcnt vmcnt(0)
1071; GFX10-NEXT:    buffer_gl1_inv
1072; GFX10-NEXT:    buffer_gl0_inv
1073; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
1074; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1075; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
1076; GFX10-NEXT:    s_cbranch_execnz .LBB9_1
1077; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
1078; GFX10-NEXT:    s_or_b64 exec, exec, s[0:1]
1079; GFX10-NEXT:    ; return to shader part epilog
1080;
1081; GFX11-LABEL: global_min_saddr_i32_rtn_neg128:
1082; GFX11:       ; %bb.0:
1083; GFX11-NEXT:    v_mov_b32_e32 v2, v0
1084; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] offset:-128
1085; GFX11-NEXT:    v_add_co_u32 v2, s[0:1], s2, v2
1086; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1087; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
1088; GFX11-NEXT:    s_mov_b64 s[0:1], 0
1089; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
1090; GFX11-NEXT:  .LBB9_1: ; %atomicrmw.start
1091; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
1092; GFX11-NEXT:    s_waitcnt vmcnt(0)
1093; GFX11-NEXT:    v_mov_b32_e32 v5, v0
1094; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1095; GFX11-NEXT:    v_min_i32_e32 v4, v5, v1
1096; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc
1097; GFX11-NEXT:    s_waitcnt vmcnt(0)
1098; GFX11-NEXT:    buffer_gl1_inv
1099; GFX11-NEXT:    buffer_gl0_inv
1100; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
1101; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1102; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1103; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
1104; GFX11-NEXT:    s_cbranch_execnz .LBB9_1
1105; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
1106; GFX11-NEXT:    s_or_b64 exec, exec, s[0:1]
1107; GFX11-NEXT:    ; return to shader part epilog
1108;
1109; GFX12-LABEL: global_min_saddr_i32_rtn_neg128:
1110; GFX12:       ; %bb.0:
1111; GFX12-NEXT:    v_mov_b32_e32 v2, v0
1112; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3] offset:-128
1113; GFX12-NEXT:    v_add_co_u32 v2, s[0:1], s2, v2
1114; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1115; GFX12-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
1116; GFX12-NEXT:    s_mov_b64 s[0:1], 0
1117; GFX12-NEXT:  .LBB9_1: ; %atomicrmw.start
1118; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
1119; GFX12-NEXT:    s_wait_loadcnt 0x0
1120; GFX12-NEXT:    v_mov_b32_e32 v5, v0
1121; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1122; GFX12-NEXT:    v_min_i32_e32 v4, v5, v1
1123; GFX12-NEXT:    global_wb scope:SCOPE_SYS
1124; GFX12-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
1125; GFX12-NEXT:    s_wait_loadcnt 0x0
1126; GFX12-NEXT:    global_inv scope:SCOPE_SYS
1127; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
1128; GFX12-NEXT:    s_wait_alu 0xfffe
1129; GFX12-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1130; GFX12-NEXT:    s_wait_alu 0xfffe
1131; GFX12-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
1132; GFX12-NEXT:    s_cbranch_execnz .LBB9_1
1133; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
1134; GFX12-NEXT:    s_or_b64 exec, exec, s[0:1]
1135; GFX12-NEXT:    ; return to shader part epilog
1136  %zext.offset = zext i32 %voffset to i64
1137  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1138  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
1139  %rtn = atomicrmw min ptr addrspace(1) %gep1, i32 %data seq_cst
1140  %cast.rtn = bitcast i32 %rtn to float
1141  ret float %cast.rtn
1142}
1143
1144define amdgpu_ps void @global_min_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
1145; GFX9-LABEL: global_min_saddr_i32_nortn:
1146; GFX9:       ; %bb.0:
1147; GFX9-NEXT:    global_load_dword v5, v0, s[2:3]
1148; GFX9-NEXT:    v_mov_b32_e32 v3, s3
1149; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s2, v0
1150; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1151; GFX9-NEXT:    s_mov_b64 s[0:1], 0
1152; GFX9-NEXT:  .LBB10_1: ; %atomicrmw.start
1153; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
1154; GFX9-NEXT:    s_waitcnt vmcnt(0)
1155; GFX9-NEXT:    v_min_i32_e32 v4, v5, v1
1156; GFX9-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
1157; GFX9-NEXT:    s_waitcnt vmcnt(0)
1158; GFX9-NEXT:    buffer_wbinvl1
1159; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
1160; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1161; GFX9-NEXT:    v_mov_b32_e32 v5, v0
1162; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
1163; GFX9-NEXT:    s_cbranch_execnz .LBB10_1
1164; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
1165; GFX9-NEXT:    s_endpgm
1166;
1167; GFX10-LABEL: global_min_saddr_i32_nortn:
1168; GFX10:       ; %bb.0:
1169; GFX10-NEXT:    global_load_dword v5, v0, s[2:3]
1170; GFX10-NEXT:    v_add_co_u32 v2, s[0:1], s2, v0
1171; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
1172; GFX10-NEXT:    s_mov_b64 s[0:1], 0
1173; GFX10-NEXT:  .LBB10_1: ; %atomicrmw.start
1174; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
1175; GFX10-NEXT:    s_waitcnt vmcnt(0)
1176; GFX10-NEXT:    v_min_i32_e32 v4, v5, v1
1177; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
1178; GFX10-NEXT:    s_waitcnt vmcnt(0)
1179; GFX10-NEXT:    buffer_gl1_inv
1180; GFX10-NEXT:    buffer_gl0_inv
1181; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
1182; GFX10-NEXT:    v_mov_b32_e32 v5, v0
1183; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1184; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
1185; GFX10-NEXT:    s_cbranch_execnz .LBB10_1
1186; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
1187; GFX10-NEXT:    s_endpgm
1188;
1189; GFX11-LABEL: global_min_saddr_i32_nortn:
1190; GFX11:       ; %bb.0:
1191; GFX11-NEXT:    global_load_b32 v5, v0, s[2:3]
1192; GFX11-NEXT:    v_add_co_u32 v2, s[0:1], s2, v0
1193; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1194; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
1195; GFX11-NEXT:    s_mov_b64 s[0:1], 0
1196; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
1197; GFX11-NEXT:  .LBB10_1: ; %atomicrmw.start
1198; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
1199; GFX11-NEXT:    s_waitcnt vmcnt(0)
1200; GFX11-NEXT:    v_min_i32_e32 v4, v5, v1
1201; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc
1202; GFX11-NEXT:    s_waitcnt vmcnt(0)
1203; GFX11-NEXT:    buffer_gl1_inv
1204; GFX11-NEXT:    buffer_gl0_inv
1205; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
1206; GFX11-NEXT:    v_mov_b32_e32 v5, v0
1207; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1208; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1209; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
1210; GFX11-NEXT:    s_cbranch_execnz .LBB10_1
1211; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
1212; GFX11-NEXT:    s_endpgm
1213;
1214; GFX12-LABEL: global_min_saddr_i32_nortn:
1215; GFX12:       ; %bb.0:
1216; GFX12-NEXT:    global_load_b32 v5, v0, s[2:3]
1217; GFX12-NEXT:    v_add_co_u32 v2, s[0:1], s2, v0
1218; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1219; GFX12-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
1220; GFX12-NEXT:    s_mov_b64 s[0:1], 0
1221; GFX12-NEXT:  .LBB10_1: ; %atomicrmw.start
1222; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
1223; GFX12-NEXT:    s_wait_loadcnt 0x0
1224; GFX12-NEXT:    v_min_i32_e32 v4, v5, v1
1225; GFX12-NEXT:    global_wb scope:SCOPE_SYS
1226; GFX12-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
1227; GFX12-NEXT:    s_wait_loadcnt 0x0
1228; GFX12-NEXT:    global_inv scope:SCOPE_SYS
1229; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
1230; GFX12-NEXT:    v_mov_b32_e32 v5, v0
1231; GFX12-NEXT:    s_wait_alu 0xfffe
1232; GFX12-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1233; GFX12-NEXT:    s_wait_alu 0xfffe
1234; GFX12-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
1235; GFX12-NEXT:    s_cbranch_execnz .LBB10_1
1236; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
1237; GFX12-NEXT:    s_endpgm
1238  %zext.offset = zext i32 %voffset to i64
1239  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1240  %unused = atomicrmw min ptr addrspace(1) %gep0, i32 %data seq_cst
1241  ret void
1242}
1243
1244define amdgpu_ps void @global_min_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
1245; GFX9-LABEL: global_min_saddr_i32_nortn_neg128:
1246; GFX9:       ; %bb.0:
1247; GFX9-NEXT:    global_load_dword v5, v0, s[2:3] offset:-128
1248; GFX9-NEXT:    v_mov_b32_e32 v3, s3
1249; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s2, v0
1250; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1251; GFX9-NEXT:    s_mov_b64 s[0:1], 0
1252; GFX9-NEXT:  .LBB11_1: ; %atomicrmw.start
1253; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
1254; GFX9-NEXT:    s_waitcnt vmcnt(0)
1255; GFX9-NEXT:    v_min_i32_e32 v4, v5, v1
1256; GFX9-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
1257; GFX9-NEXT:    s_waitcnt vmcnt(0)
1258; GFX9-NEXT:    buffer_wbinvl1
1259; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
1260; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1261; GFX9-NEXT:    v_mov_b32_e32 v5, v0
1262; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
1263; GFX9-NEXT:    s_cbranch_execnz .LBB11_1
1264; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
1265; GFX9-NEXT:    s_endpgm
1266;
1267; GFX10-LABEL: global_min_saddr_i32_nortn_neg128:
1268; GFX10:       ; %bb.0:
1269; GFX10-NEXT:    global_load_dword v5, v0, s[2:3] offset:-128
1270; GFX10-NEXT:    v_add_co_u32 v2, s[0:1], s2, v0
1271; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
1272; GFX10-NEXT:    s_mov_b64 s[0:1], 0
1273; GFX10-NEXT:  .LBB11_1: ; %atomicrmw.start
1274; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
1275; GFX10-NEXT:    s_waitcnt vmcnt(0)
1276; GFX10-NEXT:    v_min_i32_e32 v4, v5, v1
1277; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
1278; GFX10-NEXT:    s_waitcnt vmcnt(0)
1279; GFX10-NEXT:    buffer_gl1_inv
1280; GFX10-NEXT:    buffer_gl0_inv
1281; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
1282; GFX10-NEXT:    v_mov_b32_e32 v5, v0
1283; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1284; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
1285; GFX10-NEXT:    s_cbranch_execnz .LBB11_1
1286; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
1287; GFX10-NEXT:    s_endpgm
1288;
1289; GFX11-LABEL: global_min_saddr_i32_nortn_neg128:
1290; GFX11:       ; %bb.0:
1291; GFX11-NEXT:    global_load_b32 v5, v0, s[2:3] offset:-128
1292; GFX11-NEXT:    v_add_co_u32 v2, s[0:1], s2, v0
1293; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1294; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
1295; GFX11-NEXT:    s_mov_b64 s[0:1], 0
1296; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
1297; GFX11-NEXT:  .LBB11_1: ; %atomicrmw.start
1298; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
1299; GFX11-NEXT:    s_waitcnt vmcnt(0)
1300; GFX11-NEXT:    v_min_i32_e32 v4, v5, v1
1301; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc
1302; GFX11-NEXT:    s_waitcnt vmcnt(0)
1303; GFX11-NEXT:    buffer_gl1_inv
1304; GFX11-NEXT:    buffer_gl0_inv
1305; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
1306; GFX11-NEXT:    v_mov_b32_e32 v5, v0
1307; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1308; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1309; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
1310; GFX11-NEXT:    s_cbranch_execnz .LBB11_1
1311; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
1312; GFX11-NEXT:    s_endpgm
1313;
1314; GFX12-LABEL: global_min_saddr_i32_nortn_neg128:
1315; GFX12:       ; %bb.0:
1316; GFX12-NEXT:    global_load_b32 v5, v0, s[2:3] offset:-128
1317; GFX12-NEXT:    v_add_co_u32 v2, s[0:1], s2, v0
1318; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1319; GFX12-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
1320; GFX12-NEXT:    s_mov_b64 s[0:1], 0
1321; GFX12-NEXT:  .LBB11_1: ; %atomicrmw.start
1322; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
1323; GFX12-NEXT:    s_wait_loadcnt 0x0
1324; GFX12-NEXT:    v_min_i32_e32 v4, v5, v1
1325; GFX12-NEXT:    global_wb scope:SCOPE_SYS
1326; GFX12-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
1327; GFX12-NEXT:    s_wait_loadcnt 0x0
1328; GFX12-NEXT:    global_inv scope:SCOPE_SYS
1329; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
1330; GFX12-NEXT:    v_mov_b32_e32 v5, v0
1331; GFX12-NEXT:    s_wait_alu 0xfffe
1332; GFX12-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1333; GFX12-NEXT:    s_wait_alu 0xfffe
1334; GFX12-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
1335; GFX12-NEXT:    s_cbranch_execnz .LBB11_1
1336; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
1337; GFX12-NEXT:    s_endpgm
1338  %zext.offset = zext i32 %voffset to i64
1339  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1340  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
1341  %unused = atomicrmw min ptr addrspace(1) %gep1, i32 %data seq_cst
1342  ret void
1343}
1344
1345define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
1346; GFX9-LABEL: global_min_saddr_i64_rtn:
1347; GFX9:       ; %bb.0:
1348; GFX9-NEXT:    global_load_dwordx2 v[3:4], v0, s[2:3]
1349; GFX9-NEXT:    v_mov_b32_e32 v6, s3
1350; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, s2, v0
1351; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
1352; GFX9-NEXT:    s_mov_b64 s[0:1], 0
1353; GFX9-NEXT:  .LBB12_1: ; %atomicrmw.start
1354; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
1355; GFX9-NEXT:    s_waitcnt vmcnt(0)
1356; GFX9-NEXT:    v_mov_b32_e32 v10, v4
1357; GFX9-NEXT:    v_mov_b32_e32 v9, v3
1358; GFX9-NEXT:    v_cmp_le_i64_e32 vcc, v[9:10], v[1:2]
1359; GFX9-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
1360; GFX9-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
1361; GFX9-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc
1362; GFX9-NEXT:    s_waitcnt vmcnt(0)
1363; GFX9-NEXT:    buffer_wbinvl1
1364; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
1365; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1366; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
1367; GFX9-NEXT:    s_cbranch_execnz .LBB12_1
1368; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
1369; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
1370; GFX9-NEXT:    v_mov_b32_e32 v0, v3
1371; GFX9-NEXT:    v_mov_b32_e32 v1, v4
1372; GFX9-NEXT:    ; return to shader part epilog
1373;
1374; GFX10-LABEL: global_min_saddr_i64_rtn:
1375; GFX10:       ; %bb.0:
1376; GFX10-NEXT:    global_load_dwordx2 v[3:4], v0, s[2:3]
1377; GFX10-NEXT:    v_add_co_u32 v5, s[0:1], s2, v0
1378; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1]
1379; GFX10-NEXT:    s_mov_b64 s[0:1], 0
1380; GFX10-NEXT:  .LBB12_1: ; %atomicrmw.start
1381; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
1382; GFX10-NEXT:    s_waitcnt vmcnt(0)
1383; GFX10-NEXT:    v_mov_b32_e32 v10, v4
1384; GFX10-NEXT:    v_mov_b32_e32 v9, v3
1385; GFX10-NEXT:    v_cmp_le_i64_e32 vcc, v[9:10], v[1:2]
1386; GFX10-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
1387; GFX10-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
1388; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc
1389; GFX10-NEXT:    s_waitcnt vmcnt(0)
1390; GFX10-NEXT:    buffer_gl1_inv
1391; GFX10-NEXT:    buffer_gl0_inv
1392; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
1393; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1394; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
1395; GFX10-NEXT:    s_cbranch_execnz .LBB12_1
1396; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
1397; GFX10-NEXT:    s_or_b64 exec, exec, s[0:1]
1398; GFX10-NEXT:    v_mov_b32_e32 v0, v3
1399; GFX10-NEXT:    v_mov_b32_e32 v1, v4
1400; GFX10-NEXT:    ; return to shader part epilog
1401;
1402; GFX11-LABEL: global_min_saddr_i64_rtn:
1403; GFX11:       ; %bb.0:
1404; GFX11-NEXT:    global_load_b64 v[3:4], v0, s[2:3]
1405; GFX11-NEXT:    v_add_co_u32 v5, s[0:1], s2, v0
1406; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1407; GFX11-NEXT:    v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1]
1408; GFX11-NEXT:    s_mov_b64 s[0:1], 0
1409; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
1410; GFX11-NEXT:  .LBB12_1: ; %atomicrmw.start
1411; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
1412; GFX11-NEXT:    s_waitcnt vmcnt(0)
1413; GFX11-NEXT:    v_mov_b32_e32 v10, v4
1414; GFX11-NEXT:    v_mov_b32_e32 v9, v3
1415; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1416; GFX11-NEXT:    v_cmp_le_i64_e32 vcc, v[9:10], v[1:2]
1417; GFX11-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
1418; GFX11-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
1419; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off glc
1420; GFX11-NEXT:    s_waitcnt vmcnt(0)
1421; GFX11-NEXT:    buffer_gl1_inv
1422; GFX11-NEXT:    buffer_gl0_inv
1423; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
1424; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1425; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1426; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
1427; GFX11-NEXT:    s_cbranch_execnz .LBB12_1
1428; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
1429; GFX11-NEXT:    s_or_b64 exec, exec, s[0:1]
1430; GFX11-NEXT:    v_mov_b32_e32 v0, v3
1431; GFX11-NEXT:    v_mov_b32_e32 v1, v4
1432; GFX11-NEXT:    ; return to shader part epilog
1433;
1434; GFX12-LABEL: global_min_saddr_i64_rtn:
1435; GFX12:       ; %bb.0:
1436; GFX12-NEXT:    global_load_b64 v[3:4], v0, s[2:3]
1437; GFX12-NEXT:    v_add_co_u32 v5, s[0:1], s2, v0
1438; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1439; GFX12-NEXT:    v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1]
1440; GFX12-NEXT:    s_mov_b64 s[0:1], 0
1441; GFX12-NEXT:  .LBB12_1: ; %atomicrmw.start
1442; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
1443; GFX12-NEXT:    s_wait_loadcnt 0x0
1444; GFX12-NEXT:    v_mov_b32_e32 v10, v4
1445; GFX12-NEXT:    v_mov_b32_e32 v9, v3
1446; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1447; GFX12-NEXT:    v_cmp_le_i64_e32 vcc, v[9:10], v[1:2]
1448; GFX12-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
1449; GFX12-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
1450; GFX12-NEXT:    global_wb scope:SCOPE_SYS
1451; GFX12-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
1452; GFX12-NEXT:    s_wait_loadcnt 0x0
1453; GFX12-NEXT:    global_inv scope:SCOPE_SYS
1454; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
1455; GFX12-NEXT:    s_wait_alu 0xfffe
1456; GFX12-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1457; GFX12-NEXT:    s_wait_alu 0xfffe
1458; GFX12-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
1459; GFX12-NEXT:    s_cbranch_execnz .LBB12_1
1460; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
1461; GFX12-NEXT:    s_or_b64 exec, exec, s[0:1]
1462; GFX12-NEXT:    v_mov_b32_e32 v0, v3
1463; GFX12-NEXT:    v_mov_b32_e32 v1, v4
1464; GFX12-NEXT:    ; return to shader part epilog
1465  %zext.offset = zext i32 %voffset to i64
1466  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1467  %rtn = atomicrmw min ptr addrspace(1) %gep0, i64 %data seq_cst
1468  %cast.rtn = bitcast i64 %rtn to <2 x float>
1469  ret <2 x float> %cast.rtn
1470}
1471
1472define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
1473; GFX9-LABEL: global_min_saddr_i64_rtn_neg128:
1474; GFX9:       ; %bb.0:
1475; GFX9-NEXT:    global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128
1476; GFX9-NEXT:    v_mov_b32_e32 v6, s3
1477; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, s2, v0
1478; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
1479; GFX9-NEXT:    s_mov_b64 s[0:1], 0
1480; GFX9-NEXT:  .LBB13_1: ; %atomicrmw.start
1481; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
1482; GFX9-NEXT:    s_waitcnt vmcnt(0)
1483; GFX9-NEXT:    v_mov_b32_e32 v10, v4
1484; GFX9-NEXT:    v_mov_b32_e32 v9, v3
1485; GFX9-NEXT:    v_cmp_le_i64_e32 vcc, v[9:10], v[1:2]
1486; GFX9-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
1487; GFX9-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
1488; GFX9-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc
1489; GFX9-NEXT:    s_waitcnt vmcnt(0)
1490; GFX9-NEXT:    buffer_wbinvl1
1491; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
1492; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1493; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
1494; GFX9-NEXT:    s_cbranch_execnz .LBB13_1
1495; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
1496; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
1497; GFX9-NEXT:    v_mov_b32_e32 v0, v3
1498; GFX9-NEXT:    v_mov_b32_e32 v1, v4
1499; GFX9-NEXT:    ; return to shader part epilog
1500;
1501; GFX10-LABEL: global_min_saddr_i64_rtn_neg128:
1502; GFX10:       ; %bb.0:
1503; GFX10-NEXT:    global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128
1504; GFX10-NEXT:    v_add_co_u32 v5, s[0:1], s2, v0
1505; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1]
1506; GFX10-NEXT:    s_mov_b64 s[0:1], 0
1507; GFX10-NEXT:  .LBB13_1: ; %atomicrmw.start
1508; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
1509; GFX10-NEXT:    s_waitcnt vmcnt(0)
1510; GFX10-NEXT:    v_mov_b32_e32 v10, v4
1511; GFX10-NEXT:    v_mov_b32_e32 v9, v3
1512; GFX10-NEXT:    v_cmp_le_i64_e32 vcc, v[9:10], v[1:2]
1513; GFX10-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
1514; GFX10-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
1515; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc
1516; GFX10-NEXT:    s_waitcnt vmcnt(0)
1517; GFX10-NEXT:    buffer_gl1_inv
1518; GFX10-NEXT:    buffer_gl0_inv
1519; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
1520; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1521; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
1522; GFX10-NEXT:    s_cbranch_execnz .LBB13_1
1523; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
1524; GFX10-NEXT:    s_or_b64 exec, exec, s[0:1]
1525; GFX10-NEXT:    v_mov_b32_e32 v0, v3
1526; GFX10-NEXT:    v_mov_b32_e32 v1, v4
1527; GFX10-NEXT:    ; return to shader part epilog
1528;
1529; GFX11-LABEL: global_min_saddr_i64_rtn_neg128:
1530; GFX11:       ; %bb.0:
1531; GFX11-NEXT:    global_load_b64 v[3:4], v0, s[2:3] offset:-128
1532; GFX11-NEXT:    v_add_co_u32 v5, s[0:1], s2, v0
1533; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1534; GFX11-NEXT:    v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1]
1535; GFX11-NEXT:    s_mov_b64 s[0:1], 0
1536; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
1537; GFX11-NEXT:  .LBB13_1: ; %atomicrmw.start
1538; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
1539; GFX11-NEXT:    s_waitcnt vmcnt(0)
1540; GFX11-NEXT:    v_mov_b32_e32 v10, v4
1541; GFX11-NEXT:    v_mov_b32_e32 v9, v3
1542; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1543; GFX11-NEXT:    v_cmp_le_i64_e32 vcc, v[9:10], v[1:2]
1544; GFX11-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
1545; GFX11-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
1546; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 glc
1547; GFX11-NEXT:    s_waitcnt vmcnt(0)
1548; GFX11-NEXT:    buffer_gl1_inv
1549; GFX11-NEXT:    buffer_gl0_inv
1550; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
1551; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1552; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1553; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
1554; GFX11-NEXT:    s_cbranch_execnz .LBB13_1
1555; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
1556; GFX11-NEXT:    s_or_b64 exec, exec, s[0:1]
1557; GFX11-NEXT:    v_mov_b32_e32 v0, v3
1558; GFX11-NEXT:    v_mov_b32_e32 v1, v4
1559; GFX11-NEXT:    ; return to shader part epilog
1560;
1561; GFX12-LABEL: global_min_saddr_i64_rtn_neg128:
1562; GFX12:       ; %bb.0:
1563; GFX12-NEXT:    global_load_b64 v[3:4], v0, s[2:3] offset:-128
1564; GFX12-NEXT:    v_add_co_u32 v5, s[0:1], s2, v0
1565; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1566; GFX12-NEXT:    v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1]
1567; GFX12-NEXT:    s_mov_b64 s[0:1], 0
1568; GFX12-NEXT:  .LBB13_1: ; %atomicrmw.start
1569; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
1570; GFX12-NEXT:    s_wait_loadcnt 0x0
1571; GFX12-NEXT:    v_mov_b32_e32 v10, v4
1572; GFX12-NEXT:    v_mov_b32_e32 v9, v3
1573; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1574; GFX12-NEXT:    v_cmp_le_i64_e32 vcc, v[9:10], v[1:2]
1575; GFX12-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
1576; GFX12-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
1577; GFX12-NEXT:    global_wb scope:SCOPE_SYS
1578; GFX12-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
1579; GFX12-NEXT:    s_wait_loadcnt 0x0
1580; GFX12-NEXT:    global_inv scope:SCOPE_SYS
1581; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
1582; GFX12-NEXT:    s_wait_alu 0xfffe
1583; GFX12-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1584; GFX12-NEXT:    s_wait_alu 0xfffe
1585; GFX12-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
1586; GFX12-NEXT:    s_cbranch_execnz .LBB13_1
1587; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
1588; GFX12-NEXT:    s_or_b64 exec, exec, s[0:1]
1589; GFX12-NEXT:    v_mov_b32_e32 v0, v3
1590; GFX12-NEXT:    v_mov_b32_e32 v1, v4
1591; GFX12-NEXT:    ; return to shader part epilog
1592  %zext.offset = zext i32 %voffset to i64
1593  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1594  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
1595  %rtn = atomicrmw min ptr addrspace(1) %gep1, i64 %data seq_cst
1596  %cast.rtn = bitcast i64 %rtn to <2 x float>
1597  ret <2 x float> %cast.rtn
1598}
1599
1600define amdgpu_ps void @global_min_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
1601; GFX9-LABEL: global_min_saddr_i64_nortn:
1602; GFX9:       ; %bb.0:
1603; GFX9-NEXT:    global_load_dwordx2 v[5:6], v0, s[2:3]
1604; GFX9-NEXT:    v_mov_b32_e32 v3, s3
1605; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, s2, v0
1606; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v3, vcc
1607; GFX9-NEXT:    s_mov_b64 s[0:1], 0
1608; GFX9-NEXT:  .LBB14_1: ; %atomicrmw.start
1609; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
1610; GFX9-NEXT:    s_waitcnt vmcnt(0)
1611; GFX9-NEXT:    v_cmp_le_i64_e32 vcc, v[5:6], v[1:2]
1612; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
1613; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
1614; GFX9-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc
1615; GFX9-NEXT:    s_waitcnt vmcnt(0)
1616; GFX9-NEXT:    buffer_wbinvl1
1617; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
1618; GFX9-NEXT:    v_mov_b32_e32 v6, v4
1619; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1620; GFX9-NEXT:    v_mov_b32_e32 v5, v3
1621; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
1622; GFX9-NEXT:    s_cbranch_execnz .LBB14_1
1623; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
1624; GFX9-NEXT:    s_endpgm
1625;
1626; GFX10-LABEL: global_min_saddr_i64_nortn:
1627; GFX10:       ; %bb.0:
1628; GFX10-NEXT:    global_load_dwordx2 v[5:6], v0, s[2:3]
1629; GFX10-NEXT:    v_add_co_u32 v7, s[0:1], s2, v0
1630; GFX10-NEXT:    v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1]
1631; GFX10-NEXT:    s_mov_b64 s[0:1], 0
1632; GFX10-NEXT:  .LBB14_1: ; %atomicrmw.start
1633; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
1634; GFX10-NEXT:    s_waitcnt vmcnt(0)
1635; GFX10-NEXT:    v_cmp_le_i64_e32 vcc, v[5:6], v[1:2]
1636; GFX10-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
1637; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
1638; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc
1639; GFX10-NEXT:    s_waitcnt vmcnt(0)
1640; GFX10-NEXT:    buffer_gl1_inv
1641; GFX10-NEXT:    buffer_gl0_inv
1642; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
1643; GFX10-NEXT:    v_mov_b32_e32 v6, v4
1644; GFX10-NEXT:    v_mov_b32_e32 v5, v3
1645; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1646; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
1647; GFX10-NEXT:    s_cbranch_execnz .LBB14_1
1648; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
1649; GFX10-NEXT:    s_endpgm
1650;
1651; GFX11-LABEL: global_min_saddr_i64_nortn:
1652; GFX11:       ; %bb.0:
1653; GFX11-NEXT:    global_load_b64 v[5:6], v0, s[2:3]
1654; GFX11-NEXT:    v_add_co_u32 v7, s[0:1], s2, v0
1655; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1656; GFX11-NEXT:    v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1]
1657; GFX11-NEXT:    s_mov_b64 s[0:1], 0
1658; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
1659; GFX11-NEXT:  .LBB14_1: ; %atomicrmw.start
1660; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
1661; GFX11-NEXT:    s_waitcnt vmcnt(0)
1662; GFX11-NEXT:    v_cmp_le_i64_e32 vcc, v[5:6], v[1:2]
1663; GFX11-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
1664; GFX11-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
1665; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off glc
1666; GFX11-NEXT:    s_waitcnt vmcnt(0)
1667; GFX11-NEXT:    buffer_gl1_inv
1668; GFX11-NEXT:    buffer_gl0_inv
1669; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
1670; GFX11-NEXT:    v_mov_b32_e32 v6, v4
1671; GFX11-NEXT:    v_mov_b32_e32 v5, v3
1672; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1673; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1674; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
1675; GFX11-NEXT:    s_cbranch_execnz .LBB14_1
1676; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
1677; GFX11-NEXT:    s_endpgm
1678;
1679; GFX12-LABEL: global_min_saddr_i64_nortn:
1680; GFX12:       ; %bb.0:
1681; GFX12-NEXT:    global_load_b64 v[5:6], v0, s[2:3]
1682; GFX12-NEXT:    v_add_co_u32 v7, s[0:1], s2, v0
1683; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1684; GFX12-NEXT:    v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1]
1685; GFX12-NEXT:    s_mov_b64 s[0:1], 0
1686; GFX12-NEXT:  .LBB14_1: ; %atomicrmw.start
1687; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
1688; GFX12-NEXT:    s_wait_loadcnt 0x0
1689; GFX12-NEXT:    v_cmp_le_i64_e32 vcc, v[5:6], v[1:2]
1690; GFX12-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
1691; GFX12-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
1692; GFX12-NEXT:    global_wb scope:SCOPE_SYS
1693; GFX12-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
1694; GFX12-NEXT:    s_wait_loadcnt 0x0
1695; GFX12-NEXT:    global_inv scope:SCOPE_SYS
1696; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
1697; GFX12-NEXT:    v_mov_b32_e32 v6, v4
1698; GFX12-NEXT:    v_mov_b32_e32 v5, v3
1699; GFX12-NEXT:    s_wait_alu 0xfffe
1700; GFX12-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1701; GFX12-NEXT:    s_wait_alu 0xfffe
1702; GFX12-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
1703; GFX12-NEXT:    s_cbranch_execnz .LBB14_1
1704; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
1705; GFX12-NEXT:    s_endpgm
1706  %zext.offset = zext i32 %voffset to i64
1707  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1708  %unused = atomicrmw min ptr addrspace(1) %gep0, i64 %data seq_cst
1709  ret void
1710}
1711
1712define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
1713; GFX9-LABEL: global_min_saddr_i64_nortn_neg128:
1714; GFX9:       ; %bb.0:
1715; GFX9-NEXT:    global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128
1716; GFX9-NEXT:    v_mov_b32_e32 v3, s3
1717; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, s2, v0
1718; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v3, vcc
1719; GFX9-NEXT:    s_mov_b64 s[0:1], 0
1720; GFX9-NEXT:  .LBB15_1: ; %atomicrmw.start
1721; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
1722; GFX9-NEXT:    s_waitcnt vmcnt(0)
1723; GFX9-NEXT:    v_cmp_le_i64_e32 vcc, v[5:6], v[1:2]
1724; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
1725; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
1726; GFX9-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc
1727; GFX9-NEXT:    s_waitcnt vmcnt(0)
1728; GFX9-NEXT:    buffer_wbinvl1
1729; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
1730; GFX9-NEXT:    v_mov_b32_e32 v6, v4
1731; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1732; GFX9-NEXT:    v_mov_b32_e32 v5, v3
1733; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
1734; GFX9-NEXT:    s_cbranch_execnz .LBB15_1
1735; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
1736; GFX9-NEXT:    s_endpgm
1737;
1738; GFX10-LABEL: global_min_saddr_i64_nortn_neg128:
1739; GFX10:       ; %bb.0:
1740; GFX10-NEXT:    global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128
1741; GFX10-NEXT:    v_add_co_u32 v7, s[0:1], s2, v0
1742; GFX10-NEXT:    v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1]
1743; GFX10-NEXT:    s_mov_b64 s[0:1], 0
1744; GFX10-NEXT:  .LBB15_1: ; %atomicrmw.start
1745; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
1746; GFX10-NEXT:    s_waitcnt vmcnt(0)
1747; GFX10-NEXT:    v_cmp_le_i64_e32 vcc, v[5:6], v[1:2]
1748; GFX10-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
1749; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
1750; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc
1751; GFX10-NEXT:    s_waitcnt vmcnt(0)
1752; GFX10-NEXT:    buffer_gl1_inv
1753; GFX10-NEXT:    buffer_gl0_inv
1754; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
1755; GFX10-NEXT:    v_mov_b32_e32 v6, v4
1756; GFX10-NEXT:    v_mov_b32_e32 v5, v3
1757; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1758; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
1759; GFX10-NEXT:    s_cbranch_execnz .LBB15_1
1760; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
1761; GFX10-NEXT:    s_endpgm
1762;
1763; GFX11-LABEL: global_min_saddr_i64_nortn_neg128:
1764; GFX11:       ; %bb.0:
1765; GFX11-NEXT:    global_load_b64 v[5:6], v0, s[2:3] offset:-128
1766; GFX11-NEXT:    v_add_co_u32 v7, s[0:1], s2, v0
1767; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1768; GFX11-NEXT:    v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1]
1769; GFX11-NEXT:    s_mov_b64 s[0:1], 0
1770; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
1771; GFX11-NEXT:  .LBB15_1: ; %atomicrmw.start
1772; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
1773; GFX11-NEXT:    s_waitcnt vmcnt(0)
1774; GFX11-NEXT:    v_cmp_le_i64_e32 vcc, v[5:6], v[1:2]
1775; GFX11-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
1776; GFX11-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
1777; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 glc
1778; GFX11-NEXT:    s_waitcnt vmcnt(0)
1779; GFX11-NEXT:    buffer_gl1_inv
1780; GFX11-NEXT:    buffer_gl0_inv
1781; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
1782; GFX11-NEXT:    v_mov_b32_e32 v6, v4
1783; GFX11-NEXT:    v_mov_b32_e32 v5, v3
1784; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1785; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1786; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
1787; GFX11-NEXT:    s_cbranch_execnz .LBB15_1
1788; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
1789; GFX11-NEXT:    s_endpgm
1790;
1791; GFX12-LABEL: global_min_saddr_i64_nortn_neg128:
1792; GFX12:       ; %bb.0:
1793; GFX12-NEXT:    global_load_b64 v[5:6], v0, s[2:3] offset:-128
1794; GFX12-NEXT:    v_add_co_u32 v7, s[0:1], s2, v0
1795; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1796; GFX12-NEXT:    v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1]
1797; GFX12-NEXT:    s_mov_b64 s[0:1], 0
1798; GFX12-NEXT:  .LBB15_1: ; %atomicrmw.start
1799; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
1800; GFX12-NEXT:    s_wait_loadcnt 0x0
1801; GFX12-NEXT:    v_cmp_le_i64_e32 vcc, v[5:6], v[1:2]
1802; GFX12-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
1803; GFX12-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
1804; GFX12-NEXT:    global_wb scope:SCOPE_SYS
1805; GFX12-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
1806; GFX12-NEXT:    s_wait_loadcnt 0x0
1807; GFX12-NEXT:    global_inv scope:SCOPE_SYS
1808; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
1809; GFX12-NEXT:    v_mov_b32_e32 v6, v4
1810; GFX12-NEXT:    v_mov_b32_e32 v5, v3
1811; GFX12-NEXT:    s_wait_alu 0xfffe
1812; GFX12-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1813; GFX12-NEXT:    s_wait_alu 0xfffe
1814; GFX12-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
1815; GFX12-NEXT:    s_cbranch_execnz .LBB15_1
1816; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
1817; GFX12-NEXT:    s_endpgm
1818  %zext.offset = zext i32 %voffset to i64
1819  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1820  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
1821  %unused = atomicrmw min ptr addrspace(1) %gep1, i64 %data seq_cst
1822  ret void
1823}
1824
1825; --------------------------------------------------------------------------------
1826; atomicrmw umax
1827; --------------------------------------------------------------------------------
1828
1829define amdgpu_ps float @global_umax_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
1830; GFX9-LABEL: global_umax_saddr_i32_rtn:
1831; GFX9:       ; %bb.0:
1832; GFX9-NEXT:    v_mov_b32_e32 v2, v0
1833; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
1834; GFX9-NEXT:    v_mov_b32_e32 v3, s3
1835; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s2, v2
1836; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1837; GFX9-NEXT:    s_mov_b64 s[0:1], 0
1838; GFX9-NEXT:  .LBB16_1: ; %atomicrmw.start
1839; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
1840; GFX9-NEXT:    s_waitcnt vmcnt(0)
1841; GFX9-NEXT:    v_mov_b32_e32 v5, v0
1842; GFX9-NEXT:    v_max_u32_e32 v4, v5, v1
1843; GFX9-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
1844; GFX9-NEXT:    s_waitcnt vmcnt(0)
1845; GFX9-NEXT:    buffer_wbinvl1
1846; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
1847; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1848; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
1849; GFX9-NEXT:    s_cbranch_execnz .LBB16_1
1850; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
1851; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
1852; GFX9-NEXT:    ; return to shader part epilog
1853;
1854; GFX10-LABEL: global_umax_saddr_i32_rtn:
1855; GFX10:       ; %bb.0:
1856; GFX10-NEXT:    v_mov_b32_e32 v2, v0
1857; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
1858; GFX10-NEXT:    v_add_co_u32 v2, s[0:1], s2, v2
1859; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
1860; GFX10-NEXT:    s_mov_b64 s[0:1], 0
1861; GFX10-NEXT:  .LBB16_1: ; %atomicrmw.start
1862; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
1863; GFX10-NEXT:    s_waitcnt vmcnt(0)
1864; GFX10-NEXT:    v_mov_b32_e32 v5, v0
1865; GFX10-NEXT:    v_max_u32_e32 v4, v5, v1
1866; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
1867; GFX10-NEXT:    s_waitcnt vmcnt(0)
1868; GFX10-NEXT:    buffer_gl1_inv
1869; GFX10-NEXT:    buffer_gl0_inv
1870; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
1871; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1872; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
1873; GFX10-NEXT:    s_cbranch_execnz .LBB16_1
1874; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
1875; GFX10-NEXT:    s_or_b64 exec, exec, s[0:1]
1876; GFX10-NEXT:    ; return to shader part epilog
1877;
1878; GFX11-LABEL: global_umax_saddr_i32_rtn:
1879; GFX11:       ; %bb.0:
1880; GFX11-NEXT:    v_mov_b32_e32 v2, v0
1881; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
1882; GFX11-NEXT:    v_add_co_u32 v2, s[0:1], s2, v2
1883; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1884; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
1885; GFX11-NEXT:    s_mov_b64 s[0:1], 0
1886; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
1887; GFX11-NEXT:  .LBB16_1: ; %atomicrmw.start
1888; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
1889; GFX11-NEXT:    s_waitcnt vmcnt(0)
1890; GFX11-NEXT:    v_mov_b32_e32 v5, v0
1891; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1892; GFX11-NEXT:    v_max_u32_e32 v4, v5, v1
1893; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc
1894; GFX11-NEXT:    s_waitcnt vmcnt(0)
1895; GFX11-NEXT:    buffer_gl1_inv
1896; GFX11-NEXT:    buffer_gl0_inv
1897; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
1898; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1899; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1900; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
1901; GFX11-NEXT:    s_cbranch_execnz .LBB16_1
1902; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
1903; GFX11-NEXT:    s_or_b64 exec, exec, s[0:1]
1904; GFX11-NEXT:    ; return to shader part epilog
1905;
1906; GFX12-LABEL: global_umax_saddr_i32_rtn:
1907; GFX12:       ; %bb.0:
1908; GFX12-NEXT:    v_mov_b32_e32 v2, v0
1909; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3]
1910; GFX12-NEXT:    v_add_co_u32 v2, s[0:1], s2, v2
1911; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1912; GFX12-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
1913; GFX12-NEXT:    s_mov_b64 s[0:1], 0
1914; GFX12-NEXT:  .LBB16_1: ; %atomicrmw.start
1915; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
1916; GFX12-NEXT:    s_wait_loadcnt 0x0
1917; GFX12-NEXT:    v_mov_b32_e32 v5, v0
1918; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1919; GFX12-NEXT:    v_max_u32_e32 v4, v5, v1
1920; GFX12-NEXT:    global_wb scope:SCOPE_SYS
1921; GFX12-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
1922; GFX12-NEXT:    s_wait_loadcnt 0x0
1923; GFX12-NEXT:    global_inv scope:SCOPE_SYS
1924; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
1925; GFX12-NEXT:    s_wait_alu 0xfffe
1926; GFX12-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1927; GFX12-NEXT:    s_wait_alu 0xfffe
1928; GFX12-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
1929; GFX12-NEXT:    s_cbranch_execnz .LBB16_1
1930; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
1931; GFX12-NEXT:    s_or_b64 exec, exec, s[0:1]
1932; GFX12-NEXT:    ; return to shader part epilog
1933  %zext.offset = zext i32 %voffset to i64
1934  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1935  %rtn = atomicrmw umax ptr addrspace(1) %gep0, i32 %data seq_cst
1936  %cast.rtn = bitcast i32 %rtn to float
1937  ret float %cast.rtn
1938}
1939
1940define amdgpu_ps float @global_umax_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
1941; GFX9-LABEL: global_umax_saddr_i32_rtn_neg128:
1942; GFX9:       ; %bb.0:
1943; GFX9-NEXT:    v_mov_b32_e32 v2, v0
1944; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] offset:-128
1945; GFX9-NEXT:    v_mov_b32_e32 v3, s3
1946; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s2, v2
1947; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1948; GFX9-NEXT:    s_mov_b64 s[0:1], 0
1949; GFX9-NEXT:  .LBB17_1: ; %atomicrmw.start
1950; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
1951; GFX9-NEXT:    s_waitcnt vmcnt(0)
1952; GFX9-NEXT:    v_mov_b32_e32 v5, v0
1953; GFX9-NEXT:    v_max_u32_e32 v4, v5, v1
1954; GFX9-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
1955; GFX9-NEXT:    s_waitcnt vmcnt(0)
1956; GFX9-NEXT:    buffer_wbinvl1
1957; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
1958; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1959; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
1960; GFX9-NEXT:    s_cbranch_execnz .LBB17_1
1961; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
1962; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
1963; GFX9-NEXT:    ; return to shader part epilog
1964;
1965; GFX10-LABEL: global_umax_saddr_i32_rtn_neg128:
1966; GFX10:       ; %bb.0:
1967; GFX10-NEXT:    v_mov_b32_e32 v2, v0
1968; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] offset:-128
1969; GFX10-NEXT:    v_add_co_u32 v2, s[0:1], s2, v2
1970; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
1971; GFX10-NEXT:    s_mov_b64 s[0:1], 0
1972; GFX10-NEXT:  .LBB17_1: ; %atomicrmw.start
1973; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
1974; GFX10-NEXT:    s_waitcnt vmcnt(0)
1975; GFX10-NEXT:    v_mov_b32_e32 v5, v0
1976; GFX10-NEXT:    v_max_u32_e32 v4, v5, v1
1977; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
1978; GFX10-NEXT:    s_waitcnt vmcnt(0)
1979; GFX10-NEXT:    buffer_gl1_inv
1980; GFX10-NEXT:    buffer_gl0_inv
1981; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
1982; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1983; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
1984; GFX10-NEXT:    s_cbranch_execnz .LBB17_1
1985; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
1986; GFX10-NEXT:    s_or_b64 exec, exec, s[0:1]
1987; GFX10-NEXT:    ; return to shader part epilog
1988;
1989; GFX11-LABEL: global_umax_saddr_i32_rtn_neg128:
1990; GFX11:       ; %bb.0:
1991; GFX11-NEXT:    v_mov_b32_e32 v2, v0
1992; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] offset:-128
1993; GFX11-NEXT:    v_add_co_u32 v2, s[0:1], s2, v2
1994; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1995; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
1996; GFX11-NEXT:    s_mov_b64 s[0:1], 0
1997; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
1998; GFX11-NEXT:  .LBB17_1: ; %atomicrmw.start
1999; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
2000; GFX11-NEXT:    s_waitcnt vmcnt(0)
2001; GFX11-NEXT:    v_mov_b32_e32 v5, v0
2002; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2003; GFX11-NEXT:    v_max_u32_e32 v4, v5, v1
2004; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc
2005; GFX11-NEXT:    s_waitcnt vmcnt(0)
2006; GFX11-NEXT:    buffer_gl1_inv
2007; GFX11-NEXT:    buffer_gl0_inv
2008; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
2009; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2010; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2011; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
2012; GFX11-NEXT:    s_cbranch_execnz .LBB17_1
2013; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
2014; GFX11-NEXT:    s_or_b64 exec, exec, s[0:1]
2015; GFX11-NEXT:    ; return to shader part epilog
2016;
2017; GFX12-LABEL: global_umax_saddr_i32_rtn_neg128:
2018; GFX12:       ; %bb.0:
2019; GFX12-NEXT:    v_mov_b32_e32 v2, v0
2020; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3] offset:-128
2021; GFX12-NEXT:    v_add_co_u32 v2, s[0:1], s2, v2
2022; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2023; GFX12-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
2024; GFX12-NEXT:    s_mov_b64 s[0:1], 0
2025; GFX12-NEXT:  .LBB17_1: ; %atomicrmw.start
2026; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
2027; GFX12-NEXT:    s_wait_loadcnt 0x0
2028; GFX12-NEXT:    v_mov_b32_e32 v5, v0
2029; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2030; GFX12-NEXT:    v_max_u32_e32 v4, v5, v1
2031; GFX12-NEXT:    global_wb scope:SCOPE_SYS
2032; GFX12-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
2033; GFX12-NEXT:    s_wait_loadcnt 0x0
2034; GFX12-NEXT:    global_inv scope:SCOPE_SYS
2035; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
2036; GFX12-NEXT:    s_wait_alu 0xfffe
2037; GFX12-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2038; GFX12-NEXT:    s_wait_alu 0xfffe
2039; GFX12-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
2040; GFX12-NEXT:    s_cbranch_execnz .LBB17_1
2041; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
2042; GFX12-NEXT:    s_or_b64 exec, exec, s[0:1]
2043; GFX12-NEXT:    ; return to shader part epilog
2044  %zext.offset = zext i32 %voffset to i64
2045  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2046  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
2047  %rtn = atomicrmw umax ptr addrspace(1) %gep1, i32 %data seq_cst
2048  %cast.rtn = bitcast i32 %rtn to float
2049  ret float %cast.rtn
2050}
2051
2052define amdgpu_ps void @global_umax_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
2053; GFX9-LABEL: global_umax_saddr_i32_nortn:
2054; GFX9:       ; %bb.0:
2055; GFX9-NEXT:    global_load_dword v5, v0, s[2:3]
2056; GFX9-NEXT:    v_mov_b32_e32 v3, s3
2057; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s2, v0
2058; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
2059; GFX9-NEXT:    s_mov_b64 s[0:1], 0
2060; GFX9-NEXT:  .LBB18_1: ; %atomicrmw.start
2061; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
2062; GFX9-NEXT:    s_waitcnt vmcnt(0)
2063; GFX9-NEXT:    v_max_u32_e32 v4, v5, v1
2064; GFX9-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
2065; GFX9-NEXT:    s_waitcnt vmcnt(0)
2066; GFX9-NEXT:    buffer_wbinvl1
2067; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
2068; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2069; GFX9-NEXT:    v_mov_b32_e32 v5, v0
2070; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
2071; GFX9-NEXT:    s_cbranch_execnz .LBB18_1
2072; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
2073; GFX9-NEXT:    s_endpgm
2074;
2075; GFX10-LABEL: global_umax_saddr_i32_nortn:
2076; GFX10:       ; %bb.0:
2077; GFX10-NEXT:    global_load_dword v5, v0, s[2:3]
2078; GFX10-NEXT:    v_add_co_u32 v2, s[0:1], s2, v0
2079; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
2080; GFX10-NEXT:    s_mov_b64 s[0:1], 0
2081; GFX10-NEXT:  .LBB18_1: ; %atomicrmw.start
2082; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
2083; GFX10-NEXT:    s_waitcnt vmcnt(0)
2084; GFX10-NEXT:    v_max_u32_e32 v4, v5, v1
2085; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
2086; GFX10-NEXT:    s_waitcnt vmcnt(0)
2087; GFX10-NEXT:    buffer_gl1_inv
2088; GFX10-NEXT:    buffer_gl0_inv
2089; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
2090; GFX10-NEXT:    v_mov_b32_e32 v5, v0
2091; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2092; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
2093; GFX10-NEXT:    s_cbranch_execnz .LBB18_1
2094; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
2095; GFX10-NEXT:    s_endpgm
2096;
2097; GFX11-LABEL: global_umax_saddr_i32_nortn:
2098; GFX11:       ; %bb.0:
2099; GFX11-NEXT:    global_load_b32 v5, v0, s[2:3]
2100; GFX11-NEXT:    v_add_co_u32 v2, s[0:1], s2, v0
2101; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2102; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
2103; GFX11-NEXT:    s_mov_b64 s[0:1], 0
2104; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
2105; GFX11-NEXT:  .LBB18_1: ; %atomicrmw.start
2106; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
2107; GFX11-NEXT:    s_waitcnt vmcnt(0)
2108; GFX11-NEXT:    v_max_u32_e32 v4, v5, v1
2109; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc
2110; GFX11-NEXT:    s_waitcnt vmcnt(0)
2111; GFX11-NEXT:    buffer_gl1_inv
2112; GFX11-NEXT:    buffer_gl0_inv
2113; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
2114; GFX11-NEXT:    v_mov_b32_e32 v5, v0
2115; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2116; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2117; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
2118; GFX11-NEXT:    s_cbranch_execnz .LBB18_1
2119; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
2120; GFX11-NEXT:    s_endpgm
2121;
2122; GFX12-LABEL: global_umax_saddr_i32_nortn:
2123; GFX12:       ; %bb.0:
2124; GFX12-NEXT:    global_load_b32 v5, v0, s[2:3]
2125; GFX12-NEXT:    v_add_co_u32 v2, s[0:1], s2, v0
2126; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2127; GFX12-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
2128; GFX12-NEXT:    s_mov_b64 s[0:1], 0
2129; GFX12-NEXT:  .LBB18_1: ; %atomicrmw.start
2130; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
2131; GFX12-NEXT:    s_wait_loadcnt 0x0
2132; GFX12-NEXT:    v_max_u32_e32 v4, v5, v1
2133; GFX12-NEXT:    global_wb scope:SCOPE_SYS
2134; GFX12-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
2135; GFX12-NEXT:    s_wait_loadcnt 0x0
2136; GFX12-NEXT:    global_inv scope:SCOPE_SYS
2137; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
2138; GFX12-NEXT:    v_mov_b32_e32 v5, v0
2139; GFX12-NEXT:    s_wait_alu 0xfffe
2140; GFX12-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2141; GFX12-NEXT:    s_wait_alu 0xfffe
2142; GFX12-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
2143; GFX12-NEXT:    s_cbranch_execnz .LBB18_1
2144; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
2145; GFX12-NEXT:    s_endpgm
2146  %zext.offset = zext i32 %voffset to i64
2147  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2148  %unused = atomicrmw umax ptr addrspace(1) %gep0, i32 %data seq_cst
2149  ret void
2150}
2151
2152define amdgpu_ps void @global_umax_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
2153; GFX9-LABEL: global_umax_saddr_i32_nortn_neg128:
2154; GFX9:       ; %bb.0:
2155; GFX9-NEXT:    global_load_dword v5, v0, s[2:3] offset:-128
2156; GFX9-NEXT:    v_mov_b32_e32 v3, s3
2157; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s2, v0
2158; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
2159; GFX9-NEXT:    s_mov_b64 s[0:1], 0
2160; GFX9-NEXT:  .LBB19_1: ; %atomicrmw.start
2161; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
2162; GFX9-NEXT:    s_waitcnt vmcnt(0)
2163; GFX9-NEXT:    v_max_u32_e32 v4, v5, v1
2164; GFX9-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
2165; GFX9-NEXT:    s_waitcnt vmcnt(0)
2166; GFX9-NEXT:    buffer_wbinvl1
2167; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
2168; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2169; GFX9-NEXT:    v_mov_b32_e32 v5, v0
2170; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
2171; GFX9-NEXT:    s_cbranch_execnz .LBB19_1
2172; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
2173; GFX9-NEXT:    s_endpgm
2174;
2175; GFX10-LABEL: global_umax_saddr_i32_nortn_neg128:
2176; GFX10:       ; %bb.0:
2177; GFX10-NEXT:    global_load_dword v5, v0, s[2:3] offset:-128
2178; GFX10-NEXT:    v_add_co_u32 v2, s[0:1], s2, v0
2179; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
2180; GFX10-NEXT:    s_mov_b64 s[0:1], 0
2181; GFX10-NEXT:  .LBB19_1: ; %atomicrmw.start
2182; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
2183; GFX10-NEXT:    s_waitcnt vmcnt(0)
2184; GFX10-NEXT:    v_max_u32_e32 v4, v5, v1
2185; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
2186; GFX10-NEXT:    s_waitcnt vmcnt(0)
2187; GFX10-NEXT:    buffer_gl1_inv
2188; GFX10-NEXT:    buffer_gl0_inv
2189; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
2190; GFX10-NEXT:    v_mov_b32_e32 v5, v0
2191; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2192; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
2193; GFX10-NEXT:    s_cbranch_execnz .LBB19_1
2194; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
2195; GFX10-NEXT:    s_endpgm
2196;
2197; GFX11-LABEL: global_umax_saddr_i32_nortn_neg128:
2198; GFX11:       ; %bb.0:
2199; GFX11-NEXT:    global_load_b32 v5, v0, s[2:3] offset:-128
2200; GFX11-NEXT:    v_add_co_u32 v2, s[0:1], s2, v0
2201; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2202; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
2203; GFX11-NEXT:    s_mov_b64 s[0:1], 0
2204; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
2205; GFX11-NEXT:  .LBB19_1: ; %atomicrmw.start
2206; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
2207; GFX11-NEXT:    s_waitcnt vmcnt(0)
2208; GFX11-NEXT:    v_max_u32_e32 v4, v5, v1
2209; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc
2210; GFX11-NEXT:    s_waitcnt vmcnt(0)
2211; GFX11-NEXT:    buffer_gl1_inv
2212; GFX11-NEXT:    buffer_gl0_inv
2213; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
2214; GFX11-NEXT:    v_mov_b32_e32 v5, v0
2215; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2216; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2217; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
2218; GFX11-NEXT:    s_cbranch_execnz .LBB19_1
2219; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
2220; GFX11-NEXT:    s_endpgm
2221;
2222; GFX12-LABEL: global_umax_saddr_i32_nortn_neg128:
2223; GFX12:       ; %bb.0:
2224; GFX12-NEXT:    global_load_b32 v5, v0, s[2:3] offset:-128
2225; GFX12-NEXT:    v_add_co_u32 v2, s[0:1], s2, v0
2226; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2227; GFX12-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
2228; GFX12-NEXT:    s_mov_b64 s[0:1], 0
2229; GFX12-NEXT:  .LBB19_1: ; %atomicrmw.start
2230; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
2231; GFX12-NEXT:    s_wait_loadcnt 0x0
2232; GFX12-NEXT:    v_max_u32_e32 v4, v5, v1
2233; GFX12-NEXT:    global_wb scope:SCOPE_SYS
2234; GFX12-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
2235; GFX12-NEXT:    s_wait_loadcnt 0x0
2236; GFX12-NEXT:    global_inv scope:SCOPE_SYS
2237; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
2238; GFX12-NEXT:    v_mov_b32_e32 v5, v0
2239; GFX12-NEXT:    s_wait_alu 0xfffe
2240; GFX12-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2241; GFX12-NEXT:    s_wait_alu 0xfffe
2242; GFX12-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
2243; GFX12-NEXT:    s_cbranch_execnz .LBB19_1
2244; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
2245; GFX12-NEXT:    s_endpgm
2246  %zext.offset = zext i32 %voffset to i64
2247  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2248  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
2249  %unused = atomicrmw umax ptr addrspace(1) %gep1, i32 %data seq_cst
2250  ret void
2251}
2252
2253define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
2254; GFX9-LABEL: global_umax_saddr_i64_rtn:
2255; GFX9:       ; %bb.0:
2256; GFX9-NEXT:    global_load_dwordx2 v[3:4], v0, s[2:3]
2257; GFX9-NEXT:    v_mov_b32_e32 v6, s3
2258; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, s2, v0
2259; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
2260; GFX9-NEXT:    s_mov_b64 s[0:1], 0
2261; GFX9-NEXT:  .LBB20_1: ; %atomicrmw.start
2262; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
2263; GFX9-NEXT:    s_waitcnt vmcnt(0)
2264; GFX9-NEXT:    v_mov_b32_e32 v10, v4
2265; GFX9-NEXT:    v_mov_b32_e32 v9, v3
2266; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2]
2267; GFX9-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
2268; GFX9-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
2269; GFX9-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc
2270; GFX9-NEXT:    s_waitcnt vmcnt(0)
2271; GFX9-NEXT:    buffer_wbinvl1
2272; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
2273; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2274; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
2275; GFX9-NEXT:    s_cbranch_execnz .LBB20_1
2276; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
2277; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
2278; GFX9-NEXT:    v_mov_b32_e32 v0, v3
2279; GFX9-NEXT:    v_mov_b32_e32 v1, v4
2280; GFX9-NEXT:    ; return to shader part epilog
2281;
2282; GFX10-LABEL: global_umax_saddr_i64_rtn:
2283; GFX10:       ; %bb.0:
2284; GFX10-NEXT:    global_load_dwordx2 v[3:4], v0, s[2:3]
2285; GFX10-NEXT:    v_add_co_u32 v5, s[0:1], s2, v0
2286; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1]
2287; GFX10-NEXT:    s_mov_b64 s[0:1], 0
2288; GFX10-NEXT:  .LBB20_1: ; %atomicrmw.start
2289; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
2290; GFX10-NEXT:    s_waitcnt vmcnt(0)
2291; GFX10-NEXT:    v_mov_b32_e32 v10, v4
2292; GFX10-NEXT:    v_mov_b32_e32 v9, v3
2293; GFX10-NEXT:    v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2]
2294; GFX10-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
2295; GFX10-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
2296; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc
2297; GFX10-NEXT:    s_waitcnt vmcnt(0)
2298; GFX10-NEXT:    buffer_gl1_inv
2299; GFX10-NEXT:    buffer_gl0_inv
2300; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
2301; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2302; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
2303; GFX10-NEXT:    s_cbranch_execnz .LBB20_1
2304; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
2305; GFX10-NEXT:    s_or_b64 exec, exec, s[0:1]
2306; GFX10-NEXT:    v_mov_b32_e32 v0, v3
2307; GFX10-NEXT:    v_mov_b32_e32 v1, v4
2308; GFX10-NEXT:    ; return to shader part epilog
2309;
2310; GFX11-LABEL: global_umax_saddr_i64_rtn:
2311; GFX11:       ; %bb.0:
2312; GFX11-NEXT:    global_load_b64 v[3:4], v0, s[2:3]
2313; GFX11-NEXT:    v_add_co_u32 v5, s[0:1], s2, v0
2314; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2315; GFX11-NEXT:    v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1]
2316; GFX11-NEXT:    s_mov_b64 s[0:1], 0
2317; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
2318; GFX11-NEXT:  .LBB20_1: ; %atomicrmw.start
2319; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
2320; GFX11-NEXT:    s_waitcnt vmcnt(0)
2321; GFX11-NEXT:    v_mov_b32_e32 v10, v4
2322; GFX11-NEXT:    v_mov_b32_e32 v9, v3
2323; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2324; GFX11-NEXT:    v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2]
2325; GFX11-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
2326; GFX11-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
2327; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off glc
2328; GFX11-NEXT:    s_waitcnt vmcnt(0)
2329; GFX11-NEXT:    buffer_gl1_inv
2330; GFX11-NEXT:    buffer_gl0_inv
2331; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
2332; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2333; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2334; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
2335; GFX11-NEXT:    s_cbranch_execnz .LBB20_1
2336; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
2337; GFX11-NEXT:    s_or_b64 exec, exec, s[0:1]
2338; GFX11-NEXT:    v_mov_b32_e32 v0, v3
2339; GFX11-NEXT:    v_mov_b32_e32 v1, v4
2340; GFX11-NEXT:    ; return to shader part epilog
2341;
2342; GFX12-LABEL: global_umax_saddr_i64_rtn:
2343; GFX12:       ; %bb.0:
2344; GFX12-NEXT:    global_load_b64 v[3:4], v0, s[2:3]
2345; GFX12-NEXT:    v_add_co_u32 v5, s[0:1], s2, v0
2346; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2347; GFX12-NEXT:    v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1]
2348; GFX12-NEXT:    s_mov_b64 s[0:1], 0
2349; GFX12-NEXT:  .LBB20_1: ; %atomicrmw.start
2350; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
2351; GFX12-NEXT:    s_wait_loadcnt 0x0
2352; GFX12-NEXT:    v_mov_b32_e32 v10, v4
2353; GFX12-NEXT:    v_mov_b32_e32 v9, v3
2354; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2355; GFX12-NEXT:    v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2]
2356; GFX12-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
2357; GFX12-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
2358; GFX12-NEXT:    global_wb scope:SCOPE_SYS
2359; GFX12-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
2360; GFX12-NEXT:    s_wait_loadcnt 0x0
2361; GFX12-NEXT:    global_inv scope:SCOPE_SYS
2362; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
2363; GFX12-NEXT:    s_wait_alu 0xfffe
2364; GFX12-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2365; GFX12-NEXT:    s_wait_alu 0xfffe
2366; GFX12-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
2367; GFX12-NEXT:    s_cbranch_execnz .LBB20_1
2368; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
2369; GFX12-NEXT:    s_or_b64 exec, exec, s[0:1]
2370; GFX12-NEXT:    v_mov_b32_e32 v0, v3
2371; GFX12-NEXT:    v_mov_b32_e32 v1, v4
2372; GFX12-NEXT:    ; return to shader part epilog
2373  %zext.offset = zext i32 %voffset to i64
2374  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2375  %rtn = atomicrmw umax ptr addrspace(1) %gep0, i64 %data seq_cst
2376  %cast.rtn = bitcast i64 %rtn to <2 x float>
2377  ret <2 x float> %cast.rtn
2378}
2379
2380define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
2381; GFX9-LABEL: global_umax_saddr_i64_rtn_neg128:
2382; GFX9:       ; %bb.0:
2383; GFX9-NEXT:    global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128
2384; GFX9-NEXT:    v_mov_b32_e32 v6, s3
2385; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, s2, v0
2386; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
2387; GFX9-NEXT:    s_mov_b64 s[0:1], 0
2388; GFX9-NEXT:  .LBB21_1: ; %atomicrmw.start
2389; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
2390; GFX9-NEXT:    s_waitcnt vmcnt(0)
2391; GFX9-NEXT:    v_mov_b32_e32 v10, v4
2392; GFX9-NEXT:    v_mov_b32_e32 v9, v3
2393; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2]
2394; GFX9-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
2395; GFX9-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
2396; GFX9-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc
2397; GFX9-NEXT:    s_waitcnt vmcnt(0)
2398; GFX9-NEXT:    buffer_wbinvl1
2399; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
2400; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2401; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
2402; GFX9-NEXT:    s_cbranch_execnz .LBB21_1
2403; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
2404; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
2405; GFX9-NEXT:    v_mov_b32_e32 v0, v3
2406; GFX9-NEXT:    v_mov_b32_e32 v1, v4
2407; GFX9-NEXT:    ; return to shader part epilog
2408;
2409; GFX10-LABEL: global_umax_saddr_i64_rtn_neg128:
2410; GFX10:       ; %bb.0:
2411; GFX10-NEXT:    global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128
2412; GFX10-NEXT:    v_add_co_u32 v5, s[0:1], s2, v0
2413; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1]
2414; GFX10-NEXT:    s_mov_b64 s[0:1], 0
2415; GFX10-NEXT:  .LBB21_1: ; %atomicrmw.start
2416; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
2417; GFX10-NEXT:    s_waitcnt vmcnt(0)
2418; GFX10-NEXT:    v_mov_b32_e32 v10, v4
2419; GFX10-NEXT:    v_mov_b32_e32 v9, v3
2420; GFX10-NEXT:    v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2]
2421; GFX10-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
2422; GFX10-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
2423; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc
2424; GFX10-NEXT:    s_waitcnt vmcnt(0)
2425; GFX10-NEXT:    buffer_gl1_inv
2426; GFX10-NEXT:    buffer_gl0_inv
2427; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
2428; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2429; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
2430; GFX10-NEXT:    s_cbranch_execnz .LBB21_1
2431; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
2432; GFX10-NEXT:    s_or_b64 exec, exec, s[0:1]
2433; GFX10-NEXT:    v_mov_b32_e32 v0, v3
2434; GFX10-NEXT:    v_mov_b32_e32 v1, v4
2435; GFX10-NEXT:    ; return to shader part epilog
2436;
2437; GFX11-LABEL: global_umax_saddr_i64_rtn_neg128:
2438; GFX11:       ; %bb.0:
2439; GFX11-NEXT:    global_load_b64 v[3:4], v0, s[2:3] offset:-128
2440; GFX11-NEXT:    v_add_co_u32 v5, s[0:1], s2, v0
2441; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2442; GFX11-NEXT:    v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1]
2443; GFX11-NEXT:    s_mov_b64 s[0:1], 0
2444; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
2445; GFX11-NEXT:  .LBB21_1: ; %atomicrmw.start
2446; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
2447; GFX11-NEXT:    s_waitcnt vmcnt(0)
2448; GFX11-NEXT:    v_mov_b32_e32 v10, v4
2449; GFX11-NEXT:    v_mov_b32_e32 v9, v3
2450; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2451; GFX11-NEXT:    v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2]
2452; GFX11-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
2453; GFX11-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
2454; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 glc
2455; GFX11-NEXT:    s_waitcnt vmcnt(0)
2456; GFX11-NEXT:    buffer_gl1_inv
2457; GFX11-NEXT:    buffer_gl0_inv
2458; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
2459; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2460; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2461; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
2462; GFX11-NEXT:    s_cbranch_execnz .LBB21_1
2463; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
2464; GFX11-NEXT:    s_or_b64 exec, exec, s[0:1]
2465; GFX11-NEXT:    v_mov_b32_e32 v0, v3
2466; GFX11-NEXT:    v_mov_b32_e32 v1, v4
2467; GFX11-NEXT:    ; return to shader part epilog
2468;
2469; GFX12-LABEL: global_umax_saddr_i64_rtn_neg128:
2470; GFX12:       ; %bb.0:
2471; GFX12-NEXT:    global_load_b64 v[3:4], v0, s[2:3] offset:-128
2472; GFX12-NEXT:    v_add_co_u32 v5, s[0:1], s2, v0
2473; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2474; GFX12-NEXT:    v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1]
2475; GFX12-NEXT:    s_mov_b64 s[0:1], 0
2476; GFX12-NEXT:  .LBB21_1: ; %atomicrmw.start
2477; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
2478; GFX12-NEXT:    s_wait_loadcnt 0x0
2479; GFX12-NEXT:    v_mov_b32_e32 v10, v4
2480; GFX12-NEXT:    v_mov_b32_e32 v9, v3
2481; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2482; GFX12-NEXT:    v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2]
2483; GFX12-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
2484; GFX12-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
2485; GFX12-NEXT:    global_wb scope:SCOPE_SYS
2486; GFX12-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
2487; GFX12-NEXT:    s_wait_loadcnt 0x0
2488; GFX12-NEXT:    global_inv scope:SCOPE_SYS
2489; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
2490; GFX12-NEXT:    s_wait_alu 0xfffe
2491; GFX12-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2492; GFX12-NEXT:    s_wait_alu 0xfffe
2493; GFX12-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
2494; GFX12-NEXT:    s_cbranch_execnz .LBB21_1
2495; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
2496; GFX12-NEXT:    s_or_b64 exec, exec, s[0:1]
2497; GFX12-NEXT:    v_mov_b32_e32 v0, v3
2498; GFX12-NEXT:    v_mov_b32_e32 v1, v4
2499; GFX12-NEXT:    ; return to shader part epilog
2500  %zext.offset = zext i32 %voffset to i64
2501  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2502  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
2503  %rtn = atomicrmw umax ptr addrspace(1) %gep1, i64 %data seq_cst
2504  %cast.rtn = bitcast i64 %rtn to <2 x float>
2505  ret <2 x float> %cast.rtn
2506}
2507
2508define amdgpu_ps void @global_umax_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
2509; GFX9-LABEL: global_umax_saddr_i64_nortn:
2510; GFX9:       ; %bb.0:
2511; GFX9-NEXT:    global_load_dwordx2 v[5:6], v0, s[2:3]
2512; GFX9-NEXT:    v_mov_b32_e32 v3, s3
2513; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, s2, v0
2514; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v3, vcc
2515; GFX9-NEXT:    s_mov_b64 s[0:1], 0
2516; GFX9-NEXT:  .LBB22_1: ; %atomicrmw.start
2517; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
2518; GFX9-NEXT:    s_waitcnt vmcnt(0)
2519; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2]
2520; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
2521; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
2522; GFX9-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc
2523; GFX9-NEXT:    s_waitcnt vmcnt(0)
2524; GFX9-NEXT:    buffer_wbinvl1
2525; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
2526; GFX9-NEXT:    v_mov_b32_e32 v6, v4
2527; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2528; GFX9-NEXT:    v_mov_b32_e32 v5, v3
2529; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
2530; GFX9-NEXT:    s_cbranch_execnz .LBB22_1
2531; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
2532; GFX9-NEXT:    s_endpgm
2533;
2534; GFX10-LABEL: global_umax_saddr_i64_nortn:
2535; GFX10:       ; %bb.0:
2536; GFX10-NEXT:    global_load_dwordx2 v[5:6], v0, s[2:3]
2537; GFX10-NEXT:    v_add_co_u32 v7, s[0:1], s2, v0
2538; GFX10-NEXT:    v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1]
2539; GFX10-NEXT:    s_mov_b64 s[0:1], 0
2540; GFX10-NEXT:  .LBB22_1: ; %atomicrmw.start
2541; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
2542; GFX10-NEXT:    s_waitcnt vmcnt(0)
2543; GFX10-NEXT:    v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2]
2544; GFX10-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
2545; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
2546; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc
2547; GFX10-NEXT:    s_waitcnt vmcnt(0)
2548; GFX10-NEXT:    buffer_gl1_inv
2549; GFX10-NEXT:    buffer_gl0_inv
2550; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
2551; GFX10-NEXT:    v_mov_b32_e32 v6, v4
2552; GFX10-NEXT:    v_mov_b32_e32 v5, v3
2553; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2554; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
2555; GFX10-NEXT:    s_cbranch_execnz .LBB22_1
2556; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
2557; GFX10-NEXT:    s_endpgm
2558;
2559; GFX11-LABEL: global_umax_saddr_i64_nortn:
2560; GFX11:       ; %bb.0:
2561; GFX11-NEXT:    global_load_b64 v[5:6], v0, s[2:3]
2562; GFX11-NEXT:    v_add_co_u32 v7, s[0:1], s2, v0
2563; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2564; GFX11-NEXT:    v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1]
2565; GFX11-NEXT:    s_mov_b64 s[0:1], 0
2566; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
2567; GFX11-NEXT:  .LBB22_1: ; %atomicrmw.start
2568; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
2569; GFX11-NEXT:    s_waitcnt vmcnt(0)
2570; GFX11-NEXT:    v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2]
2571; GFX11-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
2572; GFX11-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
2573; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off glc
2574; GFX11-NEXT:    s_waitcnt vmcnt(0)
2575; GFX11-NEXT:    buffer_gl1_inv
2576; GFX11-NEXT:    buffer_gl0_inv
2577; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
2578; GFX11-NEXT:    v_mov_b32_e32 v6, v4
2579; GFX11-NEXT:    v_mov_b32_e32 v5, v3
2580; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2581; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2582; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
2583; GFX11-NEXT:    s_cbranch_execnz .LBB22_1
2584; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
2585; GFX11-NEXT:    s_endpgm
2586;
2587; GFX12-LABEL: global_umax_saddr_i64_nortn:
2588; GFX12:       ; %bb.0:
2589; GFX12-NEXT:    global_load_b64 v[5:6], v0, s[2:3]
2590; GFX12-NEXT:    v_add_co_u32 v7, s[0:1], s2, v0
2591; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2592; GFX12-NEXT:    v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1]
2593; GFX12-NEXT:    s_mov_b64 s[0:1], 0
2594; GFX12-NEXT:  .LBB22_1: ; %atomicrmw.start
2595; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
2596; GFX12-NEXT:    s_wait_loadcnt 0x0
2597; GFX12-NEXT:    v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2]
2598; GFX12-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
2599; GFX12-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
2600; GFX12-NEXT:    global_wb scope:SCOPE_SYS
2601; GFX12-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
2602; GFX12-NEXT:    s_wait_loadcnt 0x0
2603; GFX12-NEXT:    global_inv scope:SCOPE_SYS
2604; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
2605; GFX12-NEXT:    v_mov_b32_e32 v6, v4
2606; GFX12-NEXT:    v_mov_b32_e32 v5, v3
2607; GFX12-NEXT:    s_wait_alu 0xfffe
2608; GFX12-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2609; GFX12-NEXT:    s_wait_alu 0xfffe
2610; GFX12-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
2611; GFX12-NEXT:    s_cbranch_execnz .LBB22_1
2612; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
2613; GFX12-NEXT:    s_endpgm
2614  %zext.offset = zext i32 %voffset to i64
2615  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2616  %unused = atomicrmw umax ptr addrspace(1) %gep0, i64 %data seq_cst
2617  ret void
2618}
2619
2620define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
2621; GFX9-LABEL: global_umax_saddr_i64_nortn_neg128:
2622; GFX9:       ; %bb.0:
2623; GFX9-NEXT:    global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128
2624; GFX9-NEXT:    v_mov_b32_e32 v3, s3
2625; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, s2, v0
2626; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v3, vcc
2627; GFX9-NEXT:    s_mov_b64 s[0:1], 0
2628; GFX9-NEXT:  .LBB23_1: ; %atomicrmw.start
2629; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
2630; GFX9-NEXT:    s_waitcnt vmcnt(0)
2631; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2]
2632; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
2633; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
2634; GFX9-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc
2635; GFX9-NEXT:    s_waitcnt vmcnt(0)
2636; GFX9-NEXT:    buffer_wbinvl1
2637; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
2638; GFX9-NEXT:    v_mov_b32_e32 v6, v4
2639; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2640; GFX9-NEXT:    v_mov_b32_e32 v5, v3
2641; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
2642; GFX9-NEXT:    s_cbranch_execnz .LBB23_1
2643; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
2644; GFX9-NEXT:    s_endpgm
2645;
2646; GFX10-LABEL: global_umax_saddr_i64_nortn_neg128:
2647; GFX10:       ; %bb.0:
2648; GFX10-NEXT:    global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128
2649; GFX10-NEXT:    v_add_co_u32 v7, s[0:1], s2, v0
2650; GFX10-NEXT:    v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1]
2651; GFX10-NEXT:    s_mov_b64 s[0:1], 0
2652; GFX10-NEXT:  .LBB23_1: ; %atomicrmw.start
2653; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
2654; GFX10-NEXT:    s_waitcnt vmcnt(0)
2655; GFX10-NEXT:    v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2]
2656; GFX10-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
2657; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
2658; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc
2659; GFX10-NEXT:    s_waitcnt vmcnt(0)
2660; GFX10-NEXT:    buffer_gl1_inv
2661; GFX10-NEXT:    buffer_gl0_inv
2662; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
2663; GFX10-NEXT:    v_mov_b32_e32 v6, v4
2664; GFX10-NEXT:    v_mov_b32_e32 v5, v3
2665; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2666; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
2667; GFX10-NEXT:    s_cbranch_execnz .LBB23_1
2668; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
2669; GFX10-NEXT:    s_endpgm
2670;
2671; GFX11-LABEL: global_umax_saddr_i64_nortn_neg128:
2672; GFX11:       ; %bb.0:
2673; GFX11-NEXT:    global_load_b64 v[5:6], v0, s[2:3] offset:-128
2674; GFX11-NEXT:    v_add_co_u32 v7, s[0:1], s2, v0
2675; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2676; GFX11-NEXT:    v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1]
2677; GFX11-NEXT:    s_mov_b64 s[0:1], 0
2678; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
2679; GFX11-NEXT:  .LBB23_1: ; %atomicrmw.start
2680; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
2681; GFX11-NEXT:    s_waitcnt vmcnt(0)
2682; GFX11-NEXT:    v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2]
2683; GFX11-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
2684; GFX11-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
2685; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 glc
2686; GFX11-NEXT:    s_waitcnt vmcnt(0)
2687; GFX11-NEXT:    buffer_gl1_inv
2688; GFX11-NEXT:    buffer_gl0_inv
2689; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
2690; GFX11-NEXT:    v_mov_b32_e32 v6, v4
2691; GFX11-NEXT:    v_mov_b32_e32 v5, v3
2692; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2693; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2694; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
2695; GFX11-NEXT:    s_cbranch_execnz .LBB23_1
2696; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
2697; GFX11-NEXT:    s_endpgm
2698;
2699; GFX12-LABEL: global_umax_saddr_i64_nortn_neg128:
2700; GFX12:       ; %bb.0:
2701; GFX12-NEXT:    global_load_b64 v[5:6], v0, s[2:3] offset:-128
2702; GFX12-NEXT:    v_add_co_u32 v7, s[0:1], s2, v0
2703; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2704; GFX12-NEXT:    v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1]
2705; GFX12-NEXT:    s_mov_b64 s[0:1], 0
2706; GFX12-NEXT:  .LBB23_1: ; %atomicrmw.start
2707; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
2708; GFX12-NEXT:    s_wait_loadcnt 0x0
2709; GFX12-NEXT:    v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2]
2710; GFX12-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
2711; GFX12-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
2712; GFX12-NEXT:    global_wb scope:SCOPE_SYS
2713; GFX12-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
2714; GFX12-NEXT:    s_wait_loadcnt 0x0
2715; GFX12-NEXT:    global_inv scope:SCOPE_SYS
2716; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
2717; GFX12-NEXT:    v_mov_b32_e32 v6, v4
2718; GFX12-NEXT:    v_mov_b32_e32 v5, v3
2719; GFX12-NEXT:    s_wait_alu 0xfffe
2720; GFX12-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2721; GFX12-NEXT:    s_wait_alu 0xfffe
2722; GFX12-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
2723; GFX12-NEXT:    s_cbranch_execnz .LBB23_1
2724; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
2725; GFX12-NEXT:    s_endpgm
2726  %zext.offset = zext i32 %voffset to i64
2727  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2728  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
2729  %unused = atomicrmw umax ptr addrspace(1) %gep1, i64 %data seq_cst
2730  ret void
2731}
2732
2733; --------------------------------------------------------------------------------
2734; atomicrmw umin
2735; --------------------------------------------------------------------------------
2736
2737define amdgpu_ps float @global_umin_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
2738; GFX9-LABEL: global_umin_saddr_i32_rtn:
2739; GFX9:       ; %bb.0:
2740; GFX9-NEXT:    v_mov_b32_e32 v2, v0
2741; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
2742; GFX9-NEXT:    v_mov_b32_e32 v3, s3
2743; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s2, v2
2744; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
2745; GFX9-NEXT:    s_mov_b64 s[0:1], 0
2746; GFX9-NEXT:  .LBB24_1: ; %atomicrmw.start
2747; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
2748; GFX9-NEXT:    s_waitcnt vmcnt(0)
2749; GFX9-NEXT:    v_mov_b32_e32 v5, v0
2750; GFX9-NEXT:    v_min_u32_e32 v4, v5, v1
2751; GFX9-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
2752; GFX9-NEXT:    s_waitcnt vmcnt(0)
2753; GFX9-NEXT:    buffer_wbinvl1
2754; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
2755; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2756; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
2757; GFX9-NEXT:    s_cbranch_execnz .LBB24_1
2758; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
2759; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
2760; GFX9-NEXT:    ; return to shader part epilog
2761;
2762; GFX10-LABEL: global_umin_saddr_i32_rtn:
2763; GFX10:       ; %bb.0:
2764; GFX10-NEXT:    v_mov_b32_e32 v2, v0
2765; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
2766; GFX10-NEXT:    v_add_co_u32 v2, s[0:1], s2, v2
2767; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
2768; GFX10-NEXT:    s_mov_b64 s[0:1], 0
2769; GFX10-NEXT:  .LBB24_1: ; %atomicrmw.start
2770; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
2771; GFX10-NEXT:    s_waitcnt vmcnt(0)
2772; GFX10-NEXT:    v_mov_b32_e32 v5, v0
2773; GFX10-NEXT:    v_min_u32_e32 v4, v5, v1
2774; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
2775; GFX10-NEXT:    s_waitcnt vmcnt(0)
2776; GFX10-NEXT:    buffer_gl1_inv
2777; GFX10-NEXT:    buffer_gl0_inv
2778; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
2779; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2780; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
2781; GFX10-NEXT:    s_cbranch_execnz .LBB24_1
2782; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
2783; GFX10-NEXT:    s_or_b64 exec, exec, s[0:1]
2784; GFX10-NEXT:    ; return to shader part epilog
2785;
2786; GFX11-LABEL: global_umin_saddr_i32_rtn:
2787; GFX11:       ; %bb.0:
2788; GFX11-NEXT:    v_mov_b32_e32 v2, v0
2789; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
2790; GFX11-NEXT:    v_add_co_u32 v2, s[0:1], s2, v2
2791; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2792; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
2793; GFX11-NEXT:    s_mov_b64 s[0:1], 0
2794; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
2795; GFX11-NEXT:  .LBB24_1: ; %atomicrmw.start
2796; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
2797; GFX11-NEXT:    s_waitcnt vmcnt(0)
2798; GFX11-NEXT:    v_mov_b32_e32 v5, v0
2799; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2800; GFX11-NEXT:    v_min_u32_e32 v4, v5, v1
2801; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc
2802; GFX11-NEXT:    s_waitcnt vmcnt(0)
2803; GFX11-NEXT:    buffer_gl1_inv
2804; GFX11-NEXT:    buffer_gl0_inv
2805; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
2806; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2807; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2808; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
2809; GFX11-NEXT:    s_cbranch_execnz .LBB24_1
2810; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
2811; GFX11-NEXT:    s_or_b64 exec, exec, s[0:1]
2812; GFX11-NEXT:    ; return to shader part epilog
2813;
2814; GFX12-LABEL: global_umin_saddr_i32_rtn:
2815; GFX12:       ; %bb.0:
2816; GFX12-NEXT:    v_mov_b32_e32 v2, v0
2817; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3]
2818; GFX12-NEXT:    v_add_co_u32 v2, s[0:1], s2, v2
2819; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2820; GFX12-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
2821; GFX12-NEXT:    s_mov_b64 s[0:1], 0
2822; GFX12-NEXT:  .LBB24_1: ; %atomicrmw.start
2823; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
2824; GFX12-NEXT:    s_wait_loadcnt 0x0
2825; GFX12-NEXT:    v_mov_b32_e32 v5, v0
2826; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2827; GFX12-NEXT:    v_min_u32_e32 v4, v5, v1
2828; GFX12-NEXT:    global_wb scope:SCOPE_SYS
2829; GFX12-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
2830; GFX12-NEXT:    s_wait_loadcnt 0x0
2831; GFX12-NEXT:    global_inv scope:SCOPE_SYS
2832; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
2833; GFX12-NEXT:    s_wait_alu 0xfffe
2834; GFX12-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2835; GFX12-NEXT:    s_wait_alu 0xfffe
2836; GFX12-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
2837; GFX12-NEXT:    s_cbranch_execnz .LBB24_1
2838; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
2839; GFX12-NEXT:    s_or_b64 exec, exec, s[0:1]
2840; GFX12-NEXT:    ; return to shader part epilog
2841  %zext.offset = zext i32 %voffset to i64
2842  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2843  %rtn = atomicrmw umin ptr addrspace(1) %gep0, i32 %data seq_cst
2844  %cast.rtn = bitcast i32 %rtn to float
2845  ret float %cast.rtn
2846}
2847
2848define amdgpu_ps float @global_umin_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
2849; GFX9-LABEL: global_umin_saddr_i32_rtn_neg128:
2850; GFX9:       ; %bb.0:
2851; GFX9-NEXT:    v_mov_b32_e32 v2, v0
2852; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] offset:-128
2853; GFX9-NEXT:    v_mov_b32_e32 v3, s3
2854; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s2, v2
2855; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
2856; GFX9-NEXT:    s_mov_b64 s[0:1], 0
2857; GFX9-NEXT:  .LBB25_1: ; %atomicrmw.start
2858; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
2859; GFX9-NEXT:    s_waitcnt vmcnt(0)
2860; GFX9-NEXT:    v_mov_b32_e32 v5, v0
2861; GFX9-NEXT:    v_min_u32_e32 v4, v5, v1
2862; GFX9-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
2863; GFX9-NEXT:    s_waitcnt vmcnt(0)
2864; GFX9-NEXT:    buffer_wbinvl1
2865; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
2866; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2867; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
2868; GFX9-NEXT:    s_cbranch_execnz .LBB25_1
2869; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
2870; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
2871; GFX9-NEXT:    ; return to shader part epilog
2872;
2873; GFX10-LABEL: global_umin_saddr_i32_rtn_neg128:
2874; GFX10:       ; %bb.0:
2875; GFX10-NEXT:    v_mov_b32_e32 v2, v0
2876; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] offset:-128
2877; GFX10-NEXT:    v_add_co_u32 v2, s[0:1], s2, v2
2878; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
2879; GFX10-NEXT:    s_mov_b64 s[0:1], 0
2880; GFX10-NEXT:  .LBB25_1: ; %atomicrmw.start
2881; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
2882; GFX10-NEXT:    s_waitcnt vmcnt(0)
2883; GFX10-NEXT:    v_mov_b32_e32 v5, v0
2884; GFX10-NEXT:    v_min_u32_e32 v4, v5, v1
2885; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
2886; GFX10-NEXT:    s_waitcnt vmcnt(0)
2887; GFX10-NEXT:    buffer_gl1_inv
2888; GFX10-NEXT:    buffer_gl0_inv
2889; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
2890; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2891; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
2892; GFX10-NEXT:    s_cbranch_execnz .LBB25_1
2893; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
2894; GFX10-NEXT:    s_or_b64 exec, exec, s[0:1]
2895; GFX10-NEXT:    ; return to shader part epilog
2896;
2897; GFX11-LABEL: global_umin_saddr_i32_rtn_neg128:
2898; GFX11:       ; %bb.0:
2899; GFX11-NEXT:    v_mov_b32_e32 v2, v0
2900; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] offset:-128
2901; GFX11-NEXT:    v_add_co_u32 v2, s[0:1], s2, v2
2902; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2903; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
2904; GFX11-NEXT:    s_mov_b64 s[0:1], 0
2905; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
2906; GFX11-NEXT:  .LBB25_1: ; %atomicrmw.start
2907; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
2908; GFX11-NEXT:    s_waitcnt vmcnt(0)
2909; GFX11-NEXT:    v_mov_b32_e32 v5, v0
2910; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2911; GFX11-NEXT:    v_min_u32_e32 v4, v5, v1
2912; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc
2913; GFX11-NEXT:    s_waitcnt vmcnt(0)
2914; GFX11-NEXT:    buffer_gl1_inv
2915; GFX11-NEXT:    buffer_gl0_inv
2916; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
2917; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2918; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2919; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
2920; GFX11-NEXT:    s_cbranch_execnz .LBB25_1
2921; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
2922; GFX11-NEXT:    s_or_b64 exec, exec, s[0:1]
2923; GFX11-NEXT:    ; return to shader part epilog
2924;
2925; GFX12-LABEL: global_umin_saddr_i32_rtn_neg128:
2926; GFX12:       ; %bb.0:
2927; GFX12-NEXT:    v_mov_b32_e32 v2, v0
2928; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3] offset:-128
2929; GFX12-NEXT:    v_add_co_u32 v2, s[0:1], s2, v2
2930; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2931; GFX12-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
2932; GFX12-NEXT:    s_mov_b64 s[0:1], 0
2933; GFX12-NEXT:  .LBB25_1: ; %atomicrmw.start
2934; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
2935; GFX12-NEXT:    s_wait_loadcnt 0x0
2936; GFX12-NEXT:    v_mov_b32_e32 v5, v0
2937; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2938; GFX12-NEXT:    v_min_u32_e32 v4, v5, v1
2939; GFX12-NEXT:    global_wb scope:SCOPE_SYS
2940; GFX12-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
2941; GFX12-NEXT:    s_wait_loadcnt 0x0
2942; GFX12-NEXT:    global_inv scope:SCOPE_SYS
2943; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
2944; GFX12-NEXT:    s_wait_alu 0xfffe
2945; GFX12-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2946; GFX12-NEXT:    s_wait_alu 0xfffe
2947; GFX12-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
2948; GFX12-NEXT:    s_cbranch_execnz .LBB25_1
2949; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
2950; GFX12-NEXT:    s_or_b64 exec, exec, s[0:1]
2951; GFX12-NEXT:    ; return to shader part epilog
2952  %zext.offset = zext i32 %voffset to i64
2953  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2954  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
2955  %rtn = atomicrmw umin ptr addrspace(1) %gep1, i32 %data seq_cst
2956  %cast.rtn = bitcast i32 %rtn to float
2957  ret float %cast.rtn
2958}
2959
2960define amdgpu_ps void @global_umin_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
2961; GFX9-LABEL: global_umin_saddr_i32_nortn:
2962; GFX9:       ; %bb.0:
2963; GFX9-NEXT:    global_load_dword v5, v0, s[2:3]
2964; GFX9-NEXT:    v_mov_b32_e32 v3, s3
2965; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s2, v0
2966; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
2967; GFX9-NEXT:    s_mov_b64 s[0:1], 0
2968; GFX9-NEXT:  .LBB26_1: ; %atomicrmw.start
2969; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
2970; GFX9-NEXT:    s_waitcnt vmcnt(0)
2971; GFX9-NEXT:    v_min_u32_e32 v4, v5, v1
2972; GFX9-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
2973; GFX9-NEXT:    s_waitcnt vmcnt(0)
2974; GFX9-NEXT:    buffer_wbinvl1
2975; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
2976; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2977; GFX9-NEXT:    v_mov_b32_e32 v5, v0
2978; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
2979; GFX9-NEXT:    s_cbranch_execnz .LBB26_1
2980; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
2981; GFX9-NEXT:    s_endpgm
2982;
2983; GFX10-LABEL: global_umin_saddr_i32_nortn:
2984; GFX10:       ; %bb.0:
2985; GFX10-NEXT:    global_load_dword v5, v0, s[2:3]
2986; GFX10-NEXT:    v_add_co_u32 v2, s[0:1], s2, v0
2987; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
2988; GFX10-NEXT:    s_mov_b64 s[0:1], 0
2989; GFX10-NEXT:  .LBB26_1: ; %atomicrmw.start
2990; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
2991; GFX10-NEXT:    s_waitcnt vmcnt(0)
2992; GFX10-NEXT:    v_min_u32_e32 v4, v5, v1
2993; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
2994; GFX10-NEXT:    s_waitcnt vmcnt(0)
2995; GFX10-NEXT:    buffer_gl1_inv
2996; GFX10-NEXT:    buffer_gl0_inv
2997; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
2998; GFX10-NEXT:    v_mov_b32_e32 v5, v0
2999; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
3000; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
3001; GFX10-NEXT:    s_cbranch_execnz .LBB26_1
3002; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
3003; GFX10-NEXT:    s_endpgm
3004;
3005; GFX11-LABEL: global_umin_saddr_i32_nortn:
3006; GFX11:       ; %bb.0:
3007; GFX11-NEXT:    global_load_b32 v5, v0, s[2:3]
3008; GFX11-NEXT:    v_add_co_u32 v2, s[0:1], s2, v0
3009; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3010; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
3011; GFX11-NEXT:    s_mov_b64 s[0:1], 0
3012; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
3013; GFX11-NEXT:  .LBB26_1: ; %atomicrmw.start
3014; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
3015; GFX11-NEXT:    s_waitcnt vmcnt(0)
3016; GFX11-NEXT:    v_min_u32_e32 v4, v5, v1
3017; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc
3018; GFX11-NEXT:    s_waitcnt vmcnt(0)
3019; GFX11-NEXT:    buffer_gl1_inv
3020; GFX11-NEXT:    buffer_gl0_inv
3021; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
3022; GFX11-NEXT:    v_mov_b32_e32 v5, v0
3023; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
3024; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
3025; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
3026; GFX11-NEXT:    s_cbranch_execnz .LBB26_1
3027; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
3028; GFX11-NEXT:    s_endpgm
3029;
3030; GFX12-LABEL: global_umin_saddr_i32_nortn:
3031; GFX12:       ; %bb.0:
3032; GFX12-NEXT:    global_load_b32 v5, v0, s[2:3]
3033; GFX12-NEXT:    v_add_co_u32 v2, s[0:1], s2, v0
3034; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3035; GFX12-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
3036; GFX12-NEXT:    s_mov_b64 s[0:1], 0
3037; GFX12-NEXT:  .LBB26_1: ; %atomicrmw.start
3038; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
3039; GFX12-NEXT:    s_wait_loadcnt 0x0
3040; GFX12-NEXT:    v_min_u32_e32 v4, v5, v1
3041; GFX12-NEXT:    global_wb scope:SCOPE_SYS
3042; GFX12-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
3043; GFX12-NEXT:    s_wait_loadcnt 0x0
3044; GFX12-NEXT:    global_inv scope:SCOPE_SYS
3045; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
3046; GFX12-NEXT:    v_mov_b32_e32 v5, v0
3047; GFX12-NEXT:    s_wait_alu 0xfffe
3048; GFX12-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
3049; GFX12-NEXT:    s_wait_alu 0xfffe
3050; GFX12-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
3051; GFX12-NEXT:    s_cbranch_execnz .LBB26_1
3052; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
3053; GFX12-NEXT:    s_endpgm
3054  %zext.offset = zext i32 %voffset to i64
3055  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3056  %unused = atomicrmw umin ptr addrspace(1) %gep0, i32 %data seq_cst
3057  ret void
3058}
3059
3060define amdgpu_ps void @global_umin_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
3061; GFX9-LABEL: global_umin_saddr_i32_nortn_neg128:
3062; GFX9:       ; %bb.0:
3063; GFX9-NEXT:    global_load_dword v5, v0, s[2:3] offset:-128
3064; GFX9-NEXT:    v_mov_b32_e32 v3, s3
3065; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s2, v0
3066; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
3067; GFX9-NEXT:    s_mov_b64 s[0:1], 0
3068; GFX9-NEXT:  .LBB27_1: ; %atomicrmw.start
3069; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
3070; GFX9-NEXT:    s_waitcnt vmcnt(0)
3071; GFX9-NEXT:    v_min_u32_e32 v4, v5, v1
3072; GFX9-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
3073; GFX9-NEXT:    s_waitcnt vmcnt(0)
3074; GFX9-NEXT:    buffer_wbinvl1
3075; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
3076; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
3077; GFX9-NEXT:    v_mov_b32_e32 v5, v0
3078; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
3079; GFX9-NEXT:    s_cbranch_execnz .LBB27_1
3080; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
3081; GFX9-NEXT:    s_endpgm
3082;
3083; GFX10-LABEL: global_umin_saddr_i32_nortn_neg128:
3084; GFX10:       ; %bb.0:
3085; GFX10-NEXT:    global_load_dword v5, v0, s[2:3] offset:-128
3086; GFX10-NEXT:    v_add_co_u32 v2, s[0:1], s2, v0
3087; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
3088; GFX10-NEXT:    s_mov_b64 s[0:1], 0
3089; GFX10-NEXT:  .LBB27_1: ; %atomicrmw.start
3090; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
3091; GFX10-NEXT:    s_waitcnt vmcnt(0)
3092; GFX10-NEXT:    v_min_u32_e32 v4, v5, v1
3093; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
3094; GFX10-NEXT:    s_waitcnt vmcnt(0)
3095; GFX10-NEXT:    buffer_gl1_inv
3096; GFX10-NEXT:    buffer_gl0_inv
3097; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
3098; GFX10-NEXT:    v_mov_b32_e32 v5, v0
3099; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
3100; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
3101; GFX10-NEXT:    s_cbranch_execnz .LBB27_1
3102; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
3103; GFX10-NEXT:    s_endpgm
3104;
3105; GFX11-LABEL: global_umin_saddr_i32_nortn_neg128:
3106; GFX11:       ; %bb.0:
3107; GFX11-NEXT:    global_load_b32 v5, v0, s[2:3] offset:-128
3108; GFX11-NEXT:    v_add_co_u32 v2, s[0:1], s2, v0
3109; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3110; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
3111; GFX11-NEXT:    s_mov_b64 s[0:1], 0
3112; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
3113; GFX11-NEXT:  .LBB27_1: ; %atomicrmw.start
3114; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
3115; GFX11-NEXT:    s_waitcnt vmcnt(0)
3116; GFX11-NEXT:    v_min_u32_e32 v4, v5, v1
3117; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc
3118; GFX11-NEXT:    s_waitcnt vmcnt(0)
3119; GFX11-NEXT:    buffer_gl1_inv
3120; GFX11-NEXT:    buffer_gl0_inv
3121; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
3122; GFX11-NEXT:    v_mov_b32_e32 v5, v0
3123; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
3124; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
3125; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
3126; GFX11-NEXT:    s_cbranch_execnz .LBB27_1
3127; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
3128; GFX11-NEXT:    s_endpgm
3129;
3130; GFX12-LABEL: global_umin_saddr_i32_nortn_neg128:
3131; GFX12:       ; %bb.0:
3132; GFX12-NEXT:    global_load_b32 v5, v0, s[2:3] offset:-128
3133; GFX12-NEXT:    v_add_co_u32 v2, s[0:1], s2, v0
3134; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3135; GFX12-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
3136; GFX12-NEXT:    s_mov_b64 s[0:1], 0
3137; GFX12-NEXT:  .LBB27_1: ; %atomicrmw.start
3138; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
3139; GFX12-NEXT:    s_wait_loadcnt 0x0
3140; GFX12-NEXT:    v_min_u32_e32 v4, v5, v1
3141; GFX12-NEXT:    global_wb scope:SCOPE_SYS
3142; GFX12-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
3143; GFX12-NEXT:    s_wait_loadcnt 0x0
3144; GFX12-NEXT:    global_inv scope:SCOPE_SYS
3145; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
3146; GFX12-NEXT:    v_mov_b32_e32 v5, v0
3147; GFX12-NEXT:    s_wait_alu 0xfffe
3148; GFX12-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
3149; GFX12-NEXT:    s_wait_alu 0xfffe
3150; GFX12-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
3151; GFX12-NEXT:    s_cbranch_execnz .LBB27_1
3152; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
3153; GFX12-NEXT:    s_endpgm
3154  %zext.offset = zext i32 %voffset to i64
3155  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3156  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
3157  %unused = atomicrmw umin ptr addrspace(1) %gep1, i32 %data seq_cst
3158  ret void
3159}
3160
3161define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
3162; GFX9-LABEL: global_umin_saddr_i64_rtn:
3163; GFX9:       ; %bb.0:
3164; GFX9-NEXT:    global_load_dwordx2 v[3:4], v0, s[2:3]
3165; GFX9-NEXT:    v_mov_b32_e32 v6, s3
3166; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, s2, v0
3167; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
3168; GFX9-NEXT:    s_mov_b64 s[0:1], 0
3169; GFX9-NEXT:  .LBB28_1: ; %atomicrmw.start
3170; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
3171; GFX9-NEXT:    s_waitcnt vmcnt(0)
3172; GFX9-NEXT:    v_mov_b32_e32 v10, v4
3173; GFX9-NEXT:    v_mov_b32_e32 v9, v3
3174; GFX9-NEXT:    v_cmp_le_u64_e32 vcc, v[9:10], v[1:2]
3175; GFX9-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
3176; GFX9-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
3177; GFX9-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc
3178; GFX9-NEXT:    s_waitcnt vmcnt(0)
3179; GFX9-NEXT:    buffer_wbinvl1
3180; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
3181; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
3182; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
3183; GFX9-NEXT:    s_cbranch_execnz .LBB28_1
3184; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
3185; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
3186; GFX9-NEXT:    v_mov_b32_e32 v0, v3
3187; GFX9-NEXT:    v_mov_b32_e32 v1, v4
3188; GFX9-NEXT:    ; return to shader part epilog
3189;
3190; GFX10-LABEL: global_umin_saddr_i64_rtn:
3191; GFX10:       ; %bb.0:
3192; GFX10-NEXT:    global_load_dwordx2 v[3:4], v0, s[2:3]
3193; GFX10-NEXT:    v_add_co_u32 v5, s[0:1], s2, v0
3194; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1]
3195; GFX10-NEXT:    s_mov_b64 s[0:1], 0
3196; GFX10-NEXT:  .LBB28_1: ; %atomicrmw.start
3197; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
3198; GFX10-NEXT:    s_waitcnt vmcnt(0)
3199; GFX10-NEXT:    v_mov_b32_e32 v10, v4
3200; GFX10-NEXT:    v_mov_b32_e32 v9, v3
3201; GFX10-NEXT:    v_cmp_le_u64_e32 vcc, v[9:10], v[1:2]
3202; GFX10-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
3203; GFX10-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
3204; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc
3205; GFX10-NEXT:    s_waitcnt vmcnt(0)
3206; GFX10-NEXT:    buffer_gl1_inv
3207; GFX10-NEXT:    buffer_gl0_inv
3208; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
3209; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
3210; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
3211; GFX10-NEXT:    s_cbranch_execnz .LBB28_1
3212; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
3213; GFX10-NEXT:    s_or_b64 exec, exec, s[0:1]
3214; GFX10-NEXT:    v_mov_b32_e32 v0, v3
3215; GFX10-NEXT:    v_mov_b32_e32 v1, v4
3216; GFX10-NEXT:    ; return to shader part epilog
3217;
3218; GFX11-LABEL: global_umin_saddr_i64_rtn:
3219; GFX11:       ; %bb.0:
3220; GFX11-NEXT:    global_load_b64 v[3:4], v0, s[2:3]
3221; GFX11-NEXT:    v_add_co_u32 v5, s[0:1], s2, v0
3222; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3223; GFX11-NEXT:    v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1]
3224; GFX11-NEXT:    s_mov_b64 s[0:1], 0
3225; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
3226; GFX11-NEXT:  .LBB28_1: ; %atomicrmw.start
3227; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
3228; GFX11-NEXT:    s_waitcnt vmcnt(0)
3229; GFX11-NEXT:    v_mov_b32_e32 v10, v4
3230; GFX11-NEXT:    v_mov_b32_e32 v9, v3
3231; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3232; GFX11-NEXT:    v_cmp_le_u64_e32 vcc, v[9:10], v[1:2]
3233; GFX11-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
3234; GFX11-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
3235; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off glc
3236; GFX11-NEXT:    s_waitcnt vmcnt(0)
3237; GFX11-NEXT:    buffer_gl1_inv
3238; GFX11-NEXT:    buffer_gl0_inv
3239; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
3240; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
3241; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
3242; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
3243; GFX11-NEXT:    s_cbranch_execnz .LBB28_1
3244; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
3245; GFX11-NEXT:    s_or_b64 exec, exec, s[0:1]
3246; GFX11-NEXT:    v_mov_b32_e32 v0, v3
3247; GFX11-NEXT:    v_mov_b32_e32 v1, v4
3248; GFX11-NEXT:    ; return to shader part epilog
3249;
3250; GFX12-LABEL: global_umin_saddr_i64_rtn:
3251; GFX12:       ; %bb.0:
3252; GFX12-NEXT:    global_load_b64 v[3:4], v0, s[2:3]
3253; GFX12-NEXT:    v_add_co_u32 v5, s[0:1], s2, v0
3254; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3255; GFX12-NEXT:    v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1]
3256; GFX12-NEXT:    s_mov_b64 s[0:1], 0
3257; GFX12-NEXT:  .LBB28_1: ; %atomicrmw.start
3258; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
3259; GFX12-NEXT:    s_wait_loadcnt 0x0
3260; GFX12-NEXT:    v_mov_b32_e32 v10, v4
3261; GFX12-NEXT:    v_mov_b32_e32 v9, v3
3262; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3263; GFX12-NEXT:    v_cmp_le_u64_e32 vcc, v[9:10], v[1:2]
3264; GFX12-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
3265; GFX12-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
3266; GFX12-NEXT:    global_wb scope:SCOPE_SYS
3267; GFX12-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
3268; GFX12-NEXT:    s_wait_loadcnt 0x0
3269; GFX12-NEXT:    global_inv scope:SCOPE_SYS
3270; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
3271; GFX12-NEXT:    s_wait_alu 0xfffe
3272; GFX12-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
3273; GFX12-NEXT:    s_wait_alu 0xfffe
3274; GFX12-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
3275; GFX12-NEXT:    s_cbranch_execnz .LBB28_1
3276; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
3277; GFX12-NEXT:    s_or_b64 exec, exec, s[0:1]
3278; GFX12-NEXT:    v_mov_b32_e32 v0, v3
3279; GFX12-NEXT:    v_mov_b32_e32 v1, v4
3280; GFX12-NEXT:    ; return to shader part epilog
3281  %zext.offset = zext i32 %voffset to i64
3282  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3283  %rtn = atomicrmw umin ptr addrspace(1) %gep0, i64 %data seq_cst
3284  %cast.rtn = bitcast i64 %rtn to <2 x float>
3285  ret <2 x float> %cast.rtn
3286}
3287
3288define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
3289; GFX9-LABEL: global_umin_saddr_i64_rtn_neg128:
3290; GFX9:       ; %bb.0:
3291; GFX9-NEXT:    global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128
3292; GFX9-NEXT:    v_mov_b32_e32 v6, s3
3293; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, s2, v0
3294; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
3295; GFX9-NEXT:    s_mov_b64 s[0:1], 0
3296; GFX9-NEXT:  .LBB29_1: ; %atomicrmw.start
3297; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
3298; GFX9-NEXT:    s_waitcnt vmcnt(0)
3299; GFX9-NEXT:    v_mov_b32_e32 v10, v4
3300; GFX9-NEXT:    v_mov_b32_e32 v9, v3
3301; GFX9-NEXT:    v_cmp_le_u64_e32 vcc, v[9:10], v[1:2]
3302; GFX9-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
3303; GFX9-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
3304; GFX9-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc
3305; GFX9-NEXT:    s_waitcnt vmcnt(0)
3306; GFX9-NEXT:    buffer_wbinvl1
3307; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
3308; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
3309; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
3310; GFX9-NEXT:    s_cbranch_execnz .LBB29_1
3311; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
3312; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
3313; GFX9-NEXT:    v_mov_b32_e32 v0, v3
3314; GFX9-NEXT:    v_mov_b32_e32 v1, v4
3315; GFX9-NEXT:    ; return to shader part epilog
3316;
3317; GFX10-LABEL: global_umin_saddr_i64_rtn_neg128:
3318; GFX10:       ; %bb.0:
3319; GFX10-NEXT:    global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128
3320; GFX10-NEXT:    v_add_co_u32 v5, s[0:1], s2, v0
3321; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1]
3322; GFX10-NEXT:    s_mov_b64 s[0:1], 0
3323; GFX10-NEXT:  .LBB29_1: ; %atomicrmw.start
3324; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
3325; GFX10-NEXT:    s_waitcnt vmcnt(0)
3326; GFX10-NEXT:    v_mov_b32_e32 v10, v4
3327; GFX10-NEXT:    v_mov_b32_e32 v9, v3
3328; GFX10-NEXT:    v_cmp_le_u64_e32 vcc, v[9:10], v[1:2]
3329; GFX10-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
3330; GFX10-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
3331; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc
3332; GFX10-NEXT:    s_waitcnt vmcnt(0)
3333; GFX10-NEXT:    buffer_gl1_inv
3334; GFX10-NEXT:    buffer_gl0_inv
3335; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
3336; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
3337; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
3338; GFX10-NEXT:    s_cbranch_execnz .LBB29_1
3339; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
3340; GFX10-NEXT:    s_or_b64 exec, exec, s[0:1]
3341; GFX10-NEXT:    v_mov_b32_e32 v0, v3
3342; GFX10-NEXT:    v_mov_b32_e32 v1, v4
3343; GFX10-NEXT:    ; return to shader part epilog
3344;
3345; GFX11-LABEL: global_umin_saddr_i64_rtn_neg128:
3346; GFX11:       ; %bb.0:
3347; GFX11-NEXT:    global_load_b64 v[3:4], v0, s[2:3] offset:-128
3348; GFX11-NEXT:    v_add_co_u32 v5, s[0:1], s2, v0
3349; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3350; GFX11-NEXT:    v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1]
3351; GFX11-NEXT:    s_mov_b64 s[0:1], 0
3352; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
3353; GFX11-NEXT:  .LBB29_1: ; %atomicrmw.start
3354; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
3355; GFX11-NEXT:    s_waitcnt vmcnt(0)
3356; GFX11-NEXT:    v_mov_b32_e32 v10, v4
3357; GFX11-NEXT:    v_mov_b32_e32 v9, v3
3358; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3359; GFX11-NEXT:    v_cmp_le_u64_e32 vcc, v[9:10], v[1:2]
3360; GFX11-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
3361; GFX11-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
3362; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 glc
3363; GFX11-NEXT:    s_waitcnt vmcnt(0)
3364; GFX11-NEXT:    buffer_gl1_inv
3365; GFX11-NEXT:    buffer_gl0_inv
3366; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
3367; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
3368; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
3369; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
3370; GFX11-NEXT:    s_cbranch_execnz .LBB29_1
3371; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
3372; GFX11-NEXT:    s_or_b64 exec, exec, s[0:1]
3373; GFX11-NEXT:    v_mov_b32_e32 v0, v3
3374; GFX11-NEXT:    v_mov_b32_e32 v1, v4
3375; GFX11-NEXT:    ; return to shader part epilog
3376;
3377; GFX12-LABEL: global_umin_saddr_i64_rtn_neg128:
3378; GFX12:       ; %bb.0:
3379; GFX12-NEXT:    global_load_b64 v[3:4], v0, s[2:3] offset:-128
3380; GFX12-NEXT:    v_add_co_u32 v5, s[0:1], s2, v0
3381; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3382; GFX12-NEXT:    v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1]
3383; GFX12-NEXT:    s_mov_b64 s[0:1], 0
3384; GFX12-NEXT:  .LBB29_1: ; %atomicrmw.start
3385; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
3386; GFX12-NEXT:    s_wait_loadcnt 0x0
3387; GFX12-NEXT:    v_mov_b32_e32 v10, v4
3388; GFX12-NEXT:    v_mov_b32_e32 v9, v3
3389; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3390; GFX12-NEXT:    v_cmp_le_u64_e32 vcc, v[9:10], v[1:2]
3391; GFX12-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
3392; GFX12-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
3393; GFX12-NEXT:    global_wb scope:SCOPE_SYS
3394; GFX12-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
3395; GFX12-NEXT:    s_wait_loadcnt 0x0
3396; GFX12-NEXT:    global_inv scope:SCOPE_SYS
3397; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
3398; GFX12-NEXT:    s_wait_alu 0xfffe
3399; GFX12-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
3400; GFX12-NEXT:    s_wait_alu 0xfffe
3401; GFX12-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
3402; GFX12-NEXT:    s_cbranch_execnz .LBB29_1
3403; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
3404; GFX12-NEXT:    s_or_b64 exec, exec, s[0:1]
3405; GFX12-NEXT:    v_mov_b32_e32 v0, v3
3406; GFX12-NEXT:    v_mov_b32_e32 v1, v4
3407; GFX12-NEXT:    ; return to shader part epilog
3408  %zext.offset = zext i32 %voffset to i64
3409  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3410  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
3411  %rtn = atomicrmw umin ptr addrspace(1) %gep1, i64 %data seq_cst
3412  %cast.rtn = bitcast i64 %rtn to <2 x float>
3413  ret <2 x float> %cast.rtn
3414}
3415
3416define amdgpu_ps void @global_umin_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
3417; GFX9-LABEL: global_umin_saddr_i64_nortn:
3418; GFX9:       ; %bb.0:
3419; GFX9-NEXT:    global_load_dwordx2 v[5:6], v0, s[2:3]
3420; GFX9-NEXT:    v_mov_b32_e32 v3, s3
3421; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, s2, v0
3422; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v3, vcc
3423; GFX9-NEXT:    s_mov_b64 s[0:1], 0
3424; GFX9-NEXT:  .LBB30_1: ; %atomicrmw.start
3425; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
3426; GFX9-NEXT:    s_waitcnt vmcnt(0)
3427; GFX9-NEXT:    v_cmp_le_u64_e32 vcc, v[5:6], v[1:2]
3428; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
3429; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
3430; GFX9-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc
3431; GFX9-NEXT:    s_waitcnt vmcnt(0)
3432; GFX9-NEXT:    buffer_wbinvl1
3433; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
3434; GFX9-NEXT:    v_mov_b32_e32 v6, v4
3435; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
3436; GFX9-NEXT:    v_mov_b32_e32 v5, v3
3437; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
3438; GFX9-NEXT:    s_cbranch_execnz .LBB30_1
3439; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
3440; GFX9-NEXT:    s_endpgm
3441;
3442; GFX10-LABEL: global_umin_saddr_i64_nortn:
3443; GFX10:       ; %bb.0:
3444; GFX10-NEXT:    global_load_dwordx2 v[5:6], v0, s[2:3]
3445; GFX10-NEXT:    v_add_co_u32 v7, s[0:1], s2, v0
3446; GFX10-NEXT:    v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1]
3447; GFX10-NEXT:    s_mov_b64 s[0:1], 0
3448; GFX10-NEXT:  .LBB30_1: ; %atomicrmw.start
3449; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
3450; GFX10-NEXT:    s_waitcnt vmcnt(0)
3451; GFX10-NEXT:    v_cmp_le_u64_e32 vcc, v[5:6], v[1:2]
3452; GFX10-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
3453; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
3454; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc
3455; GFX10-NEXT:    s_waitcnt vmcnt(0)
3456; GFX10-NEXT:    buffer_gl1_inv
3457; GFX10-NEXT:    buffer_gl0_inv
3458; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
3459; GFX10-NEXT:    v_mov_b32_e32 v6, v4
3460; GFX10-NEXT:    v_mov_b32_e32 v5, v3
3461; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
3462; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
3463; GFX10-NEXT:    s_cbranch_execnz .LBB30_1
3464; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
3465; GFX10-NEXT:    s_endpgm
3466;
3467; GFX11-LABEL: global_umin_saddr_i64_nortn:
3468; GFX11:       ; %bb.0:
3469; GFX11-NEXT:    global_load_b64 v[5:6], v0, s[2:3]
3470; GFX11-NEXT:    v_add_co_u32 v7, s[0:1], s2, v0
3471; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3472; GFX11-NEXT:    v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1]
3473; GFX11-NEXT:    s_mov_b64 s[0:1], 0
3474; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
3475; GFX11-NEXT:  .LBB30_1: ; %atomicrmw.start
3476; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
3477; GFX11-NEXT:    s_waitcnt vmcnt(0)
3478; GFX11-NEXT:    v_cmp_le_u64_e32 vcc, v[5:6], v[1:2]
3479; GFX11-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
3480; GFX11-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
3481; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off glc
3482; GFX11-NEXT:    s_waitcnt vmcnt(0)
3483; GFX11-NEXT:    buffer_gl1_inv
3484; GFX11-NEXT:    buffer_gl0_inv
3485; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
3486; GFX11-NEXT:    v_mov_b32_e32 v6, v4
3487; GFX11-NEXT:    v_mov_b32_e32 v5, v3
3488; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
3489; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
3490; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
3491; GFX11-NEXT:    s_cbranch_execnz .LBB30_1
3492; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
3493; GFX11-NEXT:    s_endpgm
3494;
3495; GFX12-LABEL: global_umin_saddr_i64_nortn:
3496; GFX12:       ; %bb.0:
3497; GFX12-NEXT:    global_load_b64 v[5:6], v0, s[2:3]
3498; GFX12-NEXT:    v_add_co_u32 v7, s[0:1], s2, v0
3499; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3500; GFX12-NEXT:    v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1]
3501; GFX12-NEXT:    s_mov_b64 s[0:1], 0
3502; GFX12-NEXT:  .LBB30_1: ; %atomicrmw.start
3503; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
3504; GFX12-NEXT:    s_wait_loadcnt 0x0
3505; GFX12-NEXT:    v_cmp_le_u64_e32 vcc, v[5:6], v[1:2]
3506; GFX12-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
3507; GFX12-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
3508; GFX12-NEXT:    global_wb scope:SCOPE_SYS
3509; GFX12-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
3510; GFX12-NEXT:    s_wait_loadcnt 0x0
3511; GFX12-NEXT:    global_inv scope:SCOPE_SYS
3512; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
3513; GFX12-NEXT:    v_mov_b32_e32 v6, v4
3514; GFX12-NEXT:    v_mov_b32_e32 v5, v3
3515; GFX12-NEXT:    s_wait_alu 0xfffe
3516; GFX12-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
3517; GFX12-NEXT:    s_wait_alu 0xfffe
3518; GFX12-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
3519; GFX12-NEXT:    s_cbranch_execnz .LBB30_1
3520; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
3521; GFX12-NEXT:    s_endpgm
3522  %zext.offset = zext i32 %voffset to i64
3523  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3524  %unused = atomicrmw umin ptr addrspace(1) %gep0, i64 %data seq_cst
3525  ret void
3526}
3527
3528define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
3529; GFX9-LABEL: global_umin_saddr_i64_nortn_neg128:
3530; GFX9:       ; %bb.0:
3531; GFX9-NEXT:    global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128
3532; GFX9-NEXT:    v_mov_b32_e32 v3, s3
3533; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, s2, v0
3534; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v3, vcc
3535; GFX9-NEXT:    s_mov_b64 s[0:1], 0
3536; GFX9-NEXT:  .LBB31_1: ; %atomicrmw.start
3537; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
3538; GFX9-NEXT:    s_waitcnt vmcnt(0)
3539; GFX9-NEXT:    v_cmp_le_u64_e32 vcc, v[5:6], v[1:2]
3540; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
3541; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
3542; GFX9-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc
3543; GFX9-NEXT:    s_waitcnt vmcnt(0)
3544; GFX9-NEXT:    buffer_wbinvl1
3545; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
3546; GFX9-NEXT:    v_mov_b32_e32 v6, v4
3547; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
3548; GFX9-NEXT:    v_mov_b32_e32 v5, v3
3549; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
3550; GFX9-NEXT:    s_cbranch_execnz .LBB31_1
3551; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
3552; GFX9-NEXT:    s_endpgm
3553;
3554; GFX10-LABEL: global_umin_saddr_i64_nortn_neg128:
3555; GFX10:       ; %bb.0:
3556; GFX10-NEXT:    global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128
3557; GFX10-NEXT:    v_add_co_u32 v7, s[0:1], s2, v0
3558; GFX10-NEXT:    v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1]
3559; GFX10-NEXT:    s_mov_b64 s[0:1], 0
3560; GFX10-NEXT:  .LBB31_1: ; %atomicrmw.start
3561; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
3562; GFX10-NEXT:    s_waitcnt vmcnt(0)
3563; GFX10-NEXT:    v_cmp_le_u64_e32 vcc, v[5:6], v[1:2]
3564; GFX10-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
3565; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
3566; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc
3567; GFX10-NEXT:    s_waitcnt vmcnt(0)
3568; GFX10-NEXT:    buffer_gl1_inv
3569; GFX10-NEXT:    buffer_gl0_inv
3570; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
3571; GFX10-NEXT:    v_mov_b32_e32 v6, v4
3572; GFX10-NEXT:    v_mov_b32_e32 v5, v3
3573; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
3574; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
3575; GFX10-NEXT:    s_cbranch_execnz .LBB31_1
3576; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
3577; GFX10-NEXT:    s_endpgm
3578;
3579; GFX11-LABEL: global_umin_saddr_i64_nortn_neg128:
3580; GFX11:       ; %bb.0:
3581; GFX11-NEXT:    global_load_b64 v[5:6], v0, s[2:3] offset:-128
3582; GFX11-NEXT:    v_add_co_u32 v7, s[0:1], s2, v0
3583; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3584; GFX11-NEXT:    v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1]
3585; GFX11-NEXT:    s_mov_b64 s[0:1], 0
3586; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
3587; GFX11-NEXT:  .LBB31_1: ; %atomicrmw.start
3588; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
3589; GFX11-NEXT:    s_waitcnt vmcnt(0)
3590; GFX11-NEXT:    v_cmp_le_u64_e32 vcc, v[5:6], v[1:2]
3591; GFX11-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
3592; GFX11-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
3593; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 glc
3594; GFX11-NEXT:    s_waitcnt vmcnt(0)
3595; GFX11-NEXT:    buffer_gl1_inv
3596; GFX11-NEXT:    buffer_gl0_inv
3597; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
3598; GFX11-NEXT:    v_mov_b32_e32 v6, v4
3599; GFX11-NEXT:    v_mov_b32_e32 v5, v3
3600; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
3601; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
3602; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
3603; GFX11-NEXT:    s_cbranch_execnz .LBB31_1
3604; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
3605; GFX11-NEXT:    s_endpgm
3606;
3607; GFX12-LABEL: global_umin_saddr_i64_nortn_neg128:
3608; GFX12:       ; %bb.0:
3609; GFX12-NEXT:    global_load_b64 v[5:6], v0, s[2:3] offset:-128
3610; GFX12-NEXT:    v_add_co_u32 v7, s[0:1], s2, v0
3611; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3612; GFX12-NEXT:    v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1]
3613; GFX12-NEXT:    s_mov_b64 s[0:1], 0
3614; GFX12-NEXT:  .LBB31_1: ; %atomicrmw.start
3615; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
3616; GFX12-NEXT:    s_wait_loadcnt 0x0
3617; GFX12-NEXT:    v_cmp_le_u64_e32 vcc, v[5:6], v[1:2]
3618; GFX12-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
3619; GFX12-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
3620; GFX12-NEXT:    global_wb scope:SCOPE_SYS
3621; GFX12-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
3622; GFX12-NEXT:    s_wait_loadcnt 0x0
3623; GFX12-NEXT:    global_inv scope:SCOPE_SYS
3624; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
3625; GFX12-NEXT:    v_mov_b32_e32 v6, v4
3626; GFX12-NEXT:    v_mov_b32_e32 v5, v3
3627; GFX12-NEXT:    s_wait_alu 0xfffe
3628; GFX12-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
3629; GFX12-NEXT:    s_wait_alu 0xfffe
3630; GFX12-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
3631; GFX12-NEXT:    s_cbranch_execnz .LBB31_1
3632; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
3633; GFX12-NEXT:    s_endpgm
3634  %zext.offset = zext i32 %voffset to i64
3635  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3636  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
3637  %unused = atomicrmw umin ptr addrspace(1) %gep1, i64 %data seq_cst
3638  ret void
3639}
3640
3641attributes #0 = { argmemonly nounwind willreturn }
3642