xref: /llvm-project/llvm/test/CodeGen/AMDGPU/global-atomic-scan.ll (revision 5feb32ba929f9e517c530217cabb09d1d734a763)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN:  opt -S -mtriple=amdgcn-- -passes='amdgpu-atomic-optimizer,verify<domtree>' %s | FileCheck -check-prefix=IR %s
3
4define amdgpu_kernel void @atomic_add_i32_offset(ptr addrspace(1) %out, i32 %in) {
5; IR-LABEL: @atomic_add_i32_offset(
6; IR-NEXT:  entry:
7; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
8; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
9; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
10; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
11; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
12; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
13; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
14; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
15; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
16; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
17; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
18; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
19; IR:       10:
20; IR-NEXT:    [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
21; IR-NEXT:    br label [[TMP12]]
22; IR:       12:
23; IR-NEXT:    ret void
24;
25entry:
26  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
27  %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
28  ret void
29}
30
31define amdgpu_kernel void @atomic_add_i32_max_neg_offset(ptr addrspace(1) %out, i32 %in) {
32; IR-LABEL: @atomic_add_i32_max_neg_offset(
33; IR-NEXT:  entry:
34; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 -1024
35; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
36; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
37; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
38; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
39; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
40; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
41; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
42; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
43; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
44; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
45; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
46; IR:       10:
47; IR-NEXT:    [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
48; IR-NEXT:    br label [[TMP12]]
49; IR:       12:
50; IR-NEXT:    ret void
51;
52entry:
53  %gep = getelementptr i32, ptr addrspace(1) %out, i64 -1024
54  %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
55  ret void
56}
57
58define amdgpu_kernel void @atomic_add_i32_soffset(ptr addrspace(1) %out, i32 %in) {
59; IR-LABEL: @atomic_add_i32_soffset(
60; IR-NEXT:  entry:
61; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 9000
62; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
63; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
64; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
65; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
66; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
67; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
68; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
69; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
70; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
71; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
72; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
73; IR:       10:
74; IR-NEXT:    [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
75; IR-NEXT:    br label [[TMP12]]
76; IR:       12:
77; IR-NEXT:    ret void
78;
79entry:
80  %gep = getelementptr i32, ptr addrspace(1) %out, i64 9000
81  %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
82  ret void
83}
84
85define amdgpu_kernel void @atomic_add_i32_huge_offset(ptr addrspace(1) %out, i32 %in) {
86; IR-LABEL: @atomic_add_i32_huge_offset(
87; IR-NEXT:  entry:
88; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 47224239175595
89; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
90; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
91; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
92; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
93; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
94; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
95; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
96; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
97; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
98; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
99; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
100; IR:       10:
101; IR-NEXT:    [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
102; IR-NEXT:    br label [[TMP12]]
103; IR:       12:
104; IR-NEXT:    ret void
105;
106entry:
107  %gep = getelementptr i32, ptr addrspace(1) %out, i64 47224239175595
108
109  %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
110  ret void
111}
112
113define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
114; IR-LABEL: @atomic_add_i32_ret_offset(
115; IR-NEXT:  entry:
116; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
117; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
118; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
119; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
120; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
121; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
122; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
123; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
124; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
125; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
126; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
127; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
128; IR:       10:
129; IR-NEXT:    [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
130; IR-NEXT:    br label [[TMP12]]
131; IR:       12:
132; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
133; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
134; IR-NEXT:    [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
135; IR-NEXT:    [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]]
136; IR-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
137; IR-NEXT:    ret void
138;
139entry:
140  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
141  %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
142  store i32 %val, ptr addrspace(1) %out2
143  ret void
144}
145
146define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
147; IR-LABEL: @atomic_add_i32_addr64_offset(
148; IR-NEXT:  entry:
149; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
150; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
151; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
152; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
153; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
154; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
155; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
156; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
157; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
158; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
159; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
160; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
161; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
162; IR:       10:
163; IR-NEXT:    [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
164; IR-NEXT:    br label [[TMP12]]
165; IR:       12:
166; IR-NEXT:    ret void
167;
168entry:
169  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
170  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
171  %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
172  ret void
173}
174
175define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
176; IR-LABEL: @atomic_add_i32_ret_addr64_offset(
177; IR-NEXT:  entry:
178; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
179; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
180; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
181; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
182; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
183; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
184; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
185; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
186; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
187; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
188; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
189; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
190; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
191; IR:       10:
192; IR-NEXT:    [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
193; IR-NEXT:    br label [[TMP12]]
194; IR:       12:
195; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
196; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
197; IR-NEXT:    [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
198; IR-NEXT:    [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]]
199; IR-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
200; IR-NEXT:    ret void
201;
202entry:
203  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
204  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
205  %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
206  store i32 %val, ptr addrspace(1) %out2
207  ret void
208}
209
210define amdgpu_kernel void @atomic_add_i32(ptr addrspace(1) %out, i32 %in) {
211; IR-LABEL: @atomic_add_i32(
212; IR-NEXT:  entry:
213; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
214; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
215; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
216; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
217; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
218; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
219; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
220; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
221; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
222; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
223; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
224; IR:       10:
225; IR-NEXT:    [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[OUT:%.*]], i32 [[TMP8]] seq_cst, align 4
226; IR-NEXT:    br label [[TMP12]]
227; IR:       12:
228; IR-NEXT:    ret void
229;
230entry:
231  %val = atomicrmw volatile add ptr addrspace(1) %out, i32 %in seq_cst
232  ret void
233}
234
235define amdgpu_kernel void @atomic_add_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
236; IR-LABEL: @atomic_add_i32_ret(
237; IR-NEXT:  entry:
238; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
239; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
240; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
241; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
242; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
243; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
244; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
245; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
246; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
247; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
248; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
249; IR:       10:
250; IR-NEXT:    [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[OUT:%.*]], i32 [[TMP8]] seq_cst, align 4
251; IR-NEXT:    br label [[TMP12]]
252; IR:       12:
253; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
254; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
255; IR-NEXT:    [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
256; IR-NEXT:    [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]]
257; IR-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
258; IR-NEXT:    ret void
259;
260entry:
261  %val = atomicrmw volatile add ptr addrspace(1) %out, i32 %in seq_cst
262  store i32 %val, ptr addrspace(1) %out2
263  ret void
264}
265
266define amdgpu_kernel void @atomic_add_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
267; IR-LABEL: @atomic_add_i32_addr64(
268; IR-NEXT:  entry:
269; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
270; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
271; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
272; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
273; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
274; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
275; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
276; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
277; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
278; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
279; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
280; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
281; IR:       10:
282; IR-NEXT:    [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[PTR]], i32 [[TMP8]] seq_cst, align 4
283; IR-NEXT:    br label [[TMP12]]
284; IR:       12:
285; IR-NEXT:    ret void
286;
287entry:
288  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
289  %val = atomicrmw volatile add ptr addrspace(1) %ptr, i32 %in seq_cst
290  ret void
291}
292
293define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
294; IR-LABEL: @atomic_add_i32_ret_addr64(
295; IR-NEXT:  entry:
296; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
297; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
298; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
299; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
300; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
301; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
302; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
303; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
304; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
305; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
306; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
307; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
308; IR:       10:
309; IR-NEXT:    [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[PTR]], i32 [[TMP8]] seq_cst, align 4
310; IR-NEXT:    br label [[TMP12]]
311; IR:       12:
312; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
313; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
314; IR-NEXT:    [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
315; IR-NEXT:    [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]]
316; IR-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
317; IR-NEXT:    ret void
318;
319entry:
320  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
321  %val = atomicrmw volatile add ptr addrspace(1) %ptr, i32 %in seq_cst
322  store i32 %val, ptr addrspace(1) %out2
323  ret void
324}
325
326define amdgpu_kernel void @atomic_and_i32_offset(ptr addrspace(1) %out, i32 %in) {
327; IR-LABEL: @atomic_and_i32_offset(
328; IR-NEXT:  entry:
329; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
330; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
331; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
332; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
333; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
334; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
335; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
336; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
337; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
338; IR:       7:
339; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4
340; IR-NEXT:    br label [[TMP9]]
341; IR:       9:
342; IR-NEXT:    ret void
343;
344entry:
345  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
346  %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in seq_cst
347  ret void
348}
349
350define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
351; IR-LABEL: @atomic_and_i32_ret_offset(
352; IR-NEXT:  entry:
353; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
354; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
355; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
356; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
357; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
358; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
359; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
360; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
361; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
362; IR:       7:
363; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4
364; IR-NEXT:    br label [[TMP9]]
365; IR:       9:
366; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
367; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
368; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]]
369; IR-NEXT:    [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]]
370; IR-NEXT:    store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4
371; IR-NEXT:    ret void
372;
373entry:
374  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
375  %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in seq_cst
376  store i32 %val, ptr addrspace(1) %out2
377  ret void
378}
379
380define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
381; IR-LABEL: @atomic_and_i32_addr64_offset(
382; IR-NEXT:  entry:
383; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
384; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
385; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
386; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
387; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
388; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
389; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
390; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
391; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
392; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
393; IR:       7:
394; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4
395; IR-NEXT:    br label [[TMP9]]
396; IR:       9:
397; IR-NEXT:    ret void
398;
399entry:
400  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
401  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
402  %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in seq_cst
403  ret void
404}
405
406define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
407; IR-LABEL: @atomic_and_i32_ret_addr64_offset(
408; IR-NEXT:  entry:
409; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
410; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
411; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
412; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
413; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
414; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
415; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
416; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
417; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
418; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
419; IR:       7:
420; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4
421; IR-NEXT:    br label [[TMP9]]
422; IR:       9:
423; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
424; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
425; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]]
426; IR-NEXT:    [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]]
427; IR-NEXT:    store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4
428; IR-NEXT:    ret void
429;
430entry:
431  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
432  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
433  %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in seq_cst
434  store i32 %val, ptr addrspace(1) %out2
435  ret void
436}
437
438define amdgpu_kernel void @atomic_and_i32(ptr addrspace(1) %out, i32 %in) {
439; IR-LABEL: @atomic_and_i32(
440; IR-NEXT:  entry:
441; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
442; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
443; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
444; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
445; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
446; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
447; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
448; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
449; IR:       7:
450; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] seq_cst, align 4
451; IR-NEXT:    br label [[TMP9]]
452; IR:       9:
453; IR-NEXT:    ret void
454;
455entry:
456  %val = atomicrmw volatile and ptr addrspace(1) %out, i32 %in seq_cst
457  ret void
458}
459
460define amdgpu_kernel void @atomic_and_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
461; IR-LABEL: @atomic_and_i32_ret(
462; IR-NEXT:  entry:
463; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
464; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
465; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
466; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
467; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
468; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
469; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
470; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
471; IR:       7:
472; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] seq_cst, align 4
473; IR-NEXT:    br label [[TMP9]]
474; IR:       9:
475; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
476; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
477; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]]
478; IR-NEXT:    [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]]
479; IR-NEXT:    store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4
480; IR-NEXT:    ret void
481;
482entry:
483  %val = atomicrmw volatile and ptr addrspace(1) %out, i32 %in seq_cst
484  store i32 %val, ptr addrspace(1) %out2
485  ret void
486}
487
488define amdgpu_kernel void @atomic_and_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
489; IR-LABEL: @atomic_and_i32_addr64(
490; IR-NEXT:  entry:
491; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
492; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
493; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
494; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
495; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
496; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
497; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
498; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
499; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
500; IR:       7:
501; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] seq_cst, align 4
502; IR-NEXT:    br label [[TMP9]]
503; IR:       9:
504; IR-NEXT:    ret void
505;
506entry:
507  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
508  %val = atomicrmw volatile and ptr addrspace(1) %ptr, i32 %in seq_cst
509  ret void
510}
511
512define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
513; IR-LABEL: @atomic_and_i32_ret_addr64(
514; IR-NEXT:  entry:
515; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
516; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
517; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
518; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
519; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
520; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
521; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
522; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
523; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
524; IR:       7:
525; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] seq_cst, align 4
526; IR-NEXT:    br label [[TMP9]]
527; IR:       9:
528; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
529; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
530; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]]
531; IR-NEXT:    [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]]
532; IR-NEXT:    store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4
533; IR-NEXT:    ret void
534;
535entry:
536  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
537  %val = atomicrmw volatile and ptr addrspace(1) %ptr, i32 %in seq_cst
538  store i32 %val, ptr addrspace(1) %out2
539  ret void
540}
541
542define amdgpu_kernel void @atomic_sub_i32_offset(ptr addrspace(1) %out, i32 %in) {
543; IR-LABEL: @atomic_sub_i32_offset(
544; IR-NEXT:  entry:
545; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
546; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
547; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
548; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
549; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
550; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
551; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
552; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
553; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
554; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
555; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
556; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
557; IR:       10:
558; IR-NEXT:    [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
559; IR-NEXT:    br label [[TMP12]]
560; IR:       12:
561; IR-NEXT:    ret void
562;
563entry:
564  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
565  %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in seq_cst
566  ret void
567}
568
569define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
570; IR-LABEL: @atomic_sub_i32_ret_offset(
571; IR-NEXT:  entry:
572; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
573; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
574; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
575; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
576; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
577; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
578; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
579; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
580; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
581; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
582; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
583; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
584; IR:       10:
585; IR-NEXT:    [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
586; IR-NEXT:    br label [[TMP12]]
587; IR:       12:
588; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
589; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
590; IR-NEXT:    [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
591; IR-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
592; IR-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
593; IR-NEXT:    ret void
594;
595entry:
596  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
597  %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in seq_cst
598  store i32 %val, ptr addrspace(1) %out2
599  ret void
600}
601
602define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
603; IR-LABEL: @atomic_sub_i32_addr64_offset(
604; IR-NEXT:  entry:
605; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
606; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
607; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
608; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
609; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
610; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
611; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
612; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
613; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
614; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
615; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
616; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
617; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
618; IR:       10:
619; IR-NEXT:    [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
620; IR-NEXT:    br label [[TMP12]]
621; IR:       12:
622; IR-NEXT:    ret void
623;
624entry:
625  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
626  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
627  %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in seq_cst
628  ret void
629}
630
631define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
632; IR-LABEL: @atomic_sub_i32_ret_addr64_offset(
633; IR-NEXT:  entry:
634; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
635; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
636; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
637; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
638; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
639; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
640; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
641; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
642; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
643; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
644; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
645; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
646; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
647; IR:       10:
648; IR-NEXT:    [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
649; IR-NEXT:    br label [[TMP12]]
650; IR:       12:
651; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
652; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
653; IR-NEXT:    [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
654; IR-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
655; IR-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
656; IR-NEXT:    ret void
657;
658entry:
659  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
660  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
661  %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in seq_cst
662  store i32 %val, ptr addrspace(1) %out2
663  ret void
664}
665
666define amdgpu_kernel void @atomic_sub_i32(ptr addrspace(1) %out, i32 %in) {
667; IR-LABEL: @atomic_sub_i32(
668; IR-NEXT:  entry:
669; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
670; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
671; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
672; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
673; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
674; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
675; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
676; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
677; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
678; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
679; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
680; IR:       10:
681; IR-NEXT:    [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[OUT:%.*]], i32 [[TMP8]] seq_cst, align 4
682; IR-NEXT:    br label [[TMP12]]
683; IR:       12:
684; IR-NEXT:    ret void
685;
686entry:
687  %val = atomicrmw volatile sub ptr addrspace(1) %out, i32 %in seq_cst
688  ret void
689}
690
691define amdgpu_kernel void @atomic_sub_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
692; IR-LABEL: @atomic_sub_i32_ret(
693; IR-NEXT:  entry:
694; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
695; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
696; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
697; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
698; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
699; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
700; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
701; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
702; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
703; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
704; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
705; IR:       10:
706; IR-NEXT:    [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[OUT:%.*]], i32 [[TMP8]] seq_cst, align 4
707; IR-NEXT:    br label [[TMP12]]
708; IR:       12:
709; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
710; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
711; IR-NEXT:    [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
712; IR-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
713; IR-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
714; IR-NEXT:    ret void
715;
716entry:
717  %val = atomicrmw volatile sub ptr addrspace(1) %out, i32 %in seq_cst
718  store i32 %val, ptr addrspace(1) %out2
719  ret void
720}
721
722define amdgpu_kernel void @atomic_sub_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
723; IR-LABEL: @atomic_sub_i32_addr64(
724; IR-NEXT:  entry:
725; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
726; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
727; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
728; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
729; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
730; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
731; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
732; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
733; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
734; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
735; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
736; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
737; IR:       10:
738; IR-NEXT:    [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[PTR]], i32 [[TMP8]] seq_cst, align 4
739; IR-NEXT:    br label [[TMP12]]
740; IR:       12:
741; IR-NEXT:    ret void
742;
743entry:
744  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
745  %val = atomicrmw volatile sub ptr addrspace(1) %ptr, i32 %in seq_cst
746  ret void
747}
748
749define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
750; IR-LABEL: @atomic_sub_i32_ret_addr64(
751; IR-NEXT:  entry:
752; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
753; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
754; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
755; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
756; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
757; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
758; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
759; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
760; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
761; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
762; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
763; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
764; IR:       10:
765; IR-NEXT:    [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[PTR]], i32 [[TMP8]] seq_cst, align 4
766; IR-NEXT:    br label [[TMP12]]
767; IR:       12:
768; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
769; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
770; IR-NEXT:    [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
771; IR-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
772; IR-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
773; IR-NEXT:    ret void
774;
775entry:
776  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
777  %val = atomicrmw volatile sub ptr addrspace(1) %ptr, i32 %in seq_cst
778  store i32 %val, ptr addrspace(1) %out2
779  ret void
780}
781
782define amdgpu_kernel void @atomic_max_i32_offset(ptr addrspace(1) %out, i32 %in) {
783; IR-LABEL: @atomic_max_i32_offset(
784; IR-NEXT:  entry:
785; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
786; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
787; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
788; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
789; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
790; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
791; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
792; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
793; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
794; IR:       7:
795; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4
796; IR-NEXT:    br label [[TMP9]]
797; IR:       9:
798; IR-NEXT:    ret void
799;
800entry:
801  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
802  %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in seq_cst
803  ret void
804}
805
806define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
807; IR-LABEL: @atomic_max_i32_ret_offset(
808; IR-NEXT:  entry:
809; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
810; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
811; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
812; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
813; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
814; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
815; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
816; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
817; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
818; IR:       7:
819; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
820; IR-NEXT:    br label [[TMP9]]
821; IR:       9:
822; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
823; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
824; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]]
825; IR-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
826; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
827; IR-NEXT:    store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
828; IR-NEXT:    ret void
829;
830entry:
831  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
832  %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
833  store i32 %val, ptr addrspace(1) %out2
834  ret void
835}
836
837define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
838; IR-LABEL: @atomic_max_i32_addr64_offset(
839; IR-NEXT:  entry:
840; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
841; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
842; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
843; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
844; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
845; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
846; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
847; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
848; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
849; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
850; IR:       7:
851; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
852; IR-NEXT:    br label [[TMP9]]
853; IR:       9:
854; IR-NEXT:    ret void
855;
856entry:
857  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
858  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
859  %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
860  ret void
861}
862
863define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
864; IR-LABEL: @atomic_max_i32_ret_addr64_offset(
865; IR-NEXT:  entry:
866; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
867; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
868; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
869; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
870; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
871; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
872; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
873; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
874; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
875; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
876; IR:       7:
877; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
878; IR-NEXT:    br label [[TMP9]]
879; IR:       9:
880; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
881; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
882; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]]
883; IR-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
884; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
885; IR-NEXT:    store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
886; IR-NEXT:    ret void
887;
888entry:
889  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
890  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
891  %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
892  store i32 %val, ptr addrspace(1) %out2
893  ret void
894}
895
896define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, i32 %in) {
897; IR-LABEL: @atomic_max_i32(
898; IR-NEXT:  entry:
899; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
900; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
901; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
902; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
903; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
904; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
905; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
906; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
907; IR:       7:
908; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
909; IR-NEXT:    br label [[TMP9]]
910; IR:       9:
911; IR-NEXT:    ret void
912;
913entry:
914  %val = atomicrmw volatile max ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
915  ret void
916}
917
918define amdgpu_kernel void @atomic_max_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
919; IR-LABEL: @atomic_max_i32_ret(
920; IR-NEXT:  entry:
921; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
922; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
923; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
924; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
925; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
926; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
927; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
928; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
929; IR:       7:
930; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
931; IR-NEXT:    br label [[TMP9]]
932; IR:       9:
933; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
934; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
935; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]]
936; IR-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
937; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
938; IR-NEXT:    store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
939; IR-NEXT:    ret void
940;
941entry:
942  %val = atomicrmw volatile max ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
943  store i32 %val, ptr addrspace(1) %out2
944  ret void
945}
946
947define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
948; IR-LABEL: @atomic_max_i32_addr64(
949; IR-NEXT:  entry:
950; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
951; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
952; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
953; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
954; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
955; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
956; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
957; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
958; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
959; IR:       7:
960; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
961; IR-NEXT:    br label [[TMP9]]
962; IR:       9:
963; IR-NEXT:    ret void
964;
965entry:
966  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
967  %val = atomicrmw volatile max ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
968  ret void
969}
970
971define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
972; IR-LABEL: @atomic_max_i32_ret_addr64(
973; IR-NEXT:  entry:
974; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
975; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
976; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
977; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
978; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
979; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
980; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
981; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
982; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
983; IR:       7:
984; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
985; IR-NEXT:    br label [[TMP9]]
986; IR:       9:
987; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
988; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
989; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]]
990; IR-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
991; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
992; IR-NEXT:    store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
993; IR-NEXT:    ret void
994;
995entry:
996  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
997  %val = atomicrmw volatile max ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
998  store i32 %val, ptr addrspace(1) %out2
999  ret void
1000}
1001
1002define amdgpu_kernel void @atomic_umax_i32_offset(ptr addrspace(1) %out, i32 %in) {
1003; IR-LABEL: @atomic_umax_i32_offset(
1004; IR-NEXT:  entry:
1005; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
1006; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
1007; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
1008; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
1009; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
1010; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
1011; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
1012; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
1013; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
1014; IR:       7:
1015; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
1016; IR-NEXT:    br label [[TMP9]]
1017; IR:       9:
1018; IR-NEXT:    ret void
1019;
1020entry:
1021  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
1022  %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
1023  ret void
1024}
1025
1026define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
1027; IR-LABEL: @atomic_umax_i32_ret_offset(
1028; IR-NEXT:  entry:
1029; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
1030; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
1031; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
1032; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
1033; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
1034; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
1035; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
1036; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
1037; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
1038; IR:       7:
1039; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
1040; IR-NEXT:    br label [[TMP9]]
1041; IR:       9:
1042; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
1043; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
1044; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]]
1045; IR-NEXT:    [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]]
1046; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
1047; IR-NEXT:    store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
1048; IR-NEXT:    ret void
1049;
1050entry:
1051  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
1052  %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
1053  store i32 %val, ptr addrspace(1) %out2
1054  ret void
1055}
1056
1057define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
1058; IR-LABEL: @atomic_umax_i32_addr64_offset(
1059; IR-NEXT:  entry:
1060; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
1061; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
1062; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
1063; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
1064; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
1065; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
1066; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
1067; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
1068; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
1069; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
1070; IR:       7:
1071; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
1072; IR-NEXT:    br label [[TMP9]]
1073; IR:       9:
1074; IR-NEXT:    ret void
1075;
1076entry:
1077  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
1078  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
1079  %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
1080  ret void
1081}
1082
1083define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
1084; IR-LABEL: @atomic_umax_i32_ret_addr64_offset(
1085; IR-NEXT:  entry:
1086; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
1087; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
1088; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
1089; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
1090; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
1091; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
1092; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
1093; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
1094; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
1095; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
1096; IR:       7:
1097; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
1098; IR-NEXT:    br label [[TMP9]]
1099; IR:       9:
1100; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
1101; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
1102; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]]
1103; IR-NEXT:    [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]]
1104; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
1105; IR-NEXT:    store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
1106; IR-NEXT:    ret void
1107;
1108entry:
1109  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
1110  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
1111  %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
1112  store i32 %val, ptr addrspace(1) %out2
1113  ret void
1114}
1115
1116define amdgpu_kernel void @atomic_umax_i32(ptr addrspace(1) %out, i32 %in) {
1117; IR-LABEL: @atomic_umax_i32(
1118; IR-NEXT:  entry:
1119; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
1120; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
1121; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
1122; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
1123; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
1124; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
1125; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
1126; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
1127; IR:       7:
1128; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
1129; IR-NEXT:    br label [[TMP9]]
1130; IR:       9:
1131; IR-NEXT:    ret void
1132;
1133entry:
1134  %val = atomicrmw volatile umax ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
1135  ret void
1136}
1137
1138define amdgpu_kernel void @atomic_umax_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
1139; IR-LABEL: @atomic_umax_i32_ret(
1140; IR-NEXT:  entry:
1141; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
1142; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
1143; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
1144; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
1145; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
1146; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
1147; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
1148; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
1149; IR:       7:
1150; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
1151; IR-NEXT:    br label [[TMP9]]
1152; IR:       9:
1153; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
1154; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
1155; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]]
1156; IR-NEXT:    [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]]
1157; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
1158; IR-NEXT:    store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
1159; IR-NEXT:    ret void
1160;
1161entry:
1162  %val = atomicrmw volatile umax ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
1163  store i32 %val, ptr addrspace(1) %out2
1164  ret void
1165}
1166
1167define amdgpu_kernel void @atomic_umax_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
1168; IR-LABEL: @atomic_umax_i32_addr64(
1169; IR-NEXT:  entry:
1170; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
1171; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
1172; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
1173; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
1174; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
1175; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
1176; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
1177; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
1178; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
1179; IR:       7:
1180; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
1181; IR-NEXT:    br label [[TMP9]]
1182; IR:       9:
1183; IR-NEXT:    ret void
1184;
1185entry:
1186  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
1187  %val = atomicrmw volatile umax ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
1188  ret void
1189}
1190
1191define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
1192; IR-LABEL: @atomic_umax_i32_ret_addr64(
1193; IR-NEXT:  entry:
1194; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
1195; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
1196; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
1197; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
1198; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
1199; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
1200; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
1201; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
1202; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
1203; IR:       7:
1204; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
1205; IR-NEXT:    br label [[TMP9]]
1206; IR:       9:
1207; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
1208; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
1209; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]]
1210; IR-NEXT:    [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]]
1211; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
1212; IR-NEXT:    store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
1213; IR-NEXT:    ret void
1214;
1215entry:
1216  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
1217  %val = atomicrmw volatile umax ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
1218  store i32 %val, ptr addrspace(1) %out2
1219  ret void
1220}
1221
1222define amdgpu_kernel void @atomic_min_i32_offset(ptr addrspace(1) %out, i32 %in) {
1223; IR-LABEL: @atomic_min_i32_offset(
1224; IR-NEXT:  entry:
1225; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
1226; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
1227; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
1228; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
1229; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
1230; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
1231; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
1232; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
1233; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
1234; IR:       7:
1235; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
1236; IR-NEXT:    br label [[TMP9]]
1237; IR:       9:
1238; IR-NEXT:    ret void
1239;
1240entry:
1241  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
1242  %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
1243  ret void
1244}
1245
1246define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
1247; IR-LABEL: @atomic_min_i32_ret_offset(
1248; IR-NEXT:  entry:
1249; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
1250; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
1251; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
1252; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
1253; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
1254; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
1255; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
1256; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
1257; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
1258; IR:       7:
1259; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
1260; IR-NEXT:    br label [[TMP9]]
1261; IR:       9:
1262; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
1263; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
1264; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]]
1265; IR-NEXT:    [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]]
1266; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
1267; IR-NEXT:    store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
1268; IR-NEXT:    ret void
1269;
1270entry:
1271  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
1272  %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
1273  store i32 %val, ptr addrspace(1) %out2
1274  ret void
1275}
1276
1277define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
1278; IR-LABEL: @atomic_min_i32_addr64_offset(
1279; IR-NEXT:  entry:
1280; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
1281; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
1282; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
1283; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
1284; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
1285; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
1286; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
1287; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
1288; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
1289; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
1290; IR:       7:
1291; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
1292; IR-NEXT:    br label [[TMP9]]
1293; IR:       9:
1294; IR-NEXT:    ret void
1295;
1296entry:
1297  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
1298  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
1299  %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
1300  ret void
1301}
1302
1303define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
1304; IR-LABEL: @atomic_min_i32_ret_addr64_offset(
1305; IR-NEXT:  entry:
1306; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
1307; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
1308; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
1309; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
1310; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
1311; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
1312; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
1313; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
1314; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
1315; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
1316; IR:       7:
1317; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
1318; IR-NEXT:    br label [[TMP9]]
1319; IR:       9:
1320; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
1321; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
1322; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]]
1323; IR-NEXT:    [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]]
1324; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
1325; IR-NEXT:    store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
1326; IR-NEXT:    ret void
1327;
1328entry:
1329  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
1330  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
1331  %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
1332  store i32 %val, ptr addrspace(1) %out2
1333  ret void
1334}
1335
1336define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) {
1337; IR-LABEL: @atomic_min_i32(
1338; IR-NEXT:  entry:
1339; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
1340; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
1341; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
1342; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
1343; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
1344; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
1345; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
1346; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
1347; IR:       7:
1348; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
1349; IR-NEXT:    br label [[TMP9]]
1350; IR:       9:
1351; IR-NEXT:    ret void
1352;
1353entry:
1354  %val = atomicrmw volatile min ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
1355  ret void
1356}
1357
1358define amdgpu_kernel void @atomic_min_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
1359; IR-LABEL: @atomic_min_i32_ret(
1360; IR-NEXT:  entry:
1361; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
1362; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
1363; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
1364; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
1365; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
1366; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
1367; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
1368; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
1369; IR:       7:
1370; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
1371; IR-NEXT:    br label [[TMP9]]
1372; IR:       9:
1373; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
1374; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
1375; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]]
1376; IR-NEXT:    [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]]
1377; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
1378; IR-NEXT:    store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
1379; IR-NEXT:    ret void
1380;
1381entry:
1382  %val = atomicrmw volatile min ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
1383  store i32 %val, ptr addrspace(1) %out2
1384  ret void
1385}
1386
1387define amdgpu_kernel void @atomic_min_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
1388; IR-LABEL: @atomic_min_i32_addr64(
1389; IR-NEXT:  entry:
1390; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
1391; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
1392; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
1393; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
1394; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
1395; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
1396; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
1397; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
1398; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
1399; IR:       7:
1400; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
1401; IR-NEXT:    br label [[TMP9]]
1402; IR:       9:
1403; IR-NEXT:    ret void
1404;
1405entry:
1406  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
1407  %val = atomicrmw volatile min ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
1408  ret void
1409}
1410
1411define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
1412; IR-LABEL: @atomic_min_i32_ret_addr64(
1413; IR-NEXT:  entry:
1414; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
1415; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
1416; IR-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
1417; IR-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
1418; IR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
1419; IR-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
1420; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
1421; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
1422; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
1423; IR:       7:
1424; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
1425; IR-NEXT:    br label [[TMP9]]
1426; IR:       9:
1427; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
1428; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
1429; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]]
1430; IR-NEXT:    [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]]
1431; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
1432; IR-NEXT:    store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
1433; IR-NEXT:    ret void
1434;
1435entry:
1436  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
1437  %val = atomicrmw volatile min ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
1438  store i32 %val, ptr addrspace(1) %out2
1439  ret void
1440}
1441