1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-atomic-optimizer,verify<domtree>' %s | FileCheck -check-prefix=IR %s 3 4define amdgpu_kernel void @atomic_add_i32_offset(ptr addrspace(1) %out, i32 %in) { 5; IR-LABEL: @atomic_add_i32_offset( 6; IR-NEXT: entry: 7; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4 8; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 9; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 10; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 11; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 12; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 13; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 14; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) 15; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 16; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] 17; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 18; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] 19; IR: 10: 20; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4 21; IR-NEXT: br label [[TMP12]] 22; IR: 12: 23; IR-NEXT: ret void 24; 25entry: 26 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 27 %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst 28 ret void 29} 30 31define amdgpu_kernel void @atomic_add_i32_max_neg_offset(ptr addrspace(1) %out, i32 %in) { 32; IR-LABEL: @atomic_add_i32_max_neg_offset( 33; IR-NEXT: entry: 34; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 -1024 35; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 36; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 37; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 38; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 39; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 40; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 41; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) 42; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 43; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] 44; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 45; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] 46; IR: 10: 47; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4 48; IR-NEXT: br label [[TMP12]] 49; IR: 12: 50; IR-NEXT: ret void 51; 52entry: 53 %gep = getelementptr i32, ptr addrspace(1) %out, i64 -1024 54 %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst 55 ret void 56} 57 58define amdgpu_kernel void @atomic_add_i32_soffset(ptr addrspace(1) %out, i32 %in) { 59; IR-LABEL: @atomic_add_i32_soffset( 60; IR-NEXT: entry: 61; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 9000 62; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 63; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 64; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 65; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 66; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 67; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 68; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) 69; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 70; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] 71; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 72; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] 73; IR: 10: 74; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4 75; IR-NEXT: br label [[TMP12]] 76; IR: 12: 77; IR-NEXT: ret void 78; 79entry: 80 %gep = getelementptr i32, ptr addrspace(1) %out, i64 9000 81 %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst 82 ret void 83} 84 85define amdgpu_kernel void @atomic_add_i32_huge_offset(ptr addrspace(1) %out, i32 %in) { 86; IR-LABEL: @atomic_add_i32_huge_offset( 87; IR-NEXT: entry: 88; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 47224239175595 89; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 90; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 91; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 92; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 93; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 94; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 95; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) 96; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 97; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] 98; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 99; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] 100; IR: 10: 101; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4 102; IR-NEXT: br label [[TMP12]] 103; IR: 12: 104; IR-NEXT: ret void 105; 106entry: 107 %gep = getelementptr i32, ptr addrspace(1) %out, i64 47224239175595 108 109 %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst 110 ret void 111} 112 113define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { 114; IR-LABEL: @atomic_add_i32_ret_offset( 115; IR-NEXT: entry: 116; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4 117; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 118; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 119; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 120; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 121; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 122; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 123; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) 124; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 125; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] 126; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 127; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] 128; IR: 10: 129; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4 130; IR-NEXT: br label [[TMP12]] 131; IR: 12: 132; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ] 133; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]]) 134; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]] 135; IR-NEXT: [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]] 136; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4 137; IR-NEXT: ret void 138; 139entry: 140 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 141 %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst 142 store i32 %val, ptr addrspace(1) %out2 143 ret void 144} 145 146define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { 147; IR-LABEL: @atomic_add_i32_addr64_offset( 148; IR-NEXT: entry: 149; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] 150; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 151; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 152; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 153; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 154; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 155; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 156; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 157; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) 158; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 159; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] 160; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 161; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] 162; IR: 10: 163; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4 164; IR-NEXT: br label [[TMP12]] 165; IR: 12: 166; IR-NEXT: ret void 167; 168entry: 169 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index 170 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 171 %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst 172 ret void 173} 174 175define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { 176; IR-LABEL: @atomic_add_i32_ret_addr64_offset( 177; IR-NEXT: entry: 178; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] 179; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 180; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 181; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 182; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 183; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 184; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 185; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 186; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) 187; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 188; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] 189; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 190; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] 191; IR: 10: 192; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4 193; IR-NEXT: br label [[TMP12]] 194; IR: 12: 195; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ] 196; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]]) 197; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]] 198; IR-NEXT: [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]] 199; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4 200; IR-NEXT: ret void 201; 202entry: 203 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index 204 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 205 %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst 206 store i32 %val, ptr addrspace(1) %out2 207 ret void 208} 209 210define amdgpu_kernel void @atomic_add_i32(ptr addrspace(1) %out, i32 %in) { 211; IR-LABEL: @atomic_add_i32( 212; IR-NEXT: entry: 213; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 214; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 215; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 216; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 217; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 218; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 219; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) 220; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 221; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] 222; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 223; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] 224; IR: 10: 225; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[OUT:%.*]], i32 [[TMP8]] seq_cst, align 4 226; IR-NEXT: br label [[TMP12]] 227; IR: 12: 228; IR-NEXT: ret void 229; 230entry: 231 %val = atomicrmw volatile add ptr addrspace(1) %out, i32 %in seq_cst 232 ret void 233} 234 235define amdgpu_kernel void @atomic_add_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { 236; IR-LABEL: @atomic_add_i32_ret( 237; IR-NEXT: entry: 238; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 239; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 240; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 241; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 242; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 243; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 244; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) 245; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 246; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] 247; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 248; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] 249; IR: 10: 250; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[OUT:%.*]], i32 [[TMP8]] seq_cst, align 4 251; IR-NEXT: br label [[TMP12]] 252; IR: 12: 253; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ] 254; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]]) 255; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]] 256; IR-NEXT: [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]] 257; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4 258; IR-NEXT: ret void 259; 260entry: 261 %val = atomicrmw volatile add ptr addrspace(1) %out, i32 %in seq_cst 262 store i32 %val, ptr addrspace(1) %out2 263 ret void 264} 265 266define amdgpu_kernel void @atomic_add_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { 267; IR-LABEL: @atomic_add_i32_addr64( 268; IR-NEXT: entry: 269; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] 270; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 271; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 272; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 273; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 274; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 275; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 276; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) 277; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 278; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] 279; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 280; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] 281; IR: 10: 282; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[PTR]], i32 [[TMP8]] seq_cst, align 4 283; IR-NEXT: br label [[TMP12]] 284; IR: 12: 285; IR-NEXT: ret void 286; 287entry: 288 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index 289 %val = atomicrmw volatile add ptr addrspace(1) %ptr, i32 %in seq_cst 290 ret void 291} 292 293define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { 294; IR-LABEL: @atomic_add_i32_ret_addr64( 295; IR-NEXT: entry: 296; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] 297; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 298; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 299; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 300; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 301; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 302; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 303; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) 304; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 305; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] 306; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 307; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] 308; IR: 10: 309; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[PTR]], i32 [[TMP8]] seq_cst, align 4 310; IR-NEXT: br label [[TMP12]] 311; IR: 12: 312; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ] 313; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]]) 314; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]] 315; IR-NEXT: [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]] 316; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4 317; IR-NEXT: ret void 318; 319entry: 320 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index 321 %val = atomicrmw volatile add ptr addrspace(1) %ptr, i32 %in seq_cst 322 store i32 %val, ptr addrspace(1) %out2 323 ret void 324} 325 326define amdgpu_kernel void @atomic_and_i32_offset(ptr addrspace(1) %out, i32 %in) { 327; IR-LABEL: @atomic_and_i32_offset( 328; IR-NEXT: entry: 329; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4 330; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 331; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 332; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 333; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 334; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 335; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 336; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 337; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] 338; IR: 7: 339; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4 340; IR-NEXT: br label [[TMP9]] 341; IR: 9: 342; IR-NEXT: ret void 343; 344entry: 345 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 346 %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in seq_cst 347 ret void 348} 349 350define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { 351; IR-LABEL: @atomic_and_i32_ret_offset( 352; IR-NEXT: entry: 353; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4 354; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 355; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 356; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 357; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 358; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 359; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 360; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 361; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] 362; IR: 7: 363; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4 364; IR-NEXT: br label [[TMP9]] 365; IR: 9: 366; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] 367; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) 368; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]] 369; IR-NEXT: [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]] 370; IR-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4 371; IR-NEXT: ret void 372; 373entry: 374 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 375 %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in seq_cst 376 store i32 %val, ptr addrspace(1) %out2 377 ret void 378} 379 380define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { 381; IR-LABEL: @atomic_and_i32_addr64_offset( 382; IR-NEXT: entry: 383; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] 384; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 385; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 386; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 387; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 388; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 389; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 390; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 391; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 392; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] 393; IR: 7: 394; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4 395; IR-NEXT: br label [[TMP9]] 396; IR: 9: 397; IR-NEXT: ret void 398; 399entry: 400 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index 401 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 402 %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in seq_cst 403 ret void 404} 405 406define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { 407; IR-LABEL: @atomic_and_i32_ret_addr64_offset( 408; IR-NEXT: entry: 409; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] 410; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 411; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 412; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 413; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 414; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 415; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 416; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 417; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 418; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] 419; IR: 7: 420; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4 421; IR-NEXT: br label [[TMP9]] 422; IR: 9: 423; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] 424; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) 425; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]] 426; IR-NEXT: [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]] 427; IR-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4 428; IR-NEXT: ret void 429; 430entry: 431 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index 432 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 433 %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in seq_cst 434 store i32 %val, ptr addrspace(1) %out2 435 ret void 436} 437 438define amdgpu_kernel void @atomic_and_i32(ptr addrspace(1) %out, i32 %in) { 439; IR-LABEL: @atomic_and_i32( 440; IR-NEXT: entry: 441; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 442; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 443; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 444; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 445; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 446; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 447; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 448; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] 449; IR: 7: 450; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] seq_cst, align 4 451; IR-NEXT: br label [[TMP9]] 452; IR: 9: 453; IR-NEXT: ret void 454; 455entry: 456 %val = atomicrmw volatile and ptr addrspace(1) %out, i32 %in seq_cst 457 ret void 458} 459 460define amdgpu_kernel void @atomic_and_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { 461; IR-LABEL: @atomic_and_i32_ret( 462; IR-NEXT: entry: 463; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 464; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 465; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 466; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 467; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 468; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 469; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 470; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] 471; IR: 7: 472; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] seq_cst, align 4 473; IR-NEXT: br label [[TMP9]] 474; IR: 9: 475; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] 476; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) 477; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]] 478; IR-NEXT: [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]] 479; IR-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4 480; IR-NEXT: ret void 481; 482entry: 483 %val = atomicrmw volatile and ptr addrspace(1) %out, i32 %in seq_cst 484 store i32 %val, ptr addrspace(1) %out2 485 ret void 486} 487 488define amdgpu_kernel void @atomic_and_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { 489; IR-LABEL: @atomic_and_i32_addr64( 490; IR-NEXT: entry: 491; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] 492; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 493; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 494; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 495; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 496; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 497; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 498; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 499; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] 500; IR: 7: 501; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] seq_cst, align 4 502; IR-NEXT: br label [[TMP9]] 503; IR: 9: 504; IR-NEXT: ret void 505; 506entry: 507 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index 508 %val = atomicrmw volatile and ptr addrspace(1) %ptr, i32 %in seq_cst 509 ret void 510} 511 512define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { 513; IR-LABEL: @atomic_and_i32_ret_addr64( 514; IR-NEXT: entry: 515; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] 516; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 517; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 518; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 519; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 520; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 521; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 522; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 523; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] 524; IR: 7: 525; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] seq_cst, align 4 526; IR-NEXT: br label [[TMP9]] 527; IR: 9: 528; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] 529; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) 530; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]] 531; IR-NEXT: [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]] 532; IR-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4 533; IR-NEXT: ret void 534; 535entry: 536 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index 537 %val = atomicrmw volatile and ptr addrspace(1) %ptr, i32 %in seq_cst 538 store i32 %val, ptr addrspace(1) %out2 539 ret void 540} 541 542define amdgpu_kernel void @atomic_sub_i32_offset(ptr addrspace(1) %out, i32 %in) { 543; IR-LABEL: @atomic_sub_i32_offset( 544; IR-NEXT: entry: 545; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4 546; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 547; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 548; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 549; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 550; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 551; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 552; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) 553; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 554; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] 555; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 556; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] 557; IR: 10: 558; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4 559; IR-NEXT: br label [[TMP12]] 560; IR: 12: 561; IR-NEXT: ret void 562; 563entry: 564 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 565 %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in seq_cst 566 ret void 567} 568 569define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { 570; IR-LABEL: @atomic_sub_i32_ret_offset( 571; IR-NEXT: entry: 572; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4 573; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 574; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 575; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 576; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 577; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 578; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 579; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) 580; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 581; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] 582; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 583; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] 584; IR: 10: 585; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4 586; IR-NEXT: br label [[TMP12]] 587; IR: 12: 588; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ] 589; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]]) 590; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]] 591; IR-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]] 592; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4 593; IR-NEXT: ret void 594; 595entry: 596 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 597 %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in seq_cst 598 store i32 %val, ptr addrspace(1) %out2 599 ret void 600} 601 602define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { 603; IR-LABEL: @atomic_sub_i32_addr64_offset( 604; IR-NEXT: entry: 605; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] 606; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 607; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 608; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 609; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 610; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 611; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 612; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 613; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) 614; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 615; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] 616; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 617; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] 618; IR: 10: 619; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4 620; IR-NEXT: br label [[TMP12]] 621; IR: 12: 622; IR-NEXT: ret void 623; 624entry: 625 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index 626 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 627 %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in seq_cst 628 ret void 629} 630 631define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { 632; IR-LABEL: @atomic_sub_i32_ret_addr64_offset( 633; IR-NEXT: entry: 634; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] 635; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 636; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 637; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 638; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 639; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 640; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 641; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 642; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) 643; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 644; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] 645; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 646; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] 647; IR: 10: 648; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4 649; IR-NEXT: br label [[TMP12]] 650; IR: 12: 651; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ] 652; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]]) 653; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]] 654; IR-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]] 655; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4 656; IR-NEXT: ret void 657; 658entry: 659 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index 660 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 661 %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in seq_cst 662 store i32 %val, ptr addrspace(1) %out2 663 ret void 664} 665 666define amdgpu_kernel void @atomic_sub_i32(ptr addrspace(1) %out, i32 %in) { 667; IR-LABEL: @atomic_sub_i32( 668; IR-NEXT: entry: 669; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 670; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 671; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 672; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 673; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 674; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 675; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) 676; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 677; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] 678; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 679; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] 680; IR: 10: 681; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[OUT:%.*]], i32 [[TMP8]] seq_cst, align 4 682; IR-NEXT: br label [[TMP12]] 683; IR: 12: 684; IR-NEXT: ret void 685; 686entry: 687 %val = atomicrmw volatile sub ptr addrspace(1) %out, i32 %in seq_cst 688 ret void 689} 690 691define amdgpu_kernel void @atomic_sub_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { 692; IR-LABEL: @atomic_sub_i32_ret( 693; IR-NEXT: entry: 694; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 695; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 696; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 697; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 698; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 699; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 700; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) 701; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 702; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] 703; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 704; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] 705; IR: 10: 706; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[OUT:%.*]], i32 [[TMP8]] seq_cst, align 4 707; IR-NEXT: br label [[TMP12]] 708; IR: 12: 709; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ] 710; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]]) 711; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]] 712; IR-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]] 713; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4 714; IR-NEXT: ret void 715; 716entry: 717 %val = atomicrmw volatile sub ptr addrspace(1) %out, i32 %in seq_cst 718 store i32 %val, ptr addrspace(1) %out2 719 ret void 720} 721 722define amdgpu_kernel void @atomic_sub_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { 723; IR-LABEL: @atomic_sub_i32_addr64( 724; IR-NEXT: entry: 725; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] 726; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 727; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 728; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 729; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 730; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 731; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 732; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) 733; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 734; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] 735; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 736; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] 737; IR: 10: 738; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[PTR]], i32 [[TMP8]] seq_cst, align 4 739; IR-NEXT: br label [[TMP12]] 740; IR: 12: 741; IR-NEXT: ret void 742; 743entry: 744 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index 745 %val = atomicrmw volatile sub ptr addrspace(1) %ptr, i32 %in seq_cst 746 ret void 747} 748 749define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { 750; IR-LABEL: @atomic_sub_i32_ret_addr64( 751; IR-NEXT: entry: 752; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] 753; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 754; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 755; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 756; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 757; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 758; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 759; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) 760; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 761; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] 762; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 763; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] 764; IR: 10: 765; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[PTR]], i32 [[TMP8]] seq_cst, align 4 766; IR-NEXT: br label [[TMP12]] 767; IR: 12: 768; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ] 769; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]]) 770; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]] 771; IR-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]] 772; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4 773; IR-NEXT: ret void 774; 775entry: 776 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index 777 %val = atomicrmw volatile sub ptr addrspace(1) %ptr, i32 %in seq_cst 778 store i32 %val, ptr addrspace(1) %out2 779 ret void 780} 781 782define amdgpu_kernel void @atomic_max_i32_offset(ptr addrspace(1) %out, i32 %in) { 783; IR-LABEL: @atomic_max_i32_offset( 784; IR-NEXT: entry: 785; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4 786; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 787; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 788; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 789; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 790; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 791; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 792; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 793; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] 794; IR: 7: 795; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4 796; IR-NEXT: br label [[TMP9]] 797; IR: 9: 798; IR-NEXT: ret void 799; 800entry: 801 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 802 %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in seq_cst 803 ret void 804} 805 806define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { 807; IR-LABEL: @atomic_max_i32_ret_offset( 808; IR-NEXT: entry: 809; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4 810; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 811; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 812; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 813; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 814; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 815; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 816; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 817; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] 818; IR: 7: 819; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 820; IR-NEXT: br label [[TMP9]] 821; IR: 9: 822; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] 823; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) 824; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]] 825; IR-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] 826; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] 827; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4 828; IR-NEXT: ret void 829; 830entry: 831 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 832 %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst 833 store i32 %val, ptr addrspace(1) %out2 834 ret void 835} 836 837define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { 838; IR-LABEL: @atomic_max_i32_addr64_offset( 839; IR-NEXT: entry: 840; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] 841; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 842; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 843; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 844; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 845; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 846; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 847; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 848; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 849; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] 850; IR: 7: 851; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 852; IR-NEXT: br label [[TMP9]] 853; IR: 9: 854; IR-NEXT: ret void 855; 856entry: 857 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index 858 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 859 %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst 860 ret void 861} 862 863define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { 864; IR-LABEL: @atomic_max_i32_ret_addr64_offset( 865; IR-NEXT: entry: 866; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] 867; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 868; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 869; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 870; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 871; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 872; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 873; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 874; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 875; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] 876; IR: 7: 877; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 878; IR-NEXT: br label [[TMP9]] 879; IR: 9: 880; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] 881; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) 882; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]] 883; IR-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] 884; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] 885; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4 886; IR-NEXT: ret void 887; 888entry: 889 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index 890 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 891 %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst 892 store i32 %val, ptr addrspace(1) %out2 893 ret void 894} 895 896define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, i32 %in) { 897; IR-LABEL: @atomic_max_i32( 898; IR-NEXT: entry: 899; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 900; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 901; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 902; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 903; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 904; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 905; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 906; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] 907; IR: 7: 908; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 909; IR-NEXT: br label [[TMP9]] 910; IR: 9: 911; IR-NEXT: ret void 912; 913entry: 914 %val = atomicrmw volatile max ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst 915 ret void 916} 917 918define amdgpu_kernel void @atomic_max_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { 919; IR-LABEL: @atomic_max_i32_ret( 920; IR-NEXT: entry: 921; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 922; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 923; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 924; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 925; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 926; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 927; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 928; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] 929; IR: 7: 930; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 931; IR-NEXT: br label [[TMP9]] 932; IR: 9: 933; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] 934; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) 935; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]] 936; IR-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] 937; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] 938; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4 939; IR-NEXT: ret void 940; 941entry: 942 %val = atomicrmw volatile max ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst 943 store i32 %val, ptr addrspace(1) %out2 944 ret void 945} 946 947define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { 948; IR-LABEL: @atomic_max_i32_addr64( 949; IR-NEXT: entry: 950; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] 951; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 952; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 953; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 954; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 955; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 956; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 957; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 958; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] 959; IR: 7: 960; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 961; IR-NEXT: br label [[TMP9]] 962; IR: 9: 963; IR-NEXT: ret void 964; 965entry: 966 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index 967 %val = atomicrmw volatile max ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst 968 ret void 969} 970 971define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { 972; IR-LABEL: @atomic_max_i32_ret_addr64( 973; IR-NEXT: entry: 974; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] 975; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 976; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 977; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 978; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 979; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 980; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 981; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 982; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] 983; IR: 7: 984; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 985; IR-NEXT: br label [[TMP9]] 986; IR: 9: 987; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] 988; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) 989; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]] 990; IR-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] 991; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] 992; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4 993; IR-NEXT: ret void 994; 995entry: 996 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index 997 %val = atomicrmw volatile max ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst 998 store i32 %val, ptr addrspace(1) %out2 999 ret void 1000} 1001 1002define amdgpu_kernel void @atomic_umax_i32_offset(ptr addrspace(1) %out, i32 %in) { 1003; IR-LABEL: @atomic_umax_i32_offset( 1004; IR-NEXT: entry: 1005; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4 1006; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 1007; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 1008; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 1009; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 1010; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 1011; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 1012; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 1013; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] 1014; IR: 7: 1015; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 1016; IR-NEXT: br label [[TMP9]] 1017; IR: 9: 1018; IR-NEXT: ret void 1019; 1020entry: 1021 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 1022 %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst 1023 ret void 1024} 1025 1026define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { 1027; IR-LABEL: @atomic_umax_i32_ret_offset( 1028; IR-NEXT: entry: 1029; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4 1030; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 1031; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 1032; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 1033; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 1034; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 1035; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 1036; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 1037; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] 1038; IR: 7: 1039; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 1040; IR-NEXT: br label [[TMP9]] 1041; IR: 9: 1042; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] 1043; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) 1044; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]] 1045; IR-NEXT: [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]] 1046; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] 1047; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4 1048; IR-NEXT: ret void 1049; 1050entry: 1051 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 1052 %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst 1053 store i32 %val, ptr addrspace(1) %out2 1054 ret void 1055} 1056 1057define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { 1058; IR-LABEL: @atomic_umax_i32_addr64_offset( 1059; IR-NEXT: entry: 1060; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] 1061; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 1062; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 1063; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 1064; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 1065; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 1066; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 1067; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 1068; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 1069; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] 1070; IR: 7: 1071; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 1072; IR-NEXT: br label [[TMP9]] 1073; IR: 9: 1074; IR-NEXT: ret void 1075; 1076entry: 1077 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index 1078 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 1079 %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst 1080 ret void 1081} 1082 1083define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { 1084; IR-LABEL: @atomic_umax_i32_ret_addr64_offset( 1085; IR-NEXT: entry: 1086; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] 1087; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 1088; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 1089; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 1090; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 1091; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 1092; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 1093; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 1094; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 1095; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] 1096; IR: 7: 1097; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 1098; IR-NEXT: br label [[TMP9]] 1099; IR: 9: 1100; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] 1101; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) 1102; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]] 1103; IR-NEXT: [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]] 1104; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] 1105; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4 1106; IR-NEXT: ret void 1107; 1108entry: 1109 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index 1110 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 1111 %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst 1112 store i32 %val, ptr addrspace(1) %out2 1113 ret void 1114} 1115 1116define amdgpu_kernel void @atomic_umax_i32(ptr addrspace(1) %out, i32 %in) { 1117; IR-LABEL: @atomic_umax_i32( 1118; IR-NEXT: entry: 1119; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 1120; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 1121; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 1122; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 1123; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 1124; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 1125; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 1126; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] 1127; IR: 7: 1128; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 1129; IR-NEXT: br label [[TMP9]] 1130; IR: 9: 1131; IR-NEXT: ret void 1132; 1133entry: 1134 %val = atomicrmw volatile umax ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst 1135 ret void 1136} 1137 1138define amdgpu_kernel void @atomic_umax_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { 1139; IR-LABEL: @atomic_umax_i32_ret( 1140; IR-NEXT: entry: 1141; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 1142; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 1143; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 1144; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 1145; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 1146; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 1147; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 1148; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] 1149; IR: 7: 1150; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 1151; IR-NEXT: br label [[TMP9]] 1152; IR: 9: 1153; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] 1154; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) 1155; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]] 1156; IR-NEXT: [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]] 1157; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] 1158; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4 1159; IR-NEXT: ret void 1160; 1161entry: 1162 %val = atomicrmw volatile umax ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst 1163 store i32 %val, ptr addrspace(1) %out2 1164 ret void 1165} 1166 1167define amdgpu_kernel void @atomic_umax_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { 1168; IR-LABEL: @atomic_umax_i32_addr64( 1169; IR-NEXT: entry: 1170; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] 1171; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 1172; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 1173; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 1174; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 1175; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 1176; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 1177; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 1178; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] 1179; IR: 7: 1180; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 1181; IR-NEXT: br label [[TMP9]] 1182; IR: 9: 1183; IR-NEXT: ret void 1184; 1185entry: 1186 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index 1187 %val = atomicrmw volatile umax ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst 1188 ret void 1189} 1190 1191define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { 1192; IR-LABEL: @atomic_umax_i32_ret_addr64( 1193; IR-NEXT: entry: 1194; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] 1195; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 1196; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 1197; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 1198; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 1199; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 1200; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 1201; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 1202; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] 1203; IR: 7: 1204; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 1205; IR-NEXT: br label [[TMP9]] 1206; IR: 9: 1207; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] 1208; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) 1209; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]] 1210; IR-NEXT: [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]] 1211; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] 1212; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4 1213; IR-NEXT: ret void 1214; 1215entry: 1216 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index 1217 %val = atomicrmw volatile umax ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst 1218 store i32 %val, ptr addrspace(1) %out2 1219 ret void 1220} 1221 1222define amdgpu_kernel void @atomic_min_i32_offset(ptr addrspace(1) %out, i32 %in) { 1223; IR-LABEL: @atomic_min_i32_offset( 1224; IR-NEXT: entry: 1225; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4 1226; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 1227; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 1228; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 1229; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 1230; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 1231; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 1232; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 1233; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] 1234; IR: 7: 1235; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 1236; IR-NEXT: br label [[TMP9]] 1237; IR: 9: 1238; IR-NEXT: ret void 1239; 1240entry: 1241 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 1242 %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst 1243 ret void 1244} 1245 1246define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { 1247; IR-LABEL: @atomic_min_i32_ret_offset( 1248; IR-NEXT: entry: 1249; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4 1250; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 1251; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 1252; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 1253; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 1254; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 1255; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 1256; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 1257; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] 1258; IR: 7: 1259; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 1260; IR-NEXT: br label [[TMP9]] 1261; IR: 9: 1262; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] 1263; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) 1264; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]] 1265; IR-NEXT: [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]] 1266; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] 1267; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4 1268; IR-NEXT: ret void 1269; 1270entry: 1271 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 1272 %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst 1273 store i32 %val, ptr addrspace(1) %out2 1274 ret void 1275} 1276 1277define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { 1278; IR-LABEL: @atomic_min_i32_addr64_offset( 1279; IR-NEXT: entry: 1280; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] 1281; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 1282; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 1283; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 1284; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 1285; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 1286; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 1287; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 1288; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 1289; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] 1290; IR: 7: 1291; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 1292; IR-NEXT: br label [[TMP9]] 1293; IR: 9: 1294; IR-NEXT: ret void 1295; 1296entry: 1297 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index 1298 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 1299 %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst 1300 ret void 1301} 1302 1303define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { 1304; IR-LABEL: @atomic_min_i32_ret_addr64_offset( 1305; IR-NEXT: entry: 1306; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] 1307; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 1308; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 1309; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 1310; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 1311; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 1312; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 1313; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 1314; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 1315; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] 1316; IR: 7: 1317; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 1318; IR-NEXT: br label [[TMP9]] 1319; IR: 9: 1320; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] 1321; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) 1322; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]] 1323; IR-NEXT: [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]] 1324; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] 1325; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4 1326; IR-NEXT: ret void 1327; 1328entry: 1329 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index 1330 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 1331 %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst 1332 store i32 %val, ptr addrspace(1) %out2 1333 ret void 1334} 1335 1336define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { 1337; IR-LABEL: @atomic_min_i32( 1338; IR-NEXT: entry: 1339; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 1340; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 1341; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 1342; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 1343; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 1344; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 1345; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 1346; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] 1347; IR: 7: 1348; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 1349; IR-NEXT: br label [[TMP9]] 1350; IR: 9: 1351; IR-NEXT: ret void 1352; 1353entry: 1354 %val = atomicrmw volatile min ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst 1355 ret void 1356} 1357 1358define amdgpu_kernel void @atomic_min_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { 1359; IR-LABEL: @atomic_min_i32_ret( 1360; IR-NEXT: entry: 1361; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 1362; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 1363; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 1364; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 1365; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 1366; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 1367; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 1368; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] 1369; IR: 7: 1370; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 1371; IR-NEXT: br label [[TMP9]] 1372; IR: 9: 1373; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] 1374; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) 1375; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]] 1376; IR-NEXT: [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]] 1377; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] 1378; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4 1379; IR-NEXT: ret void 1380; 1381entry: 1382 %val = atomicrmw volatile min ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst 1383 store i32 %val, ptr addrspace(1) %out2 1384 ret void 1385} 1386 1387define amdgpu_kernel void @atomic_min_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { 1388; IR-LABEL: @atomic_min_i32_addr64( 1389; IR-NEXT: entry: 1390; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] 1391; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 1392; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 1393; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 1394; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 1395; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 1396; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 1397; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 1398; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] 1399; IR: 7: 1400; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 1401; IR-NEXT: br label [[TMP9]] 1402; IR: 9: 1403; IR-NEXT: ret void 1404; 1405entry: 1406 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index 1407 %val = atomicrmw volatile min ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst 1408 ret void 1409} 1410 1411define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { 1412; IR-LABEL: @atomic_min_i32_ret_addr64( 1413; IR-NEXT: entry: 1414; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] 1415; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) 1416; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 1417; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 1418; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 1419; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0) 1420; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) 1421; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 1422; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] 1423; IR: 7: 1424; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 1425; IR-NEXT: br label [[TMP9]] 1426; IR: 9: 1427; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] 1428; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) 1429; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]] 1430; IR-NEXT: [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]] 1431; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] 1432; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4 1433; IR-NEXT: ret void 1434; 1435entry: 1436 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index 1437 %val = atomicrmw volatile min ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst 1438 store i32 %val, ptr addrspace(1) %out2 1439 ret void 1440} 1441