1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 2; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 | FileCheck %s -check-prefix=CHECK 3; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 | FileCheck %s -check-prefix=CHECK 4 5define amdgpu_kernel void @raw_atomic_buffer_load_i32(<4 x i32> %addr) { 6; CHECK-LABEL: raw_atomic_buffer_load_i32: 7; CHECK: ; %bb.0: ; %bb 8; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 9; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 10; CHECK-NEXT: s_mov_b32 s4, 0 11; CHECK-NEXT: .LBB0_1: ; %bb1 12; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 13; CHECK-NEXT: s_waitcnt lgkmcnt(0) 14; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 glc 15; CHECK-NEXT: s_waitcnt vmcnt(0) 16; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 17; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 18; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 19; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 20; CHECK-NEXT: s_cbranch_execnz .LBB0_1 21; CHECK-NEXT: ; %bb.2: ; %bb2 22; CHECK-NEXT: s_endpgm 23bb: 24 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 25 br label %bb1 26bb1: 27 %load = call i32 @llvm.amdgcn.raw.atomic.buffer.load.i32(<4 x i32> %addr, i32 0, i32 0, i32 1) 28 %cmp = icmp eq i32 %load, %id 29 br i1 %cmp, label %bb1, label %bb2 30bb2: 31 ret void 32} 33 34define amdgpu_kernel void @raw_atomic_buffer_load_i32_off(<4 x i32> %addr) { 35; CHECK-LABEL: raw_atomic_buffer_load_i32_off: 36; CHECK: ; %bb.0: ; %bb 37; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 38; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 39; CHECK-NEXT: s_mov_b32 s4, 0 40; CHECK-NEXT: .LBB1_1: ; %bb1 41; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 42; CHECK-NEXT: s_waitcnt lgkmcnt(0) 43; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 glc 44; CHECK-NEXT: s_waitcnt vmcnt(0) 45; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 46; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 47; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 48; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 49; CHECK-NEXT: s_cbranch_execnz .LBB1_1 50; CHECK-NEXT: ; %bb.2: ; %bb2 51; CHECK-NEXT: s_endpgm 52bb: 53 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 54 br label %bb1 55bb1: 56 %load = call i32 @llvm.amdgcn.raw.atomic.buffer.load.i32(<4 x i32> %addr, i32 0, i32 0, i32 1) 57 %cmp = icmp eq i32 %load, %id 58 br i1 %cmp, label %bb1, label %bb2 59bb2: 60 ret void 61} 62define amdgpu_kernel void @raw_atomic_buffer_load_i32_soff(<4 x i32> %addr) { 63; CHECK-LABEL: raw_atomic_buffer_load_i32_soff: 64; CHECK: ; %bb.0: ; %bb 65; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 66; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 67; CHECK-NEXT: s_mov_b32 s4, 0 68; CHECK-NEXT: .LBB2_1: ; %bb1 69; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 70; CHECK-NEXT: s_waitcnt lgkmcnt(0) 71; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 4 offset:4 glc 72; CHECK-NEXT: s_waitcnt vmcnt(0) 73; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 74; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 75; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 76; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 77; CHECK-NEXT: s_cbranch_execnz .LBB2_1 78; CHECK-NEXT: ; %bb.2: ; %bb2 79; CHECK-NEXT: s_endpgm 80bb: 81 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 82 br label %bb1 83bb1: 84 %load = call i32 @llvm.amdgcn.raw.atomic.buffer.load.i32(<4 x i32> %addr, i32 4, i32 4, i32 1) 85 %cmp = icmp eq i32 %load, %id 86 br i1 %cmp, label %bb1, label %bb2 87bb2: 88 ret void 89} 90define amdgpu_kernel void @raw_atomic_buffer_load_i32_dlc(<4 x i32> %addr) { 91; CHECK-LABEL: raw_atomic_buffer_load_i32_dlc: 92; CHECK: ; %bb.0: ; %bb 93; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 94; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 95; CHECK-NEXT: s_mov_b32 s4, 0 96; CHECK-NEXT: .LBB3_1: ; %bb1 97; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 98; CHECK-NEXT: s_waitcnt lgkmcnt(0) 99; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 offset:4 dlc 100; CHECK-NEXT: s_waitcnt vmcnt(0) 101; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 102; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 103; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 104; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 105; CHECK-NEXT: s_cbranch_execnz .LBB3_1 106; CHECK-NEXT: ; %bb.2: ; %bb2 107; CHECK-NEXT: s_endpgm 108bb: 109 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 110 br label %bb1 111bb1: 112 %load = call i32 @llvm.amdgcn.raw.atomic.buffer.load.i32(<4 x i32> %addr, i32 4, i32 0, i32 4) 113 %cmp = icmp eq i32 %load, %id 114 br i1 %cmp, label %bb1, label %bb2 115bb2: 116 ret void 117} 118 119define amdgpu_kernel void @raw_nonatomic_buffer_load_i32(<4 x i32> %addr) { 120; CHECK-LABEL: raw_nonatomic_buffer_load_i32: 121; CHECK: ; %bb.0: ; %bb 122; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 123; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 124; CHECK-NEXT: s_waitcnt lgkmcnt(0) 125; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 offset:4 glc 126; CHECK-NEXT: s_mov_b32 s0, 0 127; CHECK-NEXT: s_waitcnt vmcnt(0) 128; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 129; CHECK-NEXT: .LBB4_1: ; %bb1 130; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 131; CHECK-NEXT: s_and_b32 s1, exec_lo, vcc_lo 132; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 133; CHECK-NEXT: s_or_b32 s0, s1, s0 134; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 135; CHECK-NEXT: s_cbranch_execnz .LBB4_1 136; CHECK-NEXT: ; %bb.2: ; %bb2 137; CHECK-NEXT: s_endpgm 138bb: 139 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 140 br label %bb1 141bb1: 142 %load = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %addr, i32 4, i32 0, i32 1) 143 %cmp = icmp eq i32 %load, %id 144 br i1 %cmp, label %bb1, label %bb2 145bb2: 146 ret void 147} 148 149define amdgpu_kernel void @raw_atomic_buffer_load_i64(<4 x i32> %addr) { 150; CHECK-LABEL: raw_atomic_buffer_load_i64: 151; CHECK: ; %bb.0: ; %bb 152; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 153; CHECK-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 154; CHECK-NEXT: s_mov_b32 s4, 0 155; CHECK-NEXT: .LBB5_1: ; %bb1 156; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 157; CHECK-NEXT: s_waitcnt lgkmcnt(0) 158; CHECK-NEXT: buffer_load_b64 v[2:3], off, s[0:3], 0 offset:4 glc 159; CHECK-NEXT: s_waitcnt vmcnt(0) 160; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1] 161; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 162; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 163; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 164; CHECK-NEXT: s_cbranch_execnz .LBB5_1 165; CHECK-NEXT: ; %bb.2: ; %bb2 166; CHECK-NEXT: s_endpgm 167bb: 168 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 169 %id.zext = zext i32 %id to i64 170 br label %bb1 171bb1: 172 %load = call i64 @llvm.amdgcn.raw.atomic.buffer.load.i64(<4 x i32> %addr, i32 4, i32 0, i32 1) 173 %cmp = icmp eq i64 %load, %id.zext 174 br i1 %cmp, label %bb1, label %bb2 175bb2: 176 ret void 177} 178 179define amdgpu_kernel void @raw_atomic_buffer_load_v2i16(<4 x i32> %addr) { 180; CHECK-LABEL: raw_atomic_buffer_load_v2i16: 181; CHECK: ; %bb.0: ; %bb 182; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 183; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 184; CHECK-NEXT: s_mov_b32 s4, 0 185; CHECK-NEXT: .LBB6_1: ; %bb1 186; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 187; CHECK-NEXT: s_waitcnt lgkmcnt(0) 188; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 glc 189; CHECK-NEXT: s_waitcnt vmcnt(0) 190; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 191; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 192; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 193; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 194; CHECK-NEXT: s_cbranch_execnz .LBB6_1 195; CHECK-NEXT: ; %bb.2: ; %bb2 196; CHECK-NEXT: s_endpgm 197bb: 198 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 199 br label %bb1 200bb1: 201 %load = call <2 x i16> @llvm.amdgcn.raw.atomic.buffer.load.v2i16(<4 x i32> %addr, i32 0, i32 0, i32 1) 202 %bitcast = bitcast <2 x i16> %load to i32 203 %cmp = icmp eq i32 %bitcast, %id 204 br i1 %cmp, label %bb1, label %bb2 205bb2: 206 ret void 207} 208 209define amdgpu_kernel void @raw_atomic_buffer_load_v4i16(<4 x i32> %addr) { 210; CHECK-LABEL: raw_atomic_buffer_load_v4i16: 211; CHECK: ; %bb.0: ; %bb 212; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 213; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 214; CHECK-NEXT: s_mov_b32 s4, 0 215; CHECK-NEXT: .LBB7_1: ; %bb1 216; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 217; CHECK-NEXT: s_waitcnt lgkmcnt(0) 218; CHECK-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc 219; CHECK-NEXT: s_waitcnt vmcnt(0) 220; CHECK-NEXT: v_and_b32_e32 v1, 0xffff, v1 221; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 222; CHECK-NEXT: v_lshl_or_b32 v1, v2, 16, v1 223; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 224; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 225; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 226; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 227; CHECK-NEXT: s_cbranch_execnz .LBB7_1 228; CHECK-NEXT: ; %bb.2: ; %bb2 229; CHECK-NEXT: s_endpgm 230bb: 231 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 232 br label %bb1 233bb1: 234 %load = call <4 x i16> @llvm.amdgcn.raw.atomic.buffer.load.v4i16(<4 x i32> %addr, i32 4, i32 0, i32 1) 235 %shortened = shufflevector <4 x i16> %load, <4 x i16> poison, <2 x i32> <i32 0, i32 2> 236 %bitcast = bitcast <2 x i16> %shortened to i32 237 %cmp = icmp eq i32 %bitcast, %id 238 br i1 %cmp, label %bb1, label %bb2 239bb2: 240 ret void 241} 242 243define amdgpu_kernel void @raw_atomic_buffer_load_v4i32(<4 x i32> %addr) { 244; CHECK-LABEL: raw_atomic_buffer_load_v4i32: 245; CHECK: ; %bb.0: ; %bb 246; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 247; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 248; CHECK-NEXT: s_mov_b32 s4, 0 249; CHECK-NEXT: .LBB8_1: ; %bb1 250; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 251; CHECK-NEXT: s_waitcnt lgkmcnt(0) 252; CHECK-NEXT: buffer_load_b128 v[1:4], off, s[0:3], 0 offset:4 glc 253; CHECK-NEXT: s_waitcnt vmcnt(0) 254; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v4, v0 255; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 256; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 257; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 258; CHECK-NEXT: s_cbranch_execnz .LBB8_1 259; CHECK-NEXT: ; %bb.2: ; %bb2 260; CHECK-NEXT: s_endpgm 261bb: 262 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 263 br label %bb1 264bb1: 265 %load = call <4 x i32> @llvm.amdgcn.raw.atomic.buffer.load.v4i32(<4 x i32> %addr, i32 4, i32 0, i32 1) 266 %extracted = extractelement <4 x i32> %load, i32 3 267 %cmp = icmp eq i32 %extracted, %id 268 br i1 %cmp, label %bb1, label %bb2 269bb2: 270 ret void 271} 272 273define amdgpu_kernel void @raw_atomic_buffer_load_ptr(<4 x i32> %addr) { 274; CHECK-LABEL: raw_atomic_buffer_load_ptr: 275; CHECK: ; %bb.0: ; %bb 276; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 277; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 278; CHECK-NEXT: s_mov_b32 s4, 0 279; CHECK-NEXT: .LBB9_1: ; %bb1 280; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 281; CHECK-NEXT: s_waitcnt lgkmcnt(0) 282; CHECK-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc 283; CHECK-NEXT: s_waitcnt vmcnt(0) 284; CHECK-NEXT: flat_load_b32 v1, v[1:2] 285; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 286; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 287; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 288; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 289; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 290; CHECK-NEXT: s_cbranch_execnz .LBB9_1 291; CHECK-NEXT: ; %bb.2: ; %bb2 292; CHECK-NEXT: s_endpgm 293bb: 294 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 295 br label %bb1 296bb1: 297 %load = call ptr @llvm.amdgcn.raw.atomic.buffer.load.ptr(<4 x i32> %addr, i32 4, i32 0, i32 1) 298 %elem = load i32, ptr %load 299 %cmp = icmp eq i32 %elem, %id 300 br i1 %cmp, label %bb1, label %bb2 301bb2: 302 ret void 303} 304 305; Function Attrs: nounwind readonly 306declare i32 @llvm.amdgcn.raw.atomic.buffer.load.i32(<4 x i32>, i32, i32, i32 immarg) 307declare i64 @llvm.amdgcn.raw.atomic.buffer.load.i64(<4 x i32>, i32, i32, i32 immarg) 308declare <2 x i16> @llvm.amdgcn.raw.atomic.buffer.load.v2i16(<4 x i32>, i32, i32, i32 immarg) 309declare <4 x i16> @llvm.amdgcn.raw.atomic.buffer.load.v4i16(<4 x i32>, i32, i32, i32 immarg) 310declare <4 x i32> @llvm.amdgcn.raw.atomic.buffer.load.v4i32(<4 x i32>, i32, i32, i32 immarg) 311declare ptr @llvm.amdgcn.raw.atomic.buffer.load.ptr(<4 x i32>, i32, i32, i32 immarg) 312declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32 immarg) 313declare i32 @llvm.amdgcn.workitem.id.x() 314