xref: /llvm-project/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll (revision eeac0ffaf46cf9f9b0f680b9940cc4b68a0286d8)
16a1b1190SJessica Del; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
29afaf9c6SFangrui Song; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefix=CHECK
39afaf9c6SFangrui Song; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefix=CHECK
46a1b1190SJessica Del
56a1b1190SJessica Deldefine amdgpu_kernel void @struct_atomic_buffer_load_i32(<4 x i32> %addr, i32 %index) {
66a1b1190SJessica Del; CHECK-LABEL: struct_atomic_buffer_load_i32:
76a1b1190SJessica Del; CHECK:       ; %bb.0: ; %bb
86a1b1190SJessica Del; CHECK-NEXT:    s_clause 0x1
9*eeac0ffaSNikita Popov; CHECK-NEXT:    s_load_b32 s6, s[4:5], 0x34
106548b635SShilei Tian; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
116548b635SShilei Tian; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
12*eeac0ffaSNikita Popov; CHECK-NEXT:    s_mov_b32 s4, 0
13*eeac0ffaSNikita Popov; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
14*eeac0ffaSNikita Popov; CHECK-NEXT:    v_mov_b32_e32 v1, s6
156a1b1190SJessica Del; CHECK-NEXT:  .LBB0_1: ; %bb1
166a1b1190SJessica Del; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
17*eeac0ffaSNikita Popov; CHECK-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
186a1b1190SJessica Del; CHECK-NEXT:    s_waitcnt vmcnt(0)
19*eeac0ffaSNikita Popov; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
20*eeac0ffaSNikita Popov; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
216a1b1190SJessica Del; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
22*eeac0ffaSNikita Popov; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
236a1b1190SJessica Del; CHECK-NEXT:    s_cbranch_execnz .LBB0_1
246a1b1190SJessica Del; CHECK-NEXT:  ; %bb.2: ; %bb2
256a1b1190SJessica Del; CHECK-NEXT:    s_endpgm
266a1b1190SJessica Delbb:
276a1b1190SJessica Del  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
286a1b1190SJessica Del  br label %bb1
296a1b1190SJessica Delbb1:
306a1b1190SJessica Del  %load = call i32 @llvm.amdgcn.struct.atomic.buffer.load.i32(<4 x i32> %addr, i32 %index, i32 0, i32 0, i32 1)
316a1b1190SJessica Del  %cmp = icmp eq i32 %load, %id
326a1b1190SJessica Del  br i1 %cmp, label %bb1, label %bb2
336a1b1190SJessica Delbb2:
346a1b1190SJessica Del  ret void
356a1b1190SJessica Del}
366a1b1190SJessica Del
376a1b1190SJessica Deldefine amdgpu_kernel void @struct_atomic_buffer_load_i32_const_idx(<4 x i32> %addr) {
386a1b1190SJessica Del; CHECK-LABEL: struct_atomic_buffer_load_i32_const_idx:
396a1b1190SJessica Del; CHECK:       ; %bb.0: ; %bb
406548b635SShilei Tian; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
416a1b1190SJessica Del; CHECK-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0
426a1b1190SJessica Del; CHECK-NEXT:    s_mov_b32 s4, 0
436a1b1190SJessica Del; CHECK-NEXT:  .LBB1_1: ; %bb1
446a1b1190SJessica Del; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
456a1b1190SJessica Del; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
466a1b1190SJessica Del; CHECK-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
476a1b1190SJessica Del; CHECK-NEXT:    s_waitcnt vmcnt(0)
486a1b1190SJessica Del; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
496a1b1190SJessica Del; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
506a1b1190SJessica Del; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
516a1b1190SJessica Del; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
526a1b1190SJessica Del; CHECK-NEXT:    s_cbranch_execnz .LBB1_1
536a1b1190SJessica Del; CHECK-NEXT:  ; %bb.2: ; %bb2
546a1b1190SJessica Del; CHECK-NEXT:    s_endpgm
556a1b1190SJessica Delbb:
566a1b1190SJessica Del  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
576a1b1190SJessica Del  br label %bb1
586a1b1190SJessica Delbb1:
596a1b1190SJessica Del  %load = call i32 @llvm.amdgcn.struct.atomic.buffer.load.i32(<4 x i32> %addr, i32 15, i32 0, i32 0, i32 1)
606a1b1190SJessica Del  %cmp = icmp eq i32 %load, %id
616a1b1190SJessica Del  br i1 %cmp, label %bb1, label %bb2
626a1b1190SJessica Delbb2:
636a1b1190SJessica Del  ret void
646a1b1190SJessica Del}
656a1b1190SJessica Del
666a1b1190SJessica Deldefine amdgpu_kernel void @struct_atomic_buffer_load_i32_off(<4 x i32> %addr, i32 %index) {
676a1b1190SJessica Del; CHECK-LABEL: struct_atomic_buffer_load_i32_off:
686a1b1190SJessica Del; CHECK:       ; %bb.0: ; %bb
696a1b1190SJessica Del; CHECK-NEXT:    s_clause 0x1
70*eeac0ffaSNikita Popov; CHECK-NEXT:    s_load_b32 s6, s[4:5], 0x34
716548b635SShilei Tian; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
726548b635SShilei Tian; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
73*eeac0ffaSNikita Popov; CHECK-NEXT:    s_mov_b32 s4, 0
74*eeac0ffaSNikita Popov; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
75*eeac0ffaSNikita Popov; CHECK-NEXT:    v_mov_b32_e32 v1, s6
766a1b1190SJessica Del; CHECK-NEXT:  .LBB2_1: ; %bb1
776a1b1190SJessica Del; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
78*eeac0ffaSNikita Popov; CHECK-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
796a1b1190SJessica Del; CHECK-NEXT:    s_waitcnt vmcnt(0)
80*eeac0ffaSNikita Popov; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
81*eeac0ffaSNikita Popov; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
826a1b1190SJessica Del; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
83*eeac0ffaSNikita Popov; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
846a1b1190SJessica Del; CHECK-NEXT:    s_cbranch_execnz .LBB2_1
856a1b1190SJessica Del; CHECK-NEXT:  ; %bb.2: ; %bb2
866a1b1190SJessica Del; CHECK-NEXT:    s_endpgm
876a1b1190SJessica Delbb:
886a1b1190SJessica Del  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
896a1b1190SJessica Del  br label %bb1
906a1b1190SJessica Delbb1:
916a1b1190SJessica Del  %load = call i32 @llvm.amdgcn.struct.atomic.buffer.load.i32(<4 x i32> %addr, i32 %index, i32 0, i32 0, i32 1)
926a1b1190SJessica Del  %cmp = icmp eq i32 %load, %id
936a1b1190SJessica Del  br i1 %cmp, label %bb1, label %bb2
946a1b1190SJessica Delbb2:
956a1b1190SJessica Del  ret void
966a1b1190SJessica Del}
976a1b1190SJessica Del
986a1b1190SJessica Deldefine amdgpu_kernel void @struct_atomic_buffer_load_i32_soff(<4 x i32> %addr, i32 %index) {
996a1b1190SJessica Del; CHECK-LABEL: struct_atomic_buffer_load_i32_soff:
1006a1b1190SJessica Del; CHECK:       ; %bb.0: ; %bb
1016a1b1190SJessica Del; CHECK-NEXT:    s_clause 0x1
102*eeac0ffaSNikita Popov; CHECK-NEXT:    s_load_b32 s6, s[4:5], 0x34
1036548b635SShilei Tian; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1046548b635SShilei Tian; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
105*eeac0ffaSNikita Popov; CHECK-NEXT:    s_mov_b32 s4, 0
106*eeac0ffaSNikita Popov; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
107*eeac0ffaSNikita Popov; CHECK-NEXT:    v_mov_b32_e32 v1, s6
1086a1b1190SJessica Del; CHECK-NEXT:  .LBB3_1: ; %bb1
1096a1b1190SJessica Del; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
110*eeac0ffaSNikita Popov; CHECK-NEXT:    buffer_load_b32 v2, v1, s[0:3], 4 idxen offset:4 glc
1116a1b1190SJessica Del; CHECK-NEXT:    s_waitcnt vmcnt(0)
112*eeac0ffaSNikita Popov; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
113*eeac0ffaSNikita Popov; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
1146a1b1190SJessica Del; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
115*eeac0ffaSNikita Popov; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
1166a1b1190SJessica Del; CHECK-NEXT:    s_cbranch_execnz .LBB3_1
1176a1b1190SJessica Del; CHECK-NEXT:  ; %bb.2: ; %bb2
1186a1b1190SJessica Del; CHECK-NEXT:    s_endpgm
1196a1b1190SJessica Delbb:
1206a1b1190SJessica Del  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
1216a1b1190SJessica Del  br label %bb1
1226a1b1190SJessica Delbb1:
1236a1b1190SJessica Del  %load = call i32 @llvm.amdgcn.struct.atomic.buffer.load.i32(<4 x i32> %addr, i32 %index, i32 4, i32 4, i32 1)
1246a1b1190SJessica Del  %cmp = icmp eq i32 %load, %id
1256a1b1190SJessica Del  br i1 %cmp, label %bb1, label %bb2
1266a1b1190SJessica Delbb2:
1276a1b1190SJessica Del  ret void
1286a1b1190SJessica Del}
1296a1b1190SJessica Deldefine amdgpu_kernel void @struct_atomic_buffer_load_i32_dlc(<4 x i32> %addr, i32 %index) {
1306a1b1190SJessica Del; CHECK-LABEL: struct_atomic_buffer_load_i32_dlc:
1316a1b1190SJessica Del; CHECK:       ; %bb.0: ; %bb
1326a1b1190SJessica Del; CHECK-NEXT:    s_clause 0x1
133*eeac0ffaSNikita Popov; CHECK-NEXT:    s_load_b32 s6, s[4:5], 0x34
1346548b635SShilei Tian; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1356548b635SShilei Tian; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
136*eeac0ffaSNikita Popov; CHECK-NEXT:    s_mov_b32 s4, 0
137*eeac0ffaSNikita Popov; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
138*eeac0ffaSNikita Popov; CHECK-NEXT:    v_mov_b32_e32 v1, s6
1396a1b1190SJessica Del; CHECK-NEXT:  .LBB4_1: ; %bb1
1406a1b1190SJessica Del; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
141*eeac0ffaSNikita Popov; CHECK-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 idxen offset:4 dlc
1426a1b1190SJessica Del; CHECK-NEXT:    s_waitcnt vmcnt(0)
143*eeac0ffaSNikita Popov; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
144*eeac0ffaSNikita Popov; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
1456a1b1190SJessica Del; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
146*eeac0ffaSNikita Popov; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
1476a1b1190SJessica Del; CHECK-NEXT:    s_cbranch_execnz .LBB4_1
1486a1b1190SJessica Del; CHECK-NEXT:  ; %bb.2: ; %bb2
1496a1b1190SJessica Del; CHECK-NEXT:    s_endpgm
1506a1b1190SJessica Delbb:
1516a1b1190SJessica Del  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
1526a1b1190SJessica Del  br label %bb1
1536a1b1190SJessica Delbb1:
1546a1b1190SJessica Del  %load = call i32 @llvm.amdgcn.struct.atomic.buffer.load.i32(<4 x i32> %addr, i32 %index, i32 4, i32 0, i32 4)
1556a1b1190SJessica Del  %cmp = icmp eq i32 %load, %id
1566a1b1190SJessica Del  br i1 %cmp, label %bb1, label %bb2
1576a1b1190SJessica Delbb2:
1586a1b1190SJessica Del  ret void
1596a1b1190SJessica Del}
1606a1b1190SJessica Del
1616a1b1190SJessica Deldefine amdgpu_kernel void @struct_nonatomic_buffer_load_i32(<4 x i32> %addr, i32 %index) {
1626a1b1190SJessica Del; CHECK-LABEL: struct_nonatomic_buffer_load_i32:
1636a1b1190SJessica Del; CHECK:       ; %bb.0: ; %bb
1646a1b1190SJessica Del; CHECK-NEXT:    s_clause 0x1
1656548b635SShilei Tian; CHECK-NEXT:    s_load_b32 s6, s[4:5], 0x34
1666548b635SShilei Tian; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1676a1b1190SJessica Del; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
1686548b635SShilei Tian; CHECK-NEXT:    v_dual_mov_b32 v1, s6 :: v_dual_and_b32 v0, 0x3ff, v0
1696a1b1190SJessica Del; CHECK-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 idxen offset:4 glc
1706a1b1190SJessica Del; CHECK-NEXT:    s_mov_b32 s0, 0
1716a1b1190SJessica Del; CHECK-NEXT:    s_waitcnt vmcnt(0)
1726a1b1190SJessica Del; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
1736a1b1190SJessica Del; CHECK-NEXT:  .LBB5_1: ; %bb1
1746a1b1190SJessica Del; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
1756a1b1190SJessica Del; CHECK-NEXT:    s_and_b32 s1, exec_lo, vcc_lo
1766a1b1190SJessica Del; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1776a1b1190SJessica Del; CHECK-NEXT:    s_or_b32 s0, s1, s0
1786a1b1190SJessica Del; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
1796a1b1190SJessica Del; CHECK-NEXT:    s_cbranch_execnz .LBB5_1
1806a1b1190SJessica Del; CHECK-NEXT:  ; %bb.2: ; %bb2
1816a1b1190SJessica Del; CHECK-NEXT:    s_endpgm
1826a1b1190SJessica Delbb:
1836a1b1190SJessica Del  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
1846a1b1190SJessica Del  br label %bb1
1856a1b1190SJessica Delbb1:
1866a1b1190SJessica Del  %load = call i32 @llvm.amdgcn.struct.buffer.load.i32(<4 x i32> %addr, i32 %index, i32 4, i32 0, i32 1)
1876a1b1190SJessica Del  %cmp = icmp eq i32 %load, %id
1886a1b1190SJessica Del  br i1 %cmp, label %bb1, label %bb2
1896a1b1190SJessica Delbb2:
1906a1b1190SJessica Del  ret void
1916a1b1190SJessica Del}
1926a1b1190SJessica Del
1936a1b1190SJessica Deldefine amdgpu_kernel void @struct_atomic_buffer_load_i64(<4 x i32> %addr, i32 %index) {
1946a1b1190SJessica Del; CHECK-LABEL: struct_atomic_buffer_load_i64:
1956a1b1190SJessica Del; CHECK:       ; %bb.0: ; %bb
1966a1b1190SJessica Del; CHECK-NEXT:    s_clause 0x1
197*eeac0ffaSNikita Popov; CHECK-NEXT:    s_load_b32 s6, s[4:5], 0x34
1986548b635SShilei Tian; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1996a1b1190SJessica Del; CHECK-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
200*eeac0ffaSNikita Popov; CHECK-NEXT:    s_mov_b32 s4, 0
201*eeac0ffaSNikita Popov; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
202*eeac0ffaSNikita Popov; CHECK-NEXT:    v_mov_b32_e32 v2, s6
2036a1b1190SJessica Del; CHECK-NEXT:  .LBB6_1: ; %bb1
2046a1b1190SJessica Del; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
205*eeac0ffaSNikita Popov; CHECK-NEXT:    buffer_load_b64 v[3:4], v2, s[0:3], 0 idxen offset:4 glc
2066a1b1190SJessica Del; CHECK-NEXT:    s_waitcnt vmcnt(0)
207*eeac0ffaSNikita Popov; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[3:4], v[0:1]
208*eeac0ffaSNikita Popov; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
2096a1b1190SJessica Del; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
210*eeac0ffaSNikita Popov; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
2116a1b1190SJessica Del; CHECK-NEXT:    s_cbranch_execnz .LBB6_1
2126a1b1190SJessica Del; CHECK-NEXT:  ; %bb.2: ; %bb2
2136a1b1190SJessica Del; CHECK-NEXT:    s_endpgm
2146a1b1190SJessica Delbb:
2156a1b1190SJessica Del  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
2166a1b1190SJessica Del  %id.zext = zext i32 %id to i64
2176a1b1190SJessica Del  br label %bb1
2186a1b1190SJessica Delbb1:
2196a1b1190SJessica Del  %load = call i64 @llvm.amdgcn.struct.atomic.buffer.load.i64(<4 x i32> %addr, i32 %index, i32 4, i32 0, i32 1)
2206a1b1190SJessica Del  %cmp = icmp eq i64 %load, %id.zext
2216a1b1190SJessica Del  br i1 %cmp, label %bb1, label %bb2
2226a1b1190SJessica Delbb2:
2236a1b1190SJessica Del  ret void
2246a1b1190SJessica Del}
2256a1b1190SJessica Del
2266a1b1190SJessica Deldefine amdgpu_kernel void @struct_atomic_buffer_load_v2i16(<4 x i32> %addr, i32 %index) {
2276a1b1190SJessica Del; CHECK-LABEL: struct_atomic_buffer_load_v2i16:
2286a1b1190SJessica Del; CHECK:       ; %bb.0: ; %bb
2296a1b1190SJessica Del; CHECK-NEXT:    s_clause 0x1
230*eeac0ffaSNikita Popov; CHECK-NEXT:    s_load_b32 s6, s[4:5], 0x34
2316548b635SShilei Tian; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2326548b635SShilei Tian; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
233*eeac0ffaSNikita Popov; CHECK-NEXT:    s_mov_b32 s4, 0
234*eeac0ffaSNikita Popov; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
235*eeac0ffaSNikita Popov; CHECK-NEXT:    v_mov_b32_e32 v1, s6
2366a1b1190SJessica Del; CHECK-NEXT:  .LBB7_1: ; %bb1
2376a1b1190SJessica Del; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
238*eeac0ffaSNikita Popov; CHECK-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
2396a1b1190SJessica Del; CHECK-NEXT:    s_waitcnt vmcnt(0)
240*eeac0ffaSNikita Popov; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
241*eeac0ffaSNikita Popov; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
2426a1b1190SJessica Del; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
243*eeac0ffaSNikita Popov; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
2446a1b1190SJessica Del; CHECK-NEXT:    s_cbranch_execnz .LBB7_1
2456a1b1190SJessica Del; CHECK-NEXT:  ; %bb.2: ; %bb2
2466a1b1190SJessica Del; CHECK-NEXT:    s_endpgm
2476a1b1190SJessica Delbb:
2486a1b1190SJessica Del  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
2496a1b1190SJessica Del  br label %bb1
2506a1b1190SJessica Delbb1:
2516a1b1190SJessica Del  %load = call <2 x i16> @llvm.amdgcn.struct.atomic.buffer.load.v2i16(<4 x i32> %addr, i32 %index, i32 0, i32 0, i32 1)
2526a1b1190SJessica Del  %bitcast = bitcast <2 x i16> %load to i32
2536a1b1190SJessica Del  %cmp = icmp eq i32 %bitcast, %id
2546a1b1190SJessica Del  br i1 %cmp, label %bb1, label %bb2
2556a1b1190SJessica Delbb2:
2566a1b1190SJessica Del  ret void
2576a1b1190SJessica Del}
2586a1b1190SJessica Del
2596a1b1190SJessica Deldefine amdgpu_kernel void @struct_atomic_buffer_load_v4i16(<4 x i32> %addr, i32 %index) {
2606a1b1190SJessica Del; CHECK-LABEL: struct_atomic_buffer_load_v4i16:
2616a1b1190SJessica Del; CHECK:       ; %bb.0: ; %bb
2626a1b1190SJessica Del; CHECK-NEXT:    s_clause 0x1
263*eeac0ffaSNikita Popov; CHECK-NEXT:    s_load_b32 s6, s[4:5], 0x34
2646548b635SShilei Tian; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2656548b635SShilei Tian; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
266*eeac0ffaSNikita Popov; CHECK-NEXT:    s_mov_b32 s4, 0
267*eeac0ffaSNikita Popov; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
268*eeac0ffaSNikita Popov; CHECK-NEXT:    v_mov_b32_e32 v1, s6
2696a1b1190SJessica Del; CHECK-NEXT:  .LBB8_1: ; %bb1
2706a1b1190SJessica Del; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
271*eeac0ffaSNikita Popov; CHECK-NEXT:    buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
2726a1b1190SJessica Del; CHECK-NEXT:    s_waitcnt vmcnt(0)
273*eeac0ffaSNikita Popov; CHECK-NEXT:    v_and_b32_e32 v2, 0xffff, v2
2746a1b1190SJessica Del; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
275*eeac0ffaSNikita Popov; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
276*eeac0ffaSNikita Popov; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
277*eeac0ffaSNikita Popov; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
2786a1b1190SJessica Del; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
279*eeac0ffaSNikita Popov; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
2806a1b1190SJessica Del; CHECK-NEXT:    s_cbranch_execnz .LBB8_1
2816a1b1190SJessica Del; CHECK-NEXT:  ; %bb.2: ; %bb2
2826a1b1190SJessica Del; CHECK-NEXT:    s_endpgm
2836a1b1190SJessica Delbb:
2846a1b1190SJessica Del  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
2856a1b1190SJessica Del  br label %bb1
2866a1b1190SJessica Delbb1:
2876a1b1190SJessica Del  %load = call <4 x i16> @llvm.amdgcn.struct.atomic.buffer.load.v4i16(<4 x i32> %addr, i32 %index, i32 4, i32 0, i32 1)
2886a1b1190SJessica Del  %shortened = shufflevector <4 x i16> %load, <4 x i16> poison, <2 x i32> <i32 0, i32 2>
2896a1b1190SJessica Del  %bitcast = bitcast <2 x i16> %shortened to i32
2906a1b1190SJessica Del  %cmp = icmp eq i32 %bitcast, %id
2916a1b1190SJessica Del  br i1 %cmp, label %bb1, label %bb2
2926a1b1190SJessica Delbb2:
2936a1b1190SJessica Del  ret void
2946a1b1190SJessica Del}
2956a1b1190SJessica Del
2966a1b1190SJessica Deldefine amdgpu_kernel void @struct_atomic_buffer_load_v4i32(<4 x i32> %addr, i32 %index) {
2976a1b1190SJessica Del; CHECK-LABEL: struct_atomic_buffer_load_v4i32:
2986a1b1190SJessica Del; CHECK:       ; %bb.0: ; %bb
2996a1b1190SJessica Del; CHECK-NEXT:    s_clause 0x1
300*eeac0ffaSNikita Popov; CHECK-NEXT:    s_load_b32 s6, s[4:5], 0x34
3016548b635SShilei Tian; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3026548b635SShilei Tian; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
303*eeac0ffaSNikita Popov; CHECK-NEXT:    s_mov_b32 s4, 0
304*eeac0ffaSNikita Popov; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
305*eeac0ffaSNikita Popov; CHECK-NEXT:    v_mov_b32_e32 v1, s6
3066a1b1190SJessica Del; CHECK-NEXT:  .LBB9_1: ; %bb1
3076a1b1190SJessica Del; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
308*eeac0ffaSNikita Popov; CHECK-NEXT:    buffer_load_b128 v[2:5], v1, s[0:3], 0 idxen offset:4 glc
3096a1b1190SJessica Del; CHECK-NEXT:    s_waitcnt vmcnt(0)
310*eeac0ffaSNikita Popov; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v5, v0
311*eeac0ffaSNikita Popov; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
3126a1b1190SJessica Del; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
313*eeac0ffaSNikita Popov; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
3146a1b1190SJessica Del; CHECK-NEXT:    s_cbranch_execnz .LBB9_1
3156a1b1190SJessica Del; CHECK-NEXT:  ; %bb.2: ; %bb2
3166a1b1190SJessica Del; CHECK-NEXT:    s_endpgm
3176a1b1190SJessica Delbb:
3186a1b1190SJessica Del  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
3196a1b1190SJessica Del  br label %bb1
3206a1b1190SJessica Delbb1:
3216a1b1190SJessica Del  %load = call <4 x i32> @llvm.amdgcn.struct.atomic.buffer.load.v4i32(<4 x i32> %addr, i32 %index, i32 4, i32 0, i32 1)
3226a1b1190SJessica Del  %extracted = extractelement <4 x i32> %load, i32 3
3236a1b1190SJessica Del  %cmp = icmp eq i32 %extracted, %id
3246a1b1190SJessica Del  br i1 %cmp, label %bb1, label %bb2
3256a1b1190SJessica Delbb2:
3266a1b1190SJessica Del  ret void
3276a1b1190SJessica Del}
3286a1b1190SJessica Del
3296a1b1190SJessica Deldefine amdgpu_kernel void @struct_atomic_buffer_load_ptr(<4 x i32> %addr, i32 %index) {
3306a1b1190SJessica Del; CHECK-LABEL: struct_atomic_buffer_load_ptr:
3316a1b1190SJessica Del; CHECK:       ; %bb.0: ; %bb
3326a1b1190SJessica Del; CHECK-NEXT:    s_clause 0x1
333*eeac0ffaSNikita Popov; CHECK-NEXT:    s_load_b32 s6, s[4:5], 0x34
3346548b635SShilei Tian; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3356548b635SShilei Tian; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
336*eeac0ffaSNikita Popov; CHECK-NEXT:    s_mov_b32 s4, 0
337*eeac0ffaSNikita Popov; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
338*eeac0ffaSNikita Popov; CHECK-NEXT:    v_mov_b32_e32 v1, s6
3396a1b1190SJessica Del; CHECK-NEXT:  .LBB10_1: ; %bb1
3406a1b1190SJessica Del; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
341*eeac0ffaSNikita Popov; CHECK-NEXT:    buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
3426a1b1190SJessica Del; CHECK-NEXT:    s_waitcnt vmcnt(0)
343*eeac0ffaSNikita Popov; CHECK-NEXT:    flat_load_b32 v2, v[2:3]
3446a1b1190SJessica Del; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
345*eeac0ffaSNikita Popov; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
346*eeac0ffaSNikita Popov; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
3476a1b1190SJessica Del; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
348*eeac0ffaSNikita Popov; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
3496a1b1190SJessica Del; CHECK-NEXT:    s_cbranch_execnz .LBB10_1
3506a1b1190SJessica Del; CHECK-NEXT:  ; %bb.2: ; %bb2
3516a1b1190SJessica Del; CHECK-NEXT:    s_endpgm
3526a1b1190SJessica Delbb:
3536a1b1190SJessica Del  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
3546a1b1190SJessica Del  br label %bb1
3556a1b1190SJessica Delbb1:
3566a1b1190SJessica Del  %load = call ptr @llvm.amdgcn.struct.atomic.buffer.load.ptr(<4 x i32> %addr, i32 %index, i32 4, i32 0, i32 1)
3576a1b1190SJessica Del  %elem = load i32, ptr %load
3586a1b1190SJessica Del  %cmp = icmp eq i32 %elem, %id
3596a1b1190SJessica Del  br i1 %cmp, label %bb1, label %bb2
3606a1b1190SJessica Delbb2:
3616a1b1190SJessica Del  ret void
3626a1b1190SJessica Del}
3636a1b1190SJessica Del
3646a1b1190SJessica Deldeclare i32 @llvm.amdgcn.struct.atom.buffer.load.i32(<4 x i32>, i32, i32, i32, i32 immarg)
3656a1b1190SJessica Deldeclare i64 @llvm.amdgcn.struct.atom.buffer.load.i64(<4 x i32>, i32, i32, i32, i32 immarg)
3666a1b1190SJessica Deldeclare <2 x i16> @llvm.amdgcn.struct.atom.buffer.load.v2i16(<4 x i32>, i32, i32, i32, i32 immarg)
3676a1b1190SJessica Deldeclare <4 x i16> @llvm.amdgcn.struct.atom.buffer.load.v4i16(<4 x i32>, i32, i32, i32, i32 immarg)
3686a1b1190SJessica Deldeclare <4 x i32> @llvm.amdgcn.struct.atom.buffer.load.v4i32(<4 x i32>, i32, i32, i32, i32 immarg)
3696a1b1190SJessica Deldeclare ptr @llvm.amdgcn.struct.atom.buffer.load.ptr(<4 x i32>, i32, i32, i32, i32 immarg)
3706a1b1190SJessica Deldeclare i32 @llvm.amdgcn.struct.buffer.load.i32(<4 x i32>, i32, i32, i32, i32 immarg)
3716a1b1190SJessica Deldeclare i32 @llvm.amdgcn.workitem.id.x()
372