CodeGen/AMDGPU/mixed-vmem-types.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-GISEL %s

define amdgpu_cs void @mixed_vmem_types(i32 inreg %globalTable, i32 inreg %perShaderTable, i32 inreg %descTable0, i32 inreg %descTable1, <3 x i32> inreg %WorkgroupId, i32 inreg %MultiDispatchInfo, <3 x i32> %LocalInvocationId) #0 {
; GFX11-LABEL: mixed_vmem_types:
; GFX11:       ; %bb.0: ; %.entry
; GFX11-NEXT:    s_getpc_b64 s[4:5]
; GFX11-NEXT:    s_mov_b32 s0, s3
; GFX11-NEXT:    s_mov_b32 s3, s5
; GFX11-NEXT:    s_mov_b32 s1, s5
; GFX11-NEXT:    s_load_b256 s[20:27], s[2:3], 0x40
; GFX11-NEXT:    s_load_b512 s[4:19], s[0:1], 0x0
; GFX11-NEXT:    s_load_b512 s[36:51], s[2:3], 0x0
; GFX11-NEXT:    v_mov_b32_e32 v0, 0xbc00bc00
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    buffer_load_b32 v1, off, s[20:23], 0
; GFX11-NEXT:    buffer_load_b32 v2, off, s[16:19], 0
; GFX11-NEXT:    image_sample_lz v3, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
; GFX11-NEXT:    buffer_load_b32 v4, off, s[40:43], 0
; GFX11-NEXT:    image_sample_lz v0, v0, s[44:51], s[36:39] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
; GFX11-NEXT:    s_waitcnt vmcnt(4)
; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0xac0, v1
; GFX11-NEXT:    s_waitcnt vmcnt(3)
; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0xac0, v2
; GFX11-NEXT:    s_waitcnt vmcnt(2)
; GFX11-NEXT:    v_cmp_eq_f32_e64 s1, 1.0, v3
; GFX11-NEXT:    s_waitcnt vmcnt(1)
; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 0xac0, v4
; GFX11-NEXT:    s_and_b32 s0, s0, vcc_lo
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v0
; GFX11-NEXT:    s_and_b32 s0, s0, s1
; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT:    s_and_b32 s0, s0, s2
; GFX11-NEXT:    s_and_b32 s0, s0, vcc_lo
; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT:    buffer_store_b32 v0, off, s[24:27], 0
; GFX11-NEXT:    s_endpgm
;
; GFX12-LABEL: mixed_vmem_types:
; GFX12:       ; %bb.0: ; %.entry
; GFX12-NEXT:    s_getpc_b64 s[4:5]
; GFX12-NEXT:    s_mov_b32 s0, s3
; GFX12-NEXT:    s_sext_i32_i16 s5, s5
; GFX12-NEXT:    v_mov_b32_e32 v0, 0xbc00bc00
; GFX12-NEXT:    s_mov_b32 s3, s5
; GFX12-NEXT:    s_mov_b32 s1, s5
; GFX12-NEXT:    s_load_b256 s[20:27], s[2:3], 0x40
; GFX12-NEXT:    s_load_b512 s[4:19], s[0:1], 0x0
; GFX12-NEXT:    s_load_b512 s[36:51], s[2:3], 0x0
; GFX12-NEXT:    s_wait_kmcnt 0x0
; GFX12-NEXT:    buffer_load_b32 v1, off, s[20:23], null
; GFX12-NEXT:    buffer_load_b32 v2, off, s[16:19], null
; GFX12-NEXT:    image_sample_lz v3, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
; GFX12-NEXT:    buffer_load_b32 v4, off, s[40:43], null
; GFX12-NEXT:    image_sample_lz v0, v0, s[44:51], s[36:39] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
; GFX12-NEXT:    s_wait_loadcnt 0x2
; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0xac0, v1
; GFX12-NEXT:    s_wait_loadcnt 0x1
; GFX12-NEXT:    v_cmp_eq_u32_e64 s0, 0xac0, v2
; GFX12-NEXT:    s_wait_samplecnt 0x1
; GFX12-NEXT:    v_cmp_eq_f32_e64 s1, 1.0, v3
; GFX12-NEXT:    s_wait_loadcnt 0x0
; GFX12-NEXT:    v_cmp_eq_u32_e64 s2, 0xac0, v4
; GFX12-NEXT:    s_and_b32 s0, s0, vcc_lo
; GFX12-NEXT:    s_wait_samplecnt 0x0
; GFX12-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v0
; GFX12-NEXT:    s_and_b32 s0, s0, s1
; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT:    s_and_b32 s0, s0, s2
; GFX12-NEXT:    s_and_b32 s0, s0, vcc_lo
; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
; GFX12-NEXT:    buffer_store_b32 v0, off, s[24:27], null
; GFX12-NEXT:    s_endpgm
;
; GFX12-GISEL-LABEL: mixed_vmem_types:
; GFX12-GISEL:       ; %bb.0: ; %.entry
; GFX12-GISEL-NEXT:    s_getpc_b64 s[20:21]
; GFX12-GISEL-NEXT:    s_mov_b32 s0, s3
; GFX12-GISEL-NEXT:    s_sext_i32_i16 s21, s21
; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0xbc00bc00
; GFX12-GISEL-NEXT:    s_mov_b32 s1, s21
; GFX12-GISEL-NEXT:    s_mov_b32 s3, s21
; GFX12-GISEL-NEXT:    s_load_b512 s[4:19], s[0:1], 0x0
; GFX12-GISEL-NEXT:    s_clause 0x1
; GFX12-GISEL-NEXT:    s_load_b256 s[20:27], s[2:3], 0x40
; GFX12-GISEL-NEXT:    s_load_b512 s[36:51], s[2:3], 0x0
; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT:    image_sample_lz v1, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
; GFX12-GISEL-NEXT:    buffer_load_b32 v2, off, s[16:19], null
; GFX12-GISEL-NEXT:    buffer_load_b32 v3, off, s[20:23], null
; GFX12-GISEL-NEXT:    buffer_load_b32 v4, off, s[40:43], null
; GFX12-GISEL-NEXT:    image_sample_lz v0, v0, s[44:51], s[36:39] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x2
; GFX12-GISEL-NEXT:    v_cmp_eq_u32_e64 s0, 0xac0, v2
; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x1
; GFX12-GISEL-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 1.0, v1
; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x1
; GFX12-GISEL-NEXT:    v_cmp_eq_u32_e64 s1, 0xac0, v3
; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT:    v_cmp_eq_u32_e64 s2, 0xac0, v4
; GFX12-GISEL-NEXT:    s_and_b32 s0, s0, vcc_lo
; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
; GFX12-GISEL-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v0
; GFX12-GISEL-NEXT:    s_and_b32 s0, s0, s1
; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-GISEL-NEXT:    s_and_b32 s0, s0, s2
; GFX12-GISEL-NEXT:    s_and_b32 s0, s0, vcc_lo
; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
; GFX12-GISEL-NEXT:    buffer_store_b32 v0, off, s[24:27], null
; GFX12-GISEL-NEXT:    s_endpgm
.entry:
  %i = call i64 @llvm.amdgcn.s.getpc()
  %extelt.offset = lshr i64 %i, 32
  %.i1 = trunc i64 %extelt.offset to i32
  %.upto0 = insertelement <2 x i32> poison, i32 %descTable1, i64 0
  %i1 = insertelement <2 x i32> %.upto0, i32 %.i1, i64 1
  %i2 = bitcast <2 x i32> %i1 to i64
  %i3 = inttoptr i64 %i2 to ptr addrspace(4)
  %.upto03 = insertelement <2 x i32> poison, i32 %descTable0, i64 0
  %i4 = insertelement <2 x i32> %.upto03, i32 %.i1, i64 1
  %i5 = bitcast <2 x i32> %i4 to i64
  %i6 = inttoptr i64 %i5 to ptr addrspace(4)
  %i7 = getelementptr i8, ptr addrspace(4) %i6, i64 80
  %i8 = load <4 x i32>, ptr addrspace(4) %i7, align 16
  %i9 = getelementptr i8, ptr addrspace(4) %i3, i64 48
  %i10 = load <4 x i32>, ptr addrspace(4) %i9, align 16
  %i11 = getelementptr i8, ptr addrspace(4) %i6, i64 64
  %i12 = load <4 x i32>, ptr addrspace(4) %i11, align 16
  %i13 = getelementptr i8, ptr addrspace(4) %i6, i64 16
  %i14 = load <4 x i32>, ptr addrspace(4) %i13, align 16
  %i15 = getelementptr i8, ptr addrspace(4) %i6, i64 32
  %i16 = load <8 x i32>, ptr addrspace(4) %i15, align 32
  %i17 = load <4 x i32>, ptr addrspace(4) %i6, align 16
  %i18 = call float @llvm.amdgcn.image.sample.lz.2d.f32.f16.v8i32.v4i32(i32 1, half 0xHBC00, half 0xHBC00, <8 x i32> %i16, <4 x i32> %i17, i1 false, i32 0, i32 0)
  %i19 = fcmp oeq float %i18, 0.000000e+00
  %i20 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %i14, i32 0, i32 0, i32 0)
  %.not = icmp eq i32 %i20, 2752
  %i21 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %i12, i32 0, i32 0, i32 0)
  %.not1 = icmp eq i32 %i21, 2752
  %i22 = getelementptr i8, ptr addrspace(4) %i3, i64 16
  %i23 = load <8 x i32>, ptr addrspace(4) %i22, align 32
  %i24 = load <4 x i32>, ptr addrspace(4) %i3, align 16
  %i25 = call float @llvm.amdgcn.image.sample.lz.2d.f32.f16.v8i32.v4i32(i32 1, half 0xHBC00, half 0xHBC00, <8 x i32> %i23, <4 x i32> %i24, i1 false, i32 0, i32 0)
  %i26 = fcmp oeq float %i25, 1.000000e+00
  %i27 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %i10, i32 0, i32 0, i32 0)
  %.not2 = icmp eq i32 %i27, 2752
  %i28 = select i1 %.not2, i1 %i26, i1 false
  %i29 = select i1 %i28, i1 %.not1, i1 false
  %i30 = select i1 %i29, i1 %.not, i1 false
  %narrow2 = select i1 %i30, i1 %i19, i1 false
  %.4 = zext i1 %narrow2 to i32
  call void @llvm.amdgcn.raw.buffer.store.i32(i32 %.4, <4 x i32> %i8, i32 0, i32 0, i32 0)
  ret void
}