1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s 3; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s 4; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-GISEL %s 5 6define amdgpu_cs void @mixed_vmem_types(i32 inreg %globalTable, i32 inreg %perShaderTable, i32 inreg %descTable0, i32 inreg %descTable1, <3 x i32> inreg %WorkgroupId, i32 inreg %MultiDispatchInfo, <3 x i32> %LocalInvocationId) #0 { 7; GFX11-LABEL: mixed_vmem_types: 8; GFX11: ; %bb.0: ; %.entry 9; GFX11-NEXT: s_getpc_b64 s[4:5] 10; GFX11-NEXT: s_mov_b32 s0, s3 11; GFX11-NEXT: s_mov_b32 s3, s5 12; GFX11-NEXT: s_mov_b32 s1, s5 13; GFX11-NEXT: s_load_b256 s[20:27], s[2:3], 0x40 14; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x0 15; GFX11-NEXT: s_load_b512 s[36:51], s[2:3], 0x0 16; GFX11-NEXT: v_mov_b32_e32 v0, 0xbc00bc00 17; GFX11-NEXT: s_waitcnt lgkmcnt(0) 18; GFX11-NEXT: buffer_load_b32 v1, off, s[20:23], 0 19; GFX11-NEXT: buffer_load_b32 v2, off, s[16:19], 0 20; GFX11-NEXT: image_sample_lz v3, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 21; GFX11-NEXT: buffer_load_b32 v4, off, s[40:43], 0 22; GFX11-NEXT: image_sample_lz v0, v0, s[44:51], s[36:39] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 23; GFX11-NEXT: s_waitcnt vmcnt(4) 24; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0xac0, v1 25; GFX11-NEXT: s_waitcnt vmcnt(3) 26; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0xac0, v2 27; GFX11-NEXT: s_waitcnt vmcnt(2) 28; GFX11-NEXT: v_cmp_eq_f32_e64 s1, 1.0, v3 29; GFX11-NEXT: s_waitcnt vmcnt(1) 30; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0xac0, v4 31; GFX11-NEXT: s_and_b32 s0, s0, vcc_lo 32; GFX11-NEXT: s_waitcnt vmcnt(0) 33; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0 34; GFX11-NEXT: s_and_b32 s0, s0, s1 35; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 36; GFX11-NEXT: s_and_b32 s0, s0, s2 37; GFX11-NEXT: s_and_b32 s0, s0, vcc_lo 38; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 39; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 40; GFX11-NEXT: buffer_store_b32 v0, off, s[24:27], 0 41; GFX11-NEXT: s_endpgm 42; 43; GFX12-LABEL: mixed_vmem_types: 44; GFX12: ; %bb.0: ; %.entry 45; GFX12-NEXT: s_getpc_b64 s[4:5] 46; GFX12-NEXT: s_mov_b32 s0, s3 47; GFX12-NEXT: s_sext_i32_i16 s5, s5 48; GFX12-NEXT: v_mov_b32_e32 v0, 0xbc00bc00 49; GFX12-NEXT: s_mov_b32 s3, s5 50; GFX12-NEXT: s_mov_b32 s1, s5 51; GFX12-NEXT: s_load_b256 s[20:27], s[2:3], 0x40 52; GFX12-NEXT: s_load_b512 s[4:19], s[0:1], 0x0 53; GFX12-NEXT: s_load_b512 s[36:51], s[2:3], 0x0 54; GFX12-NEXT: s_wait_kmcnt 0x0 55; GFX12-NEXT: buffer_load_b32 v1, off, s[20:23], null 56; GFX12-NEXT: buffer_load_b32 v2, off, s[16:19], null 57; GFX12-NEXT: image_sample_lz v3, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 58; GFX12-NEXT: buffer_load_b32 v4, off, s[40:43], null 59; GFX12-NEXT: image_sample_lz v0, v0, s[44:51], s[36:39] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 60; GFX12-NEXT: s_wait_loadcnt 0x2 61; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0xac0, v1 62; GFX12-NEXT: s_wait_loadcnt 0x1 63; GFX12-NEXT: v_cmp_eq_u32_e64 s0, 0xac0, v2 64; GFX12-NEXT: s_wait_samplecnt 0x1 65; GFX12-NEXT: v_cmp_eq_f32_e64 s1, 1.0, v3 66; GFX12-NEXT: s_wait_loadcnt 0x0 67; GFX12-NEXT: v_cmp_eq_u32_e64 s2, 0xac0, v4 68; GFX12-NEXT: s_and_b32 s0, s0, vcc_lo 69; GFX12-NEXT: s_wait_samplecnt 0x0 70; GFX12-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0 71; GFX12-NEXT: s_and_b32 s0, s0, s1 72; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 73; GFX12-NEXT: s_and_b32 s0, s0, s2 74; GFX12-NEXT: s_and_b32 s0, s0, vcc_lo 75; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 76; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 77; GFX12-NEXT: buffer_store_b32 v0, off, s[24:27], null 78; GFX12-NEXT: s_endpgm 79; 80; GFX12-GISEL-LABEL: mixed_vmem_types: 81; GFX12-GISEL: ; %bb.0: ; %.entry 82; GFX12-GISEL-NEXT: s_getpc_b64 s[20:21] 83; GFX12-GISEL-NEXT: s_mov_b32 s0, s3 84; GFX12-GISEL-NEXT: s_sext_i32_i16 s21, s21 85; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0xbc00bc00 86; GFX12-GISEL-NEXT: s_mov_b32 s1, s21 87; GFX12-GISEL-NEXT: s_mov_b32 s3, s21 88; GFX12-GISEL-NEXT: s_load_b512 s[4:19], s[0:1], 0x0 89; GFX12-GISEL-NEXT: s_clause 0x1 90; GFX12-GISEL-NEXT: s_load_b256 s[20:27], s[2:3], 0x40 91; GFX12-GISEL-NEXT: s_load_b512 s[36:51], s[2:3], 0x0 92; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 93; GFX12-GISEL-NEXT: image_sample_lz v1, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 94; GFX12-GISEL-NEXT: buffer_load_b32 v2, off, s[16:19], null 95; GFX12-GISEL-NEXT: buffer_load_b32 v3, off, s[20:23], null 96; GFX12-GISEL-NEXT: buffer_load_b32 v4, off, s[40:43], null 97; GFX12-GISEL-NEXT: image_sample_lz v0, v0, s[44:51], s[36:39] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 98; GFX12-GISEL-NEXT: s_wait_loadcnt 0x2 99; GFX12-GISEL-NEXT: v_cmp_eq_u32_e64 s0, 0xac0, v2 100; GFX12-GISEL-NEXT: s_wait_samplecnt 0x1 101; GFX12-GISEL-NEXT: v_cmp_eq_f32_e32 vcc_lo, 1.0, v1 102; GFX12-GISEL-NEXT: s_wait_loadcnt 0x1 103; GFX12-GISEL-NEXT: v_cmp_eq_u32_e64 s1, 0xac0, v3 104; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 105; GFX12-GISEL-NEXT: v_cmp_eq_u32_e64 s2, 0xac0, v4 106; GFX12-GISEL-NEXT: s_and_b32 s0, s0, vcc_lo 107; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 108; GFX12-GISEL-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0 109; GFX12-GISEL-NEXT: s_and_b32 s0, s0, s1 110; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 111; GFX12-GISEL-NEXT: s_and_b32 s0, s0, s2 112; GFX12-GISEL-NEXT: s_and_b32 s0, s0, vcc_lo 113; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 114; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 115; GFX12-GISEL-NEXT: buffer_store_b32 v0, off, s[24:27], null 116; GFX12-GISEL-NEXT: s_endpgm 117.entry: 118 %i = call i64 @llvm.amdgcn.s.getpc() 119 %extelt.offset = lshr i64 %i, 32 120 %.i1 = trunc i64 %extelt.offset to i32 121 %.upto0 = insertelement <2 x i32> poison, i32 %descTable1, i64 0 122 %i1 = insertelement <2 x i32> %.upto0, i32 %.i1, i64 1 123 %i2 = bitcast <2 x i32> %i1 to i64 124 %i3 = inttoptr i64 %i2 to ptr addrspace(4) 125 %.upto03 = insertelement <2 x i32> poison, i32 %descTable0, i64 0 126 %i4 = insertelement <2 x i32> %.upto03, i32 %.i1, i64 1 127 %i5 = bitcast <2 x i32> %i4 to i64 128 %i6 = inttoptr i64 %i5 to ptr addrspace(4) 129 %i7 = getelementptr i8, ptr addrspace(4) %i6, i64 80 130 %i8 = load <4 x i32>, ptr addrspace(4) %i7, align 16 131 %i9 = getelementptr i8, ptr addrspace(4) %i3, i64 48 132 %i10 = load <4 x i32>, ptr addrspace(4) %i9, align 16 133 %i11 = getelementptr i8, ptr addrspace(4) %i6, i64 64 134 %i12 = load <4 x i32>, ptr addrspace(4) %i11, align 16 135 %i13 = getelementptr i8, ptr addrspace(4) %i6, i64 16 136 %i14 = load <4 x i32>, ptr addrspace(4) %i13, align 16 137 %i15 = getelementptr i8, ptr addrspace(4) %i6, i64 32 138 %i16 = load <8 x i32>, ptr addrspace(4) %i15, align 32 139 %i17 = load <4 x i32>, ptr addrspace(4) %i6, align 16 140 %i18 = call float @llvm.amdgcn.image.sample.lz.2d.f32.f16.v8i32.v4i32(i32 1, half 0xHBC00, half 0xHBC00, <8 x i32> %i16, <4 x i32> %i17, i1 false, i32 0, i32 0) 141 %i19 = fcmp oeq float %i18, 0.000000e+00 142 %i20 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %i14, i32 0, i32 0, i32 0) 143 %.not = icmp eq i32 %i20, 2752 144 %i21 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %i12, i32 0, i32 0, i32 0) 145 %.not1 = icmp eq i32 %i21, 2752 146 %i22 = getelementptr i8, ptr addrspace(4) %i3, i64 16 147 %i23 = load <8 x i32>, ptr addrspace(4) %i22, align 32 148 %i24 = load <4 x i32>, ptr addrspace(4) %i3, align 16 149 %i25 = call float @llvm.amdgcn.image.sample.lz.2d.f32.f16.v8i32.v4i32(i32 1, half 0xHBC00, half 0xHBC00, <8 x i32> %i23, <4 x i32> %i24, i1 false, i32 0, i32 0) 150 %i26 = fcmp oeq float %i25, 1.000000e+00 151 %i27 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %i10, i32 0, i32 0, i32 0) 152 %.not2 = icmp eq i32 %i27, 2752 153 %i28 = select i1 %.not2, i1 %i26, i1 false 154 %i29 = select i1 %i28, i1 %.not1, i1 false 155 %i30 = select i1 %i29, i1 %.not, i1 false 156 %narrow2 = select i1 %i30, i1 %i19, i1 false 157 %.4 = zext i1 %narrow2 to i32 158 call void @llvm.amdgcn.raw.buffer.store.i32(i32 %.4, <4 x i32> %i8, i32 0, i32 0, i32 0) 159 ret void 160} 161