1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; TODO: Run these for global isel as well. 3; RUN: llc -mtriple=amdgcn -mcpu=gfx1013 -verify-machineinstrs < %s | FileCheck -check-prefixes=PRE-GFX12,GFX10,GFX1013 %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=PRE-GFX12,GFX10,GFX1030 %s 5; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERR %s 6; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=PRE-GFX12,GFX11 %s 7; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s 8; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s 9 10; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(uint node_ptr, float ray_extent, float3 ray_origin, float3 ray_dir, float3 ray_inv_dir, uint4 texture_descr) 11; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(uint node_ptr, float ray_extent, float3 ray_origin, half3 ray_dir, half3 ray_inv_dir, uint4 texture_descr) 12; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(ulong node_ptr, float ray_extent, float3 ray_origin, float3 ray_dir, float3 ray_inv_dir, uint4 texture_descr) 13; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(ulong node_ptr, float ray_extent, float3 ray_origin, half3 ray_dir, half3 ray_inv_dir, uint4 texture_descr) 14 15declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32, float, <3 x float>, <3 x float>, <3 x float>, <4 x i32>) 16declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32, float, <3 x float>, <3 x half>, <3 x half>, <4 x i32>) 17declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64, float, <3 x float>, <3 x float>, <3 x float>, <4 x i32>) 18declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64, float, <3 x float>, <3 x half>, <3 x half>, <4 x i32>) 19 20; ERR: in function image_bvh_intersect_ray{{.*}}intrinsic not supported on subtarget 21; Arguments are flattened to represent the actual VGPR_A layout, so we have no 22; extra moves in the generated kernel. 23define amdgpu_ps <4 x float> @image_bvh_intersect_ray(i32 %node_ptr, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, float %ray_inv_dir_x, float %ray_inv_dir_y, float %ray_inv_dir_z, <4 x i32> inreg %tdescr) { 24; PRE-GFX12-LABEL: image_bvh_intersect_ray: 25; PRE-GFX12: ; %bb.0: ; %main_body 26; PRE-GFX12-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[0:3] 27; PRE-GFX12-NEXT: s_waitcnt vmcnt(0) 28; PRE-GFX12-NEXT: ; return to shader part epilog 29; 30; GFX12-LABEL: image_bvh_intersect_ray: 31; GFX12: ; %bb.0: ; %main_body 32; GFX12-NEXT: image_bvh_intersect_ray v[0:3], [v0, v1, v[2:4], v[5:7], v[8:10]], s[0:3] 33; GFX12-NEXT: s_wait_bvhcnt 0x0 34; GFX12-NEXT: ; return to shader part epilog 35main_body: 36 %ray_origin0 = insertelement <3 x float> undef, float %ray_origin_x, i32 0 37 %ray_origin1 = insertelement <3 x float> %ray_origin0, float %ray_origin_y, i32 1 38 %ray_origin = insertelement <3 x float> %ray_origin1, float %ray_origin_z, i32 2 39 %ray_dir0 = insertelement <3 x float> undef, float %ray_dir_x, i32 0 40 %ray_dir1 = insertelement <3 x float> %ray_dir0, float %ray_dir_y, i32 1 41 %ray_dir = insertelement <3 x float> %ray_dir1, float %ray_dir_z, i32 2 42 %ray_inv_dir0 = insertelement <3 x float> undef, float %ray_inv_dir_x, i32 0 43 %ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1 44 %ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2 45 %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) 46 %r = bitcast <4 x i32> %v to <4 x float> 47 ret <4 x float> %r 48} 49 50define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 inreg %node_ptr, float inreg %ray_extent, <3 x float> inreg %ray_origin, <3 x half> inreg %ray_dir, <3 x half> inreg %ray_inv_dir, <4 x i32> inreg %tdescr) { 51; GFX10-LABEL: image_bvh_intersect_ray_a16: 52; GFX10: ; %bb.0: ; %main_body 53; GFX10-NEXT: s_mov_b32 s15, s12 54; GFX10-NEXT: s_mov_b32 s12, s9 55; GFX10-NEXT: s_lshr_b32 s9, s7, 16 56; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s7 57; GFX10-NEXT: s_pack_ll_b32_b16 s7, s9, s8 58; GFX10-NEXT: v_mov_b32_e32 v0, s0 59; GFX10-NEXT: v_mov_b32_e32 v1, s1 60; GFX10-NEXT: v_mov_b32_e32 v2, s2 61; GFX10-NEXT: v_mov_b32_e32 v3, s3 62; GFX10-NEXT: v_mov_b32_e32 v4, s4 63; GFX10-NEXT: v_mov_b32_e32 v5, s5 64; GFX10-NEXT: v_mov_b32_e32 v6, s6 65; GFX10-NEXT: v_mov_b32_e32 v7, s7 66; GFX10-NEXT: s_mov_b32 s14, s11 67; GFX10-NEXT: s_mov_b32 s13, s10 68; GFX10-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[12:15] a16 69; GFX10-NEXT: s_waitcnt vmcnt(0) 70; GFX10-NEXT: ; return to shader part epilog 71; 72; GFX11-LABEL: image_bvh_intersect_ray_a16: 73; GFX11: ; %bb.0: ; %main_body 74; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 75; GFX11-NEXT: s_lshr_b32 s2, s7, 16 76; GFX11-NEXT: s_lshr_b32 s3, s5, 16 77; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v7, s1 78; GFX11-NEXT: s_pack_ll_b32_b16 s2, s3, s2 79; GFX11-NEXT: s_pack_ll_b32_b16 s3, s5, s7 80; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 81; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s3 82; GFX11-NEXT: s_pack_ll_b32_b16 s4, s6, s8 83; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s4 84; GFX11-NEXT: s_mov_b32 s15, s12 85; GFX11-NEXT: s_mov_b32 s14, s11 86; GFX11-NEXT: s_mov_b32 s13, s10 87; GFX11-NEXT: s_mov_b32 s12, s9 88; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[12:15] a16 89; GFX11-NEXT: s_waitcnt vmcnt(0) 90; GFX11-NEXT: ; return to shader part epilog 91; 92; GFX12-SDAG-LABEL: image_bvh_intersect_ray_a16: 93; GFX12-SDAG: ; %bb.0: ; %main_body 94; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 95; GFX12-SDAG-NEXT: s_lshr_b32 s2, s7, 16 96; GFX12-SDAG-NEXT: s_lshr_b32 s3, s5, 16 97; GFX12-SDAG-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v7, s1 98; GFX12-SDAG-NEXT: s_wait_alu 0xfffe 99; GFX12-SDAG-NEXT: s_pack_ll_b32_b16 s2, s3, s2 100; GFX12-SDAG-NEXT: s_pack_ll_b32_b16 s3, s5, s7 101; GFX12-SDAG-NEXT: s_wait_alu 0xfffe 102; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s3 103; GFX12-SDAG-NEXT: s_pack_ll_b32_b16 s4, s6, s8 104; GFX12-SDAG-NEXT: s_wait_alu 0xfffe 105; GFX12-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s4 106; GFX12-SDAG-NEXT: s_mov_b32 s15, s12 107; GFX12-SDAG-NEXT: s_mov_b32 s14, s11 108; GFX12-SDAG-NEXT: s_mov_b32 s13, s10 109; GFX12-SDAG-NEXT: s_mov_b32 s12, s9 110; GFX12-SDAG-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[12:15] a16 111; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 112; GFX12-SDAG-NEXT: ; return to shader part epilog 113; 114; GFX12-GISEL-LABEL: image_bvh_intersect_ray_a16: 115; GFX12-GISEL: ; %bb.0: ; %main_body 116; GFX12-GISEL-NEXT: s_mov_b32 s20, s2 117; GFX12-GISEL-NEXT: s_mov_b32 s22, s4 118; GFX12-GISEL-NEXT: s_pack_ll_b32_b16 s4, s7, s5 119; GFX12-GISEL-NEXT: s_mov_b32 s21, s3 120; GFX12-GISEL-NEXT: s_pack_hh_b32_b16 s5, s7, s5 121; GFX12-GISEL-NEXT: s_pack_ll_b32_b16 s6, s8, s6 122; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s20 :: v_dual_mov_b32 v1, s21 123; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s22 :: v_dual_mov_b32 v3, s4 124; GFX12-GISEL-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v7, s1 125; GFX12-GISEL-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 126; GFX12-GISEL-NEXT: s_mov_b32 s16, s9 127; GFX12-GISEL-NEXT: s_mov_b32 s17, s10 128; GFX12-GISEL-NEXT: s_mov_b32 s18, s11 129; GFX12-GISEL-NEXT: s_mov_b32 s19, s12 130; GFX12-GISEL-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[16:19] a16 131; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 132; GFX12-GISEL-NEXT: ; return to shader part epilog 133main_body: 134 %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) 135 %r = bitcast <4 x i32> %v to <4 x float> 136 ret <4 x float> %r 137} 138 139; Arguments are flattened to represent the actual VGPR_A layout, so we have no 140; extra moves in the generated kernel. 141define amdgpu_ps <4 x float> @image_bvh64_intersect_ray(<2 x i32> %node_ptr_vec, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, float %ray_inv_dir_x, float %ray_inv_dir_y, float %ray_inv_dir_z, <4 x i32> inreg %tdescr) { 142; PRE-GFX12-LABEL: image_bvh64_intersect_ray: 143; PRE-GFX12: ; %bb.0: ; %main_body 144; PRE-GFX12-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3] 145; PRE-GFX12-NEXT: s_waitcnt vmcnt(0) 146; PRE-GFX12-NEXT: ; return to shader part epilog 147; 148; GFX12-LABEL: image_bvh64_intersect_ray: 149; GFX12: ; %bb.0: ; %main_body 150; GFX12-NEXT: image_bvh64_intersect_ray v[0:3], [v[0:1], v2, v[3:5], v[6:8], v[9:11]], s[0:3] 151; GFX12-NEXT: s_wait_bvhcnt 0x0 152; GFX12-NEXT: ; return to shader part epilog 153main_body: 154 %node_ptr = bitcast <2 x i32> %node_ptr_vec to i64 155 %ray_origin0 = insertelement <3 x float> undef, float %ray_origin_x, i32 0 156 %ray_origin1 = insertelement <3 x float> %ray_origin0, float %ray_origin_y, i32 1 157 %ray_origin = insertelement <3 x float> %ray_origin1, float %ray_origin_z, i32 2 158 %ray_dir0 = insertelement <3 x float> undef, float %ray_dir_x, i32 0 159 %ray_dir1 = insertelement <3 x float> %ray_dir0, float %ray_dir_y, i32 1 160 %ray_dir = insertelement <3 x float> %ray_dir1, float %ray_dir_z, i32 2 161 %ray_inv_dir0 = insertelement <3 x float> undef, float %ray_inv_dir_x, i32 0 162 %ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1 163 %ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2 164 %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) 165 %r = bitcast <4 x i32> %v to <4 x float> 166 ret <4 x float> %r 167} 168 169define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 inreg %node_ptr, float inreg %ray_extent, <3 x float> inreg %ray_origin, <3 x half> inreg %ray_dir, <3 x half> inreg %ray_inv_dir, <4 x i32> inreg %tdescr) { 170; GFX10-LABEL: image_bvh64_intersect_ray_a16: 171; GFX10: ; %bb.0: ; %main_body 172; GFX10-NEXT: s_mov_b32 s14, s12 173; GFX10-NEXT: s_mov_b32 s12, s10 174; GFX10-NEXT: s_lshr_b32 s10, s8, 16 175; GFX10-NEXT: s_pack_ll_b32_b16 s7, s7, s8 176; GFX10-NEXT: s_pack_ll_b32_b16 s8, s10, s9 177; GFX10-NEXT: v_mov_b32_e32 v0, s0 178; GFX10-NEXT: v_mov_b32_e32 v1, s1 179; GFX10-NEXT: v_mov_b32_e32 v2, s2 180; GFX10-NEXT: v_mov_b32_e32 v3, s3 181; GFX10-NEXT: v_mov_b32_e32 v4, s4 182; GFX10-NEXT: v_mov_b32_e32 v5, s5 183; GFX10-NEXT: v_mov_b32_e32 v6, s6 184; GFX10-NEXT: v_mov_b32_e32 v7, s7 185; GFX10-NEXT: v_mov_b32_e32 v8, s8 186; GFX10-NEXT: s_mov_b32 s15, s13 187; GFX10-NEXT: s_mov_b32 s13, s11 188; GFX10-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[12:15] a16 189; GFX10-NEXT: s_waitcnt vmcnt(0) 190; GFX10-NEXT: ; return to shader part epilog 191; 192; GFX11-LABEL: image_bvh64_intersect_ray_a16: 193; GFX11: ; %bb.0: ; %main_body 194; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s4 195; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v7, s1 196; GFX11-NEXT: s_lshr_b32 s3, s6, 16 197; GFX11-NEXT: s_pack_ll_b32_b16 s1, s6, s8 198; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) 199; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v3, s1 200; GFX11-NEXT: s_lshr_b32 s0, s8, 16 201; GFX11-NEXT: v_mov_b32_e32 v8, s2 202; GFX11-NEXT: s_pack_ll_b32_b16 s0, s3, s0 203; GFX11-NEXT: s_pack_ll_b32_b16 s3, s7, s9 204; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s3 205; GFX11-NEXT: s_mov_b32 s15, s13 206; GFX11-NEXT: s_mov_b32 s14, s12 207; GFX11-NEXT: s_mov_b32 s13, s11 208; GFX11-NEXT: s_mov_b32 s12, s10 209; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[0:2], v[3:5]], s[12:15] a16 210; GFX11-NEXT: s_waitcnt vmcnt(0) 211; GFX11-NEXT: ; return to shader part epilog 212; 213; GFX12-SDAG-LABEL: image_bvh64_intersect_ray_a16: 214; GFX12-SDAG: ; %bb.0: ; %main_body 215; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s4 216; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v7, s1 217; GFX12-SDAG-NEXT: s_lshr_b32 s3, s6, 16 218; GFX12-SDAG-NEXT: s_pack_ll_b32_b16 s1, s6, s8 219; GFX12-SDAG-NEXT: s_wait_alu 0xfffe 220; GFX12-SDAG-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v3, s1 221; GFX12-SDAG-NEXT: s_lshr_b32 s0, s8, 16 222; GFX12-SDAG-NEXT: v_mov_b32_e32 v8, s2 223; GFX12-SDAG-NEXT: s_wait_alu 0xfffe 224; GFX12-SDAG-NEXT: s_pack_ll_b32_b16 s0, s3, s0 225; GFX12-SDAG-NEXT: s_pack_ll_b32_b16 s3, s7, s9 226; GFX12-SDAG-NEXT: s_wait_alu 0xfffe 227; GFX12-SDAG-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s3 228; GFX12-SDAG-NEXT: s_mov_b32 s15, s13 229; GFX12-SDAG-NEXT: s_mov_b32 s14, s12 230; GFX12-SDAG-NEXT: s_mov_b32 s13, s11 231; GFX12-SDAG-NEXT: s_mov_b32 s12, s10 232; GFX12-SDAG-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[0:2], v[3:5]], s[12:15] a16 233; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 234; GFX12-SDAG-NEXT: ; return to shader part epilog 235; 236; GFX12-GISEL-LABEL: image_bvh64_intersect_ray_a16: 237; GFX12-GISEL: ; %bb.0: ; %main_body 238; GFX12-GISEL-NEXT: s_mov_b32 s20, s3 239; GFX12-GISEL-NEXT: s_mov_b32 s21, s4 240; GFX12-GISEL-NEXT: s_pack_ll_b32_b16 s4, s8, s6 241; GFX12-GISEL-NEXT: s_mov_b32 s22, s5 242; GFX12-GISEL-NEXT: s_pack_hh_b32_b16 s5, s8, s6 243; GFX12-GISEL-NEXT: s_pack_ll_b32_b16 s6, s9, s7 244; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s20 :: v_dual_mov_b32 v3, s4 245; GFX12-GISEL-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 246; GFX12-GISEL-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v1, s21 247; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s22 :: v_dual_mov_b32 v5, s6 248; GFX12-GISEL-NEXT: v_mov_b32_e32 v4, s5 249; GFX12-GISEL-NEXT: s_mov_b32 s16, s10 250; GFX12-GISEL-NEXT: s_mov_b32 s17, s11 251; GFX12-GISEL-NEXT: s_mov_b32 s18, s12 252; GFX12-GISEL-NEXT: s_mov_b32 s19, s13 253; GFX12-GISEL-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[0:2], v[3:5]], s[16:19] a16 254; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 255; GFX12-GISEL-NEXT: ; return to shader part epilog 256main_body: 257 %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) 258 %r = bitcast <4 x i32> %v to <4 x float> 259 ret <4 x float> %r 260} 261 262; TODO: NSA reassign is very limited and cannot work with VGPR tuples and subregs. 263 264define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) { 265; GFX1013-LABEL: image_bvh_intersect_ray_nsa_reassign: 266; GFX1013: ; %bb.0: ; %main_body 267; GFX1013-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 268; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0 269; GFX1013-NEXT: v_mov_b32_e32 v6, 4.0 270; GFX1013-NEXT: v_mov_b32_e32 v7, 0x40a00000 271; GFX1013-NEXT: v_mov_b32_e32 v8, 0x40c00000 272; GFX1013-NEXT: v_mov_b32_e32 v9, 0x40e00000 273; GFX1013-NEXT: v_mov_b32_e32 v10, 0x41000000 274; GFX1013-NEXT: s_waitcnt lgkmcnt(0) 275; GFX1013-NEXT: v_add_co_u32 v2, s0, s8, v0 276; GFX1013-NEXT: v_add_co_ci_u32_e64 v3, s0, s9, 0, s0 277; GFX1013-NEXT: v_add_co_u32 v4, s0, s10, v0 278; GFX1013-NEXT: v_add_co_ci_u32_e64 v5, s0, s11, 0, s0 279; GFX1013-NEXT: flat_load_dword v0, v[2:3] 280; GFX1013-NEXT: flat_load_dword v1, v[4:5] 281; GFX1013-NEXT: v_mov_b32_e32 v2, 0 282; GFX1013-NEXT: v_mov_b32_e32 v3, 1.0 283; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0 284; GFX1013-NEXT: v_mov_b32_e32 v5, 0x40400000 285; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 286; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[12:15] 287; GFX1013-NEXT: s_waitcnt vmcnt(0) 288; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] 289; GFX1013-NEXT: s_endpgm 290; 291; GFX1030-LABEL: image_bvh_intersect_ray_nsa_reassign: 292; GFX1030: ; %bb.0: ; %main_body 293; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 294; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0 295; GFX1030-NEXT: v_mov_b32_e32 v10, 0x41000000 296; GFX1030-NEXT: v_mov_b32_e32 v9, 0x40e00000 297; GFX1030-NEXT: v_mov_b32_e32 v8, 0x40c00000 298; GFX1030-NEXT: v_mov_b32_e32 v7, 0x40a00000 299; GFX1030-NEXT: v_mov_b32_e32 v6, 4.0 300; GFX1030-NEXT: v_mov_b32_e32 v5, 0x40400000 301; GFX1030-NEXT: v_mov_b32_e32 v4, 2.0 302; GFX1030-NEXT: s_waitcnt lgkmcnt(0) 303; GFX1030-NEXT: v_add_co_u32 v0, s0, s0, v2 304; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 305; GFX1030-NEXT: v_add_co_u32 v2, s0, s2, v2 306; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0 307; GFX1030-NEXT: flat_load_dword v0, v[0:1] 308; GFX1030-NEXT: flat_load_dword v1, v[2:3] 309; GFX1030-NEXT: v_mov_b32_e32 v2, 0 310; GFX1030-NEXT: v_mov_b32_e32 v3, 1.0 311; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 312; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[4:7] 313; GFX1030-NEXT: s_waitcnt vmcnt(0) 314; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3] 315; GFX1030-NEXT: s_endpgm 316; 317; GFX11-LABEL: image_bvh_intersect_ray_nsa_reassign: 318; GFX11: ; %bb.0: ; %main_body 319; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 320; GFX11-NEXT: v_dual_mov_b32 v7, 1.0 :: v_dual_and_b32 v0, 0x3ff, v0 321; GFX11-NEXT: v_dual_mov_b32 v5, 0x40a00000 :: v_dual_mov_b32 v6, 0 322; GFX11-NEXT: v_mov_b32_e32 v8, 2.0 323; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) 324; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 325; GFX11-NEXT: v_mov_b32_e32 v4, 4.0 326; GFX11-NEXT: s_waitcnt lgkmcnt(0) 327; GFX11-NEXT: v_add_co_u32 v0, s0, s0, v2 328; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 329; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 330; GFX11-NEXT: v_add_co_u32 v2, s0, s2, v2 331; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0 332; GFX11-NEXT: flat_load_b32 v9, v[0:1] 333; GFX11-NEXT: flat_load_b32 v10, v[2:3] 334; GFX11-NEXT: v_mov_b32_e32 v1, 0x40e00000 335; GFX11-NEXT: v_mov_b32_e32 v0, 0x40c00000 336; GFX11-NEXT: v_mov_b32_e32 v2, 0x41000000 337; GFX11-NEXT: v_mov_b32_e32 v3, 0x40400000 338; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 339; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v9, v10, v[6:8], v[3:5], v[0:2]], s[4:7] 340; GFX11-NEXT: s_waitcnt vmcnt(0) 341; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3] 342; GFX11-NEXT: s_endpgm 343; 344; GFX12-SDAG-LABEL: image_bvh_intersect_ray_nsa_reassign: 345; GFX12-SDAG: ; %bb.0: ; %main_body 346; GFX12-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 347; GFX12-SDAG-NEXT: v_dual_mov_b32 v7, 1.0 :: v_dual_and_b32 v0, 0x3ff, v0 348; GFX12-SDAG-NEXT: v_dual_mov_b32 v5, 0x40a00000 :: v_dual_mov_b32 v6, 0 349; GFX12-SDAG-NEXT: v_mov_b32_e32 v8, 2.0 350; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) 351; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 352; GFX12-SDAG-NEXT: v_mov_b32_e32 v4, 4.0 353; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 354; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, s0, v2 355; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 356; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 357; GFX12-SDAG-NEXT: v_add_co_u32 v2, s0, s2, v2 358; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0 359; GFX12-SDAG-NEXT: flat_load_b32 v9, v[0:1] 360; GFX12-SDAG-NEXT: flat_load_b32 v10, v[2:3] 361; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0x40e00000 362; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0x40c00000 363; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0x41000000 364; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 0x40400000 365; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 366; GFX12-SDAG-NEXT: image_bvh_intersect_ray v[0:3], [v9, v10, v[6:8], v[3:5], v[0:2]], s[4:7] 367; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 368; GFX12-SDAG-NEXT: flat_store_b128 v[0:1], v[0:3] 369; GFX12-SDAG-NEXT: s_endpgm 370; 371; GFX12-GISEL-LABEL: image_bvh_intersect_ray_nsa_reassign: 372; GFX12-GISEL: ; %bb.0: ; %main_body 373; GFX12-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 374; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 375; GFX12-GISEL-NEXT: s_mov_b32 s10, 0x40a00000 376; GFX12-GISEL-NEXT: s_mov_b32 s9, 4.0 377; GFX12-GISEL-NEXT: s_mov_b32 s8, 0x40400000 378; GFX12-GISEL-NEXT: s_mov_b32 s12, 0x40c00000 379; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v4, 2, v0 380; GFX12-GISEL-NEXT: s_mov_b32 s14, 0x41000000 381; GFX12-GISEL-NEXT: s_mov_b32 s13, 0x40e00000 382; GFX12-GISEL-NEXT: v_mov_b32_e32 v6, s12 383; GFX12-GISEL-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v7, s13 384; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 385; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 386; GFX12-GISEL-NEXT: s_mov_b32 s2, 2.0 387; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 388; GFX12-GISEL-NEXT: s_mov_b32 s0, 0 389; GFX12-GISEL-NEXT: s_mov_b32 s1, 1.0 390; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) 391; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 392; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 393; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 394; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo 395; GFX12-GISEL-NEXT: flat_load_b32 v9, v[0:1] 396; GFX12-GISEL-NEXT: flat_load_b32 v10, v[2:3] 397; GFX12-GISEL-NEXT: s_wait_alu 0xfffe 398; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s8 399; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 400; GFX12-GISEL-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v5, s10 401; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 402; GFX12-GISEL-NEXT: image_bvh_intersect_ray v[0:3], [v9, v10, v[0:2], v[3:5], v[6:8]], s[4:7] 403; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 404; GFX12-GISEL-NEXT: flat_store_b128 v[0:1], v[0:3] 405; GFX12-GISEL-NEXT: s_endpgm 406main_body: 407 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 408 %gep_node_ptr = getelementptr inbounds i32, ptr %p_node_ptr, i32 %lid 409 %node_ptr = load i32, ptr %gep_node_ptr, align 4 410 %gep_ray = getelementptr inbounds float, ptr %p_ray, i32 %lid 411 %ray_extent = load float, ptr %gep_ray, align 4 412 %ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0 413 %ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1 414 %ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2 415 %ray_dir0 = insertelement <3 x float> undef, float 3.0, i32 0 416 %ray_dir1 = insertelement <3 x float> %ray_dir0, float 4.0, i32 1 417 %ray_dir = insertelement <3 x float> %ray_dir1, float 5.0, i32 2 418 %ray_inv_dir0 = insertelement <3 x float> undef, float 6.0, i32 0 419 %ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float 7.0, i32 1 420 %ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float 8.0, i32 2 421 %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) 422 store <4 x i32> %v, ptr undef 423 ret void 424} 425 426define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) { 427; GFX1013-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: 428; GFX1013: ; %bb.0: ; %main_body 429; GFX1013-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 430; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0 431; GFX1013-NEXT: v_mov_b32_e32 v6, 0x46004500 432; GFX1013-NEXT: v_mov_b32_e32 v7, 0x48004700 433; GFX1013-NEXT: s_waitcnt lgkmcnt(0) 434; GFX1013-NEXT: v_add_co_u32 v2, s0, s8, v0 435; GFX1013-NEXT: v_add_co_ci_u32_e64 v3, s0, s9, 0, s0 436; GFX1013-NEXT: v_add_co_u32 v4, s0, s10, v0 437; GFX1013-NEXT: v_add_co_ci_u32_e64 v5, s0, s11, 0, s0 438; GFX1013-NEXT: flat_load_dword v0, v[2:3] 439; GFX1013-NEXT: flat_load_dword v1, v[4:5] 440; GFX1013-NEXT: v_mov_b32_e32 v2, 0 441; GFX1013-NEXT: v_mov_b32_e32 v3, 1.0 442; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0 443; GFX1013-NEXT: v_mov_b32_e32 v5, 0x44004200 444; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 445; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[12:15] a16 446; GFX1013-NEXT: s_waitcnt vmcnt(0) 447; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] 448; GFX1013-NEXT: s_endpgm 449; 450; GFX1030-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: 451; GFX1030: ; %bb.0: ; %main_body 452; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 453; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0 454; GFX1030-NEXT: v_mov_b32_e32 v4, 2.0 455; GFX1030-NEXT: v_mov_b32_e32 v5, 0x44004200 456; GFX1030-NEXT: v_mov_b32_e32 v6, 0x46004500 457; GFX1030-NEXT: v_mov_b32_e32 v7, 0x48004700 458; GFX1030-NEXT: s_waitcnt lgkmcnt(0) 459; GFX1030-NEXT: v_add_co_u32 v0, s0, s0, v2 460; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 461; GFX1030-NEXT: v_add_co_u32 v2, s0, s2, v2 462; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0 463; GFX1030-NEXT: flat_load_dword v0, v[0:1] 464; GFX1030-NEXT: flat_load_dword v1, v[2:3] 465; GFX1030-NEXT: v_mov_b32_e32 v2, 0 466; GFX1030-NEXT: v_mov_b32_e32 v3, 1.0 467; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 468; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[4:7] a16 469; GFX1030-NEXT: s_waitcnt vmcnt(0) 470; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3] 471; GFX1030-NEXT: s_endpgm 472; 473; GFX11-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: 474; GFX11: ; %bb.0: ; %main_body 475; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 476; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 477; GFX11-NEXT: v_dual_mov_b32 v4, 1.0 :: v_dual_mov_b32 v5, 2.0 478; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) 479; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 480; GFX11-NEXT: s_waitcnt lgkmcnt(0) 481; GFX11-NEXT: v_add_co_u32 v0, s0, s0, v2 482; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 483; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 484; GFX11-NEXT: v_add_co_u32 v2, s0, s2, v2 485; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0 486; GFX11-NEXT: flat_load_b32 v6, v[0:1] 487; GFX11-NEXT: flat_load_b32 v7, v[2:3] 488; GFX11-NEXT: v_mov_b32_e32 v1, 0x47004400 489; GFX11-NEXT: v_mov_b32_e32 v0, 0x46004200 490; GFX11-NEXT: v_dual_mov_b32 v2, 0x48004500 :: v_dual_mov_b32 v3, 0 491; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 492; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[3:5], v[0:2]], s[4:7] a16 493; GFX11-NEXT: s_waitcnt vmcnt(0) 494; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3] 495; GFX11-NEXT: s_endpgm 496; 497; GFX12-SDAG-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: 498; GFX12-SDAG: ; %bb.0: ; %main_body 499; GFX12-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 500; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 501; GFX12-SDAG-NEXT: v_dual_mov_b32 v4, 1.0 :: v_dual_mov_b32 v5, 2.0 502; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) 503; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 504; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 505; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, s0, v2 506; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 507; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 508; GFX12-SDAG-NEXT: v_add_co_u32 v2, s0, s2, v2 509; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0 510; GFX12-SDAG-NEXT: flat_load_b32 v6, v[0:1] 511; GFX12-SDAG-NEXT: flat_load_b32 v7, v[2:3] 512; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0x47004400 513; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0x46004200 514; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0x48004500 :: v_dual_mov_b32 v3, 0 515; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 516; GFX12-SDAG-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[3:5], v[0:2]], s[4:7] a16 517; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 518; GFX12-SDAG-NEXT: flat_store_b128 v[0:1], v[0:3] 519; GFX12-SDAG-NEXT: s_endpgm 520; 521; GFX12-GISEL-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: 522; GFX12-GISEL: ; %bb.0: ; %main_body 523; GFX12-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 524; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 525; GFX12-GISEL-NEXT: s_mov_b32 s8, 0x42004600 526; GFX12-GISEL-NEXT: s_mov_b32 s9, 0x44004700 527; GFX12-GISEL-NEXT: s_mov_b32 s10, 0x45004800 528; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) 529; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v4, 2, v0 530; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 531; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 532; GFX12-GISEL-NEXT: s_mov_b32 s2, 2.0 533; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 534; GFX12-GISEL-NEXT: s_mov_b32 s0, 0 535; GFX12-GISEL-NEXT: s_mov_b32 s1, 1.0 536; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) 537; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 538; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 539; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 540; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo 541; GFX12-GISEL-NEXT: flat_load_b32 v6, v[0:1] 542; GFX12-GISEL-NEXT: flat_load_b32 v7, v[2:3] 543; GFX12-GISEL-NEXT: s_wait_alu 0xfffe 544; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s8 545; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 546; GFX12-GISEL-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v5, s10 547; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 548; GFX12-GISEL-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[4:7] a16 549; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 550; GFX12-GISEL-NEXT: flat_store_b128 v[0:1], v[0:3] 551; GFX12-GISEL-NEXT: s_endpgm 552main_body: 553 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 554 %gep_node_ptr = getelementptr inbounds i32, ptr %p_node_ptr, i32 %lid 555 %node_ptr = load i32, ptr %gep_node_ptr, align 4 556 %gep_ray = getelementptr inbounds float, ptr %p_ray, i32 %lid 557 %ray_extent = load float, ptr %gep_ray, align 4 558 %ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0 559 %ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1 560 %ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2 561 %ray_dir0 = insertelement <3 x half> undef, half 3.0, i32 0 562 %ray_dir1 = insertelement <3 x half> %ray_dir0, half 4.0, i32 1 563 %ray_dir = insertelement <3 x half> %ray_dir1, half 5.0, i32 2 564 %ray_inv_dir0 = insertelement <3 x half> undef, half 6.0, i32 0 565 %ray_inv_dir1 = insertelement <3 x half> %ray_inv_dir0, half 7.0, i32 1 566 %ray_inv_dir = insertelement <3 x half> %ray_inv_dir1, half 8.0, i32 2 567 %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) 568 store <4 x i32> %v, ptr undef 569 ret void 570} 571 572define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) { 573; GFX1013-LABEL: image_bvh64_intersect_ray_nsa_reassign: 574; GFX1013: ; %bb.0: ; %main_body 575; GFX1013-NEXT: s_clause 0x1 576; GFX1013-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 577; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 578; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0 579; GFX1013-NEXT: v_mov_b32_e32 v3, 0 580; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0 581; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0 582; GFX1013-NEXT: v_mov_b32_e32 v6, 0x40400000 583; GFX1013-NEXT: v_mov_b32_e32 v7, 4.0 584; GFX1013-NEXT: v_mov_b32_e32 v8, 0x40a00000 585; GFX1013-NEXT: v_mov_b32_e32 v9, 0x40c00000 586; GFX1013-NEXT: v_mov_b32_e32 v10, 0x40e00000 587; GFX1013-NEXT: v_mov_b32_e32 v11, 0x41000000 588; GFX1013-NEXT: s_waitcnt lgkmcnt(0) 589; GFX1013-NEXT: v_add_co_u32 v0, s4, s6, v0 590; GFX1013-NEXT: v_add_co_ci_u32_e64 v1, s4, s7, 0, s4 591; GFX1013-NEXT: flat_load_dword v2, v[0:1] 592; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c7 593; GFX1013-NEXT: v_bfrev_b32_e32 v1, 4.0 594; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 595; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3] 596; GFX1013-NEXT: s_waitcnt vmcnt(0) 597; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] 598; GFX1013-NEXT: s_endpgm 599; 600; GFX1030-LABEL: image_bvh64_intersect_ray_nsa_reassign: 601; GFX1030: ; %bb.0: ; %main_body 602; GFX1030-NEXT: s_clause 0x1 603; GFX1030-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 604; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 605; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 2, v0 606; GFX1030-NEXT: v_mov_b32_e32 v3, 0 607; GFX1030-NEXT: v_mov_b32_e32 v11, 0x41000000 608; GFX1030-NEXT: v_mov_b32_e32 v10, 0x40e00000 609; GFX1030-NEXT: v_mov_b32_e32 v9, 0x40c00000 610; GFX1030-NEXT: v_mov_b32_e32 v8, 0x40a00000 611; GFX1030-NEXT: v_mov_b32_e32 v7, 4.0 612; GFX1030-NEXT: v_mov_b32_e32 v6, 0x40400000 613; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0 614; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0 615; GFX1030-NEXT: s_waitcnt lgkmcnt(0) 616; GFX1030-NEXT: v_add_co_u32 v0, s4, s6, v0 617; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s4 618; GFX1030-NEXT: flat_load_dword v2, v[0:1] 619; GFX1030-NEXT: v_bfrev_b32_e32 v1, 4.0 620; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c7 621; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 622; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3] 623; GFX1030-NEXT: s_waitcnt vmcnt(0) 624; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3] 625; GFX1030-NEXT: s_endpgm 626; 627; GFX11-LABEL: image_bvh64_intersect_ray_nsa_reassign: 628; GFX11: ; %bb.0: ; %main_body 629; GFX11-NEXT: s_clause 0x1 630; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 631; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 632; GFX11-NEXT: v_dual_mov_b32 v7, 1.0 :: v_dual_and_b32 v0, 0x3ff, v0 633; GFX11-NEXT: v_mov_b32_e32 v2, 0x41000000 634; GFX11-NEXT: v_dual_mov_b32 v3, 0x40400000 :: v_dual_mov_b32 v4, 4.0 635; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) 636; GFX11-NEXT: v_dual_mov_b32 v5, 0x40a00000 :: v_dual_lshlrev_b32 v0, 2, v0 637; GFX11-NEXT: v_mov_b32_e32 v6, 0 638; GFX11-NEXT: v_dual_mov_b32 v8, 2.0 :: v_dual_mov_b32 v9, 0xb36211c7 639; GFX11-NEXT: v_bfrev_b32_e32 v10, 4.0 640; GFX11-NEXT: s_waitcnt lgkmcnt(0) 641; GFX11-NEXT: v_add_co_u32 v0, s4, s6, v0 642; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 643; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s4 644; GFX11-NEXT: flat_load_b32 v11, v[0:1] 645; GFX11-NEXT: v_mov_b32_e32 v0, 0x40c00000 646; GFX11-NEXT: v_mov_b32_e32 v1, 0x40e00000 647; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 648; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[9:10], v11, v[6:8], v[3:5], v[0:2]], s[0:3] 649; GFX11-NEXT: s_waitcnt vmcnt(0) 650; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3] 651; GFX11-NEXT: s_endpgm 652; 653; GFX12-SDAG-LABEL: image_bvh64_intersect_ray_nsa_reassign: 654; GFX12-SDAG: ; %bb.0: ; %main_body 655; GFX12-SDAG-NEXT: s_clause 0x1 656; GFX12-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 657; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 658; GFX12-SDAG-NEXT: v_dual_mov_b32 v7, 1.0 :: v_dual_and_b32 v0, 0x3ff, v0 659; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0x41000000 660; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0x40400000 :: v_dual_mov_b32 v4, 4.0 661; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) 662; GFX12-SDAG-NEXT: v_dual_mov_b32 v5, 0x40a00000 :: v_dual_lshlrev_b32 v0, 2, v0 663; GFX12-SDAG-NEXT: v_mov_b32_e32 v6, 0 664; GFX12-SDAG-NEXT: v_dual_mov_b32 v8, 2.0 :: v_dual_mov_b32 v9, 0xb36211c7 665; GFX12-SDAG-NEXT: v_bfrev_b32_e32 v10, 4.0 666; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 667; GFX12-SDAG-NEXT: v_add_co_u32 v0, s4, s6, v0 668; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) 669; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s4 670; GFX12-SDAG-NEXT: flat_load_b32 v11, v[0:1] 671; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0x40c00000 672; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0x40e00000 673; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 674; GFX12-SDAG-NEXT: image_bvh64_intersect_ray v[0:3], [v[9:10], v11, v[6:8], v[3:5], v[0:2]], s[0:3] 675; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 676; GFX12-SDAG-NEXT: flat_store_b128 v[0:1], v[0:3] 677; GFX12-SDAG-NEXT: s_endpgm 678; 679; GFX12-GISEL-LABEL: image_bvh64_intersect_ray_nsa_reassign: 680; GFX12-GISEL: ; %bb.0: ; %main_body 681; GFX12-GISEL-NEXT: s_clause 0x1 682; GFX12-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 683; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 684; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 685; GFX12-GISEL-NEXT: s_mov_b32 s5, 1.0 686; GFX12-GISEL-NEXT: s_mov_b32 s4, 0 687; GFX12-GISEL-NEXT: v_mov_b32_e32 v9, 0xb36211c7 688; GFX12-GISEL-NEXT: s_mov_b32 s8, 0x40400000 689; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 690; GFX12-GISEL-NEXT: s_mov_b32 s12, 0x40c00000 691; GFX12-GISEL-NEXT: s_mov_b32 s10, 0x40a00000 692; GFX12-GISEL-NEXT: s_mov_b32 s9, 4.0 693; GFX12-GISEL-NEXT: s_mov_b32 s14, 0x41000000 694; GFX12-GISEL-NEXT: s_mov_b32 s13, 0x40e00000 695; GFX12-GISEL-NEXT: v_mov_b32_e32 v6, s12 696; GFX12-GISEL-NEXT: v_bfrev_b32_e32 v10, 4.0 697; GFX12-GISEL-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v3, s8 698; GFX12-GISEL-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v7, s13 699; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 700; GFX12-GISEL-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s6 701; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s7 702; GFX12-GISEL-NEXT: s_mov_b32 s6, 2.0 703; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 704; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 705; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 706; GFX12-GISEL-NEXT: flat_load_b32 v11, v[0:1] 707; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 708; GFX12-GISEL-NEXT: s_wait_alu 0xfffe 709; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s6 710; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 711; GFX12-GISEL-NEXT: image_bvh64_intersect_ray v[0:3], [v[9:10], v11, v[0:2], v[3:5], v[6:8]], s[0:3] 712; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 713; GFX12-GISEL-NEXT: flat_store_b128 v[0:1], v[0:3] 714; GFX12-GISEL-NEXT: s_endpgm 715main_body: 716 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 717 %gep_ray = getelementptr inbounds float, ptr %p_ray, i32 %lid 718 %ray_extent = load float, ptr %gep_ray, align 4 719 %ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0 720 %ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1 721 %ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2 722 %ray_dir0 = insertelement <3 x float> undef, float 3.0, i32 0 723 %ray_dir1 = insertelement <3 x float> %ray_dir0, float 4.0, i32 1 724 %ray_dir = insertelement <3 x float> %ray_dir1, float 5.0, i32 2 725 %ray_inv_dir0 = insertelement <3 x float> undef, float 6.0, i32 0 726 %ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float 7.0, i32 1 727 %ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float 8.0, i32 2 728 %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 1111111111111, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) 729 store <4 x i32> %v, ptr undef 730 ret void 731} 732 733define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) { 734; GFX1013-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: 735; GFX1013: ; %bb.0: ; %main_body 736; GFX1013-NEXT: s_clause 0x1 737; GFX1013-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 738; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 739; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0 740; GFX1013-NEXT: v_mov_b32_e32 v3, 0 741; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0 742; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0 743; GFX1013-NEXT: v_mov_b32_e32 v6, 0x44004200 744; GFX1013-NEXT: v_mov_b32_e32 v7, 0x46004500 745; GFX1013-NEXT: v_mov_b32_e32 v8, 0x48004700 746; GFX1013-NEXT: s_waitcnt lgkmcnt(0) 747; GFX1013-NEXT: v_add_co_u32 v0, s4, s6, v0 748; GFX1013-NEXT: v_add_co_ci_u32_e64 v1, s4, s7, 0, s4 749; GFX1013-NEXT: flat_load_dword v2, v[0:1] 750; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c6 751; GFX1013-NEXT: v_bfrev_b32_e32 v1, 4.0 752; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 753; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16 754; GFX1013-NEXT: s_waitcnt vmcnt(0) 755; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] 756; GFX1013-NEXT: s_endpgm 757; 758; GFX1030-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: 759; GFX1030: ; %bb.0: ; %main_body 760; GFX1030-NEXT: s_clause 0x1 761; GFX1030-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 762; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 763; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 2, v0 764; GFX1030-NEXT: v_mov_b32_e32 v3, 0 765; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0 766; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0 767; GFX1030-NEXT: v_mov_b32_e32 v6, 0x44004200 768; GFX1030-NEXT: v_mov_b32_e32 v7, 0x46004500 769; GFX1030-NEXT: v_mov_b32_e32 v8, 0x48004700 770; GFX1030-NEXT: s_waitcnt lgkmcnt(0) 771; GFX1030-NEXT: v_add_co_u32 v0, s4, s6, v0 772; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s4 773; GFX1030-NEXT: flat_load_dword v2, v[0:1] 774; GFX1030-NEXT: v_bfrev_b32_e32 v1, 4.0 775; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c6 776; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 777; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16 778; GFX1030-NEXT: s_waitcnt vmcnt(0) 779; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3] 780; GFX1030-NEXT: s_endpgm 781; 782; GFX11-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: 783; GFX11: ; %bb.0: ; %main_body 784; GFX11-NEXT: s_clause 0x1 785; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 786; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 787; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 788; GFX11-NEXT: v_mov_b32_e32 v2, 0x48004500 789; GFX11-NEXT: v_mov_b32_e32 v4, 1.0 790; GFX11-NEXT: v_mov_b32_e32 v6, 0xb36211c6 791; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) 792; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 793; GFX11-NEXT: v_bfrev_b32_e32 v7, 4.0 794; GFX11-NEXT: v_mov_b32_e32 v5, 2.0 795; GFX11-NEXT: s_waitcnt lgkmcnt(0) 796; GFX11-NEXT: v_add_co_u32 v0, s4, s6, v0 797; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 798; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s4 799; GFX11-NEXT: flat_load_b32 v8, v[0:1] 800; GFX11-NEXT: v_mov_b32_e32 v1, 0x47004400 801; GFX11-NEXT: v_mov_b32_e32 v0, 0x46004200 802; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 803; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[3:5], v[0:2]], s[0:3] a16 804; GFX11-NEXT: s_waitcnt vmcnt(0) 805; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3] 806; GFX11-NEXT: s_endpgm 807; 808; GFX12-SDAG-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: 809; GFX12-SDAG: ; %bb.0: ; %main_body 810; GFX12-SDAG-NEXT: s_clause 0x1 811; GFX12-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 812; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 813; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 814; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0x48004500 815; GFX12-SDAG-NEXT: v_mov_b32_e32 v4, 1.0 816; GFX12-SDAG-NEXT: v_mov_b32_e32 v6, 0xb36211c6 817; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) 818; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 819; GFX12-SDAG-NEXT: v_bfrev_b32_e32 v7, 4.0 820; GFX12-SDAG-NEXT: v_mov_b32_e32 v5, 2.0 821; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 822; GFX12-SDAG-NEXT: v_add_co_u32 v0, s4, s6, v0 823; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) 824; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s4 825; GFX12-SDAG-NEXT: flat_load_b32 v8, v[0:1] 826; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0x47004400 827; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0x46004200 828; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 829; GFX12-SDAG-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[3:5], v[0:2]], s[0:3] a16 830; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 831; GFX12-SDAG-NEXT: flat_store_b128 v[0:1], v[0:3] 832; GFX12-SDAG-NEXT: s_endpgm 833; 834; GFX12-GISEL-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: 835; GFX12-GISEL: ; %bb.0: ; %main_body 836; GFX12-GISEL-NEXT: s_clause 0x1 837; GFX12-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 838; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 839; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 840; GFX12-GISEL-NEXT: s_mov_b32 s5, 1.0 841; GFX12-GISEL-NEXT: s_mov_b32 s4, 0 842; GFX12-GISEL-NEXT: s_mov_b32 s8, 0x42004600 843; GFX12-GISEL-NEXT: s_mov_b32 s9, 0x44004700 844; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 845; GFX12-GISEL-NEXT: s_mov_b32 s10, 0x45004800 846; GFX12-GISEL-NEXT: v_mov_b32_e32 v6, 0xb36211c6 847; GFX12-GISEL-NEXT: v_bfrev_b32_e32 v7, 4.0 848; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s8 849; GFX12-GISEL-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v4, s9 850; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 851; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 852; GFX12-GISEL-NEXT: s_mov_b32 s6, 2.0 853; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) 854; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 855; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 856; GFX12-GISEL-NEXT: flat_load_b32 v8, v[0:1] 857; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 858; GFX12-GISEL-NEXT: s_wait_alu 0xfffe 859; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s6 860; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 861; GFX12-GISEL-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[0:2], v[3:5]], s[0:3] a16 862; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 863; GFX12-GISEL-NEXT: flat_store_b128 v[0:1], v[0:3] 864; GFX12-GISEL-NEXT: s_endpgm 865main_body: 866 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 867 %gep_ray = getelementptr inbounds float, ptr %p_ray, i32 %lid 868 %ray_extent = load float, ptr %gep_ray, align 4 869 %ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0 870 %ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1 871 %ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2 872 %ray_dir0 = insertelement <3 x half> undef, half 3.0, i32 0 873 %ray_dir1 = insertelement <3 x half> %ray_dir0, half 4.0, i32 1 874 %ray_dir = insertelement <3 x half> %ray_dir1, half 5.0, i32 2 875 %ray_inv_dir0 = insertelement <3 x half> undef, half 6.0, i32 0 876 %ray_inv_dir1 = insertelement <3 x half> %ray_inv_dir0, half 7.0, i32 1 877 %ray_inv_dir = insertelement <3 x half> %ray_inv_dir1, half 8.0, i32 2 878 %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 1111111111110, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) 879 store <4 x i32> %v, ptr undef 880 ret void 881} 882 883declare i32 @llvm.amdgcn.workitem.id.x() 884