1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s 3; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX8 %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11 %s 5 6; FIXME: This should be merged with sint_to_fp.ll, but s_sint_to_fp_v2i64 crashes on r600 7 8define amdgpu_kernel void @s_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %in) #0 { 9; GFX6-LABEL: s_sint_to_fp_i64_to_f16: 10; GFX6: ; %bb.0: 11; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 12; GFX6-NEXT: s_mov_b32 s7, 0xf000 13; GFX6-NEXT: s_mov_b32 s6, -1 14; GFX6-NEXT: s_waitcnt lgkmcnt(0) 15; GFX6-NEXT: s_mov_b32 s4, s0 16; GFX6-NEXT: s_mov_b32 s5, s1 17; GFX6-NEXT: s_flbit_i32 s0, s3 18; GFX6-NEXT: s_xor_b32 s1, s2, s3 19; GFX6-NEXT: s_add_i32 s0, s0, -1 20; GFX6-NEXT: s_ashr_i32 s1, s1, 31 21; GFX6-NEXT: s_add_i32 s1, s1, 32 22; GFX6-NEXT: s_min_u32 s8, s0, s1 23; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], s8 24; GFX6-NEXT: s_min_u32 s0, s0, 1 25; GFX6-NEXT: s_or_b32 s0, s1, s0 26; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s0 27; GFX6-NEXT: s_sub_i32 s0, 32, s8 28; GFX6-NEXT: v_ldexp_f32_e64 v0, v0, s0 29; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 30; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 31; GFX6-NEXT: s_endpgm 32; 33; GFX8-LABEL: s_sint_to_fp_i64_to_f16: 34; GFX8: ; %bb.0: 35; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 36; GFX8-NEXT: s_waitcnt lgkmcnt(0) 37; GFX8-NEXT: s_xor_b32 s5, s2, s3 38; GFX8-NEXT: s_flbit_i32 s4, s3 39; GFX8-NEXT: s_ashr_i32 s5, s5, 31 40; GFX8-NEXT: s_add_i32 s4, s4, -1 41; GFX8-NEXT: s_add_i32 s5, s5, 32 42; GFX8-NEXT: s_min_u32 s4, s4, s5 43; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 44; GFX8-NEXT: s_min_u32 s2, s2, 1 45; GFX8-NEXT: s_or_b32 s2, s3, s2 46; GFX8-NEXT: v_cvt_f32_i32_e32 v0, s2 47; GFX8-NEXT: s_sub_i32 s2, 32, s4 48; GFX8-NEXT: v_mov_b32_e32 v1, s1 49; GFX8-NEXT: v_ldexp_f32 v0, v0, s2 50; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v0 51; GFX8-NEXT: v_mov_b32_e32 v0, s0 52; GFX8-NEXT: flat_store_short v[0:1], v2 53; GFX8-NEXT: s_endpgm 54; 55; GFX11-LABEL: s_sint_to_fp_i64_to_f16: 56; GFX11: ; %bb.0: 57; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 58; GFX11-NEXT: v_mov_b32_e32 v1, 0 59; GFX11-NEXT: s_waitcnt lgkmcnt(0) 60; GFX11-NEXT: s_xor_b32 s4, s2, s3 61; GFX11-NEXT: s_cls_i32 s5, s3 62; GFX11-NEXT: s_ashr_i32 s4, s4, 31 63; GFX11-NEXT: s_add_i32 s5, s5, -1 64; GFX11-NEXT: s_add_i32 s4, s4, 32 65; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 66; GFX11-NEXT: s_min_u32 s4, s5, s4 67; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 68; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 69; GFX11-NEXT: s_min_u32 s2, s2, 1 70; GFX11-NEXT: s_or_b32 s2, s3, s2 71; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 72; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2 73; GFX11-NEXT: s_sub_i32 s2, 32, s4 74; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 75; GFX11-NEXT: v_ldexp_f32 v0, v0, s2 76; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 77; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 78; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] 79; GFX11-NEXT: s_endpgm 80 %result = sitofp i64 %in to half 81 store half %result, ptr addrspace(1) %out 82 ret void 83} 84 85define amdgpu_kernel void @v_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 86; GFX6-LABEL: v_sint_to_fp_i64_to_f16: 87; GFX6: ; %bb.0: 88; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 89; GFX6-NEXT: s_mov_b32 s7, 0xf000 90; GFX6-NEXT: s_mov_b32 s6, 0 91; GFX6-NEXT: v_lshlrev_b32_e32 v1, 3, v0 92; GFX6-NEXT: v_mov_b32_e32 v2, 0 93; GFX6-NEXT: s_waitcnt lgkmcnt(0) 94; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 95; GFX6-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 96; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v0 97; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 98; GFX6-NEXT: s_waitcnt vmcnt(0) 99; GFX6-NEXT: v_xor_b32_e32 v0, v3, v4 100; GFX6-NEXT: v_ffbh_i32_e32 v5, v4 101; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v0 102; GFX6-NEXT: v_add_i32_e32 v5, vcc, -1, v5 103; GFX6-NEXT: v_add_i32_e32 v0, vcc, 32, v0 104; GFX6-NEXT: v_min_u32_e32 v0, v5, v0 105; GFX6-NEXT: v_lshl_b64 v[3:4], v[3:4], v0 106; GFX6-NEXT: v_min_u32_e32 v3, 1, v3 107; GFX6-NEXT: v_or_b32_e32 v3, v4, v3 108; GFX6-NEXT: v_cvt_f32_i32_e32 v3, v3 109; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 32, v0 110; GFX6-NEXT: v_ldexp_f32_e32 v0, v3, v0 111; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 112; GFX6-NEXT: buffer_store_short v0, v[1:2], s[0:3], 0 addr64 113; GFX6-NEXT: s_endpgm 114; 115; GFX8-LABEL: v_sint_to_fp_i64_to_f16: 116; GFX8: ; %bb.0: 117; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 118; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v0 119; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 120; GFX8-NEXT: s_waitcnt lgkmcnt(0) 121; GFX8-NEXT: v_mov_b32_e32 v2, s3 122; GFX8-NEXT: v_add_u32_e32 v1, vcc, s2, v1 123; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc 124; GFX8-NEXT: flat_load_dwordx2 v[1:2], v[1:2] 125; GFX8-NEXT: s_waitcnt vmcnt(0) 126; GFX8-NEXT: v_xor_b32_e32 v3, v1, v2 127; GFX8-NEXT: v_ffbh_i32_e32 v4, v2 128; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v3 129; GFX8-NEXT: v_add_u32_e32 v4, vcc, -1, v4 130; GFX8-NEXT: v_add_u32_e32 v3, vcc, 32, v3 131; GFX8-NEXT: v_min_u32_e32 v3, v4, v3 132; GFX8-NEXT: v_lshlrev_b64 v[1:2], v3, v[1:2] 133; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 32, v3 134; GFX8-NEXT: v_min_u32_e32 v1, 1, v1 135; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 136; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1 137; GFX8-NEXT: v_mov_b32_e32 v2, s1 138; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 139; GFX8-NEXT: v_ldexp_f32 v1, v1, v3 140; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v1 141; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc 142; GFX8-NEXT: flat_store_short v[0:1], v3 143; GFX8-NEXT: s_endpgm 144; 145; GFX11-LABEL: v_sint_to_fp_i64_to_f16: 146; GFX11: ; %bb.0: 147; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 148; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 149; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 150; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v2 151; GFX11-NEXT: s_waitcnt lgkmcnt(0) 152; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] 153; GFX11-NEXT: s_waitcnt vmcnt(0) 154; GFX11-NEXT: v_xor_b32_e32 v3, v0, v1 155; GFX11-NEXT: v_cls_i32_e32 v4, v1 156; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 157; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v3 158; GFX11-NEXT: v_add_nc_u32_e32 v4, -1, v4 159; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 160; GFX11-NEXT: v_add_nc_u32_e32 v3, 32, v3 161; GFX11-NEXT: v_min_u32_e32 v3, v4, v3 162; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 163; GFX11-NEXT: v_lshlrev_b64 v[0:1], v3, v[0:1] 164; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 165; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 166; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 167; GFX11-NEXT: v_sub_nc_u32_e32 v1, 32, v3 168; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 169; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 170; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 171; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v2 172; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 173; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] 174; GFX11-NEXT: s_endpgm 175 %tid = call i32 @llvm.amdgcn.workitem.id.x() 176 %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid 177 %out.gep = getelementptr half, ptr addrspace(1) %out, i32 %tid 178 %val = load i64, ptr addrspace(1) %in.gep 179 %result = sitofp i64 %val to half 180 store half %result, ptr addrspace(1) %out.gep 181 ret void 182} 183 184define amdgpu_kernel void @s_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %in) #0 { 185; GFX6-LABEL: s_sint_to_fp_i64_to_f32: 186; GFX6: ; %bb.0: 187; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 188; GFX6-NEXT: s_mov_b32 s7, 0xf000 189; GFX6-NEXT: s_mov_b32 s6, -1 190; GFX6-NEXT: s_waitcnt lgkmcnt(0) 191; GFX6-NEXT: s_mov_b32 s4, s0 192; GFX6-NEXT: s_mov_b32 s5, s1 193; GFX6-NEXT: s_flbit_i32 s0, s3 194; GFX6-NEXT: s_xor_b32 s1, s2, s3 195; GFX6-NEXT: s_add_i32 s0, s0, -1 196; GFX6-NEXT: s_ashr_i32 s1, s1, 31 197; GFX6-NEXT: s_add_i32 s1, s1, 32 198; GFX6-NEXT: s_min_u32 s8, s0, s1 199; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], s8 200; GFX6-NEXT: s_min_u32 s0, s0, 1 201; GFX6-NEXT: s_or_b32 s0, s1, s0 202; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s0 203; GFX6-NEXT: s_sub_i32 s0, 32, s8 204; GFX6-NEXT: v_ldexp_f32_e64 v0, v0, s0 205; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 206; GFX6-NEXT: s_endpgm 207; 208; GFX8-LABEL: s_sint_to_fp_i64_to_f32: 209; GFX8: ; %bb.0: 210; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 211; GFX8-NEXT: s_waitcnt lgkmcnt(0) 212; GFX8-NEXT: s_xor_b32 s5, s2, s3 213; GFX8-NEXT: s_flbit_i32 s4, s3 214; GFX8-NEXT: s_ashr_i32 s5, s5, 31 215; GFX8-NEXT: s_add_i32 s4, s4, -1 216; GFX8-NEXT: s_add_i32 s5, s5, 32 217; GFX8-NEXT: s_min_u32 s4, s4, s5 218; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 219; GFX8-NEXT: s_min_u32 s2, s2, 1 220; GFX8-NEXT: s_or_b32 s2, s3, s2 221; GFX8-NEXT: v_cvt_f32_i32_e32 v2, s2 222; GFX8-NEXT: v_mov_b32_e32 v0, s0 223; GFX8-NEXT: s_sub_i32 s0, 32, s4 224; GFX8-NEXT: v_mov_b32_e32 v1, s1 225; GFX8-NEXT: v_ldexp_f32 v2, v2, s0 226; GFX8-NEXT: flat_store_dword v[0:1], v2 227; GFX8-NEXT: s_endpgm 228; 229; GFX11-LABEL: s_sint_to_fp_i64_to_f32: 230; GFX11: ; %bb.0: 231; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 232; GFX11-NEXT: v_mov_b32_e32 v1, 0 233; GFX11-NEXT: s_waitcnt lgkmcnt(0) 234; GFX11-NEXT: s_xor_b32 s4, s2, s3 235; GFX11-NEXT: s_cls_i32 s5, s3 236; GFX11-NEXT: s_ashr_i32 s4, s4, 31 237; GFX11-NEXT: s_add_i32 s5, s5, -1 238; GFX11-NEXT: s_add_i32 s4, s4, 32 239; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 240; GFX11-NEXT: s_min_u32 s4, s5, s4 241; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 242; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 243; GFX11-NEXT: s_min_u32 s2, s2, 1 244; GFX11-NEXT: s_or_b32 s2, s3, s2 245; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 246; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2 247; GFX11-NEXT: s_sub_i32 s2, 32, s4 248; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 249; GFX11-NEXT: v_ldexp_f32 v0, v0, s2 250; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] 251; GFX11-NEXT: s_endpgm 252 %result = sitofp i64 %in to float 253 store float %result, ptr addrspace(1) %out 254 ret void 255} 256 257define amdgpu_kernel void @v_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 258; GFX6-LABEL: v_sint_to_fp_i64_to_f32: 259; GFX6: ; %bb.0: 260; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 261; GFX6-NEXT: s_mov_b32 s7, 0xf000 262; GFX6-NEXT: s_mov_b32 s6, 0 263; GFX6-NEXT: v_lshlrev_b32_e32 v1, 3, v0 264; GFX6-NEXT: v_mov_b32_e32 v2, 0 265; GFX6-NEXT: s_waitcnt lgkmcnt(0) 266; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 267; GFX6-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 268; GFX6-NEXT: v_lshlrev_b32_e32 v1, 2, v0 269; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 270; GFX6-NEXT: s_waitcnt vmcnt(0) 271; GFX6-NEXT: v_xor_b32_e32 v0, v3, v4 272; GFX6-NEXT: v_ffbh_i32_e32 v5, v4 273; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v0 274; GFX6-NEXT: v_add_i32_e32 v5, vcc, -1, v5 275; GFX6-NEXT: v_add_i32_e32 v0, vcc, 32, v0 276; GFX6-NEXT: v_min_u32_e32 v0, v5, v0 277; GFX6-NEXT: v_lshl_b64 v[3:4], v[3:4], v0 278; GFX6-NEXT: v_min_u32_e32 v3, 1, v3 279; GFX6-NEXT: v_or_b32_e32 v3, v4, v3 280; GFX6-NEXT: v_cvt_f32_i32_e32 v3, v3 281; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 32, v0 282; GFX6-NEXT: v_ldexp_f32_e32 v0, v3, v0 283; GFX6-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 284; GFX6-NEXT: s_endpgm 285; 286; GFX8-LABEL: v_sint_to_fp_i64_to_f32: 287; GFX8: ; %bb.0: 288; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 289; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v0 290; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0 291; GFX8-NEXT: s_waitcnt lgkmcnt(0) 292; GFX8-NEXT: v_mov_b32_e32 v2, s3 293; GFX8-NEXT: v_add_u32_e32 v1, vcc, s2, v1 294; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc 295; GFX8-NEXT: flat_load_dwordx2 v[1:2], v[1:2] 296; GFX8-NEXT: s_waitcnt vmcnt(0) 297; GFX8-NEXT: v_xor_b32_e32 v0, v1, v2 298; GFX8-NEXT: v_ffbh_i32_e32 v4, v2 299; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v0 300; GFX8-NEXT: v_add_u32_e32 v4, vcc, -1, v4 301; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 302; GFX8-NEXT: v_min_u32_e32 v4, v4, v0 303; GFX8-NEXT: v_lshlrev_b64 v[0:1], v4, v[1:2] 304; GFX8-NEXT: v_mov_b32_e32 v2, s1 305; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 306; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 307; GFX8-NEXT: v_cvt_f32_i32_e32 v5, v0 308; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v3 309; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc 310; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v4 311; GFX8-NEXT: v_ldexp_f32 v2, v5, v2 312; GFX8-NEXT: flat_store_dword v[0:1], v2 313; GFX8-NEXT: s_endpgm 314; 315; GFX11-LABEL: v_sint_to_fp_i64_to_f32: 316; GFX11: ; %bb.0: 317; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 318; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 319; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 320; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v2 321; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v2 322; GFX11-NEXT: s_waitcnt lgkmcnt(0) 323; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] 324; GFX11-NEXT: s_waitcnt vmcnt(0) 325; GFX11-NEXT: v_xor_b32_e32 v3, v0, v1 326; GFX11-NEXT: v_cls_i32_e32 v4, v1 327; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 328; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v3 329; GFX11-NEXT: v_add_nc_u32_e32 v4, -1, v4 330; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 331; GFX11-NEXT: v_add_nc_u32_e32 v3, 32, v3 332; GFX11-NEXT: v_min_u32_e32 v3, v4, v3 333; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 334; GFX11-NEXT: v_lshlrev_b64 v[0:1], v3, v[0:1] 335; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 336; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 337; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 338; GFX11-NEXT: v_sub_nc_u32_e32 v1, 32, v3 339; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 340; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 341; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 342; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] 343; GFX11-NEXT: s_endpgm 344 %tid = call i32 @llvm.amdgcn.workitem.id.x() 345 %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid 346 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid 347 %val = load i64, ptr addrspace(1) %in.gep 348 %result = sitofp i64 %val to float 349 store float %result, ptr addrspace(1) %out.gep 350 ret void 351} 352 353define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2 x i64> %in) #0{ 354; GFX6-LABEL: s_sint_to_fp_v2i64_to_v2f32: 355; GFX6: ; %bb.0: 356; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd 357; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 358; GFX6-NEXT: s_mov_b32 s3, 0xf000 359; GFX6-NEXT: s_mov_b32 s2, -1 360; GFX6-NEXT: s_waitcnt lgkmcnt(0) 361; GFX6-NEXT: s_flbit_i32 s4, s11 362; GFX6-NEXT: s_xor_b32 s5, s10, s11 363; GFX6-NEXT: s_flbit_i32 s6, s9 364; GFX6-NEXT: s_xor_b32 s7, s8, s9 365; GFX6-NEXT: s_add_i32 s4, s4, -1 366; GFX6-NEXT: s_ashr_i32 s5, s5, 31 367; GFX6-NEXT: s_add_i32 s6, s6, -1 368; GFX6-NEXT: s_ashr_i32 s7, s7, 31 369; GFX6-NEXT: s_add_i32 s5, s5, 32 370; GFX6-NEXT: s_add_i32 s7, s7, 32 371; GFX6-NEXT: s_min_u32 s12, s4, s5 372; GFX6-NEXT: s_min_u32 s13, s6, s7 373; GFX6-NEXT: s_lshl_b64 s[4:5], s[10:11], s12 374; GFX6-NEXT: s_sub_i32 s10, 32, s12 375; GFX6-NEXT: s_lshl_b64 s[6:7], s[8:9], s13 376; GFX6-NEXT: s_sub_i32 s8, 32, s13 377; GFX6-NEXT: s_min_u32 s4, s4, 1 378; GFX6-NEXT: s_min_u32 s6, s6, 1 379; GFX6-NEXT: s_or_b32 s4, s5, s4 380; GFX6-NEXT: s_or_b32 s5, s7, s6 381; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4 382; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s5 383; GFX6-NEXT: v_ldexp_f32_e64 v1, v0, s10 384; GFX6-NEXT: v_ldexp_f32_e64 v0, v2, s8 385; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 386; GFX6-NEXT: s_endpgm 387; 388; GFX8-LABEL: s_sint_to_fp_v2i64_to_v2f32: 389; GFX8: ; %bb.0: 390; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 391; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 392; GFX8-NEXT: s_waitcnt lgkmcnt(0) 393; GFX8-NEXT: s_xor_b32 s7, s2, s3 394; GFX8-NEXT: s_flbit_i32 s6, s3 395; GFX8-NEXT: s_ashr_i32 s7, s7, 31 396; GFX8-NEXT: s_add_i32 s6, s6, -1 397; GFX8-NEXT: s_add_i32 s7, s7, 32 398; GFX8-NEXT: s_min_u32 s6, s6, s7 399; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s6 400; GFX8-NEXT: s_min_u32 s2, s2, 1 401; GFX8-NEXT: s_or_b32 s2, s3, s2 402; GFX8-NEXT: v_cvt_f32_i32_e32 v0, s2 403; GFX8-NEXT: s_xor_b32 s2, s0, s1 404; GFX8-NEXT: s_flbit_i32 s8, s1 405; GFX8-NEXT: s_ashr_i32 s2, s2, 31 406; GFX8-NEXT: s_add_i32 s8, s8, -1 407; GFX8-NEXT: s_add_i32 s2, s2, 32 408; GFX8-NEXT: s_min_u32 s2, s8, s2 409; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 410; GFX8-NEXT: s_min_u32 s0, s0, 1 411; GFX8-NEXT: s_or_b32 s0, s1, s0 412; GFX8-NEXT: v_cvt_f32_i32_e32 v2, s0 413; GFX8-NEXT: s_sub_i32 s0, 32, s6 414; GFX8-NEXT: v_ldexp_f32 v1, v0, s0 415; GFX8-NEXT: s_sub_i32 s0, 32, s2 416; GFX8-NEXT: v_ldexp_f32 v0, v2, s0 417; GFX8-NEXT: v_mov_b32_e32 v2, s4 418; GFX8-NEXT: v_mov_b32_e32 v3, s5 419; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 420; GFX8-NEXT: s_endpgm 421; 422; GFX11-LABEL: s_sint_to_fp_v2i64_to_v2f32: 423; GFX11: ; %bb.0: 424; GFX11-NEXT: s_clause 0x1 425; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 426; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 427; GFX11-NEXT: v_mov_b32_e32 v3, 0 428; GFX11-NEXT: s_waitcnt lgkmcnt(0) 429; GFX11-NEXT: s_xor_b32 s7, s2, s3 430; GFX11-NEXT: s_xor_b32 s9, s0, s1 431; GFX11-NEXT: s_cls_i32 s6, s3 432; GFX11-NEXT: s_cls_i32 s8, s1 433; GFX11-NEXT: s_ashr_i32 s7, s7, 31 434; GFX11-NEXT: s_ashr_i32 s9, s9, 31 435; GFX11-NEXT: s_add_i32 s6, s6, -1 436; GFX11-NEXT: s_add_i32 s8, s8, -1 437; GFX11-NEXT: s_add_i32 s7, s7, 32 438; GFX11-NEXT: s_add_i32 s9, s9, 32 439; GFX11-NEXT: s_min_u32 s6, s6, s7 440; GFX11-NEXT: s_min_u32 s7, s8, s9 441; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], s6 442; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s7 443; GFX11-NEXT: s_min_u32 s2, s2, 1 444; GFX11-NEXT: s_min_u32 s0, s0, 1 445; GFX11-NEXT: s_or_b32 s2, s3, s2 446; GFX11-NEXT: s_or_b32 s0, s1, s0 447; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2 448; GFX11-NEXT: v_cvt_f32_i32_e32 v2, s0 449; GFX11-NEXT: s_sub_i32 s0, 32, s6 450; GFX11-NEXT: s_sub_i32 s1, 32, s7 451; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 452; GFX11-NEXT: v_ldexp_f32 v1, v0, s0 453; GFX11-NEXT: v_ldexp_f32 v0, v2, s1 454; GFX11-NEXT: global_store_b64 v3, v[0:1], s[4:5] 455; GFX11-NEXT: s_endpgm 456 %result = sitofp <2 x i64> %in to <2 x float> 457 store <2 x float> %result, ptr addrspace(1) %out 458 ret void 459} 460 461define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 462; GFX6-LABEL: v_sint_to_fp_v4i64_to_v4f32: 463; GFX6: ; %bb.0: 464; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 465; GFX6-NEXT: s_mov_b32 s7, 0xf000 466; GFX6-NEXT: s_mov_b32 s6, 0 467; GFX6-NEXT: v_lshlrev_b32_e32 v8, 5, v0 468; GFX6-NEXT: v_mov_b32_e32 v9, 0 469; GFX6-NEXT: s_waitcnt lgkmcnt(0) 470; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 471; GFX6-NEXT: buffer_load_dwordx4 v[1:4], v[8:9], s[4:7], 0 addr64 offset:16 472; GFX6-NEXT: buffer_load_dwordx4 v[5:8], v[8:9], s[4:7], 0 addr64 473; GFX6-NEXT: v_lshlrev_b32_e32 v10, 4, v0 474; GFX6-NEXT: v_mov_b32_e32 v11, v9 475; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 476; GFX6-NEXT: s_waitcnt vmcnt(1) 477; GFX6-NEXT: v_xor_b32_e32 v0, v3, v4 478; GFX6-NEXT: v_ffbh_i32_e32 v9, v4 479; GFX6-NEXT: v_xor_b32_e32 v12, v1, v2 480; GFX6-NEXT: v_ffbh_i32_e32 v13, v2 481; GFX6-NEXT: s_waitcnt vmcnt(0) 482; GFX6-NEXT: v_xor_b32_e32 v14, v7, v8 483; GFX6-NEXT: v_ffbh_i32_e32 v15, v8 484; GFX6-NEXT: v_xor_b32_e32 v16, v5, v6 485; GFX6-NEXT: v_ffbh_i32_e32 v17, v6 486; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v0 487; GFX6-NEXT: v_add_i32_e32 v9, vcc, -1, v9 488; GFX6-NEXT: v_ashrrev_i32_e32 v12, 31, v12 489; GFX6-NEXT: v_add_i32_e32 v13, vcc, -1, v13 490; GFX6-NEXT: v_ashrrev_i32_e32 v14, 31, v14 491; GFX6-NEXT: v_add_i32_e32 v15, vcc, -1, v15 492; GFX6-NEXT: v_ashrrev_i32_e32 v16, 31, v16 493; GFX6-NEXT: v_add_i32_e32 v17, vcc, -1, v17 494; GFX6-NEXT: v_add_i32_e32 v0, vcc, 32, v0 495; GFX6-NEXT: v_add_i32_e32 v12, vcc, 32, v12 496; GFX6-NEXT: v_add_i32_e32 v14, vcc, 32, v14 497; GFX6-NEXT: v_add_i32_e32 v16, vcc, 32, v16 498; GFX6-NEXT: v_min_u32_e32 v0, v9, v0 499; GFX6-NEXT: v_min_u32_e32 v9, v13, v12 500; GFX6-NEXT: v_min_u32_e32 v12, v15, v14 501; GFX6-NEXT: v_min_u32_e32 v13, v17, v16 502; GFX6-NEXT: v_lshl_b64 v[3:4], v[3:4], v0 503; GFX6-NEXT: v_sub_i32_e32 v14, vcc, 32, v0 504; GFX6-NEXT: v_lshl_b64 v[0:1], v[1:2], v9 505; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 32, v9 506; GFX6-NEXT: v_lshl_b64 v[7:8], v[7:8], v12 507; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 32, v12 508; GFX6-NEXT: v_lshl_b64 v[5:6], v[5:6], v13 509; GFX6-NEXT: v_sub_i32_e32 v12, vcc, 32, v13 510; GFX6-NEXT: v_min_u32_e32 v3, 1, v3 511; GFX6-NEXT: v_min_u32_e32 v0, 1, v0 512; GFX6-NEXT: v_min_u32_e32 v7, 1, v7 513; GFX6-NEXT: v_min_u32_e32 v5, 1, v5 514; GFX6-NEXT: v_or_b32_e32 v3, v4, v3 515; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 516; GFX6-NEXT: v_or_b32_e32 v1, v8, v7 517; GFX6-NEXT: v_or_b32_e32 v4, v6, v5 518; GFX6-NEXT: v_cvt_f32_i32_e32 v3, v3 519; GFX6-NEXT: v_cvt_f32_i32_e32 v0, v0 520; GFX6-NEXT: v_cvt_f32_i32_e32 v1, v1 521; GFX6-NEXT: v_cvt_f32_i32_e32 v4, v4 522; GFX6-NEXT: v_ldexp_f32_e32 v3, v3, v14 523; GFX6-NEXT: v_ldexp_f32_e32 v2, v0, v2 524; GFX6-NEXT: v_ldexp_f32_e32 v1, v1, v9 525; GFX6-NEXT: v_ldexp_f32_e32 v0, v4, v12 526; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[10:11], s[0:3], 0 addr64 527; GFX6-NEXT: s_endpgm 528; 529; GFX8-LABEL: v_sint_to_fp_v4i64_to_v4f32: 530; GFX8: ; %bb.0: 531; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 532; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v0 533; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 534; GFX8-NEXT: s_waitcnt lgkmcnt(0) 535; GFX8-NEXT: v_mov_b32_e32 v2, s3 536; GFX8-NEXT: v_add_u32_e32 v5, vcc, s2, v1 537; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc 538; GFX8-NEXT: flat_load_dwordx4 v[1:4], v[5:6] 539; GFX8-NEXT: v_add_u32_e32 v5, vcc, 16, v5 540; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc 541; GFX8-NEXT: flat_load_dwordx4 v[5:8], v[5:6] 542; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v0 543; GFX8-NEXT: v_mov_b32_e32 v10, s1 544; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc 545; GFX8-NEXT: s_waitcnt vmcnt(1) 546; GFX8-NEXT: v_xor_b32_e32 v0, v3, v4 547; GFX8-NEXT: v_xor_b32_e32 v12, v1, v2 548; GFX8-NEXT: v_ffbh_i32_e32 v11, v4 549; GFX8-NEXT: v_ffbh_i32_e32 v13, v2 550; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v0 551; GFX8-NEXT: s_waitcnt vmcnt(0) 552; GFX8-NEXT: v_xor_b32_e32 v14, v7, v8 553; GFX8-NEXT: v_xor_b32_e32 v16, v5, v6 554; GFX8-NEXT: v_ffbh_i32_e32 v15, v8 555; GFX8-NEXT: v_ffbh_i32_e32 v17, v6 556; GFX8-NEXT: v_ashrrev_i32_e32 v12, 31, v12 557; GFX8-NEXT: v_ashrrev_i32_e32 v14, 31, v14 558; GFX8-NEXT: v_ashrrev_i32_e32 v16, 31, v16 559; GFX8-NEXT: v_add_u32_e32 v11, vcc, -1, v11 560; GFX8-NEXT: v_add_u32_e32 v13, vcc, -1, v13 561; GFX8-NEXT: v_add_u32_e32 v15, vcc, -1, v15 562; GFX8-NEXT: v_add_u32_e32 v17, vcc, -1, v17 563; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 564; GFX8-NEXT: v_add_u32_e32 v12, vcc, 32, v12 565; GFX8-NEXT: v_add_u32_e32 v14, vcc, 32, v14 566; GFX8-NEXT: v_add_u32_e32 v16, vcc, 32, v16 567; GFX8-NEXT: v_min_u32_e32 v0, v11, v0 568; GFX8-NEXT: v_min_u32_e32 v11, v13, v12 569; GFX8-NEXT: v_min_u32_e32 v12, v15, v14 570; GFX8-NEXT: v_min_u32_e32 v13, v17, v16 571; GFX8-NEXT: v_lshlrev_b64 v[3:4], v0, v[3:4] 572; GFX8-NEXT: v_sub_u32_e32 v14, vcc, 32, v0 573; GFX8-NEXT: v_lshlrev_b64 v[0:1], v11, v[1:2] 574; GFX8-NEXT: v_lshlrev_b64 v[7:8], v12, v[7:8] 575; GFX8-NEXT: v_lshlrev_b64 v[5:6], v13, v[5:6] 576; GFX8-NEXT: v_min_u32_e32 v3, 1, v3 577; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 578; GFX8-NEXT: v_min_u32_e32 v7, 1, v7 579; GFX8-NEXT: v_min_u32_e32 v5, 1, v5 580; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 581; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 582; GFX8-NEXT: v_or_b32_e32 v1, v8, v7 583; GFX8-NEXT: v_or_b32_e32 v4, v6, v5 584; GFX8-NEXT: v_cvt_f32_i32_e32 v3, v3 585; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0 586; GFX8-NEXT: v_cvt_f32_i32_e32 v5, v1 587; GFX8-NEXT: v_cvt_f32_i32_e32 v4, v4 588; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v11 589; GFX8-NEXT: v_sub_u32_e32 v11, vcc, 32, v12 590; GFX8-NEXT: v_sub_u32_e32 v12, vcc, 32, v13 591; GFX8-NEXT: v_ldexp_f32 v1, v3, v14 592; GFX8-NEXT: v_ldexp_f32 v0, v0, v2 593; GFX8-NEXT: v_ldexp_f32 v3, v5, v11 594; GFX8-NEXT: v_ldexp_f32 v2, v4, v12 595; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[0:3] 596; GFX8-NEXT: s_endpgm 597; 598; GFX11-LABEL: v_sint_to_fp_v4i64_to_v4f32: 599; GFX11: ; %bb.0: 600; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 601; GFX11-NEXT: v_and_b32_e32 v8, 0x3ff, v0 602; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 603; GFX11-NEXT: v_lshlrev_b32_e32 v4, 5, v8 604; GFX11-NEXT: s_waitcnt lgkmcnt(0) 605; GFX11-NEXT: s_clause 0x1 606; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] offset:16 607; GFX11-NEXT: global_load_b128 v[4:7], v4, s[2:3] 608; GFX11-NEXT: s_waitcnt vmcnt(1) 609; GFX11-NEXT: v_xor_b32_e32 v9, v2, v3 610; GFX11-NEXT: v_xor_b32_e32 v11, v0, v1 611; GFX11-NEXT: s_waitcnt vmcnt(0) 612; GFX11-NEXT: v_xor_b32_e32 v13, v6, v7 613; GFX11-NEXT: v_xor_b32_e32 v15, v4, v5 614; GFX11-NEXT: v_cls_i32_e32 v10, v3 615; GFX11-NEXT: v_cls_i32_e32 v12, v1 616; GFX11-NEXT: v_cls_i32_e32 v14, v7 617; GFX11-NEXT: v_cls_i32_e32 v16, v5 618; GFX11-NEXT: v_ashrrev_i32_e32 v9, 31, v9 619; GFX11-NEXT: v_ashrrev_i32_e32 v11, 31, v11 620; GFX11-NEXT: v_ashrrev_i32_e32 v13, 31, v13 621; GFX11-NEXT: v_ashrrev_i32_e32 v15, 31, v15 622; GFX11-NEXT: v_add_nc_u32_e32 v10, -1, v10 623; GFX11-NEXT: v_add_nc_u32_e32 v12, -1, v12 624; GFX11-NEXT: v_add_nc_u32_e32 v14, -1, v14 625; GFX11-NEXT: v_add_nc_u32_e32 v16, -1, v16 626; GFX11-NEXT: v_add_nc_u32_e32 v9, 32, v9 627; GFX11-NEXT: v_add_nc_u32_e32 v11, 32, v11 628; GFX11-NEXT: v_add_nc_u32_e32 v13, 32, v13 629; GFX11-NEXT: v_add_nc_u32_e32 v15, 32, v15 630; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 631; GFX11-NEXT: v_min_u32_e32 v9, v10, v9 632; GFX11-NEXT: v_min_u32_e32 v10, v12, v11 633; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 634; GFX11-NEXT: v_min_u32_e32 v11, v14, v13 635; GFX11-NEXT: v_min_u32_e32 v12, v16, v15 636; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 637; GFX11-NEXT: v_lshlrev_b64 v[2:3], v9, v[2:3] 638; GFX11-NEXT: v_lshlrev_b64 v[0:1], v10, v[0:1] 639; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 640; GFX11-NEXT: v_lshlrev_b64 v[6:7], v11, v[6:7] 641; GFX11-NEXT: v_lshlrev_b64 v[4:5], v12, v[4:5] 642; GFX11-NEXT: v_sub_nc_u32_e32 v9, 32, v9 643; GFX11-NEXT: v_sub_nc_u32_e32 v10, 32, v10 644; GFX11-NEXT: v_min_u32_e32 v2, 1, v2 645; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 646; GFX11-NEXT: v_min_u32_e32 v6, 1, v6 647; GFX11-NEXT: v_min_u32_e32 v4, 1, v4 648; GFX11-NEXT: v_sub_nc_u32_e32 v11, 32, v11 649; GFX11-NEXT: v_or_b32_e32 v2, v3, v2 650; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 651; GFX11-NEXT: v_or_b32_e32 v1, v7, v6 652; GFX11-NEXT: v_or_b32_e32 v3, v5, v4 653; GFX11-NEXT: v_sub_nc_u32_e32 v4, 32, v12 654; GFX11-NEXT: v_cvt_f32_i32_e32 v2, v2 655; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 656; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1 657; GFX11-NEXT: v_cvt_f32_i32_e32 v5, v3 658; GFX11-NEXT: v_lshlrev_b32_e32 v6, 4, v8 659; GFX11-NEXT: v_ldexp_f32 v3, v2, v9 660; GFX11-NEXT: v_ldexp_f32 v2, v0, v10 661; GFX11-NEXT: v_ldexp_f32 v1, v1, v11 662; GFX11-NEXT: v_ldexp_f32 v0, v5, v4 663; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1] 664; GFX11-NEXT: s_endpgm 665 %tid = call i32 @llvm.amdgcn.workitem.id.x() 666 %in.gep = getelementptr <4 x i64>, ptr addrspace(1) %in, i32 %tid 667 %out.gep = getelementptr <4 x float>, ptr addrspace(1) %out, i32 %tid 668 %value = load <4 x i64>, ptr addrspace(1) %in.gep 669 %result = sitofp <4 x i64> %value to <4 x float> 670 store <4 x float> %result, ptr addrspace(1) %out.gep 671 ret void 672} 673 674define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2 x i64> %in) #0{ 675; GFX6-LABEL: s_sint_to_fp_v2i64_to_v2f16: 676; GFX6: ; %bb.0: 677; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd 678; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 679; GFX6-NEXT: s_mov_b32 s3, 0xf000 680; GFX6-NEXT: s_mov_b32 s2, -1 681; GFX6-NEXT: s_waitcnt lgkmcnt(0) 682; GFX6-NEXT: s_flbit_i32 s4, s11 683; GFX6-NEXT: s_xor_b32 s5, s10, s11 684; GFX6-NEXT: s_flbit_i32 s6, s9 685; GFX6-NEXT: s_xor_b32 s7, s8, s9 686; GFX6-NEXT: s_add_i32 s4, s4, -1 687; GFX6-NEXT: s_ashr_i32 s5, s5, 31 688; GFX6-NEXT: s_add_i32 s6, s6, -1 689; GFX6-NEXT: s_ashr_i32 s7, s7, 31 690; GFX6-NEXT: s_add_i32 s5, s5, 32 691; GFX6-NEXT: s_add_i32 s7, s7, 32 692; GFX6-NEXT: s_min_u32 s12, s4, s5 693; GFX6-NEXT: s_min_u32 s13, s6, s7 694; GFX6-NEXT: s_lshl_b64 s[4:5], s[10:11], s12 695; GFX6-NEXT: s_sub_i32 s10, 32, s12 696; GFX6-NEXT: s_lshl_b64 s[6:7], s[8:9], s13 697; GFX6-NEXT: s_sub_i32 s8, 32, s13 698; GFX6-NEXT: s_min_u32 s4, s4, 1 699; GFX6-NEXT: s_min_u32 s6, s6, 1 700; GFX6-NEXT: s_or_b32 s4, s5, s4 701; GFX6-NEXT: s_or_b32 s5, s7, s6 702; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4 703; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 704; GFX6-NEXT: v_ldexp_f32_e64 v0, v0, s10 705; GFX6-NEXT: v_ldexp_f32_e64 v1, v1, s8 706; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 707; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 708; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 709; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 710; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 711; GFX6-NEXT: s_endpgm 712; 713; GFX8-LABEL: s_sint_to_fp_v2i64_to_v2f16: 714; GFX8: ; %bb.0: 715; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 716; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 717; GFX8-NEXT: s_waitcnt lgkmcnt(0) 718; GFX8-NEXT: s_xor_b32 s7, s2, s3 719; GFX8-NEXT: s_flbit_i32 s6, s3 720; GFX8-NEXT: s_ashr_i32 s7, s7, 31 721; GFX8-NEXT: s_add_i32 s6, s6, -1 722; GFX8-NEXT: s_add_i32 s7, s7, 32 723; GFX8-NEXT: s_min_u32 s6, s6, s7 724; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s6 725; GFX8-NEXT: s_min_u32 s2, s2, 1 726; GFX8-NEXT: s_or_b32 s2, s3, s2 727; GFX8-NEXT: s_xor_b32 s3, s0, s1 728; GFX8-NEXT: v_cvt_f32_i32_e32 v0, s2 729; GFX8-NEXT: s_flbit_i32 s2, s1 730; GFX8-NEXT: s_ashr_i32 s3, s3, 31 731; GFX8-NEXT: s_add_i32 s2, s2, -1 732; GFX8-NEXT: s_add_i32 s3, s3, 32 733; GFX8-NEXT: s_min_u32 s2, s2, s3 734; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 735; GFX8-NEXT: s_min_u32 s0, s0, 1 736; GFX8-NEXT: s_or_b32 s0, s1, s0 737; GFX8-NEXT: v_cvt_f32_i32_e32 v1, s0 738; GFX8-NEXT: s_sub_i32 s6, 32, s6 739; GFX8-NEXT: s_sub_i32 s0, 32, s2 740; GFX8-NEXT: v_ldexp_f32 v0, v0, s6 741; GFX8-NEXT: v_ldexp_f32 v1, v1, s0 742; GFX8-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 743; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1 744; GFX8-NEXT: v_or_b32_e32 v2, v1, v0 745; GFX8-NEXT: v_mov_b32_e32 v0, s4 746; GFX8-NEXT: v_mov_b32_e32 v1, s5 747; GFX8-NEXT: flat_store_dword v[0:1], v2 748; GFX8-NEXT: s_endpgm 749; 750; GFX11-LABEL: s_sint_to_fp_v2i64_to_v2f16: 751; GFX11: ; %bb.0: 752; GFX11-NEXT: s_clause 0x1 753; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 754; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 755; GFX11-NEXT: v_mov_b32_e32 v2, 0 756; GFX11-NEXT: s_waitcnt lgkmcnt(0) 757; GFX11-NEXT: s_xor_b32 s7, s2, s3 758; GFX11-NEXT: s_xor_b32 s9, s0, s1 759; GFX11-NEXT: s_cls_i32 s6, s3 760; GFX11-NEXT: s_cls_i32 s8, s1 761; GFX11-NEXT: s_ashr_i32 s7, s7, 31 762; GFX11-NEXT: s_ashr_i32 s9, s9, 31 763; GFX11-NEXT: s_add_i32 s6, s6, -1 764; GFX11-NEXT: s_add_i32 s8, s8, -1 765; GFX11-NEXT: s_add_i32 s7, s7, 32 766; GFX11-NEXT: s_add_i32 s9, s9, 32 767; GFX11-NEXT: s_min_u32 s6, s6, s7 768; GFX11-NEXT: s_min_u32 s7, s8, s9 769; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], s6 770; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s7 771; GFX11-NEXT: s_min_u32 s2, s2, 1 772; GFX11-NEXT: s_min_u32 s0, s0, 1 773; GFX11-NEXT: s_or_b32 s2, s3, s2 774; GFX11-NEXT: s_or_b32 s0, s1, s0 775; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2 776; GFX11-NEXT: v_cvt_f32_i32_e32 v1, s0 777; GFX11-NEXT: s_sub_i32 s0, 32, s6 778; GFX11-NEXT: s_sub_i32 s1, 32, s7 779; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 780; GFX11-NEXT: v_ldexp_f32 v0, v0, s0 781; GFX11-NEXT: v_ldexp_f32 v1, v1, s1 782; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 783; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 784; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 785; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 786; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0 787; GFX11-NEXT: global_store_b32 v2, v0, s[4:5] 788; GFX11-NEXT: s_endpgm 789 %result = sitofp <2 x i64> %in to <2 x half> 790 store <2 x half> %result, ptr addrspace(1) %out 791 ret void 792} 793 794define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 795; GFX6-LABEL: v_sint_to_fp_v4i64_to_v4f16: 796; GFX6: ; %bb.0: 797; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 798; GFX6-NEXT: s_mov_b32 s7, 0xf000 799; GFX6-NEXT: s_mov_b32 s6, 0 800; GFX6-NEXT: v_lshlrev_b32_e32 v8, 5, v0 801; GFX6-NEXT: v_mov_b32_e32 v9, 0 802; GFX6-NEXT: s_waitcnt lgkmcnt(0) 803; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 804; GFX6-NEXT: buffer_load_dwordx4 v[1:4], v[8:9], s[4:7], 0 addr64 offset:16 805; GFX6-NEXT: buffer_load_dwordx4 v[5:8], v[8:9], s[4:7], 0 addr64 806; GFX6-NEXT: v_lshlrev_b32_e32 v10, 3, v0 807; GFX6-NEXT: v_mov_b32_e32 v11, v9 808; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 809; GFX6-NEXT: s_waitcnt vmcnt(1) 810; GFX6-NEXT: v_xor_b32_e32 v0, v3, v4 811; GFX6-NEXT: v_ffbh_i32_e32 v9, v4 812; GFX6-NEXT: v_xor_b32_e32 v12, v1, v2 813; GFX6-NEXT: v_ffbh_i32_e32 v13, v2 814; GFX6-NEXT: s_waitcnt vmcnt(0) 815; GFX6-NEXT: v_xor_b32_e32 v14, v7, v8 816; GFX6-NEXT: v_ffbh_i32_e32 v15, v8 817; GFX6-NEXT: v_xor_b32_e32 v16, v5, v6 818; GFX6-NEXT: v_ffbh_i32_e32 v17, v6 819; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v0 820; GFX6-NEXT: v_add_i32_e32 v9, vcc, -1, v9 821; GFX6-NEXT: v_ashrrev_i32_e32 v12, 31, v12 822; GFX6-NEXT: v_add_i32_e32 v13, vcc, -1, v13 823; GFX6-NEXT: v_ashrrev_i32_e32 v14, 31, v14 824; GFX6-NEXT: v_add_i32_e32 v15, vcc, -1, v15 825; GFX6-NEXT: v_ashrrev_i32_e32 v16, 31, v16 826; GFX6-NEXT: v_add_i32_e32 v17, vcc, -1, v17 827; GFX6-NEXT: v_add_i32_e32 v0, vcc, 32, v0 828; GFX6-NEXT: v_add_i32_e32 v12, vcc, 32, v12 829; GFX6-NEXT: v_add_i32_e32 v14, vcc, 32, v14 830; GFX6-NEXT: v_add_i32_e32 v16, vcc, 32, v16 831; GFX6-NEXT: v_min_u32_e32 v0, v9, v0 832; GFX6-NEXT: v_min_u32_e32 v9, v13, v12 833; GFX6-NEXT: v_min_u32_e32 v12, v15, v14 834; GFX6-NEXT: v_min_u32_e32 v13, v17, v16 835; GFX6-NEXT: v_lshl_b64 v[3:4], v[3:4], v0 836; GFX6-NEXT: v_sub_i32_e32 v14, vcc, 32, v0 837; GFX6-NEXT: v_lshl_b64 v[0:1], v[1:2], v9 838; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 32, v9 839; GFX6-NEXT: v_lshl_b64 v[7:8], v[7:8], v12 840; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 32, v12 841; GFX6-NEXT: v_lshl_b64 v[5:6], v[5:6], v13 842; GFX6-NEXT: v_sub_i32_e32 v12, vcc, 32, v13 843; GFX6-NEXT: v_min_u32_e32 v3, 1, v3 844; GFX6-NEXT: v_min_u32_e32 v0, 1, v0 845; GFX6-NEXT: v_min_u32_e32 v7, 1, v7 846; GFX6-NEXT: v_min_u32_e32 v5, 1, v5 847; GFX6-NEXT: v_or_b32_e32 v3, v4, v3 848; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 849; GFX6-NEXT: v_or_b32_e32 v1, v8, v7 850; GFX6-NEXT: v_or_b32_e32 v4, v6, v5 851; GFX6-NEXT: v_cvt_f32_i32_e32 v3, v3 852; GFX6-NEXT: v_cvt_f32_i32_e32 v0, v0 853; GFX6-NEXT: v_cvt_f32_i32_e32 v1, v1 854; GFX6-NEXT: v_cvt_f32_i32_e32 v4, v4 855; GFX6-NEXT: v_ldexp_f32_e32 v3, v3, v14 856; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v2 857; GFX6-NEXT: v_ldexp_f32_e32 v1, v1, v9 858; GFX6-NEXT: v_ldexp_f32_e32 v2, v4, v12 859; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 860; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 861; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 862; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 863; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 864; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v1 865; GFX6-NEXT: v_or_b32_e32 v1, v0, v3 866; GFX6-NEXT: v_or_b32_e32 v0, v2, v4 867; GFX6-NEXT: buffer_store_dwordx2 v[0:1], v[10:11], s[0:3], 0 addr64 868; GFX6-NEXT: s_endpgm 869; 870; GFX8-LABEL: v_sint_to_fp_v4i64_to_v4f16: 871; GFX8: ; %bb.0: 872; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 873; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v0 874; GFX8-NEXT: v_lshlrev_b32_e32 v9, 3, v0 875; GFX8-NEXT: s_waitcnt lgkmcnt(0) 876; GFX8-NEXT: v_mov_b32_e32 v2, s3 877; GFX8-NEXT: v_add_u32_e32 v5, vcc, s2, v1 878; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc 879; GFX8-NEXT: flat_load_dwordx4 v[1:4], v[5:6] 880; GFX8-NEXT: v_add_u32_e32 v5, vcc, 16, v5 881; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc 882; GFX8-NEXT: flat_load_dwordx4 v[5:8], v[5:6] 883; GFX8-NEXT: v_mov_b32_e32 v10, s1 884; GFX8-NEXT: s_waitcnt vmcnt(1) 885; GFX8-NEXT: v_xor_b32_e32 v0, v3, v4 886; GFX8-NEXT: v_xor_b32_e32 v12, v1, v2 887; GFX8-NEXT: v_ffbh_i32_e32 v11, v4 888; GFX8-NEXT: v_ffbh_i32_e32 v13, v2 889; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v0 890; GFX8-NEXT: s_waitcnt vmcnt(0) 891; GFX8-NEXT: v_xor_b32_e32 v14, v7, v8 892; GFX8-NEXT: v_xor_b32_e32 v16, v5, v6 893; GFX8-NEXT: v_ffbh_i32_e32 v15, v8 894; GFX8-NEXT: v_ffbh_i32_e32 v17, v6 895; GFX8-NEXT: v_ashrrev_i32_e32 v12, 31, v12 896; GFX8-NEXT: v_ashrrev_i32_e32 v14, 31, v14 897; GFX8-NEXT: v_ashrrev_i32_e32 v16, 31, v16 898; GFX8-NEXT: v_add_u32_e32 v11, vcc, -1, v11 899; GFX8-NEXT: v_add_u32_e32 v13, vcc, -1, v13 900; GFX8-NEXT: v_add_u32_e32 v15, vcc, -1, v15 901; GFX8-NEXT: v_add_u32_e32 v17, vcc, -1, v17 902; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 903; GFX8-NEXT: v_add_u32_e32 v12, vcc, 32, v12 904; GFX8-NEXT: v_add_u32_e32 v14, vcc, 32, v14 905; GFX8-NEXT: v_add_u32_e32 v16, vcc, 32, v16 906; GFX8-NEXT: v_min_u32_e32 v0, v11, v0 907; GFX8-NEXT: v_min_u32_e32 v11, v13, v12 908; GFX8-NEXT: v_min_u32_e32 v12, v15, v14 909; GFX8-NEXT: v_min_u32_e32 v13, v17, v16 910; GFX8-NEXT: v_lshlrev_b64 v[3:4], v0, v[3:4] 911; GFX8-NEXT: v_sub_u32_e32 v14, vcc, 32, v0 912; GFX8-NEXT: v_lshlrev_b64 v[0:1], v11, v[1:2] 913; GFX8-NEXT: v_lshlrev_b64 v[7:8], v12, v[7:8] 914; GFX8-NEXT: v_lshlrev_b64 v[5:6], v13, v[5:6] 915; GFX8-NEXT: v_min_u32_e32 v3, 1, v3 916; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 917; GFX8-NEXT: v_min_u32_e32 v7, 1, v7 918; GFX8-NEXT: v_min_u32_e32 v5, 1, v5 919; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 920; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 921; GFX8-NEXT: v_or_b32_e32 v1, v8, v7 922; GFX8-NEXT: v_or_b32_e32 v4, v6, v5 923; GFX8-NEXT: v_cvt_f32_i32_e32 v3, v3 924; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0 925; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1 926; GFX8-NEXT: v_cvt_f32_i32_e32 v4, v4 927; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v11 928; GFX8-NEXT: v_sub_u32_e32 v11, vcc, 32, v12 929; GFX8-NEXT: v_sub_u32_e32 v12, vcc, 32, v13 930; GFX8-NEXT: v_ldexp_f32 v3, v3, v14 931; GFX8-NEXT: v_ldexp_f32 v0, v0, v2 932; GFX8-NEXT: v_ldexp_f32 v1, v1, v11 933; GFX8-NEXT: v_ldexp_f32 v2, v4, v12 934; GFX8-NEXT: v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 935; GFX8-NEXT: v_cvt_f16_f32_e32 v4, v0 936; GFX8-NEXT: v_cvt_f16_f32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 937; GFX8-NEXT: v_cvt_f16_f32_e32 v6, v2 938; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v9 939; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v10, vcc 940; GFX8-NEXT: v_or_b32_e32 v2, v4, v3 941; GFX8-NEXT: v_or_b32_e32 v3, v6, v5 942; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 943; GFX8-NEXT: s_endpgm 944; 945; GFX11-LABEL: v_sint_to_fp_v4i64_to_v4f16: 946; GFX11: ; %bb.0: 947; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 948; GFX11-NEXT: v_and_b32_e32 v8, 0x3ff, v0 949; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 950; GFX11-NEXT: v_lshlrev_b32_e32 v4, 5, v8 951; GFX11-NEXT: s_waitcnt lgkmcnt(0) 952; GFX11-NEXT: s_clause 0x1 953; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] offset:16 954; GFX11-NEXT: global_load_b128 v[4:7], v4, s[2:3] 955; GFX11-NEXT: s_waitcnt vmcnt(1) 956; GFX11-NEXT: v_xor_b32_e32 v9, v2, v3 957; GFX11-NEXT: v_xor_b32_e32 v11, v0, v1 958; GFX11-NEXT: s_waitcnt vmcnt(0) 959; GFX11-NEXT: v_xor_b32_e32 v13, v6, v7 960; GFX11-NEXT: v_xor_b32_e32 v15, v4, v5 961; GFX11-NEXT: v_cls_i32_e32 v10, v3 962; GFX11-NEXT: v_cls_i32_e32 v12, v1 963; GFX11-NEXT: v_cls_i32_e32 v14, v7 964; GFX11-NEXT: v_cls_i32_e32 v16, v5 965; GFX11-NEXT: v_ashrrev_i32_e32 v9, 31, v9 966; GFX11-NEXT: v_ashrrev_i32_e32 v11, 31, v11 967; GFX11-NEXT: v_ashrrev_i32_e32 v13, 31, v13 968; GFX11-NEXT: v_ashrrev_i32_e32 v15, 31, v15 969; GFX11-NEXT: v_add_nc_u32_e32 v10, -1, v10 970; GFX11-NEXT: v_add_nc_u32_e32 v12, -1, v12 971; GFX11-NEXT: v_add_nc_u32_e32 v14, -1, v14 972; GFX11-NEXT: v_add_nc_u32_e32 v16, -1, v16 973; GFX11-NEXT: v_add_nc_u32_e32 v9, 32, v9 974; GFX11-NEXT: v_add_nc_u32_e32 v11, 32, v11 975; GFX11-NEXT: v_add_nc_u32_e32 v13, 32, v13 976; GFX11-NEXT: v_add_nc_u32_e32 v15, 32, v15 977; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 978; GFX11-NEXT: v_min_u32_e32 v9, v10, v9 979; GFX11-NEXT: v_min_u32_e32 v10, v12, v11 980; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 981; GFX11-NEXT: v_min_u32_e32 v11, v14, v13 982; GFX11-NEXT: v_min_u32_e32 v12, v16, v15 983; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 984; GFX11-NEXT: v_lshlrev_b64 v[2:3], v9, v[2:3] 985; GFX11-NEXT: v_lshlrev_b64 v[0:1], v10, v[0:1] 986; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 987; GFX11-NEXT: v_lshlrev_b64 v[6:7], v11, v[6:7] 988; GFX11-NEXT: v_lshlrev_b64 v[4:5], v12, v[4:5] 989; GFX11-NEXT: v_sub_nc_u32_e32 v9, 32, v9 990; GFX11-NEXT: v_sub_nc_u32_e32 v10, 32, v10 991; GFX11-NEXT: v_min_u32_e32 v2, 1, v2 992; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 993; GFX11-NEXT: v_min_u32_e32 v6, 1, v6 994; GFX11-NEXT: v_min_u32_e32 v4, 1, v4 995; GFX11-NEXT: v_sub_nc_u32_e32 v11, 32, v11 996; GFX11-NEXT: v_or_b32_e32 v2, v3, v2 997; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 998; GFX11-NEXT: v_or_b32_e32 v1, v7, v6 999; GFX11-NEXT: v_or_b32_e32 v3, v5, v4 1000; GFX11-NEXT: v_sub_nc_u32_e32 v4, 32, v12 1001; GFX11-NEXT: v_cvt_f32_i32_e32 v2, v2 1002; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 1003; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1 1004; GFX11-NEXT: v_cvt_f32_i32_e32 v3, v3 1005; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v8 1006; GFX11-NEXT: v_ldexp_f32 v2, v2, v9 1007; GFX11-NEXT: v_ldexp_f32 v0, v0, v10 1008; GFX11-NEXT: v_ldexp_f32 v1, v1, v11 1009; GFX11-NEXT: v_ldexp_f32 v3, v3, v4 1010; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 1011; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 1012; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 1013; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 1014; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v1 1015; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 1016; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 1017; GFX11-NEXT: v_pack_b32_f16 v1, v0, v2 1018; GFX11-NEXT: v_pack_b32_f16 v0, v3, v4 1019; GFX11-NEXT: global_store_b64 v5, v[0:1], s[0:1] 1020; GFX11-NEXT: s_endpgm 1021 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1022 %in.gep = getelementptr <4 x i64>, ptr addrspace(1) %in, i32 %tid 1023 %out.gep = getelementptr <4 x half>, ptr addrspace(1) %out, i32 %tid 1024 %value = load <4 x i64>, ptr addrspace(1) %in.gep 1025 %result = sitofp <4 x i64> %value to <4 x half> 1026 store <4 x half> %result, ptr addrspace(1) %out.gep 1027 ret void 1028} 1029 1030declare i32 @llvm.amdgcn.workitem.id.x() #1 1031 1032attributes #0 = { nounwind } 1033attributes #1 = { nounwind readnone } 1034