1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s 3; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX8 %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11 %s 5 6; FIXME: This should be merged with uint_to_fp.ll, but s_uint_to_fp_v2i64 crashes on r600 7 8define amdgpu_kernel void @s_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %in) #0 { 9; GFX6-LABEL: s_uint_to_fp_i64_to_f16: 10; GFX6: ; %bb.0: 11; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 12; GFX6-NEXT: s_mov_b32 s7, 0xf000 13; GFX6-NEXT: s_mov_b32 s6, -1 14; GFX6-NEXT: s_waitcnt lgkmcnt(0) 15; GFX6-NEXT: s_mov_b32 s4, s0 16; GFX6-NEXT: s_mov_b32 s5, s1 17; GFX6-NEXT: s_flbit_i32_b32 s0, s3 18; GFX6-NEXT: s_min_u32 s8, s0, 32 19; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], s8 20; GFX6-NEXT: s_min_u32 s0, s0, 1 21; GFX6-NEXT: s_or_b32 s0, s1, s0 22; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 23; GFX6-NEXT: s_sub_i32 s0, 32, s8 24; GFX6-NEXT: v_ldexp_f32_e64 v0, v0, s0 25; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 26; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 27; GFX6-NEXT: s_endpgm 28; 29; GFX8-LABEL: s_uint_to_fp_i64_to_f16: 30; GFX8: ; %bb.0: 31; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 32; GFX8-NEXT: s_waitcnt lgkmcnt(0) 33; GFX8-NEXT: s_flbit_i32_b32 s4, s3 34; GFX8-NEXT: s_min_u32 s4, s4, 32 35; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 36; GFX8-NEXT: s_min_u32 s2, s2, 1 37; GFX8-NEXT: s_or_b32 s2, s3, s2 38; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2 39; GFX8-NEXT: s_sub_i32 s2, 32, s4 40; GFX8-NEXT: v_mov_b32_e32 v1, s1 41; GFX8-NEXT: v_ldexp_f32 v0, v0, s2 42; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v0 43; GFX8-NEXT: v_mov_b32_e32 v0, s0 44; GFX8-NEXT: flat_store_short v[0:1], v2 45; GFX8-NEXT: s_endpgm 46; 47; GFX11-LABEL: s_uint_to_fp_i64_to_f16: 48; GFX11: ; %bb.0: 49; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 50; GFX11-NEXT: v_mov_b32_e32 v1, 0 51; GFX11-NEXT: s_waitcnt lgkmcnt(0) 52; GFX11-NEXT: s_clz_i32_u32 s4, s3 53; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 54; GFX11-NEXT: s_min_u32 s4, s4, 32 55; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 56; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 57; GFX11-NEXT: s_min_u32 s2, s2, 1 58; GFX11-NEXT: s_or_b32 s2, s3, s2 59; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 60; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2 61; GFX11-NEXT: s_sub_i32 s2, 32, s4 62; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 63; GFX11-NEXT: v_ldexp_f32 v0, v0, s2 64; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 65; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 66; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] 67; GFX11-NEXT: s_endpgm 68 %result = uitofp i64 %in to half 69 store half %result, ptr addrspace(1) %out 70 ret void 71} 72 73define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 74; GFX6-LABEL: v_uint_to_fp_i64_to_f16: 75; GFX6: ; %bb.0: 76; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 77; GFX6-NEXT: s_mov_b32 s7, 0xf000 78; GFX6-NEXT: s_mov_b32 s6, 0 79; GFX6-NEXT: v_lshlrev_b32_e32 v1, 3, v0 80; GFX6-NEXT: v_mov_b32_e32 v2, 0 81; GFX6-NEXT: s_waitcnt lgkmcnt(0) 82; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 83; GFX6-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 84; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v0 85; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 86; GFX6-NEXT: s_waitcnt vmcnt(0) 87; GFX6-NEXT: v_ffbh_u32_e32 v0, v4 88; GFX6-NEXT: v_min_u32_e32 v0, 32, v0 89; GFX6-NEXT: v_lshl_b64 v[3:4], v[3:4], v0 90; GFX6-NEXT: v_min_u32_e32 v3, 1, v3 91; GFX6-NEXT: v_or_b32_e32 v3, v4, v3 92; GFX6-NEXT: v_cvt_f32_u32_e32 v3, v3 93; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 32, v0 94; GFX6-NEXT: v_ldexp_f32_e32 v0, v3, v0 95; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 96; GFX6-NEXT: buffer_store_short v0, v[1:2], s[0:3], 0 addr64 97; GFX6-NEXT: s_endpgm 98; 99; GFX8-LABEL: v_uint_to_fp_i64_to_f16: 100; GFX8: ; %bb.0: 101; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 102; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v0 103; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 104; GFX8-NEXT: s_waitcnt lgkmcnt(0) 105; GFX8-NEXT: v_mov_b32_e32 v2, s3 106; GFX8-NEXT: v_add_u32_e32 v1, vcc, s2, v1 107; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc 108; GFX8-NEXT: flat_load_dwordx2 v[1:2], v[1:2] 109; GFX8-NEXT: s_waitcnt vmcnt(0) 110; GFX8-NEXT: v_ffbh_u32_e32 v3, v2 111; GFX8-NEXT: v_min_u32_e32 v3, 32, v3 112; GFX8-NEXT: v_lshlrev_b64 v[1:2], v3, v[1:2] 113; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 32, v3 114; GFX8-NEXT: v_min_u32_e32 v1, 1, v1 115; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 116; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v1 117; GFX8-NEXT: v_mov_b32_e32 v2, s1 118; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 119; GFX8-NEXT: v_ldexp_f32 v1, v1, v3 120; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v1 121; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc 122; GFX8-NEXT: flat_store_short v[0:1], v3 123; GFX8-NEXT: s_endpgm 124; 125; GFX11-LABEL: v_uint_to_fp_i64_to_f16: 126; GFX11: ; %bb.0: 127; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 128; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 129; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) 130; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v2 131; GFX11-NEXT: s_waitcnt lgkmcnt(0) 132; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] 133; GFX11-NEXT: s_waitcnt vmcnt(0) 134; GFX11-NEXT: v_clz_i32_u32_e32 v3, v1 135; GFX11-NEXT: v_min_u32_e32 v3, 32, v3 136; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 137; GFX11-NEXT: v_lshlrev_b64 v[0:1], v3, v[0:1] 138; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 139; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 140; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 141; GFX11-NEXT: v_sub_nc_u32_e32 v1, 32, v3 142; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 143; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 144; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 145; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v2 146; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 147; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] 148; GFX11-NEXT: s_endpgm 149 %tid = call i32 @llvm.amdgcn.workitem.id.x() 150 %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid 151 %out.gep = getelementptr half, ptr addrspace(1) %out, i32 %tid 152 %val = load i64, ptr addrspace(1) %in.gep 153 %result = uitofp i64 %val to half 154 store half %result, ptr addrspace(1) %out.gep 155 ret void 156} 157 158define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %in) #0 { 159; GFX6-LABEL: s_uint_to_fp_i64_to_f32: 160; GFX6: ; %bb.0: 161; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 162; GFX6-NEXT: s_mov_b32 s7, 0xf000 163; GFX6-NEXT: s_mov_b32 s6, -1 164; GFX6-NEXT: s_waitcnt lgkmcnt(0) 165; GFX6-NEXT: s_mov_b32 s4, s0 166; GFX6-NEXT: s_mov_b32 s5, s1 167; GFX6-NEXT: s_flbit_i32_b32 s0, s3 168; GFX6-NEXT: s_min_u32 s8, s0, 32 169; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], s8 170; GFX6-NEXT: s_min_u32 s0, s0, 1 171; GFX6-NEXT: s_or_b32 s0, s1, s0 172; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 173; GFX6-NEXT: s_sub_i32 s0, 32, s8 174; GFX6-NEXT: v_ldexp_f32_e64 v0, v0, s0 175; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 176; GFX6-NEXT: s_endpgm 177; 178; GFX8-LABEL: s_uint_to_fp_i64_to_f32: 179; GFX8: ; %bb.0: 180; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 181; GFX8-NEXT: s_waitcnt lgkmcnt(0) 182; GFX8-NEXT: s_flbit_i32_b32 s4, s3 183; GFX8-NEXT: s_min_u32 s4, s4, 32 184; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 185; GFX8-NEXT: s_min_u32 s2, s2, 1 186; GFX8-NEXT: s_or_b32 s2, s3, s2 187; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s2 188; GFX8-NEXT: v_mov_b32_e32 v0, s0 189; GFX8-NEXT: s_sub_i32 s0, 32, s4 190; GFX8-NEXT: v_mov_b32_e32 v1, s1 191; GFX8-NEXT: v_ldexp_f32 v2, v2, s0 192; GFX8-NEXT: flat_store_dword v[0:1], v2 193; GFX8-NEXT: s_endpgm 194; 195; GFX11-LABEL: s_uint_to_fp_i64_to_f32: 196; GFX11: ; %bb.0: 197; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 198; GFX11-NEXT: v_mov_b32_e32 v1, 0 199; GFX11-NEXT: s_waitcnt lgkmcnt(0) 200; GFX11-NEXT: s_clz_i32_u32 s4, s3 201; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 202; GFX11-NEXT: s_min_u32 s4, s4, 32 203; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 204; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 205; GFX11-NEXT: s_min_u32 s2, s2, 1 206; GFX11-NEXT: s_or_b32 s2, s3, s2 207; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 208; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2 209; GFX11-NEXT: s_sub_i32 s2, 32, s4 210; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 211; GFX11-NEXT: v_ldexp_f32 v0, v0, s2 212; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] 213; GFX11-NEXT: s_endpgm 214 %result = uitofp i64 %in to float 215 store float %result, ptr addrspace(1) %out 216 ret void 217} 218 219define amdgpu_kernel void @v_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 220; GFX6-LABEL: v_uint_to_fp_i64_to_f32: 221; GFX6: ; %bb.0: 222; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 223; GFX6-NEXT: s_mov_b32 s7, 0xf000 224; GFX6-NEXT: s_mov_b32 s6, 0 225; GFX6-NEXT: v_lshlrev_b32_e32 v1, 3, v0 226; GFX6-NEXT: v_mov_b32_e32 v2, 0 227; GFX6-NEXT: s_waitcnt lgkmcnt(0) 228; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 229; GFX6-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 230; GFX6-NEXT: v_lshlrev_b32_e32 v1, 2, v0 231; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 232; GFX6-NEXT: s_waitcnt vmcnt(0) 233; GFX6-NEXT: v_ffbh_u32_e32 v0, v4 234; GFX6-NEXT: v_min_u32_e32 v0, 32, v0 235; GFX6-NEXT: v_lshl_b64 v[3:4], v[3:4], v0 236; GFX6-NEXT: v_min_u32_e32 v3, 1, v3 237; GFX6-NEXT: v_or_b32_e32 v3, v4, v3 238; GFX6-NEXT: v_cvt_f32_u32_e32 v3, v3 239; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 32, v0 240; GFX6-NEXT: v_ldexp_f32_e32 v0, v3, v0 241; GFX6-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 242; GFX6-NEXT: s_endpgm 243; 244; GFX8-LABEL: v_uint_to_fp_i64_to_f32: 245; GFX8: ; %bb.0: 246; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 247; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v0 248; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0 249; GFX8-NEXT: s_waitcnt lgkmcnt(0) 250; GFX8-NEXT: v_mov_b32_e32 v2, s3 251; GFX8-NEXT: v_add_u32_e32 v1, vcc, s2, v1 252; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc 253; GFX8-NEXT: flat_load_dwordx2 v[1:2], v[1:2] 254; GFX8-NEXT: s_waitcnt vmcnt(0) 255; GFX8-NEXT: v_ffbh_u32_e32 v0, v2 256; GFX8-NEXT: v_min_u32_e32 v4, 32, v0 257; GFX8-NEXT: v_lshlrev_b64 v[0:1], v4, v[1:2] 258; GFX8-NEXT: v_mov_b32_e32 v2, s1 259; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 260; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 261; GFX8-NEXT: v_cvt_f32_u32_e32 v5, v0 262; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v3 263; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc 264; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v4 265; GFX8-NEXT: v_ldexp_f32 v2, v5, v2 266; GFX8-NEXT: flat_store_dword v[0:1], v2 267; GFX8-NEXT: s_endpgm 268; 269; GFX11-LABEL: v_uint_to_fp_i64_to_f32: 270; GFX11: ; %bb.0: 271; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 272; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 273; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 274; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v2 275; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v2 276; GFX11-NEXT: s_waitcnt lgkmcnt(0) 277; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] 278; GFX11-NEXT: s_waitcnt vmcnt(0) 279; GFX11-NEXT: v_clz_i32_u32_e32 v3, v1 280; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 281; GFX11-NEXT: v_min_u32_e32 v3, 32, v3 282; GFX11-NEXT: v_lshlrev_b64 v[0:1], v3, v[0:1] 283; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 284; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 285; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 286; GFX11-NEXT: v_sub_nc_u32_e32 v1, 32, v3 287; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 288; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 289; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 290; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] 291; GFX11-NEXT: s_endpgm 292 %tid = call i32 @llvm.amdgcn.workitem.id.x() 293 %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid 294 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid 295 %val = load i64, ptr addrspace(1) %in.gep 296 %result = uitofp i64 %val to float 297 store float %result, ptr addrspace(1) %out.gep 298 ret void 299} 300 301define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2 x i64> %in) #0{ 302; GFX6-LABEL: s_uint_to_fp_v2i64_to_v2f32: 303; GFX6: ; %bb.0: 304; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd 305; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 306; GFX6-NEXT: s_mov_b32 s7, 0xf000 307; GFX6-NEXT: s_mov_b32 s6, -1 308; GFX6-NEXT: s_waitcnt lgkmcnt(0) 309; GFX6-NEXT: s_flbit_i32_b32 s8, s3 310; GFX6-NEXT: s_flbit_i32_b32 s9, s1 311; GFX6-NEXT: s_min_u32 s8, s8, 32 312; GFX6-NEXT: s_min_u32 s9, s9, 32 313; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 314; GFX6-NEXT: s_sub_i32 s8, 32, s8 315; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 316; GFX6-NEXT: s_sub_i32 s9, 32, s9 317; GFX6-NEXT: s_min_u32 s2, s2, 1 318; GFX6-NEXT: s_min_u32 s0, s0, 1 319; GFX6-NEXT: s_or_b32 s2, s3, s2 320; GFX6-NEXT: s_or_b32 s0, s1, s0 321; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 322; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s0 323; GFX6-NEXT: v_ldexp_f32_e64 v1, v0, s8 324; GFX6-NEXT: v_ldexp_f32_e64 v0, v2, s9 325; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 326; GFX6-NEXT: s_endpgm 327; 328; GFX8-LABEL: s_uint_to_fp_v2i64_to_v2f32: 329; GFX8: ; %bb.0: 330; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 331; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 332; GFX8-NEXT: s_waitcnt lgkmcnt(0) 333; GFX8-NEXT: s_flbit_i32_b32 s6, s3 334; GFX8-NEXT: s_flbit_i32_b32 s7, s1 335; GFX8-NEXT: s_min_u32 s6, s6, 32 336; GFX8-NEXT: s_min_u32 s7, s7, 32 337; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s6 338; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s7 339; GFX8-NEXT: s_min_u32 s2, s2, 1 340; GFX8-NEXT: s_or_b32 s2, s3, s2 341; GFX8-NEXT: s_min_u32 s0, s0, 1 342; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2 343; GFX8-NEXT: s_or_b32 s0, s1, s0 344; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s0 345; GFX8-NEXT: s_sub_i32 s0, 32, s6 346; GFX8-NEXT: v_ldexp_f32 v1, v0, s0 347; GFX8-NEXT: s_sub_i32 s0, 32, s7 348; GFX8-NEXT: v_ldexp_f32 v0, v2, s0 349; GFX8-NEXT: v_mov_b32_e32 v2, s4 350; GFX8-NEXT: v_mov_b32_e32 v3, s5 351; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 352; GFX8-NEXT: s_endpgm 353; 354; GFX11-LABEL: s_uint_to_fp_v2i64_to_v2f32: 355; GFX11: ; %bb.0: 356; GFX11-NEXT: s_clause 0x1 357; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 358; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 359; GFX11-NEXT: v_mov_b32_e32 v3, 0 360; GFX11-NEXT: s_waitcnt lgkmcnt(0) 361; GFX11-NEXT: s_clz_i32_u32 s6, s3 362; GFX11-NEXT: s_clz_i32_u32 s7, s1 363; GFX11-NEXT: s_min_u32 s6, s6, 32 364; GFX11-NEXT: s_min_u32 s7, s7, 32 365; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], s6 366; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s7 367; GFX11-NEXT: s_min_u32 s2, s2, 1 368; GFX11-NEXT: s_min_u32 s0, s0, 1 369; GFX11-NEXT: s_or_b32 s2, s3, s2 370; GFX11-NEXT: s_or_b32 s0, s1, s0 371; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2 372; GFX11-NEXT: v_cvt_f32_u32_e32 v2, s0 373; GFX11-NEXT: s_sub_i32 s0, 32, s6 374; GFX11-NEXT: s_sub_i32 s1, 32, s7 375; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 376; GFX11-NEXT: v_ldexp_f32 v1, v0, s0 377; GFX11-NEXT: v_ldexp_f32 v0, v2, s1 378; GFX11-NEXT: global_store_b64 v3, v[0:1], s[4:5] 379; GFX11-NEXT: s_endpgm 380 %result = uitofp <2 x i64> %in to <2 x float> 381 store <2 x float> %result, ptr addrspace(1) %out 382 ret void 383} 384 385define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 386; GFX6-LABEL: v_uint_to_fp_v4i64_to_v4f32: 387; GFX6: ; %bb.0: 388; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 389; GFX6-NEXT: s_mov_b32 s7, 0xf000 390; GFX6-NEXT: s_mov_b32 s6, 0 391; GFX6-NEXT: v_lshlrev_b32_e32 v8, 5, v0 392; GFX6-NEXT: v_mov_b32_e32 v9, 0 393; GFX6-NEXT: s_waitcnt lgkmcnt(0) 394; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 395; GFX6-NEXT: buffer_load_dwordx4 v[1:4], v[8:9], s[4:7], 0 addr64 offset:16 396; GFX6-NEXT: buffer_load_dwordx4 v[5:8], v[8:9], s[4:7], 0 addr64 397; GFX6-NEXT: v_lshlrev_b32_e32 v10, 4, v0 398; GFX6-NEXT: v_mov_b32_e32 v11, v9 399; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 400; GFX6-NEXT: s_waitcnt vmcnt(1) 401; GFX6-NEXT: v_ffbh_u32_e32 v0, v4 402; GFX6-NEXT: v_ffbh_u32_e32 v9, v2 403; GFX6-NEXT: s_waitcnt vmcnt(0) 404; GFX6-NEXT: v_ffbh_u32_e32 v12, v8 405; GFX6-NEXT: v_ffbh_u32_e32 v13, v6 406; GFX6-NEXT: v_min_u32_e32 v0, 32, v0 407; GFX6-NEXT: v_min_u32_e32 v9, 32, v9 408; GFX6-NEXT: v_min_u32_e32 v12, 32, v12 409; GFX6-NEXT: v_min_u32_e32 v13, 32, v13 410; GFX6-NEXT: v_lshl_b64 v[3:4], v[3:4], v0 411; GFX6-NEXT: v_sub_i32_e32 v14, vcc, 32, v0 412; GFX6-NEXT: v_lshl_b64 v[0:1], v[1:2], v9 413; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 32, v9 414; GFX6-NEXT: v_lshl_b64 v[7:8], v[7:8], v12 415; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 32, v12 416; GFX6-NEXT: v_lshl_b64 v[5:6], v[5:6], v13 417; GFX6-NEXT: v_sub_i32_e32 v12, vcc, 32, v13 418; GFX6-NEXT: v_min_u32_e32 v3, 1, v3 419; GFX6-NEXT: v_min_u32_e32 v0, 1, v0 420; GFX6-NEXT: v_min_u32_e32 v7, 1, v7 421; GFX6-NEXT: v_min_u32_e32 v5, 1, v5 422; GFX6-NEXT: v_or_b32_e32 v3, v4, v3 423; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 424; GFX6-NEXT: v_or_b32_e32 v1, v8, v7 425; GFX6-NEXT: v_or_b32_e32 v4, v6, v5 426; GFX6-NEXT: v_cvt_f32_u32_e32 v3, v3 427; GFX6-NEXT: v_cvt_f32_u32_e32 v0, v0 428; GFX6-NEXT: v_cvt_f32_u32_e32 v1, v1 429; GFX6-NEXT: v_cvt_f32_u32_e32 v4, v4 430; GFX6-NEXT: v_ldexp_f32_e32 v3, v3, v14 431; GFX6-NEXT: v_ldexp_f32_e32 v2, v0, v2 432; GFX6-NEXT: v_ldexp_f32_e32 v1, v1, v9 433; GFX6-NEXT: v_ldexp_f32_e32 v0, v4, v12 434; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[10:11], s[0:3], 0 addr64 435; GFX6-NEXT: s_endpgm 436; 437; GFX8-LABEL: v_uint_to_fp_v4i64_to_v4f32: 438; GFX8: ; %bb.0: 439; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 440; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v0 441; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 442; GFX8-NEXT: s_waitcnt lgkmcnt(0) 443; GFX8-NEXT: v_mov_b32_e32 v2, s3 444; GFX8-NEXT: v_add_u32_e32 v5, vcc, s2, v1 445; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc 446; GFX8-NEXT: flat_load_dwordx4 v[1:4], v[5:6] 447; GFX8-NEXT: v_add_u32_e32 v5, vcc, 16, v5 448; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc 449; GFX8-NEXT: flat_load_dwordx4 v[5:8], v[5:6] 450; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v0 451; GFX8-NEXT: v_mov_b32_e32 v10, s1 452; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc 453; GFX8-NEXT: s_waitcnt vmcnt(1) 454; GFX8-NEXT: v_ffbh_u32_e32 v0, v4 455; GFX8-NEXT: v_ffbh_u32_e32 v11, v2 456; GFX8-NEXT: v_min_u32_e32 v0, 32, v0 457; GFX8-NEXT: v_min_u32_e32 v11, 32, v11 458; GFX8-NEXT: v_lshlrev_b64 v[3:4], v0, v[3:4] 459; GFX8-NEXT: s_waitcnt vmcnt(0) 460; GFX8-NEXT: v_ffbh_u32_e32 v12, v8 461; GFX8-NEXT: v_ffbh_u32_e32 v13, v6 462; GFX8-NEXT: v_min_u32_e32 v12, 32, v12 463; GFX8-NEXT: v_min_u32_e32 v13, 32, v13 464; GFX8-NEXT: v_sub_u32_e32 v14, vcc, 32, v0 465; GFX8-NEXT: v_lshlrev_b64 v[0:1], v11, v[1:2] 466; GFX8-NEXT: v_lshlrev_b64 v[7:8], v12, v[7:8] 467; GFX8-NEXT: v_lshlrev_b64 v[5:6], v13, v[5:6] 468; GFX8-NEXT: v_min_u32_e32 v3, 1, v3 469; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 470; GFX8-NEXT: v_min_u32_e32 v7, 1, v7 471; GFX8-NEXT: v_min_u32_e32 v5, 1, v5 472; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 473; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 474; GFX8-NEXT: v_or_b32_e32 v1, v8, v7 475; GFX8-NEXT: v_or_b32_e32 v4, v6, v5 476; GFX8-NEXT: v_cvt_f32_u32_e32 v3, v3 477; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0 478; GFX8-NEXT: v_cvt_f32_u32_e32 v5, v1 479; GFX8-NEXT: v_cvt_f32_u32_e32 v4, v4 480; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v11 481; GFX8-NEXT: v_sub_u32_e32 v11, vcc, 32, v12 482; GFX8-NEXT: v_sub_u32_e32 v12, vcc, 32, v13 483; GFX8-NEXT: v_ldexp_f32 v1, v3, v14 484; GFX8-NEXT: v_ldexp_f32 v0, v0, v2 485; GFX8-NEXT: v_ldexp_f32 v3, v5, v11 486; GFX8-NEXT: v_ldexp_f32 v2, v4, v12 487; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[0:3] 488; GFX8-NEXT: s_endpgm 489; 490; GFX11-LABEL: v_uint_to_fp_v4i64_to_v4f32: 491; GFX11: ; %bb.0: 492; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 493; GFX11-NEXT: v_and_b32_e32 v8, 0x3ff, v0 494; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 495; GFX11-NEXT: v_lshlrev_b32_e32 v4, 5, v8 496; GFX11-NEXT: s_waitcnt lgkmcnt(0) 497; GFX11-NEXT: s_clause 0x1 498; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] offset:16 499; GFX11-NEXT: global_load_b128 v[4:7], v4, s[2:3] 500; GFX11-NEXT: s_waitcnt vmcnt(1) 501; GFX11-NEXT: v_clz_i32_u32_e32 v9, v3 502; GFX11-NEXT: v_clz_i32_u32_e32 v10, v1 503; GFX11-NEXT: s_waitcnt vmcnt(0) 504; GFX11-NEXT: v_clz_i32_u32_e32 v11, v7 505; GFX11-NEXT: v_clz_i32_u32_e32 v12, v5 506; GFX11-NEXT: v_min_u32_e32 v9, 32, v9 507; GFX11-NEXT: v_min_u32_e32 v10, 32, v10 508; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 509; GFX11-NEXT: v_min_u32_e32 v11, 32, v11 510; GFX11-NEXT: v_min_u32_e32 v12, 32, v12 511; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 512; GFX11-NEXT: v_lshlrev_b64 v[2:3], v9, v[2:3] 513; GFX11-NEXT: v_lshlrev_b64 v[0:1], v10, v[0:1] 514; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 515; GFX11-NEXT: v_lshlrev_b64 v[6:7], v11, v[6:7] 516; GFX11-NEXT: v_lshlrev_b64 v[4:5], v12, v[4:5] 517; GFX11-NEXT: v_sub_nc_u32_e32 v9, 32, v9 518; GFX11-NEXT: v_sub_nc_u32_e32 v10, 32, v10 519; GFX11-NEXT: v_min_u32_e32 v2, 1, v2 520; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 521; GFX11-NEXT: v_min_u32_e32 v6, 1, v6 522; GFX11-NEXT: v_min_u32_e32 v4, 1, v4 523; GFX11-NEXT: v_sub_nc_u32_e32 v11, 32, v11 524; GFX11-NEXT: v_or_b32_e32 v2, v3, v2 525; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 526; GFX11-NEXT: v_or_b32_e32 v1, v7, v6 527; GFX11-NEXT: v_or_b32_e32 v3, v5, v4 528; GFX11-NEXT: v_sub_nc_u32_e32 v4, 32, v12 529; GFX11-NEXT: v_cvt_f32_u32_e32 v2, v2 530; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 531; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 532; GFX11-NEXT: v_cvt_f32_u32_e32 v5, v3 533; GFX11-NEXT: v_lshlrev_b32_e32 v6, 4, v8 534; GFX11-NEXT: v_ldexp_f32 v3, v2, v9 535; GFX11-NEXT: v_ldexp_f32 v2, v0, v10 536; GFX11-NEXT: v_ldexp_f32 v1, v1, v11 537; GFX11-NEXT: v_ldexp_f32 v0, v5, v4 538; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1] 539; GFX11-NEXT: s_endpgm 540 %tid = call i32 @llvm.amdgcn.workitem.id.x() 541 %in.gep = getelementptr <4 x i64>, ptr addrspace(1) %in, i32 %tid 542 %out.gep = getelementptr <4 x float>, ptr addrspace(1) %out, i32 %tid 543 %value = load <4 x i64>, ptr addrspace(1) %in.gep 544 %result = uitofp <4 x i64> %value to <4 x float> 545 store <4 x float> %result, ptr addrspace(1) %out.gep 546 ret void 547} 548 549define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2 x i64> %in) #0{ 550; GFX6-LABEL: s_uint_to_fp_v2i64_to_v2f16: 551; GFX6: ; %bb.0: 552; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd 553; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 554; GFX6-NEXT: s_mov_b32 s3, 0xf000 555; GFX6-NEXT: s_mov_b32 s2, -1 556; GFX6-NEXT: s_waitcnt lgkmcnt(0) 557; GFX6-NEXT: s_flbit_i32_b32 s4, s11 558; GFX6-NEXT: s_flbit_i32_b32 s5, s9 559; GFX6-NEXT: s_min_u32 s6, s4, 32 560; GFX6-NEXT: s_min_u32 s12, s5, 32 561; GFX6-NEXT: s_lshl_b64 s[4:5], s[10:11], s6 562; GFX6-NEXT: s_sub_i32 s10, 32, s6 563; GFX6-NEXT: s_lshl_b64 s[6:7], s[8:9], s12 564; GFX6-NEXT: s_sub_i32 s8, 32, s12 565; GFX6-NEXT: s_min_u32 s4, s4, 1 566; GFX6-NEXT: s_min_u32 s6, s6, 1 567; GFX6-NEXT: s_or_b32 s4, s5, s4 568; GFX6-NEXT: s_or_b32 s5, s7, s6 569; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s4 570; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s5 571; GFX6-NEXT: v_ldexp_f32_e64 v0, v0, s10 572; GFX6-NEXT: v_ldexp_f32_e64 v1, v1, s8 573; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 574; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 575; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 576; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 577; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 578; GFX6-NEXT: s_endpgm 579; 580; GFX8-LABEL: s_uint_to_fp_v2i64_to_v2f16: 581; GFX8: ; %bb.0: 582; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 583; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 584; GFX8-NEXT: s_waitcnt lgkmcnt(0) 585; GFX8-NEXT: s_flbit_i32_b32 s6, s3 586; GFX8-NEXT: s_flbit_i32_b32 s7, s1 587; GFX8-NEXT: s_min_u32 s6, s6, 32 588; GFX8-NEXT: s_min_u32 s7, s7, 32 589; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s6 590; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s7 591; GFX8-NEXT: s_min_u32 s2, s2, 1 592; GFX8-NEXT: s_min_u32 s0, s0, 1 593; GFX8-NEXT: s_or_b32 s2, s3, s2 594; GFX8-NEXT: s_or_b32 s0, s1, s0 595; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2 596; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s0 597; GFX8-NEXT: s_sub_i32 s6, 32, s6 598; GFX8-NEXT: s_sub_i32 s0, 32, s7 599; GFX8-NEXT: v_ldexp_f32 v0, v0, s6 600; GFX8-NEXT: v_ldexp_f32 v1, v1, s0 601; GFX8-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 602; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1 603; GFX8-NEXT: v_or_b32_e32 v2, v1, v0 604; GFX8-NEXT: v_mov_b32_e32 v0, s4 605; GFX8-NEXT: v_mov_b32_e32 v1, s5 606; GFX8-NEXT: flat_store_dword v[0:1], v2 607; GFX8-NEXT: s_endpgm 608; 609; GFX11-LABEL: s_uint_to_fp_v2i64_to_v2f16: 610; GFX11: ; %bb.0: 611; GFX11-NEXT: s_clause 0x1 612; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 613; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 614; GFX11-NEXT: v_mov_b32_e32 v2, 0 615; GFX11-NEXT: s_waitcnt lgkmcnt(0) 616; GFX11-NEXT: s_clz_i32_u32 s6, s3 617; GFX11-NEXT: s_clz_i32_u32 s7, s1 618; GFX11-NEXT: s_min_u32 s6, s6, 32 619; GFX11-NEXT: s_min_u32 s7, s7, 32 620; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], s6 621; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s7 622; GFX11-NEXT: s_min_u32 s2, s2, 1 623; GFX11-NEXT: s_min_u32 s0, s0, 1 624; GFX11-NEXT: s_or_b32 s2, s3, s2 625; GFX11-NEXT: s_or_b32 s0, s1, s0 626; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2 627; GFX11-NEXT: v_cvt_f32_u32_e32 v1, s0 628; GFX11-NEXT: s_sub_i32 s0, 32, s6 629; GFX11-NEXT: s_sub_i32 s1, 32, s7 630; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 631; GFX11-NEXT: v_ldexp_f32 v0, v0, s0 632; GFX11-NEXT: v_ldexp_f32 v1, v1, s1 633; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 634; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 635; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 636; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 637; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0 638; GFX11-NEXT: global_store_b32 v2, v0, s[4:5] 639; GFX11-NEXT: s_endpgm 640 %result = uitofp <2 x i64> %in to <2 x half> 641 store <2 x half> %result, ptr addrspace(1) %out 642 ret void 643} 644 645define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 646; GFX6-LABEL: v_uint_to_fp_v4i64_to_v4f16: 647; GFX6: ; %bb.0: 648; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 649; GFX6-NEXT: s_mov_b32 s7, 0xf000 650; GFX6-NEXT: s_mov_b32 s6, 0 651; GFX6-NEXT: v_lshlrev_b32_e32 v8, 5, v0 652; GFX6-NEXT: v_mov_b32_e32 v9, 0 653; GFX6-NEXT: s_waitcnt lgkmcnt(0) 654; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 655; GFX6-NEXT: buffer_load_dwordx4 v[1:4], v[8:9], s[4:7], 0 addr64 offset:16 656; GFX6-NEXT: buffer_load_dwordx4 v[5:8], v[8:9], s[4:7], 0 addr64 657; GFX6-NEXT: v_lshlrev_b32_e32 v10, 3, v0 658; GFX6-NEXT: v_mov_b32_e32 v11, v9 659; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 660; GFX6-NEXT: s_waitcnt vmcnt(1) 661; GFX6-NEXT: v_ffbh_u32_e32 v0, v4 662; GFX6-NEXT: v_ffbh_u32_e32 v9, v2 663; GFX6-NEXT: s_waitcnt vmcnt(0) 664; GFX6-NEXT: v_ffbh_u32_e32 v12, v8 665; GFX6-NEXT: v_ffbh_u32_e32 v13, v6 666; GFX6-NEXT: v_min_u32_e32 v0, 32, v0 667; GFX6-NEXT: v_min_u32_e32 v9, 32, v9 668; GFX6-NEXT: v_min_u32_e32 v12, 32, v12 669; GFX6-NEXT: v_min_u32_e32 v13, 32, v13 670; GFX6-NEXT: v_lshl_b64 v[3:4], v[3:4], v0 671; GFX6-NEXT: v_sub_i32_e32 v14, vcc, 32, v0 672; GFX6-NEXT: v_lshl_b64 v[0:1], v[1:2], v9 673; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 32, v9 674; GFX6-NEXT: v_lshl_b64 v[7:8], v[7:8], v12 675; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 32, v12 676; GFX6-NEXT: v_lshl_b64 v[5:6], v[5:6], v13 677; GFX6-NEXT: v_sub_i32_e32 v12, vcc, 32, v13 678; GFX6-NEXT: v_min_u32_e32 v3, 1, v3 679; GFX6-NEXT: v_min_u32_e32 v0, 1, v0 680; GFX6-NEXT: v_min_u32_e32 v7, 1, v7 681; GFX6-NEXT: v_min_u32_e32 v5, 1, v5 682; GFX6-NEXT: v_or_b32_e32 v3, v4, v3 683; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 684; GFX6-NEXT: v_or_b32_e32 v1, v8, v7 685; GFX6-NEXT: v_or_b32_e32 v4, v6, v5 686; GFX6-NEXT: v_cvt_f32_u32_e32 v3, v3 687; GFX6-NEXT: v_cvt_f32_u32_e32 v0, v0 688; GFX6-NEXT: v_cvt_f32_u32_e32 v1, v1 689; GFX6-NEXT: v_cvt_f32_u32_e32 v4, v4 690; GFX6-NEXT: v_ldexp_f32_e32 v3, v3, v14 691; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v2 692; GFX6-NEXT: v_ldexp_f32_e32 v1, v1, v9 693; GFX6-NEXT: v_ldexp_f32_e32 v2, v4, v12 694; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 695; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 696; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 697; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 698; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 699; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v1 700; GFX6-NEXT: v_or_b32_e32 v1, v0, v3 701; GFX6-NEXT: v_or_b32_e32 v0, v2, v4 702; GFX6-NEXT: buffer_store_dwordx2 v[0:1], v[10:11], s[0:3], 0 addr64 703; GFX6-NEXT: s_endpgm 704; 705; GFX8-LABEL: v_uint_to_fp_v4i64_to_v4f16: 706; GFX8: ; %bb.0: 707; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 708; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v0 709; GFX8-NEXT: v_lshlrev_b32_e32 v9, 3, v0 710; GFX8-NEXT: s_waitcnt lgkmcnt(0) 711; GFX8-NEXT: v_mov_b32_e32 v2, s3 712; GFX8-NEXT: v_add_u32_e32 v5, vcc, s2, v1 713; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc 714; GFX8-NEXT: flat_load_dwordx4 v[1:4], v[5:6] 715; GFX8-NEXT: v_add_u32_e32 v5, vcc, 16, v5 716; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc 717; GFX8-NEXT: flat_load_dwordx4 v[5:8], v[5:6] 718; GFX8-NEXT: v_mov_b32_e32 v10, s1 719; GFX8-NEXT: s_waitcnt vmcnt(1) 720; GFX8-NEXT: v_ffbh_u32_e32 v0, v4 721; GFX8-NEXT: v_ffbh_u32_e32 v11, v2 722; GFX8-NEXT: v_min_u32_e32 v0, 32, v0 723; GFX8-NEXT: v_min_u32_e32 v11, 32, v11 724; GFX8-NEXT: v_lshlrev_b64 v[3:4], v0, v[3:4] 725; GFX8-NEXT: s_waitcnt vmcnt(0) 726; GFX8-NEXT: v_ffbh_u32_e32 v12, v8 727; GFX8-NEXT: v_ffbh_u32_e32 v13, v6 728; GFX8-NEXT: v_min_u32_e32 v12, 32, v12 729; GFX8-NEXT: v_min_u32_e32 v13, 32, v13 730; GFX8-NEXT: v_sub_u32_e32 v14, vcc, 32, v0 731; GFX8-NEXT: v_lshlrev_b64 v[0:1], v11, v[1:2] 732; GFX8-NEXT: v_lshlrev_b64 v[7:8], v12, v[7:8] 733; GFX8-NEXT: v_lshlrev_b64 v[5:6], v13, v[5:6] 734; GFX8-NEXT: v_min_u32_e32 v3, 1, v3 735; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 736; GFX8-NEXT: v_min_u32_e32 v7, 1, v7 737; GFX8-NEXT: v_min_u32_e32 v5, 1, v5 738; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 739; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 740; GFX8-NEXT: v_or_b32_e32 v1, v8, v7 741; GFX8-NEXT: v_or_b32_e32 v4, v6, v5 742; GFX8-NEXT: v_cvt_f32_u32_e32 v3, v3 743; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0 744; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v1 745; GFX8-NEXT: v_cvt_f32_u32_e32 v4, v4 746; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v11 747; GFX8-NEXT: v_sub_u32_e32 v11, vcc, 32, v12 748; GFX8-NEXT: v_sub_u32_e32 v12, vcc, 32, v13 749; GFX8-NEXT: v_ldexp_f32 v3, v3, v14 750; GFX8-NEXT: v_ldexp_f32 v0, v0, v2 751; GFX8-NEXT: v_ldexp_f32 v1, v1, v11 752; GFX8-NEXT: v_ldexp_f32 v2, v4, v12 753; GFX8-NEXT: v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 754; GFX8-NEXT: v_cvt_f16_f32_e32 v4, v0 755; GFX8-NEXT: v_cvt_f16_f32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 756; GFX8-NEXT: v_cvt_f16_f32_e32 v6, v2 757; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v9 758; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v10, vcc 759; GFX8-NEXT: v_or_b32_e32 v2, v4, v3 760; GFX8-NEXT: v_or_b32_e32 v3, v6, v5 761; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 762; GFX8-NEXT: s_endpgm 763; 764; GFX11-LABEL: v_uint_to_fp_v4i64_to_v4f16: 765; GFX11: ; %bb.0: 766; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 767; GFX11-NEXT: v_and_b32_e32 v8, 0x3ff, v0 768; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 769; GFX11-NEXT: v_lshlrev_b32_e32 v4, 5, v8 770; GFX11-NEXT: s_waitcnt lgkmcnt(0) 771; GFX11-NEXT: s_clause 0x1 772; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] offset:16 773; GFX11-NEXT: global_load_b128 v[4:7], v4, s[2:3] 774; GFX11-NEXT: s_waitcnt vmcnt(1) 775; GFX11-NEXT: v_clz_i32_u32_e32 v9, v3 776; GFX11-NEXT: v_clz_i32_u32_e32 v10, v1 777; GFX11-NEXT: s_waitcnt vmcnt(0) 778; GFX11-NEXT: v_clz_i32_u32_e32 v11, v7 779; GFX11-NEXT: v_clz_i32_u32_e32 v12, v5 780; GFX11-NEXT: v_min_u32_e32 v9, 32, v9 781; GFX11-NEXT: v_min_u32_e32 v10, 32, v10 782; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 783; GFX11-NEXT: v_min_u32_e32 v11, 32, v11 784; GFX11-NEXT: v_min_u32_e32 v12, 32, v12 785; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 786; GFX11-NEXT: v_lshlrev_b64 v[2:3], v9, v[2:3] 787; GFX11-NEXT: v_lshlrev_b64 v[0:1], v10, v[0:1] 788; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 789; GFX11-NEXT: v_lshlrev_b64 v[6:7], v11, v[6:7] 790; GFX11-NEXT: v_lshlrev_b64 v[4:5], v12, v[4:5] 791; GFX11-NEXT: v_sub_nc_u32_e32 v9, 32, v9 792; GFX11-NEXT: v_sub_nc_u32_e32 v10, 32, v10 793; GFX11-NEXT: v_min_u32_e32 v2, 1, v2 794; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 795; GFX11-NEXT: v_min_u32_e32 v6, 1, v6 796; GFX11-NEXT: v_min_u32_e32 v4, 1, v4 797; GFX11-NEXT: v_sub_nc_u32_e32 v11, 32, v11 798; GFX11-NEXT: v_or_b32_e32 v2, v3, v2 799; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 800; GFX11-NEXT: v_or_b32_e32 v1, v7, v6 801; GFX11-NEXT: v_or_b32_e32 v3, v5, v4 802; GFX11-NEXT: v_sub_nc_u32_e32 v4, 32, v12 803; GFX11-NEXT: v_cvt_f32_u32_e32 v2, v2 804; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 805; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 806; GFX11-NEXT: v_cvt_f32_u32_e32 v3, v3 807; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v8 808; GFX11-NEXT: v_ldexp_f32 v2, v2, v9 809; GFX11-NEXT: v_ldexp_f32 v0, v0, v10 810; GFX11-NEXT: v_ldexp_f32 v1, v1, v11 811; GFX11-NEXT: v_ldexp_f32 v3, v3, v4 812; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 813; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 814; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 815; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 816; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v1 817; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 818; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 819; GFX11-NEXT: v_pack_b32_f16 v1, v0, v2 820; GFX11-NEXT: v_pack_b32_f16 v0, v3, v4 821; GFX11-NEXT: global_store_b64 v5, v[0:1], s[0:1] 822; GFX11-NEXT: s_endpgm 823 %tid = call i32 @llvm.amdgcn.workitem.id.x() 824 %in.gep = getelementptr <4 x i64>, ptr addrspace(1) %in, i32 %tid 825 %out.gep = getelementptr <4 x half>, ptr addrspace(1) %out, i32 %tid 826 %value = load <4 x i64>, ptr addrspace(1) %in.gep 827 %result = uitofp <4 x i64> %value to <4 x half> 828 store <4 x half> %result, ptr addrspace(1) %out.gep 829 ret void 830} 831 832declare i32 @llvm.amdgcn.workitem.id.x() #1 833 834attributes #0 = { nounwind } 835attributes #1 = { nounwind readnone } 836