1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s 4; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s 5; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s 6 7define amdgpu_kernel void @uitofp_i16_to_f16( 8; SI-LABEL: uitofp_i16_to_f16: 9; SI: ; %bb.0: ; %entry 10; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 11; SI-NEXT: s_mov_b32 s7, 0xf000 12; SI-NEXT: s_mov_b32 s6, -1 13; SI-NEXT: s_mov_b32 s10, s6 14; SI-NEXT: s_mov_b32 s11, s7 15; SI-NEXT: s_waitcnt lgkmcnt(0) 16; SI-NEXT: s_mov_b32 s8, s2 17; SI-NEXT: s_mov_b32 s9, s3 18; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 19; SI-NEXT: s_mov_b32 s4, s0 20; SI-NEXT: s_mov_b32 s5, s1 21; SI-NEXT: s_waitcnt vmcnt(0) 22; SI-NEXT: v_cvt_f32_u32_e32 v0, v0 23; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 24; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 25; SI-NEXT: s_endpgm 26; 27; VI-LABEL: uitofp_i16_to_f16: 28; VI: ; %bb.0: ; %entry 29; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 30; VI-NEXT: s_mov_b32 s7, 0xf000 31; VI-NEXT: s_mov_b32 s6, -1 32; VI-NEXT: s_mov_b32 s10, s6 33; VI-NEXT: s_mov_b32 s11, s7 34; VI-NEXT: s_waitcnt lgkmcnt(0) 35; VI-NEXT: s_mov_b32 s8, s2 36; VI-NEXT: s_mov_b32 s9, s3 37; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 38; VI-NEXT: s_mov_b32 s4, s0 39; VI-NEXT: s_mov_b32 s5, s1 40; VI-NEXT: s_waitcnt vmcnt(0) 41; VI-NEXT: v_cvt_f16_u16_e32 v0, v0 42; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 43; VI-NEXT: s_endpgm 44; 45; GFX11-TRUE16-LABEL: uitofp_i16_to_f16: 46; GFX11-TRUE16: ; %bb.0: ; %entry 47; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 48; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 49; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 50; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 51; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 52; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) 53; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 54; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 55; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 56; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 57; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 58; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) 59; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v0.l, v0.l 60; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 61; GFX11-TRUE16-NEXT: s_endpgm 62; 63; GFX11-FAKE16-LABEL: uitofp_i16_to_f16: 64; GFX11-FAKE16: ; %bb.0: ; %entry 65; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 66; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 67; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 68; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 69; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 70; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) 71; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 72; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 73; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 74; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 75; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 76; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) 77; GFX11-FAKE16-NEXT: v_cvt_f16_u16_e32 v0, v0 78; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 79; GFX11-FAKE16-NEXT: s_endpgm 80 ptr addrspace(1) %r, 81 ptr addrspace(1) %a) { 82entry: 83 %a.val = load i16, ptr addrspace(1) %a 84 %r.val = uitofp i16 %a.val to half 85 store half %r.val, ptr addrspace(1) %r 86 ret void 87} 88 89define amdgpu_kernel void @uitofp_i32_to_f16( 90; SI-LABEL: uitofp_i32_to_f16: 91; SI: ; %bb.0: ; %entry 92; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 93; SI-NEXT: s_mov_b32 s7, 0xf000 94; SI-NEXT: s_mov_b32 s6, -1 95; SI-NEXT: s_mov_b32 s10, s6 96; SI-NEXT: s_mov_b32 s11, s7 97; SI-NEXT: s_waitcnt lgkmcnt(0) 98; SI-NEXT: s_mov_b32 s8, s2 99; SI-NEXT: s_mov_b32 s9, s3 100; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 101; SI-NEXT: s_mov_b32 s4, s0 102; SI-NEXT: s_mov_b32 s5, s1 103; SI-NEXT: s_waitcnt vmcnt(0) 104; SI-NEXT: v_cvt_f32_u32_e32 v0, v0 105; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 106; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 107; SI-NEXT: s_endpgm 108; 109; VI-LABEL: uitofp_i32_to_f16: 110; VI: ; %bb.0: ; %entry 111; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 112; VI-NEXT: s_mov_b32 s7, 0xf000 113; VI-NEXT: s_mov_b32 s6, -1 114; VI-NEXT: s_mov_b32 s10, s6 115; VI-NEXT: s_mov_b32 s11, s7 116; VI-NEXT: s_waitcnt lgkmcnt(0) 117; VI-NEXT: s_mov_b32 s8, s2 118; VI-NEXT: s_mov_b32 s9, s3 119; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 120; VI-NEXT: s_mov_b32 s4, s0 121; VI-NEXT: s_mov_b32 s5, s1 122; VI-NEXT: s_waitcnt vmcnt(0) 123; VI-NEXT: v_cvt_f32_u32_e32 v0, v0 124; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 125; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 126; VI-NEXT: s_endpgm 127; 128; GFX11-TRUE16-LABEL: uitofp_i32_to_f16: 129; GFX11-TRUE16: ; %bb.0: ; %entry 130; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 131; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 132; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 133; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 134; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 135; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) 136; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 137; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 138; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 139; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 140; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 141; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) 142; GFX11-TRUE16-NEXT: v_cvt_f32_u32_e32 v0, v0 143; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 144; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 145; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 146; GFX11-TRUE16-NEXT: s_endpgm 147; 148; GFX11-FAKE16-LABEL: uitofp_i32_to_f16: 149; GFX11-FAKE16: ; %bb.0: ; %entry 150; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 151; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 152; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 153; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 154; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 155; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) 156; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 157; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 158; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 159; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 160; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 161; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) 162; GFX11-FAKE16-NEXT: v_cvt_f32_u32_e32 v0, v0 163; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 164; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 165; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 166; GFX11-FAKE16-NEXT: s_endpgm 167 ptr addrspace(1) %r, 168 ptr addrspace(1) %a) { 169entry: 170 %a.val = load i32, ptr addrspace(1) %a 171 %r.val = uitofp i32 %a.val to half 172 store half %r.val, ptr addrspace(1) %r 173 ret void 174} 175 176; f16 = uitofp i64 is in uint_to_fp.i64.ll 177 178define amdgpu_kernel void @uitofp_v2i16_to_v2f16( 179; SI-LABEL: uitofp_v2i16_to_v2f16: 180; SI: ; %bb.0: ; %entry 181; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 182; SI-NEXT: s_mov_b32 s7, 0xf000 183; SI-NEXT: s_mov_b32 s6, -1 184; SI-NEXT: s_mov_b32 s10, s6 185; SI-NEXT: s_mov_b32 s11, s7 186; SI-NEXT: s_waitcnt lgkmcnt(0) 187; SI-NEXT: s_mov_b32 s8, s2 188; SI-NEXT: s_mov_b32 s9, s3 189; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 190; SI-NEXT: s_mov_b32 s4, s0 191; SI-NEXT: s_mov_b32 s5, s1 192; SI-NEXT: s_waitcnt vmcnt(0) 193; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0 194; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 195; SI-NEXT: v_cvt_f32_u32_e32 v0, v0 196; SI-NEXT: v_cvt_f32_u32_e32 v1, v1 197; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 198; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 199; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 200; SI-NEXT: v_or_b32_e32 v0, v1, v0 201; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 202; SI-NEXT: s_endpgm 203; 204; VI-LABEL: uitofp_v2i16_to_v2f16: 205; VI: ; %bb.0: ; %entry 206; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 207; VI-NEXT: s_mov_b32 s7, 0xf000 208; VI-NEXT: s_mov_b32 s6, -1 209; VI-NEXT: s_mov_b32 s10, s6 210; VI-NEXT: s_mov_b32 s11, s7 211; VI-NEXT: s_waitcnt lgkmcnt(0) 212; VI-NEXT: s_mov_b32 s8, s2 213; VI-NEXT: s_mov_b32 s9, s3 214; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 215; VI-NEXT: s_mov_b32 s4, s0 216; VI-NEXT: s_mov_b32 s5, s1 217; VI-NEXT: s_waitcnt vmcnt(0) 218; VI-NEXT: v_cvt_f16_u16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 219; VI-NEXT: v_cvt_f16_u16_e32 v0, v0 220; VI-NEXT: v_or_b32_e32 v0, v0, v1 221; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 222; VI-NEXT: s_endpgm 223; 224; GFX11-TRUE16-LABEL: uitofp_v2i16_to_v2f16: 225; GFX11-TRUE16: ; %bb.0: ; %entry 226; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 227; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 228; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 229; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 230; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 231; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) 232; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 233; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 234; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 235; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 236; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 237; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) 238; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 239; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v0.l, v0.l 240; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 241; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v0.h, v1.l 242; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h 243; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 244; GFX11-TRUE16-NEXT: s_endpgm 245; 246; GFX11-FAKE16-LABEL: uitofp_v2i16_to_v2f16: 247; GFX11-FAKE16: ; %bb.0: ; %entry 248; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 249; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 250; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 251; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 252; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 253; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) 254; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 255; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 256; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 257; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 258; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 259; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) 260; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 261; GFX11-FAKE16-NEXT: v_cvt_f16_u16_e32 v0, v0 262; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 263; GFX11-FAKE16-NEXT: v_cvt_f16_u16_e32 v1, v1 264; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 265; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 266; GFX11-FAKE16-NEXT: s_endpgm 267 ptr addrspace(1) %r, 268 ptr addrspace(1) %a) { 269entry: 270 %a.val = load <2 x i16>, ptr addrspace(1) %a 271 %r.val = uitofp <2 x i16> %a.val to <2 x half> 272 store <2 x half> %r.val, ptr addrspace(1) %r 273 ret void 274} 275 276define amdgpu_kernel void @uitofp_v2i32_to_v2f16( 277; SI-LABEL: uitofp_v2i32_to_v2f16: 278; SI: ; %bb.0: ; %entry 279; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 280; SI-NEXT: s_mov_b32 s7, 0xf000 281; SI-NEXT: s_mov_b32 s6, -1 282; SI-NEXT: s_mov_b32 s10, s6 283; SI-NEXT: s_mov_b32 s11, s7 284; SI-NEXT: s_waitcnt lgkmcnt(0) 285; SI-NEXT: s_mov_b32 s8, s2 286; SI-NEXT: s_mov_b32 s9, s3 287; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 288; SI-NEXT: s_mov_b32 s4, s0 289; SI-NEXT: s_mov_b32 s5, s1 290; SI-NEXT: s_waitcnt vmcnt(0) 291; SI-NEXT: v_cvt_f32_u32_e32 v1, v1 292; SI-NEXT: v_cvt_f32_u32_e32 v0, v0 293; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 294; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 295; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 296; SI-NEXT: v_or_b32_e32 v0, v0, v1 297; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 298; SI-NEXT: s_endpgm 299; 300; VI-LABEL: uitofp_v2i32_to_v2f16: 301; VI: ; %bb.0: ; %entry 302; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 303; VI-NEXT: s_mov_b32 s7, 0xf000 304; VI-NEXT: s_mov_b32 s6, -1 305; VI-NEXT: s_mov_b32 s10, s6 306; VI-NEXT: s_mov_b32 s11, s7 307; VI-NEXT: s_waitcnt lgkmcnt(0) 308; VI-NEXT: s_mov_b32 s8, s2 309; VI-NEXT: s_mov_b32 s9, s3 310; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 311; VI-NEXT: s_mov_b32 s4, s0 312; VI-NEXT: s_mov_b32 s5, s1 313; VI-NEXT: s_waitcnt vmcnt(0) 314; VI-NEXT: v_cvt_f32_u32_e32 v1, v1 315; VI-NEXT: v_cvt_f32_u32_e32 v0, v0 316; VI-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 317; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 318; VI-NEXT: v_or_b32_e32 v0, v0, v1 319; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 320; VI-NEXT: s_endpgm 321; 322; GFX11-TRUE16-LABEL: uitofp_v2i32_to_v2f16: 323; GFX11-TRUE16: ; %bb.0: ; %entry 324; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 325; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 326; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 327; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 328; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 329; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) 330; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 331; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 332; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 333; GFX11-TRUE16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 334; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 335; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) 336; GFX11-TRUE16-NEXT: v_cvt_f32_u32_e32 v1, v1 337; GFX11-TRUE16-NEXT: v_cvt_f32_u32_e32 v2, v0 338; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 339; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v1 340; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v2 341; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 342; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.h, v0.l 343; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 344; GFX11-TRUE16-NEXT: s_endpgm 345; 346; GFX11-FAKE16-LABEL: uitofp_v2i32_to_v2f16: 347; GFX11-FAKE16: ; %bb.0: ; %entry 348; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 349; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 350; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 351; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 352; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 353; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) 354; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 355; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 356; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 357; GFX11-FAKE16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 358; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 359; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) 360; GFX11-FAKE16-NEXT: v_cvt_f32_u32_e32 v1, v1 361; GFX11-FAKE16-NEXT: v_cvt_f32_u32_e32 v0, v0 362; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 363; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1 364; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 365; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 366; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 367; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 368; GFX11-FAKE16-NEXT: s_endpgm 369 ptr addrspace(1) %r, 370 ptr addrspace(1) %a) { 371entry: 372 %a.val = load <2 x i32>, ptr addrspace(1) %a 373 %r.val = uitofp <2 x i32> %a.val to <2 x half> 374 store <2 x half> %r.val, ptr addrspace(1) %r 375 ret void 376} 377 378define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { 379; SI-LABEL: s_uint_to_fp_i1_to_f16: 380; SI: ; %bb.0: 381; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 382; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 383; SI-NEXT: s_mov_b32 s3, 0xf000 384; SI-NEXT: s_mov_b32 s2, -1 385; SI-NEXT: s_mov_b32 s6, s2 386; SI-NEXT: s_mov_b32 s7, s3 387; SI-NEXT: s_waitcnt lgkmcnt(0) 388; SI-NEXT: s_mov_b32 s12, s10 389; SI-NEXT: s_mov_b32 s13, s11 390; SI-NEXT: s_mov_b32 s14, s2 391; SI-NEXT: s_mov_b32 s15, s3 392; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 393; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 394; SI-NEXT: s_waitcnt vmcnt(1) 395; SI-NEXT: v_cmp_le_f32_e32 vcc, 1.0, v0 396; SI-NEXT: s_waitcnt vmcnt(0) 397; SI-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 398; SI-NEXT: s_xor_b64 s[0:1], s[0:1], vcc 399; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[0:1] 400; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 401; SI-NEXT: s_mov_b32 s0, s8 402; SI-NEXT: s_mov_b32 s1, s9 403; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 404; SI-NEXT: s_endpgm 405; 406; VI-LABEL: s_uint_to_fp_i1_to_f16: 407; VI: ; %bb.0: 408; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 409; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 410; VI-NEXT: s_mov_b32 s3, 0xf000 411; VI-NEXT: s_mov_b32 s2, -1 412; VI-NEXT: s_mov_b32 s6, s2 413; VI-NEXT: s_mov_b32 s7, s3 414; VI-NEXT: s_waitcnt lgkmcnt(0) 415; VI-NEXT: s_mov_b32 s12, s10 416; VI-NEXT: s_mov_b32 s13, s11 417; VI-NEXT: s_mov_b32 s14, s2 418; VI-NEXT: s_mov_b32 s15, s3 419; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 420; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 421; VI-NEXT: s_waitcnt vmcnt(1) 422; VI-NEXT: v_cmp_le_f32_e32 vcc, 1.0, v0 423; VI-NEXT: s_waitcnt vmcnt(0) 424; VI-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 425; VI-NEXT: s_xor_b64 s[0:1], s[0:1], vcc 426; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[0:1] 427; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 428; VI-NEXT: s_mov_b32 s0, s8 429; VI-NEXT: s_mov_b32 s1, s9 430; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 431; VI-NEXT: s_endpgm 432; 433; GFX11-TRUE16-LABEL: s_uint_to_fp_i1_to_f16: 434; GFX11-TRUE16: ; %bb.0: 435; GFX11-TRUE16-NEXT: s_clause 0x1 436; GFX11-TRUE16-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 437; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 438; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 439; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 440; GFX11-TRUE16-NEXT: s_mov_b32 s2, s6 441; GFX11-TRUE16-NEXT: s_mov_b32 s3, s7 442; GFX11-TRUE16-NEXT: s_mov_b32 s14, s6 443; GFX11-TRUE16-NEXT: s_mov_b32 s15, s7 444; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) 445; GFX11-TRUE16-NEXT: s_mov_b32 s12, s10 446; GFX11-TRUE16-NEXT: s_mov_b32 s13, s11 447; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[0:3], 0 448; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 449; GFX11-TRUE16-NEXT: s_mov_b32 s4, s8 450; GFX11-TRUE16-NEXT: s_mov_b32 s5, s9 451; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) 452; GFX11-TRUE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0 453; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) 454; GFX11-TRUE16-NEXT: v_cmp_le_f32_e64 s0, 0, v1 455; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 456; GFX11-TRUE16-NEXT: s_xor_b32 s0, s0, vcc_lo 457; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0 458; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 459; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 460; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 461; GFX11-TRUE16-NEXT: s_endpgm 462; 463; GFX11-FAKE16-LABEL: s_uint_to_fp_i1_to_f16: 464; GFX11-FAKE16: ; %bb.0: 465; GFX11-FAKE16-NEXT: s_clause 0x1 466; GFX11-FAKE16-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 467; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 468; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 469; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 470; GFX11-FAKE16-NEXT: s_mov_b32 s2, s6 471; GFX11-FAKE16-NEXT: s_mov_b32 s3, s7 472; GFX11-FAKE16-NEXT: s_mov_b32 s14, s6 473; GFX11-FAKE16-NEXT: s_mov_b32 s15, s7 474; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) 475; GFX11-FAKE16-NEXT: s_mov_b32 s12, s10 476; GFX11-FAKE16-NEXT: s_mov_b32 s13, s11 477; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[0:3], 0 478; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 479; GFX11-FAKE16-NEXT: s_mov_b32 s4, s8 480; GFX11-FAKE16-NEXT: s_mov_b32 s5, s9 481; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) 482; GFX11-FAKE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0 483; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) 484; GFX11-FAKE16-NEXT: v_cmp_le_f32_e64 s0, 0, v1 485; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 486; GFX11-FAKE16-NEXT: s_xor_b32 s0, s0, vcc_lo 487; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0 488; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 489; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 490; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 491; GFX11-FAKE16-NEXT: s_endpgm 492 %a = load float, ptr addrspace(1) %in0 493 %b = load float, ptr addrspace(1) %in1 494 %acmp = fcmp oge float %a, 0.000000e+00 495 %bcmp = fcmp oge float %b, 1.000000e+00 496 %result = xor i1 %acmp, %bcmp 497 %fp = uitofp i1 %result to half 498 store half %fp, ptr addrspace(1) %out 499 ret void 500} 501 502; f16 = uitofp i64 is in uint_to_fp.i64.ll 503