1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s 4; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s 5; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s 6 7 8define amdgpu_kernel void @fptoui_f16_to_i16( 9; SI-LABEL: fptoui_f16_to_i16: 10; SI: ; %bb.0: ; %entry 11; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 12; SI-NEXT: s_mov_b32 s7, 0xf000 13; SI-NEXT: s_mov_b32 s6, -1 14; SI-NEXT: s_mov_b32 s10, s6 15; SI-NEXT: s_mov_b32 s11, s7 16; SI-NEXT: s_waitcnt lgkmcnt(0) 17; SI-NEXT: s_mov_b32 s8, s2 18; SI-NEXT: s_mov_b32 s9, s3 19; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 20; SI-NEXT: s_mov_b32 s4, s0 21; SI-NEXT: s_mov_b32 s5, s1 22; SI-NEXT: s_waitcnt vmcnt(0) 23; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 24; SI-NEXT: v_cvt_u32_f32_e32 v0, v0 25; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 26; SI-NEXT: s_endpgm 27; 28; VI-LABEL: fptoui_f16_to_i16: 29; VI: ; %bb.0: ; %entry 30; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 31; VI-NEXT: s_mov_b32 s7, 0xf000 32; VI-NEXT: s_mov_b32 s6, -1 33; VI-NEXT: s_mov_b32 s10, s6 34; VI-NEXT: s_mov_b32 s11, s7 35; VI-NEXT: s_waitcnt lgkmcnt(0) 36; VI-NEXT: s_mov_b32 s8, s2 37; VI-NEXT: s_mov_b32 s9, s3 38; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 39; VI-NEXT: s_mov_b32 s4, s0 40; VI-NEXT: s_mov_b32 s5, s1 41; VI-NEXT: s_waitcnt vmcnt(0) 42; VI-NEXT: v_cvt_u16_f16_e32 v0, v0 43; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 44; VI-NEXT: s_endpgm 45; 46; GFX11-TRUE16-LABEL: fptoui_f16_to_i16: 47; GFX11-TRUE16: ; %bb.0: ; %entry 48; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 49; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 50; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 51; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 52; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 53; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) 54; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 55; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 56; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 57; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 58; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 59; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) 60; GFX11-TRUE16-NEXT: v_cvt_u16_f16_e32 v0.l, v0.l 61; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 62; GFX11-TRUE16-NEXT: s_endpgm 63; 64; GFX11-FAKE16-LABEL: fptoui_f16_to_i16: 65; GFX11-FAKE16: ; %bb.0: ; %entry 66; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 67; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 68; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 69; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 70; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 71; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) 72; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 73; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 74; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 75; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 76; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 77; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) 78; GFX11-FAKE16-NEXT: v_cvt_u16_f16_e32 v0, v0 79; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 80; GFX11-FAKE16-NEXT: s_endpgm 81 ptr addrspace(1) %r, 82 ptr addrspace(1) %a) { 83entry: 84 %a.val = load half, ptr addrspace(1) %a 85 %r.val = fptoui half %a.val to i16 86 store i16 %r.val, ptr addrspace(1) %r 87 ret void 88} 89 90define amdgpu_kernel void @fptoui_f16_to_i32( 91; SI-LABEL: fptoui_f16_to_i32: 92; SI: ; %bb.0: ; %entry 93; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 94; SI-NEXT: s_mov_b32 s7, 0xf000 95; SI-NEXT: s_mov_b32 s6, -1 96; SI-NEXT: s_mov_b32 s10, s6 97; SI-NEXT: s_mov_b32 s11, s7 98; SI-NEXT: s_waitcnt lgkmcnt(0) 99; SI-NEXT: s_mov_b32 s8, s2 100; SI-NEXT: s_mov_b32 s9, s3 101; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 102; SI-NEXT: s_mov_b32 s4, s0 103; SI-NEXT: s_mov_b32 s5, s1 104; SI-NEXT: s_waitcnt vmcnt(0) 105; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 106; SI-NEXT: v_cvt_u32_f32_e32 v0, v0 107; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 108; SI-NEXT: s_endpgm 109; 110; VI-LABEL: fptoui_f16_to_i32: 111; VI: ; %bb.0: ; %entry 112; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 113; VI-NEXT: s_mov_b32 s7, 0xf000 114; VI-NEXT: s_mov_b32 s6, -1 115; VI-NEXT: s_mov_b32 s10, s6 116; VI-NEXT: s_mov_b32 s11, s7 117; VI-NEXT: s_waitcnt lgkmcnt(0) 118; VI-NEXT: s_mov_b32 s8, s2 119; VI-NEXT: s_mov_b32 s9, s3 120; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 121; VI-NEXT: s_mov_b32 s4, s0 122; VI-NEXT: s_mov_b32 s5, s1 123; VI-NEXT: s_waitcnt vmcnt(0) 124; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 125; VI-NEXT: v_cvt_u32_f32_e32 v0, v0 126; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 127; VI-NEXT: s_endpgm 128; 129; GFX11-TRUE16-LABEL: fptoui_f16_to_i32: 130; GFX11-TRUE16: ; %bb.0: ; %entry 131; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 132; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 133; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 134; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 135; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 136; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) 137; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 138; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 139; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 140; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 141; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 142; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) 143; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l 144; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 145; GFX11-TRUE16-NEXT: v_cvt_u32_f32_e32 v0, v0 146; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 147; GFX11-TRUE16-NEXT: s_endpgm 148; 149; GFX11-FAKE16-LABEL: fptoui_f16_to_i32: 150; GFX11-FAKE16: ; %bb.0: ; %entry 151; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 152; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 153; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 154; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 155; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 156; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) 157; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 158; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 159; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 160; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 161; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 162; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) 163; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 164; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 165; GFX11-FAKE16-NEXT: v_cvt_u32_f32_e32 v0, v0 166; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 167; GFX11-FAKE16-NEXT: s_endpgm 168 ptr addrspace(1) %r, 169 ptr addrspace(1) %a) { 170entry: 171 %a.val = load half, ptr addrspace(1) %a 172 %r.val = fptoui half %a.val to i32 173 store i32 %r.val, ptr addrspace(1) %r 174 ret void 175} 176 177; Need to make sure we promote f16 to f32 when converting f16 to i64. Existing 178; test checks code generated for 'i64 = fp_to_uint f32'. 179 180define amdgpu_kernel void @fptoui_f16_to_i64( 181; SI-LABEL: fptoui_f16_to_i64: 182; SI: ; %bb.0: ; %entry 183; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 184; SI-NEXT: s_mov_b32 s7, 0xf000 185; SI-NEXT: s_mov_b32 s6, -1 186; SI-NEXT: s_mov_b32 s10, s6 187; SI-NEXT: s_mov_b32 s11, s7 188; SI-NEXT: s_waitcnt lgkmcnt(0) 189; SI-NEXT: s_mov_b32 s8, s2 190; SI-NEXT: s_mov_b32 s9, s3 191; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 192; SI-NEXT: s_mov_b32 s4, s0 193; SI-NEXT: s_mov_b32 s5, s1 194; SI-NEXT: v_mov_b32_e32 v1, 0 195; SI-NEXT: s_waitcnt vmcnt(0) 196; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 197; SI-NEXT: v_cvt_u32_f32_e32 v0, v0 198; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 199; SI-NEXT: s_endpgm 200; 201; VI-LABEL: fptoui_f16_to_i64: 202; VI: ; %bb.0: ; %entry 203; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 204; VI-NEXT: s_mov_b32 s7, 0xf000 205; VI-NEXT: s_mov_b32 s6, -1 206; VI-NEXT: s_mov_b32 s10, s6 207; VI-NEXT: s_mov_b32 s11, s7 208; VI-NEXT: s_waitcnt lgkmcnt(0) 209; VI-NEXT: s_mov_b32 s8, s2 210; VI-NEXT: s_mov_b32 s9, s3 211; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 212; VI-NEXT: s_mov_b32 s4, s0 213; VI-NEXT: s_mov_b32 s5, s1 214; VI-NEXT: v_mov_b32_e32 v1, 0 215; VI-NEXT: s_waitcnt vmcnt(0) 216; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 217; VI-NEXT: v_cvt_u32_f32_e32 v0, v0 218; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 219; VI-NEXT: s_endpgm 220; 221; GFX11-TRUE16-LABEL: fptoui_f16_to_i64: 222; GFX11-TRUE16: ; %bb.0: ; %entry 223; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 224; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 225; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 226; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 227; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 228; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 229; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) 230; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 231; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 232; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 233; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 234; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 235; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) 236; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l 237; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 238; GFX11-TRUE16-NEXT: v_cvt_u32_f32_e32 v0, v0 239; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 240; GFX11-TRUE16-NEXT: s_endpgm 241; 242; GFX11-FAKE16-LABEL: fptoui_f16_to_i64: 243; GFX11-FAKE16: ; %bb.0: ; %entry 244; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 245; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 246; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 247; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 248; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 249; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0 250; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) 251; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 252; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 253; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 254; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 255; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 256; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) 257; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 258; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 259; GFX11-FAKE16-NEXT: v_cvt_u32_f32_e32 v0, v0 260; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 261; GFX11-FAKE16-NEXT: s_endpgm 262 ptr addrspace(1) %r, 263 ptr addrspace(1) %a) { 264entry: 265 %a.val = load half, ptr addrspace(1) %a 266 %r.val = fptoui half %a.val to i64 267 store i64 %r.val, ptr addrspace(1) %r 268 ret void 269} 270 271define amdgpu_kernel void @fptoui_v2f16_to_v2i16( 272; SI-LABEL: fptoui_v2f16_to_v2i16: 273; SI: ; %bb.0: ; %entry 274; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 275; SI-NEXT: s_mov_b32 s7, 0xf000 276; SI-NEXT: s_mov_b32 s6, -1 277; SI-NEXT: s_mov_b32 s10, s6 278; SI-NEXT: s_mov_b32 s11, s7 279; SI-NEXT: s_waitcnt lgkmcnt(0) 280; SI-NEXT: s_mov_b32 s8, s2 281; SI-NEXT: s_mov_b32 s9, s3 282; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 283; SI-NEXT: s_mov_b32 s4, s0 284; SI-NEXT: s_mov_b32 s5, s1 285; SI-NEXT: s_waitcnt vmcnt(0) 286; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 287; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 288; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 289; SI-NEXT: v_cvt_u32_f32_e32 v1, v1 290; SI-NEXT: v_cvt_u32_f32_e32 v0, v0 291; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 292; SI-NEXT: v_or_b32_e32 v0, v0, v1 293; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 294; SI-NEXT: s_endpgm 295; 296; VI-LABEL: fptoui_v2f16_to_v2i16: 297; VI: ; %bb.0: ; %entry 298; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 299; VI-NEXT: s_mov_b32 s7, 0xf000 300; VI-NEXT: s_mov_b32 s6, -1 301; VI-NEXT: s_mov_b32 s10, s6 302; VI-NEXT: s_mov_b32 s11, s7 303; VI-NEXT: s_waitcnt lgkmcnt(0) 304; VI-NEXT: s_mov_b32 s8, s2 305; VI-NEXT: s_mov_b32 s9, s3 306; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 307; VI-NEXT: s_mov_b32 s4, s0 308; VI-NEXT: s_mov_b32 s5, s1 309; VI-NEXT: s_waitcnt vmcnt(0) 310; VI-NEXT: v_cvt_u16_f16_e32 v1, v0 311; VI-NEXT: v_cvt_u16_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 312; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 313; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 314; VI-NEXT: s_endpgm 315; 316; GFX11-TRUE16-LABEL: fptoui_v2f16_to_v2i16: 317; GFX11-TRUE16: ; %bb.0: ; %entry 318; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 319; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 320; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 321; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 322; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 323; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) 324; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 325; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 326; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 327; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 328; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 329; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) 330; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 331; GFX11-TRUE16-NEXT: v_cvt_u16_f16_e32 v0.l, v0.l 332; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 333; GFX11-TRUE16-NEXT: v_cvt_u16_f16_e32 v0.h, v1.l 334; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l 335; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 336; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h 337; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 338; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 339; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 340; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 341; GFX11-TRUE16-NEXT: s_endpgm 342; 343; GFX11-FAKE16-LABEL: fptoui_v2f16_to_v2i16: 344; GFX11-FAKE16: ; %bb.0: ; %entry 345; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 346; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 347; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 348; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 349; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 350; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) 351; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 352; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 353; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 354; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 355; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 356; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) 357; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 358; GFX11-FAKE16-NEXT: v_cvt_u16_f16_e32 v0, v0 359; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 360; GFX11-FAKE16-NEXT: v_cvt_u16_f16_e32 v1, v1 361; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 362; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 363; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 364; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 365; GFX11-FAKE16-NEXT: s_endpgm 366 ptr addrspace(1) %r, 367 ptr addrspace(1) %a) { 368entry: 369 %a.val = load <2 x half>, ptr addrspace(1) %a 370 %r.val = fptoui <2 x half> %a.val to <2 x i16> 371 store <2 x i16> %r.val, ptr addrspace(1) %r 372 ret void 373} 374 375define amdgpu_kernel void @fptoui_v2f16_to_v2i32( 376; SI-LABEL: fptoui_v2f16_to_v2i32: 377; SI: ; %bb.0: ; %entry 378; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 379; SI-NEXT: s_mov_b32 s7, 0xf000 380; SI-NEXT: s_mov_b32 s6, -1 381; SI-NEXT: s_mov_b32 s10, s6 382; SI-NEXT: s_mov_b32 s11, s7 383; SI-NEXT: s_waitcnt lgkmcnt(0) 384; SI-NEXT: s_mov_b32 s8, s2 385; SI-NEXT: s_mov_b32 s9, s3 386; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 387; SI-NEXT: s_mov_b32 s4, s0 388; SI-NEXT: s_mov_b32 s5, s1 389; SI-NEXT: s_waitcnt vmcnt(0) 390; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 391; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 392; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 393; SI-NEXT: v_cvt_u32_f32_e32 v0, v0 394; SI-NEXT: v_cvt_u32_f32_e32 v1, v1 395; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 396; SI-NEXT: s_endpgm 397; 398; VI-LABEL: fptoui_v2f16_to_v2i32: 399; VI: ; %bb.0: ; %entry 400; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 401; VI-NEXT: s_mov_b32 s7, 0xf000 402; VI-NEXT: s_mov_b32 s6, -1 403; VI-NEXT: s_mov_b32 s10, s6 404; VI-NEXT: s_mov_b32 s11, s7 405; VI-NEXT: s_waitcnt lgkmcnt(0) 406; VI-NEXT: s_mov_b32 s8, s2 407; VI-NEXT: s_mov_b32 s9, s3 408; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 409; VI-NEXT: s_mov_b32 s4, s0 410; VI-NEXT: s_mov_b32 s5, s1 411; VI-NEXT: s_waitcnt vmcnt(0) 412; VI-NEXT: v_cvt_f32_f16_e32 v1, v0 413; VI-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 414; VI-NEXT: v_cvt_u32_f32_e32 v0, v1 415; VI-NEXT: v_cvt_u32_f32_e32 v1, v2 416; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 417; VI-NEXT: s_endpgm 418; 419; GFX11-TRUE16-LABEL: fptoui_v2f16_to_v2i32: 420; GFX11-TRUE16: ; %bb.0: ; %entry 421; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 422; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 423; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 424; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 425; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 426; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) 427; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 428; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 429; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 430; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 431; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 432; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) 433; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 434; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l 435; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 436; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v1.l 437; GFX11-TRUE16-NEXT: v_cvt_u32_f32_e32 v0, v0 438; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) 439; GFX11-TRUE16-NEXT: v_cvt_u32_f32_e32 v1, v1 440; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 441; GFX11-TRUE16-NEXT: s_endpgm 442; 443; GFX11-FAKE16-LABEL: fptoui_v2f16_to_v2i32: 444; GFX11-FAKE16: ; %bb.0: ; %entry 445; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 446; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 447; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 448; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 449; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 450; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) 451; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 452; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 453; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 454; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 455; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 456; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) 457; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 458; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 459; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 460; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v1 461; GFX11-FAKE16-NEXT: v_cvt_u32_f32_e32 v0, v0 462; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) 463; GFX11-FAKE16-NEXT: v_cvt_u32_f32_e32 v1, v1 464; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 465; GFX11-FAKE16-NEXT: s_endpgm 466 ptr addrspace(1) %r, 467 ptr addrspace(1) %a) { 468entry: 469 %a.val = load <2 x half>, ptr addrspace(1) %a 470 %r.val = fptoui <2 x half> %a.val to <2 x i32> 471 store <2 x i32> %r.val, ptr addrspace(1) %r 472 ret void 473} 474 475; Need to make sure we promote f16 to f32 when converting f16 to i64. Existing 476; test checks code generated for 'i64 = fp_to_uint f32'. 477 478define amdgpu_kernel void @fptoui_v2f16_to_v2i64( 479; SI-LABEL: fptoui_v2f16_to_v2i64: 480; SI: ; %bb.0: ; %entry 481; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 482; SI-NEXT: s_mov_b32 s7, 0xf000 483; SI-NEXT: s_mov_b32 s6, -1 484; SI-NEXT: s_mov_b32 s10, s6 485; SI-NEXT: s_mov_b32 s11, s7 486; SI-NEXT: s_waitcnt lgkmcnt(0) 487; SI-NEXT: s_mov_b32 s8, s2 488; SI-NEXT: s_mov_b32 s9, s3 489; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 490; SI-NEXT: s_mov_b32 s4, s0 491; SI-NEXT: s_mov_b32 s5, s1 492; SI-NEXT: s_waitcnt vmcnt(0) 493; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 494; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 495; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 496; SI-NEXT: v_cvt_u32_f32_e32 v0, v0 497; SI-NEXT: v_cvt_u32_f32_e32 v2, v1 498; SI-NEXT: v_mov_b32_e32 v1, 0 499; SI-NEXT: v_mov_b32_e32 v3, v1 500; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 501; SI-NEXT: s_endpgm 502; 503; VI-LABEL: fptoui_v2f16_to_v2i64: 504; VI: ; %bb.0: ; %entry 505; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 506; VI-NEXT: s_mov_b32 s7, 0xf000 507; VI-NEXT: s_mov_b32 s6, -1 508; VI-NEXT: s_mov_b32 s10, s6 509; VI-NEXT: s_mov_b32 s11, s7 510; VI-NEXT: s_waitcnt lgkmcnt(0) 511; VI-NEXT: s_mov_b32 s8, s2 512; VI-NEXT: s_mov_b32 s9, s3 513; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 514; VI-NEXT: s_mov_b32 s4, s0 515; VI-NEXT: s_mov_b32 s5, s1 516; VI-NEXT: s_waitcnt vmcnt(0) 517; VI-NEXT: v_cvt_f32_f16_e32 v1, v0 518; VI-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 519; VI-NEXT: v_cvt_u32_f32_e32 v0, v1 520; VI-NEXT: v_cvt_u32_f32_e32 v2, v2 521; VI-NEXT: v_mov_b32_e32 v1, 0 522; VI-NEXT: v_mov_b32_e32 v3, v1 523; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 524; VI-NEXT: s_endpgm 525; 526; GFX11-TRUE16-LABEL: fptoui_v2f16_to_v2i64: 527; GFX11-TRUE16: ; %bb.0: ; %entry 528; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 529; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 530; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 531; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 532; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 533; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) 534; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 535; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 536; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 537; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 538; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 539; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) 540; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 541; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l 542; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) 543; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v1.l 544; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 545; GFX11-TRUE16-NEXT: v_cvt_u32_f32_e32 v0, v0 546; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 547; GFX11-TRUE16-NEXT: v_cvt_u32_f32_e32 v2, v2 548; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v1 549; GFX11-TRUE16-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 550; GFX11-TRUE16-NEXT: s_endpgm 551; 552; GFX11-FAKE16-LABEL: fptoui_v2f16_to_v2i64: 553; GFX11-FAKE16: ; %bb.0: ; %entry 554; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 555; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 556; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 557; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 558; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 559; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) 560; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 561; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 562; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 563; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 564; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 565; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) 566; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 567; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 568; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) 569; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v2, v1 570; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0 571; GFX11-FAKE16-NEXT: v_cvt_u32_f32_e32 v0, v0 572; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 573; GFX11-FAKE16-NEXT: v_cvt_u32_f32_e32 v2, v2 574; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v1 575; GFX11-FAKE16-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 576; GFX11-FAKE16-NEXT: s_endpgm 577 ptr addrspace(1) %r, 578 ptr addrspace(1) %a) { 579entry: 580 %a.val = load <2 x half>, ptr addrspace(1) %a 581 %r.val = fptoui <2 x half> %a.val to <2 x i64> 582 store <2 x i64> %r.val, ptr addrspace(1) %r 583 ret void 584} 585 586define amdgpu_kernel void @fptoui_f16_to_i1(ptr addrspace(1) %out, half %in) { 587; SI-LABEL: fptoui_f16_to_i1: 588; SI: ; %bb.0: ; %entry 589; SI-NEXT: s_load_dword s0, s[4:5], 0xb 590; SI-NEXT: s_mov_b32 s3, 0xf000 591; SI-NEXT: s_mov_b32 s2, -1 592; SI-NEXT: s_waitcnt lgkmcnt(0) 593; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 594; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 595; SI-NEXT: v_cmp_eq_f32_e32 vcc, 1.0, v0 596; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 597; SI-NEXT: s_waitcnt lgkmcnt(0) 598; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 599; SI-NEXT: s_endpgm 600; 601; VI-LABEL: fptoui_f16_to_i1: 602; VI: ; %bb.0: ; %entry 603; VI-NEXT: s_load_dword s6, s[4:5], 0x2c 604; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 605; VI-NEXT: s_mov_b32 s3, 0xf000 606; VI-NEXT: s_mov_b32 s2, -1 607; VI-NEXT: s_waitcnt lgkmcnt(0) 608; VI-NEXT: v_cmp_eq_f16_e64 s[4:5], 1.0, s6 609; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] 610; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 611; VI-NEXT: s_endpgm 612; 613; GFX11-TRUE16-LABEL: fptoui_f16_to_i1: 614; GFX11-TRUE16: ; %bb.0: ; %entry 615; GFX11-TRUE16-NEXT: s_clause 0x1 616; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c 617; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 618; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 619; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) 620; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 621; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 622; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 623; GFX11-TRUE16-NEXT: v_cmp_eq_f16_e32 vcc_lo, 1.0, v0 624; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 625; GFX11-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0 626; GFX11-TRUE16-NEXT: s_endpgm 627; 628; GFX11-FAKE16-LABEL: fptoui_f16_to_i1: 629; GFX11-FAKE16: ; %bb.0: ; %entry 630; GFX11-FAKE16-NEXT: s_clause 0x1 631; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c 632; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 633; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 634; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) 635; GFX11-FAKE16-NEXT: v_cmp_eq_f16_e64 s2, 1.0, s2 636; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 637; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 638; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 639; GFX11-FAKE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0 640; GFX11-FAKE16-NEXT: s_endpgm 641entry: 642 %conv = fptoui half %in to i1 643 store i1 %conv, ptr addrspace(1) %out 644 ret void 645} 646