1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s 4; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s 5; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s 6 7 8define amdgpu_kernel void @fptosi_f16_to_i16( 9; SI-LABEL: fptosi_f16_to_i16: 10; SI: ; %bb.0: ; %entry 11; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 12; SI-NEXT: s_mov_b32 s7, 0xf000 13; SI-NEXT: s_mov_b32 s6, -1 14; SI-NEXT: s_mov_b32 s10, s6 15; SI-NEXT: s_mov_b32 s11, s7 16; SI-NEXT: s_waitcnt lgkmcnt(0) 17; SI-NEXT: s_mov_b32 s8, s2 18; SI-NEXT: s_mov_b32 s9, s3 19; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 20; SI-NEXT: s_mov_b32 s4, s0 21; SI-NEXT: s_mov_b32 s5, s1 22; SI-NEXT: s_waitcnt vmcnt(0) 23; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 24; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 25; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 26; SI-NEXT: s_endpgm 27; 28; VI-LABEL: fptosi_f16_to_i16: 29; VI: ; %bb.0: ; %entry 30; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 31; VI-NEXT: s_mov_b32 s7, 0xf000 32; VI-NEXT: s_mov_b32 s6, -1 33; VI-NEXT: s_mov_b32 s10, s6 34; VI-NEXT: s_mov_b32 s11, s7 35; VI-NEXT: s_waitcnt lgkmcnt(0) 36; VI-NEXT: s_mov_b32 s8, s2 37; VI-NEXT: s_mov_b32 s9, s3 38; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 39; VI-NEXT: s_mov_b32 s4, s0 40; VI-NEXT: s_mov_b32 s5, s1 41; VI-NEXT: s_waitcnt vmcnt(0) 42; VI-NEXT: v_cvt_i16_f16_e32 v0, v0 43; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 44; VI-NEXT: s_endpgm 45; 46; GFX11-TRUE16-LABEL: fptosi_f16_to_i16: 47; GFX11-TRUE16: ; %bb.0: ; %entry 48; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 49; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 50; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 51; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 52; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 53; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) 54; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 55; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 56; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 57; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 58; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 59; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) 60; GFX11-TRUE16-NEXT: v_cvt_i16_f16_e32 v0.l, v0.l 61; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 62; GFX11-TRUE16-NEXT: s_endpgm 63; 64; GFX11-FAKE16-LABEL: fptosi_f16_to_i16: 65; GFX11-FAKE16: ; %bb.0: ; %entry 66; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 67; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 68; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 69; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 70; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 71; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) 72; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 73; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 74; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 75; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 76; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 77; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) 78; GFX11-FAKE16-NEXT: v_cvt_i16_f16_e32 v0, v0 79; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 80; GFX11-FAKE16-NEXT: s_endpgm 81 ptr addrspace(1) %r, 82 ptr addrspace(1) %a) { 83entry: 84 %a.val = load half, ptr addrspace(1) %a 85 %r.val = fptosi half %a.val to i16 86 store i16 %r.val, ptr addrspace(1) %r 87 ret void 88} 89 90define amdgpu_kernel void @fptosi_f16_to_i32( 91; SI-LABEL: fptosi_f16_to_i32: 92; SI: ; %bb.0: ; %entry 93; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 94; SI-NEXT: s_mov_b32 s7, 0xf000 95; SI-NEXT: s_mov_b32 s6, -1 96; SI-NEXT: s_mov_b32 s10, s6 97; SI-NEXT: s_mov_b32 s11, s7 98; SI-NEXT: s_waitcnt lgkmcnt(0) 99; SI-NEXT: s_mov_b32 s8, s2 100; SI-NEXT: s_mov_b32 s9, s3 101; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 102; SI-NEXT: s_mov_b32 s4, s0 103; SI-NEXT: s_mov_b32 s5, s1 104; SI-NEXT: s_waitcnt vmcnt(0) 105; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 106; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 107; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 108; SI-NEXT: s_endpgm 109; 110; VI-LABEL: fptosi_f16_to_i32: 111; VI: ; %bb.0: ; %entry 112; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 113; VI-NEXT: s_mov_b32 s7, 0xf000 114; VI-NEXT: s_mov_b32 s6, -1 115; VI-NEXT: s_mov_b32 s10, s6 116; VI-NEXT: s_mov_b32 s11, s7 117; VI-NEXT: s_waitcnt lgkmcnt(0) 118; VI-NEXT: s_mov_b32 s8, s2 119; VI-NEXT: s_mov_b32 s9, s3 120; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 121; VI-NEXT: s_mov_b32 s4, s0 122; VI-NEXT: s_mov_b32 s5, s1 123; VI-NEXT: s_waitcnt vmcnt(0) 124; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 125; VI-NEXT: v_cvt_i32_f32_e32 v0, v0 126; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 127; VI-NEXT: s_endpgm 128; 129; GFX11-TRUE16-LABEL: fptosi_f16_to_i32: 130; GFX11-TRUE16: ; %bb.0: ; %entry 131; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 132; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 133; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 134; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 135; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 136; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) 137; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 138; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 139; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 140; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 141; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 142; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) 143; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l 144; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 145; GFX11-TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0 146; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 147; GFX11-TRUE16-NEXT: s_endpgm 148; 149; GFX11-FAKE16-LABEL: fptosi_f16_to_i32: 150; GFX11-FAKE16: ; %bb.0: ; %entry 151; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 152; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 153; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 154; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 155; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 156; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) 157; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 158; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 159; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 160; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 161; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 162; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) 163; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 164; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 165; GFX11-FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0 166; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 167; GFX11-FAKE16-NEXT: s_endpgm 168 ptr addrspace(1) %r, 169 ptr addrspace(1) %a) { 170entry: 171 %a.val = load half, ptr addrspace(1) %a 172 %r.val = fptosi half %a.val to i32 173 store i32 %r.val, ptr addrspace(1) %r 174 ret void 175} 176 177; Need to make sure we promote f16 to f32 when converting f16 to i64. Existing 178; test checks code generated for 'i64 = fp_to_sint f32'. 179 180define amdgpu_kernel void @fptosi_f16_to_i64( 181; SI-LABEL: fptosi_f16_to_i64: 182; SI: ; %bb.0: ; %entry 183; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 184; SI-NEXT: s_mov_b32 s7, 0xf000 185; SI-NEXT: s_mov_b32 s6, -1 186; SI-NEXT: s_mov_b32 s10, s6 187; SI-NEXT: s_mov_b32 s11, s7 188; SI-NEXT: s_waitcnt lgkmcnt(0) 189; SI-NEXT: s_mov_b32 s8, s2 190; SI-NEXT: s_mov_b32 s9, s3 191; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 192; SI-NEXT: s_mov_b32 s4, s0 193; SI-NEXT: s_mov_b32 s5, s1 194; SI-NEXT: s_waitcnt vmcnt(0) 195; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 196; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 197; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 198; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 199; SI-NEXT: s_endpgm 200; 201; VI-LABEL: fptosi_f16_to_i64: 202; VI: ; %bb.0: ; %entry 203; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 204; VI-NEXT: s_mov_b32 s7, 0xf000 205; VI-NEXT: s_mov_b32 s6, -1 206; VI-NEXT: s_mov_b32 s10, s6 207; VI-NEXT: s_mov_b32 s11, s7 208; VI-NEXT: s_waitcnt lgkmcnt(0) 209; VI-NEXT: s_mov_b32 s8, s2 210; VI-NEXT: s_mov_b32 s9, s3 211; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 212; VI-NEXT: s_mov_b32 s4, s0 213; VI-NEXT: s_mov_b32 s5, s1 214; VI-NEXT: s_waitcnt vmcnt(0) 215; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 216; VI-NEXT: v_cvt_i32_f32_e32 v0, v0 217; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 218; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 219; VI-NEXT: s_endpgm 220; 221; GFX11-TRUE16-LABEL: fptosi_f16_to_i64: 222; GFX11-TRUE16: ; %bb.0: ; %entry 223; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 224; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 225; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 226; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 227; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 228; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) 229; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 230; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 231; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 232; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 233; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 234; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) 235; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l 236; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 237; GFX11-TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0 238; GFX11-TRUE16-NEXT: v_ashrrev_i32_e32 v1, 31, v0 239; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 240; GFX11-TRUE16-NEXT: s_endpgm 241; 242; GFX11-FAKE16-LABEL: fptosi_f16_to_i64: 243; GFX11-FAKE16: ; %bb.0: ; %entry 244; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 245; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 246; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 247; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 248; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 249; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) 250; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 251; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 252; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 253; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 254; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 255; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) 256; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 257; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 258; GFX11-FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0 259; GFX11-FAKE16-NEXT: v_ashrrev_i32_e32 v1, 31, v0 260; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 261; GFX11-FAKE16-NEXT: s_endpgm 262 ptr addrspace(1) %r, 263 ptr addrspace(1) %a) { 264entry: 265 %a.val = load half, ptr addrspace(1) %a 266 %r.val = fptosi half %a.val to i64 267 store i64 %r.val, ptr addrspace(1) %r 268 ret void 269} 270 271define amdgpu_kernel void @fptosi_v2f16_to_v2i16( 272; SI-LABEL: fptosi_v2f16_to_v2i16: 273; SI: ; %bb.0: ; %entry 274; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 275; SI-NEXT: s_mov_b32 s7, 0xf000 276; SI-NEXT: s_mov_b32 s6, -1 277; SI-NEXT: s_mov_b32 s10, s6 278; SI-NEXT: s_mov_b32 s11, s7 279; SI-NEXT: s_waitcnt lgkmcnt(0) 280; SI-NEXT: s_mov_b32 s8, s2 281; SI-NEXT: s_mov_b32 s9, s3 282; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 283; SI-NEXT: s_mov_b32 s4, s0 284; SI-NEXT: s_mov_b32 s5, s1 285; SI-NEXT: s_waitcnt vmcnt(0) 286; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 287; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 288; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 289; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 290; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 291; SI-NEXT: v_cvt_i32_f32_e32 v1, v1 292; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 293; SI-NEXT: v_or_b32_e32 v0, v0, v1 294; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 295; SI-NEXT: s_endpgm 296; 297; VI-LABEL: fptosi_v2f16_to_v2i16: 298; VI: ; %bb.0: ; %entry 299; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 300; VI-NEXT: s_mov_b32 s7, 0xf000 301; VI-NEXT: s_mov_b32 s6, -1 302; VI-NEXT: s_mov_b32 s10, s6 303; VI-NEXT: s_mov_b32 s11, s7 304; VI-NEXT: s_waitcnt lgkmcnt(0) 305; VI-NEXT: s_mov_b32 s8, s2 306; VI-NEXT: s_mov_b32 s9, s3 307; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 308; VI-NEXT: s_mov_b32 s4, s0 309; VI-NEXT: s_mov_b32 s5, s1 310; VI-NEXT: s_waitcnt vmcnt(0) 311; VI-NEXT: v_cvt_i16_f16_e32 v1, v0 312; VI-NEXT: v_cvt_i16_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 313; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 314; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 315; VI-NEXT: s_endpgm 316; 317; GFX11-TRUE16-LABEL: fptosi_v2f16_to_v2i16: 318; GFX11-TRUE16: ; %bb.0: ; %entry 319; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 320; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 321; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 322; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 323; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 324; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) 325; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 326; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 327; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 328; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 329; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 330; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) 331; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 332; GFX11-TRUE16-NEXT: v_cvt_i16_f16_e32 v0.l, v0.l 333; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 334; GFX11-TRUE16-NEXT: v_cvt_i16_f16_e32 v0.h, v1.l 335; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l 336; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 337; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h 338; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 339; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 340; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 341; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 342; GFX11-TRUE16-NEXT: s_endpgm 343; 344; GFX11-FAKE16-LABEL: fptosi_v2f16_to_v2i16: 345; GFX11-FAKE16: ; %bb.0: ; %entry 346; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 347; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 348; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 349; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 350; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 351; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) 352; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 353; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 354; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 355; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 356; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 357; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) 358; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 359; GFX11-FAKE16-NEXT: v_cvt_i16_f16_e32 v0, v0 360; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 361; GFX11-FAKE16-NEXT: v_cvt_i16_f16_e32 v1, v1 362; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 363; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 364; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 365; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 366; GFX11-FAKE16-NEXT: s_endpgm 367 ptr addrspace(1) %r, 368 ptr addrspace(1) %a) { 369entry: 370 %a.val = load <2 x half>, ptr addrspace(1) %a 371 %r.val = fptosi <2 x half> %a.val to <2 x i16> 372 store <2 x i16> %r.val, ptr addrspace(1) %r 373 ret void 374} 375 376define amdgpu_kernel void @fptosi_v2f16_to_v2i32( 377; SI-LABEL: fptosi_v2f16_to_v2i32: 378; SI: ; %bb.0: ; %entry 379; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 380; SI-NEXT: s_mov_b32 s7, 0xf000 381; SI-NEXT: s_mov_b32 s6, -1 382; SI-NEXT: s_mov_b32 s10, s6 383; SI-NEXT: s_mov_b32 s11, s7 384; SI-NEXT: s_waitcnt lgkmcnt(0) 385; SI-NEXT: s_mov_b32 s8, s2 386; SI-NEXT: s_mov_b32 s9, s3 387; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 388; SI-NEXT: s_mov_b32 s4, s0 389; SI-NEXT: s_mov_b32 s5, s1 390; SI-NEXT: s_waitcnt vmcnt(0) 391; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 392; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 393; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 394; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 395; SI-NEXT: v_cvt_i32_f32_e32 v1, v1 396; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 397; SI-NEXT: s_endpgm 398; 399; VI-LABEL: fptosi_v2f16_to_v2i32: 400; VI: ; %bb.0: ; %entry 401; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 402; VI-NEXT: s_mov_b32 s7, 0xf000 403; VI-NEXT: s_mov_b32 s6, -1 404; VI-NEXT: s_mov_b32 s10, s6 405; VI-NEXT: s_mov_b32 s11, s7 406; VI-NEXT: s_waitcnt lgkmcnt(0) 407; VI-NEXT: s_mov_b32 s8, s2 408; VI-NEXT: s_mov_b32 s9, s3 409; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 410; VI-NEXT: s_mov_b32 s4, s0 411; VI-NEXT: s_mov_b32 s5, s1 412; VI-NEXT: s_waitcnt vmcnt(0) 413; VI-NEXT: v_cvt_f32_f16_e32 v1, v0 414; VI-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 415; VI-NEXT: v_cvt_i32_f32_e32 v0, v1 416; VI-NEXT: v_cvt_i32_f32_e32 v1, v2 417; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 418; VI-NEXT: s_endpgm 419; 420; GFX11-TRUE16-LABEL: fptosi_v2f16_to_v2i32: 421; GFX11-TRUE16: ; %bb.0: ; %entry 422; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 423; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 424; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 425; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 426; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 427; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) 428; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 429; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 430; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 431; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 432; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 433; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) 434; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 435; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l 436; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 437; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v1.l 438; GFX11-TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0 439; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) 440; GFX11-TRUE16-NEXT: v_cvt_i32_f32_e32 v1, v1 441; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 442; GFX11-TRUE16-NEXT: s_endpgm 443; 444; GFX11-FAKE16-LABEL: fptosi_v2f16_to_v2i32: 445; GFX11-FAKE16: ; %bb.0: ; %entry 446; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 447; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 448; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 449; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 450; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 451; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) 452; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 453; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 454; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 455; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 456; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 457; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) 458; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 459; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 460; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 461; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v1 462; GFX11-FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0 463; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) 464; GFX11-FAKE16-NEXT: v_cvt_i32_f32_e32 v1, v1 465; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 466; GFX11-FAKE16-NEXT: s_endpgm 467 ptr addrspace(1) %r, 468 ptr addrspace(1) %a) { 469entry: 470 %a.val = load <2 x half>, ptr addrspace(1) %a 471 %r.val = fptosi <2 x half> %a.val to <2 x i32> 472 store <2 x i32> %r.val, ptr addrspace(1) %r 473 ret void 474} 475 476; Need to make sure we promote f16 to f32 when converting f16 to i64. Existing 477; test checks code generated for 'i64 = fp_to_sint f32'. 478 479define amdgpu_kernel void @fptosi_v2f16_to_v2i64( 480; SI-LABEL: fptosi_v2f16_to_v2i64: 481; SI: ; %bb.0: ; %entry 482; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 483; SI-NEXT: s_mov_b32 s7, 0xf000 484; SI-NEXT: s_mov_b32 s6, -1 485; SI-NEXT: s_mov_b32 s10, s6 486; SI-NEXT: s_mov_b32 s11, s7 487; SI-NEXT: s_waitcnt lgkmcnt(0) 488; SI-NEXT: s_mov_b32 s8, s2 489; SI-NEXT: s_mov_b32 s9, s3 490; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 491; SI-NEXT: s_mov_b32 s4, s0 492; SI-NEXT: s_mov_b32 s5, s1 493; SI-NEXT: s_waitcnt vmcnt(0) 494; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 495; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 496; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 497; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 498; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 499; SI-NEXT: v_cvt_i32_f32_e32 v2, v2 500; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 501; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 502; SI-NEXT: s_endpgm 503; 504; VI-LABEL: fptosi_v2f16_to_v2i64: 505; VI: ; %bb.0: ; %entry 506; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 507; VI-NEXT: s_mov_b32 s7, 0xf000 508; VI-NEXT: s_mov_b32 s6, -1 509; VI-NEXT: s_mov_b32 s10, s6 510; VI-NEXT: s_mov_b32 s11, s7 511; VI-NEXT: s_waitcnt lgkmcnt(0) 512; VI-NEXT: s_mov_b32 s8, s2 513; VI-NEXT: s_mov_b32 s9, s3 514; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 515; VI-NEXT: s_mov_b32 s4, s0 516; VI-NEXT: s_mov_b32 s5, s1 517; VI-NEXT: s_waitcnt vmcnt(0) 518; VI-NEXT: v_cvt_f32_f16_e32 v1, v0 519; VI-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 520; VI-NEXT: v_cvt_i32_f32_e32 v0, v1 521; VI-NEXT: v_cvt_i32_f32_e32 v2, v2 522; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 523; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 524; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 525; VI-NEXT: s_endpgm 526; 527; GFX11-TRUE16-LABEL: fptosi_v2f16_to_v2i64: 528; GFX11-TRUE16: ; %bb.0: ; %entry 529; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 530; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 531; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 532; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 533; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 534; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) 535; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 536; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 537; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 538; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 539; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 540; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) 541; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 542; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l 543; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 544; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v1.l 545; GFX11-TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0 546; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 547; GFX11-TRUE16-NEXT: v_cvt_i32_f32_e32 v2, v1 548; GFX11-TRUE16-NEXT: v_ashrrev_i32_e32 v1, 31, v0 549; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) 550; GFX11-TRUE16-NEXT: v_ashrrev_i32_e32 v3, 31, v2 551; GFX11-TRUE16-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 552; GFX11-TRUE16-NEXT: s_endpgm 553; 554; GFX11-FAKE16-LABEL: fptosi_v2f16_to_v2i64: 555; GFX11-FAKE16: ; %bb.0: ; %entry 556; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 557; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 558; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 559; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 560; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 561; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) 562; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 563; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 564; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 565; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 566; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 567; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) 568; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 569; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 570; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 571; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v1 572; GFX11-FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0 573; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 574; GFX11-FAKE16-NEXT: v_cvt_i32_f32_e32 v2, v1 575; GFX11-FAKE16-NEXT: v_ashrrev_i32_e32 v1, 31, v0 576; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) 577; GFX11-FAKE16-NEXT: v_ashrrev_i32_e32 v3, 31, v2 578; GFX11-FAKE16-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 579; GFX11-FAKE16-NEXT: s_endpgm 580 ptr addrspace(1) %r, 581 ptr addrspace(1) %a) { 582entry: 583 %a.val = load <2 x half>, ptr addrspace(1) %a 584 %r.val = fptosi <2 x half> %a.val to <2 x i64> 585 store <2 x i64> %r.val, ptr addrspace(1) %r 586 ret void 587} 588 589define amdgpu_kernel void @fptosi_f16_to_i1(ptr addrspace(1) %out, half %in) { 590; SI-LABEL: fptosi_f16_to_i1: 591; SI: ; %bb.0: ; %entry 592; SI-NEXT: s_load_dword s6, s[4:5], 0xb 593; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 594; SI-NEXT: s_mov_b32 s3, 0xf000 595; SI-NEXT: s_mov_b32 s2, -1 596; SI-NEXT: s_waitcnt lgkmcnt(0) 597; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 598; SI-NEXT: v_cmp_eq_f32_e32 vcc, -1.0, v0 599; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 600; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 601; SI-NEXT: s_endpgm 602; 603; VI-LABEL: fptosi_f16_to_i1: 604; VI: ; %bb.0: ; %entry 605; VI-NEXT: s_load_dword s6, s[4:5], 0x2c 606; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 607; VI-NEXT: s_mov_b32 s3, 0xf000 608; VI-NEXT: s_mov_b32 s2, -1 609; VI-NEXT: s_waitcnt lgkmcnt(0) 610; VI-NEXT: v_cmp_eq_f16_e64 s[4:5], -1.0, s6 611; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] 612; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 613; VI-NEXT: s_endpgm 614; 615; GFX11-TRUE16-LABEL: fptosi_f16_to_i1: 616; GFX11-TRUE16: ; %bb.0: ; %entry 617; GFX11-TRUE16-NEXT: s_clause 0x1 618; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c 619; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 620; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 621; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) 622; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 623; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 624; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 625; GFX11-TRUE16-NEXT: v_cmp_eq_f16_e32 vcc_lo, -1.0, v0 626; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 627; GFX11-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0 628; GFX11-TRUE16-NEXT: s_endpgm 629; 630; GFX11-FAKE16-LABEL: fptosi_f16_to_i1: 631; GFX11-FAKE16: ; %bb.0: ; %entry 632; GFX11-FAKE16-NEXT: s_clause 0x1 633; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c 634; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 635; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 636; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) 637; GFX11-FAKE16-NEXT: v_cmp_eq_f16_e64 s2, -1.0, s2 638; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 639; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 640; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 641; GFX11-FAKE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0 642; GFX11-FAKE16-NEXT: s_endpgm 643entry: 644 %conv = fptosi half %in to i1 645 store i1 %conv, ptr addrspace(1) %out 646 ret void 647} 648