1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=SI %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX89,VI %s 4; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX89,GFX9 %s 5; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11-TRUE16 %s 6; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11-FAKE16 %s 7 8define amdgpu_kernel void @fpext_f16_to_f32( 9; SI-LABEL: fpext_f16_to_f32: 10; SI: ; %bb.0: ; %entry 11; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 12; SI-NEXT: s_mov_b32 s7, 0xf000 13; SI-NEXT: s_mov_b32 s6, -1 14; SI-NEXT: s_mov_b32 s10, s6 15; SI-NEXT: s_mov_b32 s11, s7 16; SI-NEXT: s_waitcnt lgkmcnt(0) 17; SI-NEXT: s_mov_b32 s8, s2 18; SI-NEXT: s_mov_b32 s9, s3 19; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 20; SI-NEXT: s_mov_b32 s4, s0 21; SI-NEXT: s_mov_b32 s5, s1 22; SI-NEXT: s_waitcnt vmcnt(0) 23; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 24; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 25; SI-NEXT: s_endpgm 26; 27; GFX89-LABEL: fpext_f16_to_f32: 28; GFX89: ; %bb.0: ; %entry 29; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 30; GFX89-NEXT: s_mov_b32 s7, 0xf000 31; GFX89-NEXT: s_mov_b32 s6, -1 32; GFX89-NEXT: s_mov_b32 s10, s6 33; GFX89-NEXT: s_mov_b32 s11, s7 34; GFX89-NEXT: s_waitcnt lgkmcnt(0) 35; GFX89-NEXT: s_mov_b32 s8, s2 36; GFX89-NEXT: s_mov_b32 s9, s3 37; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 38; GFX89-NEXT: s_mov_b32 s4, s0 39; GFX89-NEXT: s_mov_b32 s5, s1 40; GFX89-NEXT: s_waitcnt vmcnt(0) 41; GFX89-NEXT: v_cvt_f32_f16_e32 v0, v0 42; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 43; GFX89-NEXT: s_endpgm 44; 45; GFX11-TRUE16-LABEL: fpext_f16_to_f32: 46; GFX11-TRUE16: ; %bb.0: ; %entry 47; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 48; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 49; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 50; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 51; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 52; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) 53; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 54; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 55; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 56; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 57; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 58; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) 59; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l 60; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 61; GFX11-TRUE16-NEXT: s_endpgm 62; 63; GFX11-FAKE16-LABEL: fpext_f16_to_f32: 64; GFX11-FAKE16: ; %bb.0: ; %entry 65; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 66; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 67; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 68; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 69; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 70; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) 71; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 72; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 73; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 74; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 75; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 76; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) 77; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 78; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 79; GFX11-FAKE16-NEXT: s_endpgm 80 ptr addrspace(1) %r, 81 ptr addrspace(1) %a) #0 { 82entry: 83 %a.val = load half, ptr addrspace(1) %a 84 %r.val = fpext half %a.val to float 85 store float %r.val, ptr addrspace(1) %r 86 ret void 87} 88 89define amdgpu_kernel void @fpext_f16_to_f64( 90; SI-LABEL: fpext_f16_to_f64: 91; SI: ; %bb.0: ; %entry 92; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 93; SI-NEXT: s_mov_b32 s7, 0xf000 94; SI-NEXT: s_mov_b32 s6, -1 95; SI-NEXT: s_mov_b32 s10, s6 96; SI-NEXT: s_mov_b32 s11, s7 97; SI-NEXT: s_waitcnt lgkmcnt(0) 98; SI-NEXT: s_mov_b32 s8, s2 99; SI-NEXT: s_mov_b32 s9, s3 100; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 101; SI-NEXT: s_mov_b32 s4, s0 102; SI-NEXT: s_mov_b32 s5, s1 103; SI-NEXT: s_waitcnt vmcnt(0) 104; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 105; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 106; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 107; SI-NEXT: s_endpgm 108; 109; GFX89-LABEL: fpext_f16_to_f64: 110; GFX89: ; %bb.0: ; %entry 111; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 112; GFX89-NEXT: s_mov_b32 s7, 0xf000 113; GFX89-NEXT: s_mov_b32 s6, -1 114; GFX89-NEXT: s_mov_b32 s10, s6 115; GFX89-NEXT: s_mov_b32 s11, s7 116; GFX89-NEXT: s_waitcnt lgkmcnt(0) 117; GFX89-NEXT: s_mov_b32 s8, s2 118; GFX89-NEXT: s_mov_b32 s9, s3 119; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 120; GFX89-NEXT: s_mov_b32 s4, s0 121; GFX89-NEXT: s_mov_b32 s5, s1 122; GFX89-NEXT: s_waitcnt vmcnt(0) 123; GFX89-NEXT: v_cvt_f32_f16_e32 v0, v0 124; GFX89-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 125; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 126; GFX89-NEXT: s_endpgm 127; 128; GFX11-TRUE16-LABEL: fpext_f16_to_f64: 129; GFX11-TRUE16: ; %bb.0: ; %entry 130; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 131; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 132; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 133; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 134; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 135; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) 136; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 137; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 138; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 139; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 140; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 141; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) 142; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l 143; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 144; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 145; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 146; GFX11-TRUE16-NEXT: s_endpgm 147; 148; GFX11-FAKE16-LABEL: fpext_f16_to_f64: 149; GFX11-FAKE16: ; %bb.0: ; %entry 150; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 151; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 152; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 153; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 154; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 155; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) 156; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 157; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 158; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 159; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 160; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 161; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) 162; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 163; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 164; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 165; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 166; GFX11-FAKE16-NEXT: s_endpgm 167 ptr addrspace(1) %r, 168 ptr addrspace(1) %a) #0 { 169entry: 170 %a.val = load half, ptr addrspace(1) %a 171 %r.val = fpext half %a.val to double 172 store double %r.val, ptr addrspace(1) %r 173 ret void 174} 175 176define amdgpu_kernel void @fpext_v2f16_to_v2f32( 177; SI-LABEL: fpext_v2f16_to_v2f32: 178; SI: ; %bb.0: ; %entry 179; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 180; SI-NEXT: s_mov_b32 s7, 0xf000 181; SI-NEXT: s_mov_b32 s6, -1 182; SI-NEXT: s_mov_b32 s10, s6 183; SI-NEXT: s_mov_b32 s11, s7 184; SI-NEXT: s_waitcnt lgkmcnt(0) 185; SI-NEXT: s_mov_b32 s8, s2 186; SI-NEXT: s_mov_b32 s9, s3 187; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 188; SI-NEXT: s_mov_b32 s4, s0 189; SI-NEXT: s_mov_b32 s5, s1 190; SI-NEXT: s_waitcnt vmcnt(0) 191; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 192; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 193; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 194; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 195; SI-NEXT: s_endpgm 196; 197; GFX89-LABEL: fpext_v2f16_to_v2f32: 198; GFX89: ; %bb.0: ; %entry 199; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 200; GFX89-NEXT: s_mov_b32 s7, 0xf000 201; GFX89-NEXT: s_mov_b32 s6, -1 202; GFX89-NEXT: s_mov_b32 s10, s6 203; GFX89-NEXT: s_mov_b32 s11, s7 204; GFX89-NEXT: s_waitcnt lgkmcnt(0) 205; GFX89-NEXT: s_mov_b32 s8, s2 206; GFX89-NEXT: s_mov_b32 s9, s3 207; GFX89-NEXT: buffer_load_dword v1, off, s[8:11], 0 208; GFX89-NEXT: s_mov_b32 s4, s0 209; GFX89-NEXT: s_mov_b32 s5, s1 210; GFX89-NEXT: s_waitcnt vmcnt(0) 211; GFX89-NEXT: v_cvt_f32_f16_e32 v0, v1 212; GFX89-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 213; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 214; GFX89-NEXT: s_endpgm 215; 216; GFX11-TRUE16-LABEL: fpext_v2f16_to_v2f32: 217; GFX11-TRUE16: ; %bb.0: ; %entry 218; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 219; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 220; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 221; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 222; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 223; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) 224; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 225; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 226; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 227; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 228; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 229; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) 230; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 231; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l 232; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) 233; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v1.l 234; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 235; GFX11-TRUE16-NEXT: s_endpgm 236; 237; GFX11-FAKE16-LABEL: fpext_v2f16_to_v2f32: 238; GFX11-FAKE16: ; %bb.0: ; %entry 239; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 240; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 241; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 242; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 243; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 244; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) 245; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 246; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 247; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 248; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 249; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 250; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) 251; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 252; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 253; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) 254; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v1 255; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 256; GFX11-FAKE16-NEXT: s_endpgm 257 ptr addrspace(1) %r, 258 ptr addrspace(1) %a) #0 { 259entry: 260 %a.val = load <2 x half>, ptr addrspace(1) %a 261 %r.val = fpext <2 x half> %a.val to <2 x float> 262 store <2 x float> %r.val, ptr addrspace(1) %r 263 ret void 264} 265 266define amdgpu_kernel void @fpext_v2f16_to_v2f64( 267; SI-LABEL: fpext_v2f16_to_v2f64: 268; SI: ; %bb.0: ; %entry 269; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 270; SI-NEXT: s_mov_b32 s7, 0xf000 271; SI-NEXT: s_mov_b32 s6, -1 272; SI-NEXT: s_mov_b32 s10, s6 273; SI-NEXT: s_mov_b32 s11, s7 274; SI-NEXT: s_waitcnt lgkmcnt(0) 275; SI-NEXT: s_mov_b32 s8, s2 276; SI-NEXT: s_mov_b32 s9, s3 277; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 278; SI-NEXT: s_mov_b32 s4, s0 279; SI-NEXT: s_mov_b32 s5, s1 280; SI-NEXT: s_waitcnt vmcnt(0) 281; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 282; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 283; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 284; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 285; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 286; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 287; SI-NEXT: s_endpgm 288; 289; GFX89-LABEL: fpext_v2f16_to_v2f64: 290; GFX89: ; %bb.0: ; %entry 291; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 292; GFX89-NEXT: s_mov_b32 s7, 0xf000 293; GFX89-NEXT: s_mov_b32 s6, -1 294; GFX89-NEXT: s_mov_b32 s10, s6 295; GFX89-NEXT: s_mov_b32 s11, s7 296; GFX89-NEXT: s_waitcnt lgkmcnt(0) 297; GFX89-NEXT: s_mov_b32 s8, s2 298; GFX89-NEXT: s_mov_b32 s9, s3 299; GFX89-NEXT: buffer_load_dword v0, off, s[8:11], 0 300; GFX89-NEXT: s_mov_b32 s4, s0 301; GFX89-NEXT: s_mov_b32 s5, s1 302; GFX89-NEXT: s_waitcnt vmcnt(0) 303; GFX89-NEXT: v_cvt_f32_f16_e32 v1, v0 304; GFX89-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 305; GFX89-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 306; GFX89-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 307; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 308; GFX89-NEXT: s_endpgm 309; 310; GFX11-TRUE16-LABEL: fpext_v2f16_to_v2f64: 311; GFX11-TRUE16: ; %bb.0: ; %entry 312; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 313; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 314; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 315; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 316; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 317; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) 318; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 319; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 320; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 321; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 322; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 323; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) 324; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 325; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l 326; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 327; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v1.l 328; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 329; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) 330; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 331; GFX11-TRUE16-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 332; GFX11-TRUE16-NEXT: s_endpgm 333; 334; GFX11-FAKE16-LABEL: fpext_v2f16_to_v2f64: 335; GFX11-FAKE16: ; %bb.0: ; %entry 336; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 337; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 338; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 339; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 340; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 341; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) 342; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 343; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 344; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 345; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 346; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 347; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) 348; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 349; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 350; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 351; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v2, v1 352; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 353; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) 354; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 355; GFX11-FAKE16-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 356; GFX11-FAKE16-NEXT: s_endpgm 357 ptr addrspace(1) %r, 358 ptr addrspace(1) %a) { 359entry: 360 %a.val = load <2 x half>, ptr addrspace(1) %a 361 %r.val = fpext <2 x half> %a.val to <2 x double> 362 store <2 x double> %r.val, ptr addrspace(1) %r 363 ret void 364} 365 366define amdgpu_kernel void @s_fneg_fpext_f16_to_f32(ptr addrspace(1) %r, i32 %a) { 367; SI-LABEL: s_fneg_fpext_f16_to_f32: 368; SI: ; %bb.0: ; %entry 369; SI-NEXT: s_load_dword s2, s[4:5], 0xb 370; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 371; SI-NEXT: s_mov_b32 s3, 0xf000 372; SI-NEXT: s_waitcnt lgkmcnt(0) 373; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 374; SI-NEXT: s_mov_b32 s2, -1 375; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 376; SI-NEXT: s_endpgm 377; 378; GFX89-LABEL: s_fneg_fpext_f16_to_f32: 379; GFX89: ; %bb.0: ; %entry 380; GFX89-NEXT: s_load_dword s2, s[4:5], 0x2c 381; GFX89-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 382; GFX89-NEXT: s_mov_b32 s3, 0xf000 383; GFX89-NEXT: s_waitcnt lgkmcnt(0) 384; GFX89-NEXT: v_cvt_f32_f16_e32 v0, s2 385; GFX89-NEXT: s_mov_b32 s2, -1 386; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0 387; GFX89-NEXT: s_endpgm 388; 389; GFX11-TRUE16-LABEL: s_fneg_fpext_f16_to_f32: 390; GFX11-TRUE16: ; %bb.0: ; %entry 391; GFX11-TRUE16-NEXT: s_clause 0x1 392; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c 393; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 394; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 395; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) 396; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 397; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 398; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 399; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l 400; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 401; GFX11-TRUE16-NEXT: s_endpgm 402; 403; GFX11-FAKE16-LABEL: s_fneg_fpext_f16_to_f32: 404; GFX11-FAKE16: ; %bb.0: ; %entry 405; GFX11-FAKE16-NEXT: s_clause 0x1 406; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c 407; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 408; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 409; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) 410; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, s2 411; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 412; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 413; GFX11-FAKE16-NEXT: s_endpgm 414entry: 415 %a.trunc = trunc i32 %a to i16 416 %a.val = bitcast i16 %a.trunc to half 417 %r.val = fpext half %a.val to float 418 store float %r.val, ptr addrspace(1) %r 419 ret void 420} 421 422define amdgpu_kernel void @fneg_fpext_f16_to_f32( 423; SI-LABEL: fneg_fpext_f16_to_f32: 424; SI: ; %bb.0: ; %entry 425; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 426; SI-NEXT: s_mov_b32 s7, 0xf000 427; SI-NEXT: s_mov_b32 s6, -1 428; SI-NEXT: s_mov_b32 s10, s6 429; SI-NEXT: s_mov_b32 s11, s7 430; SI-NEXT: s_waitcnt lgkmcnt(0) 431; SI-NEXT: s_mov_b32 s8, s2 432; SI-NEXT: s_mov_b32 s9, s3 433; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 434; SI-NEXT: s_mov_b32 s4, s0 435; SI-NEXT: s_mov_b32 s5, s1 436; SI-NEXT: s_waitcnt vmcnt(0) 437; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 438; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 439; SI-NEXT: s_endpgm 440; 441; GFX89-LABEL: fneg_fpext_f16_to_f32: 442; GFX89: ; %bb.0: ; %entry 443; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 444; GFX89-NEXT: s_mov_b32 s7, 0xf000 445; GFX89-NEXT: s_mov_b32 s6, -1 446; GFX89-NEXT: s_mov_b32 s10, s6 447; GFX89-NEXT: s_mov_b32 s11, s7 448; GFX89-NEXT: s_waitcnt lgkmcnt(0) 449; GFX89-NEXT: s_mov_b32 s8, s2 450; GFX89-NEXT: s_mov_b32 s9, s3 451; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 452; GFX89-NEXT: s_mov_b32 s4, s0 453; GFX89-NEXT: s_mov_b32 s5, s1 454; GFX89-NEXT: s_waitcnt vmcnt(0) 455; GFX89-NEXT: v_cvt_f32_f16_e64 v0, -v0 456; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 457; GFX89-NEXT: s_endpgm 458; 459; GFX11-TRUE16-LABEL: fneg_fpext_f16_to_f32: 460; GFX11-TRUE16: ; %bb.0: ; %entry 461; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 462; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 463; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 464; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 465; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 466; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) 467; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 468; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 469; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 470; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 471; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 472; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) 473; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v0, -v0.l 474; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 475; GFX11-TRUE16-NEXT: s_endpgm 476; 477; GFX11-FAKE16-LABEL: fneg_fpext_f16_to_f32: 478; GFX11-FAKE16: ; %bb.0: ; %entry 479; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 480; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 481; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 482; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 483; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 484; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) 485; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 486; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 487; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 488; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 489; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 490; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) 491; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v0, -v0 492; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 493; GFX11-FAKE16-NEXT: s_endpgm 494 ptr addrspace(1) %r, 495 ptr addrspace(1) %a) { 496entry: 497 %a.val = load half, ptr addrspace(1) %a 498 %a.neg = fsub half -0.0, %a.val 499 %r.val = fpext half %a.neg to float 500 store float %r.val, ptr addrspace(1) %r 501 ret void 502} 503 504define amdgpu_kernel void @fabs_fpext_f16_to_f32( 505; SI-LABEL: fabs_fpext_f16_to_f32: 506; SI: ; %bb.0: ; %entry 507; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 508; SI-NEXT: s_mov_b32 s7, 0xf000 509; SI-NEXT: s_mov_b32 s6, -1 510; SI-NEXT: s_mov_b32 s10, s6 511; SI-NEXT: s_mov_b32 s11, s7 512; SI-NEXT: s_waitcnt lgkmcnt(0) 513; SI-NEXT: s_mov_b32 s8, s2 514; SI-NEXT: s_mov_b32 s9, s3 515; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 516; SI-NEXT: s_mov_b32 s4, s0 517; SI-NEXT: s_mov_b32 s5, s1 518; SI-NEXT: s_waitcnt vmcnt(0) 519; SI-NEXT: v_cvt_f32_f16_e64 v0, |v0| 520; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 521; SI-NEXT: s_endpgm 522; 523; GFX89-LABEL: fabs_fpext_f16_to_f32: 524; GFX89: ; %bb.0: ; %entry 525; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 526; GFX89-NEXT: s_mov_b32 s7, 0xf000 527; GFX89-NEXT: s_mov_b32 s6, -1 528; GFX89-NEXT: s_mov_b32 s10, s6 529; GFX89-NEXT: s_mov_b32 s11, s7 530; GFX89-NEXT: s_waitcnt lgkmcnt(0) 531; GFX89-NEXT: s_mov_b32 s8, s2 532; GFX89-NEXT: s_mov_b32 s9, s3 533; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 534; GFX89-NEXT: s_mov_b32 s4, s0 535; GFX89-NEXT: s_mov_b32 s5, s1 536; GFX89-NEXT: s_waitcnt vmcnt(0) 537; GFX89-NEXT: v_cvt_f32_f16_e64 v0, |v0| 538; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 539; GFX89-NEXT: s_endpgm 540; 541; GFX11-TRUE16-LABEL: fabs_fpext_f16_to_f32: 542; GFX11-TRUE16: ; %bb.0: ; %entry 543; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 544; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 545; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 546; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 547; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 548; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) 549; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 550; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 551; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 552; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 553; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 554; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) 555; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v0, |v0.l| 556; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 557; GFX11-TRUE16-NEXT: s_endpgm 558; 559; GFX11-FAKE16-LABEL: fabs_fpext_f16_to_f32: 560; GFX11-FAKE16: ; %bb.0: ; %entry 561; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 562; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 563; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 564; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 565; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 566; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) 567; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 568; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 569; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 570; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 571; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 572; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) 573; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v0, |v0| 574; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 575; GFX11-FAKE16-NEXT: s_endpgm 576 ptr addrspace(1) %r, 577 ptr addrspace(1) %a) { 578entry: 579 %a.val = load half, ptr addrspace(1) %a 580 %a.fabs = call half @llvm.fabs.f16(half %a.val) 581 %r.val = fpext half %a.fabs to float 582 store float %r.val, ptr addrspace(1) %r 583 ret void 584} 585 586define amdgpu_kernel void @fneg_fabs_fpext_f16_to_f32( 587; SI-LABEL: fneg_fabs_fpext_f16_to_f32: 588; SI: ; %bb.0: ; %entry 589; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 590; SI-NEXT: s_mov_b32 s7, 0xf000 591; SI-NEXT: s_mov_b32 s6, -1 592; SI-NEXT: s_mov_b32 s10, s6 593; SI-NEXT: s_mov_b32 s11, s7 594; SI-NEXT: s_waitcnt lgkmcnt(0) 595; SI-NEXT: s_mov_b32 s8, s2 596; SI-NEXT: s_mov_b32 s9, s3 597; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 598; SI-NEXT: s_mov_b32 s4, s0 599; SI-NEXT: s_mov_b32 s5, s1 600; SI-NEXT: s_waitcnt vmcnt(0) 601; SI-NEXT: v_cvt_f32_f16_e64 v0, -|v0| 602; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 603; SI-NEXT: s_endpgm 604; 605; GFX89-LABEL: fneg_fabs_fpext_f16_to_f32: 606; GFX89: ; %bb.0: ; %entry 607; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 608; GFX89-NEXT: s_mov_b32 s7, 0xf000 609; GFX89-NEXT: s_mov_b32 s6, -1 610; GFX89-NEXT: s_mov_b32 s10, s6 611; GFX89-NEXT: s_mov_b32 s11, s7 612; GFX89-NEXT: s_waitcnt lgkmcnt(0) 613; GFX89-NEXT: s_mov_b32 s8, s2 614; GFX89-NEXT: s_mov_b32 s9, s3 615; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 616; GFX89-NEXT: s_mov_b32 s4, s0 617; GFX89-NEXT: s_mov_b32 s5, s1 618; GFX89-NEXT: s_waitcnt vmcnt(0) 619; GFX89-NEXT: v_cvt_f32_f16_e64 v0, -|v0| 620; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 621; GFX89-NEXT: s_endpgm 622; 623; GFX11-TRUE16-LABEL: fneg_fabs_fpext_f16_to_f32: 624; GFX11-TRUE16: ; %bb.0: ; %entry 625; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 626; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 627; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 628; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 629; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 630; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) 631; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 632; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 633; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 634; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 635; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 636; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) 637; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v0, -|v0.l| 638; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 639; GFX11-TRUE16-NEXT: s_endpgm 640; 641; GFX11-FAKE16-LABEL: fneg_fabs_fpext_f16_to_f32: 642; GFX11-FAKE16: ; %bb.0: ; %entry 643; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 644; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 645; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 646; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 647; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 648; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) 649; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 650; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 651; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 652; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 653; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 654; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) 655; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v0, -|v0| 656; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 657; GFX11-FAKE16-NEXT: s_endpgm 658 ptr addrspace(1) %r, 659 ptr addrspace(1) %a) { 660entry: 661 %a.val = load half, ptr addrspace(1) %a 662 %a.fabs = call half @llvm.fabs.f16(half %a.val) 663 %a.fneg.fabs = fsub half -0.0, %a.fabs 664 %r.val = fpext half %a.fneg.fabs to float 665 store float %r.val, ptr addrspace(1) %r 666 ret void 667} 668 669; FIXME: Using the source modifier here only wastes code size 670 671define amdgpu_kernel void @fneg_multi_use_fpext_f16_to_f32( 672; SI-LABEL: fneg_multi_use_fpext_f16_to_f32: 673; SI: ; %bb.0: ; %entry 674; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 675; SI-NEXT: s_mov_b32 s7, 0xf000 676; SI-NEXT: s_mov_b32 s6, -1 677; SI-NEXT: s_mov_b32 s10, s6 678; SI-NEXT: s_mov_b32 s11, s7 679; SI-NEXT: s_waitcnt lgkmcnt(0) 680; SI-NEXT: s_mov_b32 s8, s2 681; SI-NEXT: s_mov_b32 s9, s3 682; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 683; SI-NEXT: s_mov_b32 s4, s0 684; SI-NEXT: s_mov_b32 s5, s1 685; SI-NEXT: s_waitcnt vmcnt(0) 686; SI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 687; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 688; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 689; SI-NEXT: s_waitcnt vmcnt(0) 690; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 691; SI-NEXT: s_waitcnt vmcnt(0) 692; SI-NEXT: s_endpgm 693; 694; GFX89-LABEL: fneg_multi_use_fpext_f16_to_f32: 695; GFX89: ; %bb.0: ; %entry 696; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 697; GFX89-NEXT: s_mov_b32 s7, 0xf000 698; GFX89-NEXT: s_mov_b32 s6, -1 699; GFX89-NEXT: s_mov_b32 s10, s6 700; GFX89-NEXT: s_mov_b32 s11, s7 701; GFX89-NEXT: s_waitcnt lgkmcnt(0) 702; GFX89-NEXT: s_mov_b32 s8, s2 703; GFX89-NEXT: s_mov_b32 s9, s3 704; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 705; GFX89-NEXT: s_mov_b32 s4, s0 706; GFX89-NEXT: s_mov_b32 s5, s1 707; GFX89-NEXT: s_waitcnt vmcnt(0) 708; GFX89-NEXT: v_cvt_f32_f16_e64 v1, -v0 709; GFX89-NEXT: v_xor_b32_e32 v0, 0x8000, v0 710; GFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0 711; GFX89-NEXT: s_waitcnt vmcnt(0) 712; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 713; GFX89-NEXT: s_waitcnt vmcnt(0) 714; GFX89-NEXT: s_endpgm 715; 716; GFX11-TRUE16-LABEL: fneg_multi_use_fpext_f16_to_f32: 717; GFX11-TRUE16: ; %bb.0: ; %entry 718; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 719; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 720; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 721; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 722; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 723; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) 724; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 725; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 726; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 727; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[8:11], 0 728; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 729; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) 730; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l 731; GFX11-TRUE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1 732; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) 733; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v0, -v0.l 734; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 dlc 735; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 736; GFX11-TRUE16-NEXT: buffer_store_b16 v1, off, s[4:7], 0 dlc 737; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 738; GFX11-TRUE16-NEXT: s_endpgm 739; 740; GFX11-FAKE16-LABEL: fneg_multi_use_fpext_f16_to_f32: 741; GFX11-FAKE16: ; %bb.0: ; %entry 742; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 743; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 744; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 745; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 746; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 747; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) 748; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 749; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 750; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 751; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 752; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 753; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) 754; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v1, -v0 755; GFX11-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 756; GFX11-FAKE16-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc 757; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 758; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc 759; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 760; GFX11-FAKE16-NEXT: s_endpgm 761 ptr addrspace(1) %r, 762 ptr addrspace(1) %a) { 763entry: 764 %a.val = load half, ptr addrspace(1) %a 765 %a.neg = fsub half -0.0, %a.val 766 %r.val = fpext half %a.neg to float 767 store volatile float %r.val, ptr addrspace(1) %r 768 store volatile half %a.neg, ptr addrspace(1) undef 769 ret void 770} 771 772define amdgpu_kernel void @fneg_multi_foldable_use_fpext_f16_to_f32( 773; SI-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32: 774; SI: ; %bb.0: ; %entry 775; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 776; SI-NEXT: s_mov_b32 s7, 0xf000 777; SI-NEXT: s_mov_b32 s6, -1 778; SI-NEXT: s_mov_b32 s10, s6 779; SI-NEXT: s_mov_b32 s11, s7 780; SI-NEXT: s_waitcnt lgkmcnt(0) 781; SI-NEXT: s_mov_b32 s8, s2 782; SI-NEXT: s_mov_b32 s9, s3 783; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 784; SI-NEXT: s_mov_b32 s4, s0 785; SI-NEXT: s_mov_b32 s5, s1 786; SI-NEXT: s_waitcnt vmcnt(0) 787; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 788; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 789; SI-NEXT: v_mul_f32_e32 v1, v0, v1 790; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 791; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 792; SI-NEXT: s_waitcnt vmcnt(0) 793; SI-NEXT: buffer_store_short v1, off, s[4:7], 0 794; SI-NEXT: s_waitcnt vmcnt(0) 795; SI-NEXT: s_endpgm 796; 797; GFX89-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32: 798; GFX89: ; %bb.0: ; %entry 799; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 800; GFX89-NEXT: s_mov_b32 s7, 0xf000 801; GFX89-NEXT: s_mov_b32 s6, -1 802; GFX89-NEXT: s_mov_b32 s10, s6 803; GFX89-NEXT: s_mov_b32 s11, s7 804; GFX89-NEXT: s_waitcnt lgkmcnt(0) 805; GFX89-NEXT: s_mov_b32 s8, s2 806; GFX89-NEXT: s_mov_b32 s9, s3 807; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 808; GFX89-NEXT: s_mov_b32 s4, s0 809; GFX89-NEXT: s_mov_b32 s5, s1 810; GFX89-NEXT: s_waitcnt vmcnt(0) 811; GFX89-NEXT: v_cvt_f32_f16_e64 v1, -v0 812; GFX89-NEXT: v_mul_f16_e64 v0, -v0, v0 813; GFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0 814; GFX89-NEXT: s_waitcnt vmcnt(0) 815; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 816; GFX89-NEXT: s_waitcnt vmcnt(0) 817; GFX89-NEXT: s_endpgm 818; 819; GFX11-TRUE16-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32: 820; GFX11-TRUE16: ; %bb.0: ; %entry 821; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 822; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 823; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 824; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 825; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 826; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) 827; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 828; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 829; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 830; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 831; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 832; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) 833; GFX11-TRUE16-NEXT: v_mul_f16_e64 v0.h, -v0.l, v0.l 834; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v1, -v0.l 835; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) 836; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h 837; GFX11-TRUE16-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc 838; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 839; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc 840; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 841; GFX11-TRUE16-NEXT: s_endpgm 842; 843; GFX11-FAKE16-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32: 844; GFX11-FAKE16: ; %bb.0: ; %entry 845; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 846; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 847; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 848; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 849; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 850; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) 851; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 852; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 853; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 854; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 855; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 856; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) 857; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v1, -v0 858; GFX11-FAKE16-NEXT: v_mul_f16_e64 v0, -v0, v0 859; GFX11-FAKE16-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc 860; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 861; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc 862; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 863; GFX11-FAKE16-NEXT: s_endpgm 864 ptr addrspace(1) %r, 865 ptr addrspace(1) %a) { 866entry: 867 %a.val = load half, ptr addrspace(1) %a 868 %a.neg = fsub half -0.0, %a.val 869 %r.val = fpext half %a.neg to float 870 %mul = fmul half %a.neg, %a.val 871 store volatile float %r.val, ptr addrspace(1) %r 872 store volatile half %mul, ptr addrspace(1) undef 873 ret void 874} 875 876define amdgpu_kernel void @fabs_multi_use_fpext_f16_to_f32( 877; SI-LABEL: fabs_multi_use_fpext_f16_to_f32: 878; SI: ; %bb.0: ; %entry 879; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 880; SI-NEXT: s_mov_b32 s7, 0xf000 881; SI-NEXT: s_mov_b32 s6, -1 882; SI-NEXT: s_mov_b32 s10, s6 883; SI-NEXT: s_mov_b32 s11, s7 884; SI-NEXT: s_waitcnt lgkmcnt(0) 885; SI-NEXT: s_mov_b32 s8, s2 886; SI-NEXT: s_mov_b32 s9, s3 887; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 888; SI-NEXT: s_mov_b32 s4, s0 889; SI-NEXT: s_mov_b32 s5, s1 890; SI-NEXT: s_waitcnt vmcnt(0) 891; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 892; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 893; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 894; SI-NEXT: s_waitcnt vmcnt(0) 895; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 896; SI-NEXT: s_waitcnt vmcnt(0) 897; SI-NEXT: s_endpgm 898; 899; GFX89-LABEL: fabs_multi_use_fpext_f16_to_f32: 900; GFX89: ; %bb.0: ; %entry 901; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 902; GFX89-NEXT: s_mov_b32 s7, 0xf000 903; GFX89-NEXT: s_mov_b32 s6, -1 904; GFX89-NEXT: s_mov_b32 s10, s6 905; GFX89-NEXT: s_mov_b32 s11, s7 906; GFX89-NEXT: s_waitcnt lgkmcnt(0) 907; GFX89-NEXT: s_mov_b32 s8, s2 908; GFX89-NEXT: s_mov_b32 s9, s3 909; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 910; GFX89-NEXT: s_mov_b32 s4, s0 911; GFX89-NEXT: s_mov_b32 s5, s1 912; GFX89-NEXT: s_waitcnt vmcnt(0) 913; GFX89-NEXT: v_cvt_f32_f16_e64 v1, |v0| 914; GFX89-NEXT: v_and_b32_e32 v0, 0x7fff, v0 915; GFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0 916; GFX89-NEXT: s_waitcnt vmcnt(0) 917; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 918; GFX89-NEXT: s_waitcnt vmcnt(0) 919; GFX89-NEXT: s_endpgm 920; 921; GFX11-TRUE16-LABEL: fabs_multi_use_fpext_f16_to_f32: 922; GFX11-TRUE16: ; %bb.0: ; %entry 923; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 924; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 925; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 926; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 927; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 928; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) 929; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 930; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 931; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 932; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[8:11], 0 933; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 934; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) 935; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l 936; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0x7fff, v1 937; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) 938; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v0, |v0.l| 939; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 dlc 940; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 941; GFX11-TRUE16-NEXT: buffer_store_b16 v1, off, s[4:7], 0 dlc 942; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 943; GFX11-TRUE16-NEXT: s_endpgm 944; 945; GFX11-FAKE16-LABEL: fabs_multi_use_fpext_f16_to_f32: 946; GFX11-FAKE16: ; %bb.0: ; %entry 947; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 948; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 949; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 950; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 951; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 952; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) 953; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 954; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 955; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 956; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 957; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 958; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) 959; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v1, |v0| 960; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0 961; GFX11-FAKE16-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc 962; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 963; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc 964; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 965; GFX11-FAKE16-NEXT: s_endpgm 966 ptr addrspace(1) %r, 967 ptr addrspace(1) %a) { 968entry: 969 %a.val = load half, ptr addrspace(1) %a 970 %a.fabs = call half @llvm.fabs.f16(half %a.val) 971 %r.val = fpext half %a.fabs to float 972 store volatile float %r.val, ptr addrspace(1) %r 973 store volatile half %a.fabs, ptr addrspace(1) undef 974 ret void 975} 976 977define amdgpu_kernel void @fabs_multi_foldable_use_fpext_f16_to_f32( 978; SI-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32: 979; SI: ; %bb.0: ; %entry 980; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 981; SI-NEXT: s_mov_b32 s7, 0xf000 982; SI-NEXT: s_mov_b32 s6, -1 983; SI-NEXT: s_mov_b32 s10, s6 984; SI-NEXT: s_mov_b32 s11, s7 985; SI-NEXT: s_waitcnt lgkmcnt(0) 986; SI-NEXT: s_mov_b32 s8, s2 987; SI-NEXT: s_mov_b32 s9, s3 988; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 989; SI-NEXT: s_mov_b32 s4, s0 990; SI-NEXT: s_mov_b32 s5, s1 991; SI-NEXT: s_waitcnt vmcnt(0) 992; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 993; SI-NEXT: v_mul_f32_e64 v1, |v0|, v0 994; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 995; SI-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 996; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 997; SI-NEXT: s_waitcnt vmcnt(0) 998; SI-NEXT: buffer_store_short v1, off, s[4:7], 0 999; SI-NEXT: s_waitcnt vmcnt(0) 1000; SI-NEXT: s_endpgm 1001; 1002; GFX89-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32: 1003; GFX89: ; %bb.0: ; %entry 1004; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1005; GFX89-NEXT: s_mov_b32 s7, 0xf000 1006; GFX89-NEXT: s_mov_b32 s6, -1 1007; GFX89-NEXT: s_mov_b32 s10, s6 1008; GFX89-NEXT: s_mov_b32 s11, s7 1009; GFX89-NEXT: s_waitcnt lgkmcnt(0) 1010; GFX89-NEXT: s_mov_b32 s8, s2 1011; GFX89-NEXT: s_mov_b32 s9, s3 1012; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 1013; GFX89-NEXT: s_mov_b32 s4, s0 1014; GFX89-NEXT: s_mov_b32 s5, s1 1015; GFX89-NEXT: s_waitcnt vmcnt(0) 1016; GFX89-NEXT: v_cvt_f32_f16_e64 v1, |v0| 1017; GFX89-NEXT: v_mul_f16_e64 v0, |v0|, v0 1018; GFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0 1019; GFX89-NEXT: s_waitcnt vmcnt(0) 1020; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 1021; GFX89-NEXT: s_waitcnt vmcnt(0) 1022; GFX89-NEXT: s_endpgm 1023; 1024; GFX11-TRUE16-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32: 1025; GFX11-TRUE16: ; %bb.0: ; %entry 1026; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1027; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 1028; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 1029; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 1030; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 1031; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) 1032; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 1033; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 1034; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 1035; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 1036; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 1037; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) 1038; GFX11-TRUE16-NEXT: v_mul_f16_e64 v0.h, |v0.l|, v0.l 1039; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v1, |v0.l| 1040; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) 1041; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h 1042; GFX11-TRUE16-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc 1043; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 1044; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc 1045; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 1046; GFX11-TRUE16-NEXT: s_endpgm 1047; 1048; GFX11-FAKE16-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32: 1049; GFX11-FAKE16: ; %bb.0: ; %entry 1050; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1051; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 1052; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 1053; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 1054; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 1055; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) 1056; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 1057; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 1058; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 1059; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 1060; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 1061; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) 1062; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v1, |v0| 1063; GFX11-FAKE16-NEXT: v_mul_f16_e64 v0, |v0|, v0 1064; GFX11-FAKE16-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc 1065; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 1066; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc 1067; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 1068; GFX11-FAKE16-NEXT: s_endpgm 1069 ptr addrspace(1) %r, 1070 ptr addrspace(1) %a) { 1071entry: 1072 %a.val = load half, ptr addrspace(1) %a 1073 %a.fabs = call half @llvm.fabs.f16(half %a.val) 1074 %r.val = fpext half %a.fabs to float 1075 %mul = fmul half %a.fabs, %a.val 1076 store volatile float %r.val, ptr addrspace(1) %r 1077 store volatile half %mul, ptr addrspace(1) undef 1078 ret void 1079} 1080 1081define amdgpu_kernel void @fabs_fneg_multi_use_fpext_f16_to_f32( 1082; SI-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32: 1083; SI: ; %bb.0: ; %entry 1084; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1085; SI-NEXT: s_mov_b32 s7, 0xf000 1086; SI-NEXT: s_mov_b32 s6, -1 1087; SI-NEXT: s_mov_b32 s10, s6 1088; SI-NEXT: s_mov_b32 s11, s7 1089; SI-NEXT: s_waitcnt lgkmcnt(0) 1090; SI-NEXT: s_mov_b32 s8, s2 1091; SI-NEXT: s_mov_b32 s9, s3 1092; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 1093; SI-NEXT: s_mov_b32 s4, s0 1094; SI-NEXT: s_mov_b32 s5, s1 1095; SI-NEXT: s_waitcnt vmcnt(0) 1096; SI-NEXT: v_or_b32_e32 v0, 0x8000, v0 1097; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 1098; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 1099; SI-NEXT: s_waitcnt vmcnt(0) 1100; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 1101; SI-NEXT: s_waitcnt vmcnt(0) 1102; SI-NEXT: s_endpgm 1103; 1104; GFX89-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32: 1105; GFX89: ; %bb.0: ; %entry 1106; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1107; GFX89-NEXT: s_mov_b32 s7, 0xf000 1108; GFX89-NEXT: s_mov_b32 s6, -1 1109; GFX89-NEXT: s_mov_b32 s10, s6 1110; GFX89-NEXT: s_mov_b32 s11, s7 1111; GFX89-NEXT: s_waitcnt lgkmcnt(0) 1112; GFX89-NEXT: s_mov_b32 s8, s2 1113; GFX89-NEXT: s_mov_b32 s9, s3 1114; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 1115; GFX89-NEXT: s_mov_b32 s4, s0 1116; GFX89-NEXT: s_mov_b32 s5, s1 1117; GFX89-NEXT: s_waitcnt vmcnt(0) 1118; GFX89-NEXT: v_cvt_f32_f16_e64 v1, -|v0| 1119; GFX89-NEXT: v_or_b32_e32 v0, 0x8000, v0 1120; GFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0 1121; GFX89-NEXT: s_waitcnt vmcnt(0) 1122; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 1123; GFX89-NEXT: s_waitcnt vmcnt(0) 1124; GFX89-NEXT: s_endpgm 1125; 1126; GFX11-TRUE16-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32: 1127; GFX11-TRUE16: ; %bb.0: ; %entry 1128; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1129; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 1130; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 1131; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 1132; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 1133; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) 1134; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 1135; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 1136; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 1137; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[8:11], 0 1138; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 1139; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) 1140; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l 1141; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, 0x8000, v1 1142; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) 1143; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v0, -|v0.l| 1144; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 dlc 1145; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 1146; GFX11-TRUE16-NEXT: buffer_store_b16 v1, off, s[4:7], 0 dlc 1147; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 1148; GFX11-TRUE16-NEXT: s_endpgm 1149; 1150; GFX11-FAKE16-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32: 1151; GFX11-FAKE16: ; %bb.0: ; %entry 1152; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1153; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 1154; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 1155; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 1156; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 1157; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) 1158; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 1159; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 1160; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 1161; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 1162; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 1163; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) 1164; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v1, -|v0| 1165; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, 0x8000, v0 1166; GFX11-FAKE16-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc 1167; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 1168; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc 1169; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 1170; GFX11-FAKE16-NEXT: s_endpgm 1171 ptr addrspace(1) %r, 1172 ptr addrspace(1) %a) { 1173entry: 1174 %a.val = load half, ptr addrspace(1) %a 1175 %a.fabs = call half @llvm.fabs.f16(half %a.val) 1176 %a.fneg.fabs = fsub half -0.0, %a.fabs 1177 %r.val = fpext half %a.fneg.fabs to float 1178 store volatile float %r.val, ptr addrspace(1) %r 1179 store volatile half %a.fneg.fabs, ptr addrspace(1) undef 1180 ret void 1181} 1182 1183define amdgpu_kernel void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32( 1184; SI-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32: 1185; SI: ; %bb.0: ; %entry 1186; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1187; SI-NEXT: s_mov_b32 s7, 0xf000 1188; SI-NEXT: s_mov_b32 s6, -1 1189; SI-NEXT: s_mov_b32 s10, s6 1190; SI-NEXT: s_mov_b32 s11, s7 1191; SI-NEXT: s_waitcnt lgkmcnt(0) 1192; SI-NEXT: s_mov_b32 s8, s2 1193; SI-NEXT: s_mov_b32 s9, s3 1194; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 1195; SI-NEXT: s_mov_b32 s4, s0 1196; SI-NEXT: s_mov_b32 s5, s1 1197; SI-NEXT: s_waitcnt vmcnt(0) 1198; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1199; SI-NEXT: v_mul_f32_e64 v1, -|v0|, v0 1200; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 1201; SI-NEXT: v_or_b32_e32 v0, 0x80000000, v0 1202; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1203; SI-NEXT: s_waitcnt vmcnt(0) 1204; SI-NEXT: buffer_store_short v1, off, s[4:7], 0 1205; SI-NEXT: s_waitcnt vmcnt(0) 1206; SI-NEXT: s_endpgm 1207; 1208; GFX89-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32: 1209; GFX89: ; %bb.0: ; %entry 1210; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1211; GFX89-NEXT: s_mov_b32 s7, 0xf000 1212; GFX89-NEXT: s_mov_b32 s6, -1 1213; GFX89-NEXT: s_mov_b32 s10, s6 1214; GFX89-NEXT: s_mov_b32 s11, s7 1215; GFX89-NEXT: s_waitcnt lgkmcnt(0) 1216; GFX89-NEXT: s_mov_b32 s8, s2 1217; GFX89-NEXT: s_mov_b32 s9, s3 1218; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 1219; GFX89-NEXT: s_mov_b32 s4, s0 1220; GFX89-NEXT: s_mov_b32 s5, s1 1221; GFX89-NEXT: s_waitcnt vmcnt(0) 1222; GFX89-NEXT: v_cvt_f32_f16_e64 v1, -|v0| 1223; GFX89-NEXT: v_mul_f16_e64 v0, -|v0|, v0 1224; GFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0 1225; GFX89-NEXT: s_waitcnt vmcnt(0) 1226; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 1227; GFX89-NEXT: s_waitcnt vmcnt(0) 1228; GFX89-NEXT: s_endpgm 1229; 1230; GFX11-TRUE16-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32: 1231; GFX11-TRUE16: ; %bb.0: ; %entry 1232; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1233; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 1234; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 1235; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 1236; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 1237; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) 1238; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 1239; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 1240; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 1241; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 1242; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 1243; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) 1244; GFX11-TRUE16-NEXT: v_mul_f16_e64 v0.h, -|v0.l|, v0.l 1245; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v1, -|v0.l| 1246; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) 1247; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h 1248; GFX11-TRUE16-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc 1249; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 1250; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc 1251; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 1252; GFX11-TRUE16-NEXT: s_endpgm 1253; 1254; GFX11-FAKE16-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32: 1255; GFX11-FAKE16: ; %bb.0: ; %entry 1256; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1257; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 1258; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 1259; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 1260; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 1261; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) 1262; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 1263; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 1264; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 1265; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 1266; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 1267; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) 1268; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v1, -|v0| 1269; GFX11-FAKE16-NEXT: v_mul_f16_e64 v0, -|v0|, v0 1270; GFX11-FAKE16-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc 1271; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 1272; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc 1273; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 1274; GFX11-FAKE16-NEXT: s_endpgm 1275 ptr addrspace(1) %r, 1276 ptr addrspace(1) %a) { 1277entry: 1278 %a.val = load half, ptr addrspace(1) %a 1279 %a.fabs = call half @llvm.fabs.f16(half %a.val) 1280 %a.fneg.fabs = fsub half -0.0, %a.fabs 1281 %r.val = fpext half %a.fneg.fabs to float 1282 %mul = fmul half %a.fneg.fabs, %a.val 1283 store volatile float %r.val, ptr addrspace(1) %r 1284 store volatile half %mul, ptr addrspace(1) undef 1285 ret void 1286} 1287 1288declare half @llvm.fabs.f16(half) #1 1289 1290attributes #1 = { nounwind readnone } 1291;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: 1292; GFX9: {{.*}} 1293; VI: {{.*}} 1294