1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,VI %s 4; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s 5; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s 6 7define amdgpu_kernel void @fmul_f16( 8; SI-LABEL: fmul_f16: 9; SI: ; %bb.0: ; %entry 10; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 11; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 12; SI-NEXT: s_mov_b32 s7, 0xf000 13; SI-NEXT: s_mov_b32 s6, -1 14; SI-NEXT: s_mov_b32 s14, s6 15; SI-NEXT: s_waitcnt lgkmcnt(0) 16; SI-NEXT: s_mov_b32 s12, s2 17; SI-NEXT: s_mov_b32 s13, s3 18; SI-NEXT: s_mov_b32 s15, s7 19; SI-NEXT: s_mov_b32 s10, s6 20; SI-NEXT: s_mov_b32 s11, s7 21; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 22; SI-NEXT: s_waitcnt vmcnt(0) 23; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc 24; SI-NEXT: s_waitcnt vmcnt(0) 25; SI-NEXT: s_mov_b32 s4, s0 26; SI-NEXT: s_mov_b32 s5, s1 27; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 28; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 29; SI-NEXT: v_mul_f32_e32 v0, v0, v1 30; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 31; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 32; SI-NEXT: s_endpgm 33; 34; GFX89-LABEL: fmul_f16: 35; GFX89: ; %bb.0: ; %entry 36; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 37; GFX89-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 38; GFX89-NEXT: s_mov_b32 s7, 0xf000 39; GFX89-NEXT: s_mov_b32 s6, -1 40; GFX89-NEXT: s_mov_b32 s14, s6 41; GFX89-NEXT: s_waitcnt lgkmcnt(0) 42; GFX89-NEXT: s_mov_b32 s12, s2 43; GFX89-NEXT: s_mov_b32 s13, s3 44; GFX89-NEXT: s_mov_b32 s15, s7 45; GFX89-NEXT: s_mov_b32 s10, s6 46; GFX89-NEXT: s_mov_b32 s11, s7 47; GFX89-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 48; GFX89-NEXT: s_waitcnt vmcnt(0) 49; GFX89-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc 50; GFX89-NEXT: s_waitcnt vmcnt(0) 51; GFX89-NEXT: s_mov_b32 s4, s0 52; GFX89-NEXT: s_mov_b32 s5, s1 53; GFX89-NEXT: v_mul_f16_e32 v0, v0, v1 54; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 55; GFX89-NEXT: s_endpgm 56; 57; GFX11-LABEL: fmul_f16: 58; GFX11: ; %bb.0: ; %entry 59; GFX11-NEXT: s_clause 0x1 60; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 61; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 62; GFX11-NEXT: s_mov_b32 s10, -1 63; GFX11-NEXT: s_mov_b32 s11, 0x31016000 64; GFX11-NEXT: s_mov_b32 s14, s10 65; GFX11-NEXT: s_mov_b32 s15, s11 66; GFX11-NEXT: s_mov_b32 s6, s10 67; GFX11-NEXT: s_mov_b32 s7, s11 68; GFX11-NEXT: s_waitcnt lgkmcnt(0) 69; GFX11-NEXT: s_mov_b32 s12, s2 70; GFX11-NEXT: s_mov_b32 s13, s3 71; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc 72; GFX11-NEXT: s_waitcnt vmcnt(0) 73; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc 74; GFX11-NEXT: s_waitcnt vmcnt(0) 75; GFX11-NEXT: s_mov_b32 s8, s0 76; GFX11-NEXT: s_mov_b32 s9, s1 77; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 78; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 79; GFX11-NEXT: s_endpgm 80 ptr addrspace(1) %r, 81 ptr addrspace(1) %a, 82 ptr addrspace(1) %b) { 83entry: 84 %a.val = load volatile half, ptr addrspace(1) %a 85 %b.val = load volatile half, ptr addrspace(1) %b 86 %r.val = fmul half %a.val, %b.val 87 store half %r.val, ptr addrspace(1) %r 88 ret void 89} 90 91define amdgpu_kernel void @fmul_f16_imm_a( 92; SI-LABEL: fmul_f16_imm_a: 93; SI: ; %bb.0: ; %entry 94; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 95; SI-NEXT: s_mov_b32 s7, 0xf000 96; SI-NEXT: s_mov_b32 s6, -1 97; SI-NEXT: s_mov_b32 s10, s6 98; SI-NEXT: s_mov_b32 s11, s7 99; SI-NEXT: s_waitcnt lgkmcnt(0) 100; SI-NEXT: s_mov_b32 s8, s2 101; SI-NEXT: s_mov_b32 s9, s3 102; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc 103; SI-NEXT: s_waitcnt vmcnt(0) 104; SI-NEXT: s_mov_b32 s4, s0 105; SI-NEXT: s_mov_b32 s5, s1 106; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 107; SI-NEXT: v_mul_f32_e32 v0, 0x40400000, v0 108; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 109; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 110; SI-NEXT: s_endpgm 111; 112; GFX89-LABEL: fmul_f16_imm_a: 113; GFX89: ; %bb.0: ; %entry 114; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 115; GFX89-NEXT: s_mov_b32 s7, 0xf000 116; GFX89-NEXT: s_mov_b32 s6, -1 117; GFX89-NEXT: s_mov_b32 s10, s6 118; GFX89-NEXT: s_mov_b32 s11, s7 119; GFX89-NEXT: s_waitcnt lgkmcnt(0) 120; GFX89-NEXT: s_mov_b32 s8, s2 121; GFX89-NEXT: s_mov_b32 s9, s3 122; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc 123; GFX89-NEXT: s_waitcnt vmcnt(0) 124; GFX89-NEXT: s_mov_b32 s4, s0 125; GFX89-NEXT: s_mov_b32 s5, s1 126; GFX89-NEXT: v_mul_f16_e32 v0, 0x4200, v0 127; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 128; GFX89-NEXT: s_endpgm 129; 130; GFX11-LABEL: fmul_f16_imm_a: 131; GFX11: ; %bb.0: ; %entry 132; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 133; GFX11-NEXT: s_mov_b32 s6, -1 134; GFX11-NEXT: s_mov_b32 s7, 0x31016000 135; GFX11-NEXT: s_mov_b32 s10, s6 136; GFX11-NEXT: s_mov_b32 s11, s7 137; GFX11-NEXT: s_waitcnt lgkmcnt(0) 138; GFX11-NEXT: s_mov_b32 s8, s2 139; GFX11-NEXT: s_mov_b32 s9, s3 140; GFX11-NEXT: s_mov_b32 s4, s0 141; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc 142; GFX11-NEXT: s_waitcnt vmcnt(0) 143; GFX11-NEXT: s_mov_b32 s5, s1 144; GFX11-NEXT: v_mul_f16_e32 v0, 0x4200, v0 145; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 146; GFX11-NEXT: s_endpgm 147 ptr addrspace(1) %r, 148 ptr addrspace(1) %b) { 149entry: 150 %b.val = load volatile half, ptr addrspace(1) %b 151 %r.val = fmul half 3.0, %b.val 152 store half %r.val, ptr addrspace(1) %r 153 ret void 154} 155 156define amdgpu_kernel void @fmul_f16_imm_b( 157; SI-LABEL: fmul_f16_imm_b: 158; SI: ; %bb.0: ; %entry 159; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 160; SI-NEXT: s_mov_b32 s7, 0xf000 161; SI-NEXT: s_mov_b32 s6, -1 162; SI-NEXT: s_mov_b32 s10, s6 163; SI-NEXT: s_mov_b32 s11, s7 164; SI-NEXT: s_waitcnt lgkmcnt(0) 165; SI-NEXT: s_mov_b32 s8, s2 166; SI-NEXT: s_mov_b32 s9, s3 167; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc 168; SI-NEXT: s_waitcnt vmcnt(0) 169; SI-NEXT: s_mov_b32 s4, s0 170; SI-NEXT: s_mov_b32 s5, s1 171; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 172; SI-NEXT: v_mul_f32_e32 v0, 4.0, v0 173; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 174; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 175; SI-NEXT: s_endpgm 176; 177; GFX89-LABEL: fmul_f16_imm_b: 178; GFX89: ; %bb.0: ; %entry 179; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 180; GFX89-NEXT: s_mov_b32 s7, 0xf000 181; GFX89-NEXT: s_mov_b32 s6, -1 182; GFX89-NEXT: s_mov_b32 s10, s6 183; GFX89-NEXT: s_mov_b32 s11, s7 184; GFX89-NEXT: s_waitcnt lgkmcnt(0) 185; GFX89-NEXT: s_mov_b32 s8, s2 186; GFX89-NEXT: s_mov_b32 s9, s3 187; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc 188; GFX89-NEXT: s_waitcnt vmcnt(0) 189; GFX89-NEXT: s_mov_b32 s4, s0 190; GFX89-NEXT: s_mov_b32 s5, s1 191; GFX89-NEXT: v_mul_f16_e32 v0, 4.0, v0 192; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 193; GFX89-NEXT: s_endpgm 194; 195; GFX11-LABEL: fmul_f16_imm_b: 196; GFX11: ; %bb.0: ; %entry 197; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 198; GFX11-NEXT: s_mov_b32 s6, -1 199; GFX11-NEXT: s_mov_b32 s7, 0x31016000 200; GFX11-NEXT: s_mov_b32 s10, s6 201; GFX11-NEXT: s_mov_b32 s11, s7 202; GFX11-NEXT: s_waitcnt lgkmcnt(0) 203; GFX11-NEXT: s_mov_b32 s8, s2 204; GFX11-NEXT: s_mov_b32 s9, s3 205; GFX11-NEXT: s_mov_b32 s4, s0 206; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc 207; GFX11-NEXT: s_waitcnt vmcnt(0) 208; GFX11-NEXT: s_mov_b32 s5, s1 209; GFX11-NEXT: v_mul_f16_e32 v0, 4.0, v0 210; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 211; GFX11-NEXT: s_endpgm 212 ptr addrspace(1) %r, 213 ptr addrspace(1) %a) { 214entry: 215 %a.val = load volatile half, ptr addrspace(1) %a 216 %r.val = fmul half %a.val, 4.0 217 store half %r.val, ptr addrspace(1) %r 218 ret void 219} 220 221define amdgpu_kernel void @fmul_v2f16( 222; SI-LABEL: fmul_v2f16: 223; SI: ; %bb.0: ; %entry 224; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 225; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 226; SI-NEXT: s_mov_b32 s7, 0xf000 227; SI-NEXT: s_mov_b32 s6, -1 228; SI-NEXT: s_mov_b32 s10, s6 229; SI-NEXT: s_mov_b32 s11, s7 230; SI-NEXT: s_waitcnt lgkmcnt(0) 231; SI-NEXT: s_mov_b32 s12, s2 232; SI-NEXT: s_mov_b32 s13, s3 233; SI-NEXT: s_mov_b32 s14, s6 234; SI-NEXT: s_mov_b32 s15, s7 235; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 236; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 237; SI-NEXT: s_mov_b32 s4, s0 238; SI-NEXT: s_mov_b32 s5, s1 239; SI-NEXT: s_waitcnt vmcnt(1) 240; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 241; SI-NEXT: s_waitcnt vmcnt(0) 242; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 243; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 244; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 245; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 246; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 247; SI-NEXT: v_mul_f32_e32 v2, v3, v2 248; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 249; SI-NEXT: v_mul_f32_e32 v0, v1, v0 250; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 251; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 252; SI-NEXT: v_or_b32_e32 v0, v0, v1 253; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 254; SI-NEXT: s_endpgm 255; 256; VI-LABEL: fmul_v2f16: 257; VI: ; %bb.0: ; %entry 258; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 259; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 260; VI-NEXT: s_mov_b32 s7, 0xf000 261; VI-NEXT: s_mov_b32 s6, -1 262; VI-NEXT: s_mov_b32 s10, s6 263; VI-NEXT: s_mov_b32 s11, s7 264; VI-NEXT: s_waitcnt lgkmcnt(0) 265; VI-NEXT: s_mov_b32 s12, s2 266; VI-NEXT: s_mov_b32 s13, s3 267; VI-NEXT: s_mov_b32 s14, s6 268; VI-NEXT: s_mov_b32 s15, s7 269; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 270; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 271; VI-NEXT: s_mov_b32 s4, s0 272; VI-NEXT: s_mov_b32 s5, s1 273; VI-NEXT: s_waitcnt vmcnt(0) 274; VI-NEXT: v_mul_f16_sdwa v2, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 275; VI-NEXT: v_mul_f16_e32 v0, v1, v0 276; VI-NEXT: v_or_b32_e32 v0, v0, v2 277; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 278; VI-NEXT: s_endpgm 279; 280; GFX9-LABEL: fmul_v2f16: 281; GFX9: ; %bb.0: ; %entry 282; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 283; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 284; GFX9-NEXT: s_mov_b32 s7, 0xf000 285; GFX9-NEXT: s_mov_b32 s6, -1 286; GFX9-NEXT: s_mov_b32 s14, s6 287; GFX9-NEXT: s_waitcnt lgkmcnt(0) 288; GFX9-NEXT: s_mov_b32 s12, s2 289; GFX9-NEXT: s_mov_b32 s13, s3 290; GFX9-NEXT: s_mov_b32 s15, s7 291; GFX9-NEXT: s_mov_b32 s10, s6 292; GFX9-NEXT: s_mov_b32 s11, s7 293; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 294; GFX9-NEXT: buffer_load_dword v1, off, s[8:11], 0 295; GFX9-NEXT: s_mov_b32 s4, s0 296; GFX9-NEXT: s_mov_b32 s5, s1 297; GFX9-NEXT: s_waitcnt vmcnt(0) 298; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1 299; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 300; GFX9-NEXT: s_endpgm 301; 302; GFX11-LABEL: fmul_v2f16: 303; GFX11: ; %bb.0: ; %entry 304; GFX11-NEXT: s_clause 0x1 305; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 306; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 307; GFX11-NEXT: s_mov_b32 s10, -1 308; GFX11-NEXT: s_mov_b32 s11, 0x31016000 309; GFX11-NEXT: s_mov_b32 s14, s10 310; GFX11-NEXT: s_mov_b32 s15, s11 311; GFX11-NEXT: s_mov_b32 s6, s10 312; GFX11-NEXT: s_mov_b32 s7, s11 313; GFX11-NEXT: s_waitcnt lgkmcnt(0) 314; GFX11-NEXT: s_mov_b32 s12, s2 315; GFX11-NEXT: s_mov_b32 s13, s3 316; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 317; GFX11-NEXT: buffer_load_b32 v1, off, s[4:7], 0 318; GFX11-NEXT: s_mov_b32 s8, s0 319; GFX11-NEXT: s_mov_b32 s9, s1 320; GFX11-NEXT: s_waitcnt vmcnt(0) 321; GFX11-NEXT: v_pk_mul_f16 v0, v0, v1 322; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 323; GFX11-NEXT: s_endpgm 324 ptr addrspace(1) %r, 325 ptr addrspace(1) %a, 326 ptr addrspace(1) %b) { 327entry: 328 %a.val = load <2 x half>, ptr addrspace(1) %a 329 %b.val = load <2 x half>, ptr addrspace(1) %b 330 %r.val = fmul <2 x half> %a.val, %b.val 331 store <2 x half> %r.val, ptr addrspace(1) %r 332 ret void 333} 334 335define amdgpu_kernel void @fmul_v2f16_imm_a( 336; SI-LABEL: fmul_v2f16_imm_a: 337; SI: ; %bb.0: ; %entry 338; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 339; SI-NEXT: s_mov_b32 s7, 0xf000 340; SI-NEXT: s_mov_b32 s6, -1 341; SI-NEXT: s_mov_b32 s10, s6 342; SI-NEXT: s_mov_b32 s11, s7 343; SI-NEXT: s_waitcnt lgkmcnt(0) 344; SI-NEXT: s_mov_b32 s8, s2 345; SI-NEXT: s_mov_b32 s9, s3 346; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 347; SI-NEXT: s_mov_b32 s4, s0 348; SI-NEXT: s_mov_b32 s5, s1 349; SI-NEXT: s_waitcnt vmcnt(0) 350; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 351; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 352; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 353; SI-NEXT: v_mul_f32_e32 v1, 4.0, v1 354; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 355; SI-NEXT: v_mul_f32_e32 v0, 0x40400000, v0 356; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 357; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 358; SI-NEXT: v_or_b32_e32 v0, v0, v1 359; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 360; SI-NEXT: s_endpgm 361; 362; VI-LABEL: fmul_v2f16_imm_a: 363; VI: ; %bb.0: ; %entry 364; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 365; VI-NEXT: s_mov_b32 s7, 0xf000 366; VI-NEXT: s_mov_b32 s6, -1 367; VI-NEXT: s_mov_b32 s10, s6 368; VI-NEXT: s_mov_b32 s11, s7 369; VI-NEXT: s_waitcnt lgkmcnt(0) 370; VI-NEXT: s_mov_b32 s8, s2 371; VI-NEXT: s_mov_b32 s9, s3 372; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 373; VI-NEXT: v_mov_b32_e32 v1, 0x4400 374; VI-NEXT: s_mov_b32 s4, s0 375; VI-NEXT: s_mov_b32 s5, s1 376; VI-NEXT: s_waitcnt vmcnt(0) 377; VI-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 378; VI-NEXT: v_mul_f16_e32 v0, 0x4200, v0 379; VI-NEXT: v_or_b32_e32 v0, v0, v1 380; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 381; VI-NEXT: s_endpgm 382; 383; GFX9-LABEL: fmul_v2f16_imm_a: 384; GFX9: ; %bb.0: ; %entry 385; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 386; GFX9-NEXT: s_mov_b32 s7, 0xf000 387; GFX9-NEXT: s_mov_b32 s6, -1 388; GFX9-NEXT: s_mov_b32 s10, s6 389; GFX9-NEXT: s_mov_b32 s11, s7 390; GFX9-NEXT: s_waitcnt lgkmcnt(0) 391; GFX9-NEXT: s_mov_b32 s8, s2 392; GFX9-NEXT: s_mov_b32 s9, s3 393; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 394; GFX9-NEXT: s_mov_b32 s4, s0 395; GFX9-NEXT: s_mov_b32 s0, 0x44004200 396; GFX9-NEXT: s_mov_b32 s5, s1 397; GFX9-NEXT: s_waitcnt vmcnt(0) 398; GFX9-NEXT: v_pk_mul_f16 v0, v0, s0 399; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 400; GFX9-NEXT: s_endpgm 401; 402; GFX11-LABEL: fmul_v2f16_imm_a: 403; GFX11: ; %bb.0: ; %entry 404; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 405; GFX11-NEXT: s_mov_b32 s6, -1 406; GFX11-NEXT: s_mov_b32 s7, 0x31016000 407; GFX11-NEXT: s_mov_b32 s10, s6 408; GFX11-NEXT: s_mov_b32 s11, s7 409; GFX11-NEXT: s_waitcnt lgkmcnt(0) 410; GFX11-NEXT: s_mov_b32 s8, s2 411; GFX11-NEXT: s_mov_b32 s9, s3 412; GFX11-NEXT: s_mov_b32 s4, s0 413; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 414; GFX11-NEXT: s_mov_b32 s5, s1 415; GFX11-NEXT: s_waitcnt vmcnt(0) 416; GFX11-NEXT: v_pk_mul_f16 v0, 0x44004200, v0 417; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 418; GFX11-NEXT: s_endpgm 419 ptr addrspace(1) %r, 420 ptr addrspace(1) %b) { 421entry: 422 %b.val = load <2 x half>, ptr addrspace(1) %b 423 %r.val = fmul <2 x half> <half 3.0, half 4.0>, %b.val 424 store <2 x half> %r.val, ptr addrspace(1) %r 425 ret void 426} 427 428define amdgpu_kernel void @fmul_v2f16_imm_b( 429; SI-LABEL: fmul_v2f16_imm_b: 430; SI: ; %bb.0: ; %entry 431; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 432; SI-NEXT: s_mov_b32 s7, 0xf000 433; SI-NEXT: s_mov_b32 s6, -1 434; SI-NEXT: s_mov_b32 s10, s6 435; SI-NEXT: s_mov_b32 s11, s7 436; SI-NEXT: s_waitcnt lgkmcnt(0) 437; SI-NEXT: s_mov_b32 s8, s2 438; SI-NEXT: s_mov_b32 s9, s3 439; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 440; SI-NEXT: s_mov_b32 s4, s0 441; SI-NEXT: s_mov_b32 s5, s1 442; SI-NEXT: s_waitcnt vmcnt(0) 443; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 444; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 445; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 446; SI-NEXT: v_mul_f32_e32 v1, 0x40400000, v1 447; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 448; SI-NEXT: v_mul_f32_e32 v0, 4.0, v0 449; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 450; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 451; SI-NEXT: v_or_b32_e32 v0, v0, v1 452; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 453; SI-NEXT: s_endpgm 454; 455; VI-LABEL: fmul_v2f16_imm_b: 456; VI: ; %bb.0: ; %entry 457; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 458; VI-NEXT: s_mov_b32 s7, 0xf000 459; VI-NEXT: s_mov_b32 s6, -1 460; VI-NEXT: s_mov_b32 s10, s6 461; VI-NEXT: s_mov_b32 s11, s7 462; VI-NEXT: s_waitcnt lgkmcnt(0) 463; VI-NEXT: s_mov_b32 s8, s2 464; VI-NEXT: s_mov_b32 s9, s3 465; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 466; VI-NEXT: v_mov_b32_e32 v1, 0x4200 467; VI-NEXT: s_mov_b32 s4, s0 468; VI-NEXT: s_mov_b32 s5, s1 469; VI-NEXT: s_waitcnt vmcnt(0) 470; VI-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 471; VI-NEXT: v_mul_f16_e32 v0, 4.0, v0 472; VI-NEXT: v_or_b32_e32 v0, v0, v1 473; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 474; VI-NEXT: s_endpgm 475; 476; GFX9-LABEL: fmul_v2f16_imm_b: 477; GFX9: ; %bb.0: ; %entry 478; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 479; GFX9-NEXT: s_mov_b32 s7, 0xf000 480; GFX9-NEXT: s_mov_b32 s6, -1 481; GFX9-NEXT: s_mov_b32 s10, s6 482; GFX9-NEXT: s_mov_b32 s11, s7 483; GFX9-NEXT: s_waitcnt lgkmcnt(0) 484; GFX9-NEXT: s_mov_b32 s8, s2 485; GFX9-NEXT: s_mov_b32 s9, s3 486; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 487; GFX9-NEXT: s_mov_b32 s4, s0 488; GFX9-NEXT: s_mov_b32 s0, 0x42004400 489; GFX9-NEXT: s_mov_b32 s5, s1 490; GFX9-NEXT: s_waitcnt vmcnt(0) 491; GFX9-NEXT: v_pk_mul_f16 v0, v0, s0 492; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 493; GFX9-NEXT: s_endpgm 494; 495; GFX11-LABEL: fmul_v2f16_imm_b: 496; GFX11: ; %bb.0: ; %entry 497; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 498; GFX11-NEXT: s_mov_b32 s6, -1 499; GFX11-NEXT: s_mov_b32 s7, 0x31016000 500; GFX11-NEXT: s_mov_b32 s10, s6 501; GFX11-NEXT: s_mov_b32 s11, s7 502; GFX11-NEXT: s_waitcnt lgkmcnt(0) 503; GFX11-NEXT: s_mov_b32 s8, s2 504; GFX11-NEXT: s_mov_b32 s9, s3 505; GFX11-NEXT: s_mov_b32 s4, s0 506; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 507; GFX11-NEXT: s_mov_b32 s5, s1 508; GFX11-NEXT: s_waitcnt vmcnt(0) 509; GFX11-NEXT: v_pk_mul_f16 v0, 0x42004400, v0 510; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 511; GFX11-NEXT: s_endpgm 512 ptr addrspace(1) %r, 513 ptr addrspace(1) %a) { 514entry: 515 %a.val = load <2 x half>, ptr addrspace(1) %a 516 %r.val = fmul <2 x half> %a.val, <half 4.0, half 3.0> 517 store <2 x half> %r.val, ptr addrspace(1) %r 518 ret void 519} 520 521define amdgpu_kernel void @fmul_v4f16( 522; SI-LABEL: fmul_v4f16: 523; SI: ; %bb.0: ; %entry 524; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 525; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 526; SI-NEXT: s_mov_b32 s3, 0xf000 527; SI-NEXT: s_mov_b32 s2, -1 528; SI-NEXT: s_mov_b32 s6, s2 529; SI-NEXT: s_waitcnt lgkmcnt(0) 530; SI-NEXT: s_mov_b32 s12, s10 531; SI-NEXT: s_mov_b32 s7, s3 532; SI-NEXT: s_mov_b32 s13, s11 533; SI-NEXT: s_mov_b32 s14, s2 534; SI-NEXT: s_mov_b32 s15, s3 535; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 536; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 537; SI-NEXT: s_mov_b32 s0, s8 538; SI-NEXT: s_mov_b32 s1, s9 539; SI-NEXT: s_waitcnt vmcnt(1) 540; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 541; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 542; SI-NEXT: v_cvt_f32_f16_e32 v5, v1 543; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 544; SI-NEXT: s_waitcnt vmcnt(0) 545; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 546; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 547; SI-NEXT: v_cvt_f32_f16_e32 v7, v3 548; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 549; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 550; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 551; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 552; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 553; SI-NEXT: v_mul_f32_e32 v5, v7, v5 554; SI-NEXT: v_mul_f32_e32 v4, v6, v4 555; SI-NEXT: v_mul_f32_e32 v1, v3, v1 556; SI-NEXT: v_mul_f32_e32 v0, v2, v0 557; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 558; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 559; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 560; SI-NEXT: v_cvt_f16_f32_e32 v3, v4 561; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 562; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 563; SI-NEXT: v_or_b32_e32 v1, v2, v1 564; SI-NEXT: v_or_b32_e32 v0, v3, v0 565; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 566; SI-NEXT: s_endpgm 567; 568; VI-LABEL: fmul_v4f16: 569; VI: ; %bb.0: ; %entry 570; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 571; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 572; VI-NEXT: s_mov_b32 s7, 0xf000 573; VI-NEXT: s_mov_b32 s6, -1 574; VI-NEXT: s_mov_b32 s10, s6 575; VI-NEXT: s_mov_b32 s11, s7 576; VI-NEXT: s_waitcnt lgkmcnt(0) 577; VI-NEXT: s_mov_b32 s12, s2 578; VI-NEXT: s_mov_b32 s13, s3 579; VI-NEXT: s_mov_b32 s14, s6 580; VI-NEXT: s_mov_b32 s15, s7 581; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 582; VI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 583; VI-NEXT: s_mov_b32 s4, s0 584; VI-NEXT: s_mov_b32 s5, s1 585; VI-NEXT: s_waitcnt vmcnt(0) 586; VI-NEXT: v_mul_f16_sdwa v4, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 587; VI-NEXT: v_mul_f16_e32 v1, v3, v1 588; VI-NEXT: v_mul_f16_sdwa v3, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 589; VI-NEXT: v_mul_f16_e32 v0, v2, v0 590; VI-NEXT: v_or_b32_e32 v1, v1, v4 591; VI-NEXT: v_or_b32_e32 v0, v0, v3 592; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 593; VI-NEXT: s_endpgm 594; 595; GFX9-LABEL: fmul_v4f16: 596; GFX9: ; %bb.0: ; %entry 597; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 598; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 599; GFX9-NEXT: s_mov_b32 s7, 0xf000 600; GFX9-NEXT: s_mov_b32 s6, -1 601; GFX9-NEXT: s_mov_b32 s10, s6 602; GFX9-NEXT: s_mov_b32 s11, s7 603; GFX9-NEXT: s_waitcnt lgkmcnt(0) 604; GFX9-NEXT: s_mov_b32 s12, s2 605; GFX9-NEXT: s_mov_b32 s13, s3 606; GFX9-NEXT: s_mov_b32 s14, s6 607; GFX9-NEXT: s_mov_b32 s15, s7 608; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 609; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 610; GFX9-NEXT: s_mov_b32 s4, s0 611; GFX9-NEXT: s_mov_b32 s5, s1 612; GFX9-NEXT: s_waitcnt vmcnt(0) 613; GFX9-NEXT: v_pk_mul_f16 v1, v3, v1 614; GFX9-NEXT: v_pk_mul_f16 v0, v2, v0 615; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 616; GFX9-NEXT: s_endpgm 617; 618; GFX11-LABEL: fmul_v4f16: 619; GFX11: ; %bb.0: ; %entry 620; GFX11-NEXT: s_clause 0x1 621; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 622; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 623; GFX11-NEXT: s_mov_b32 s10, -1 624; GFX11-NEXT: s_mov_b32 s11, 0x31016000 625; GFX11-NEXT: s_mov_b32 s6, s10 626; GFX11-NEXT: s_mov_b32 s7, s11 627; GFX11-NEXT: s_mov_b32 s14, s10 628; GFX11-NEXT: s_mov_b32 s15, s11 629; GFX11-NEXT: s_waitcnt lgkmcnt(0) 630; GFX11-NEXT: s_mov_b32 s12, s2 631; GFX11-NEXT: s_mov_b32 s13, s3 632; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[4:7], 0 633; GFX11-NEXT: buffer_load_b64 v[2:3], off, s[12:15], 0 634; GFX11-NEXT: s_mov_b32 s8, s0 635; GFX11-NEXT: s_mov_b32 s9, s1 636; GFX11-NEXT: s_waitcnt vmcnt(0) 637; GFX11-NEXT: v_pk_mul_f16 v1, v3, v1 638; GFX11-NEXT: v_pk_mul_f16 v0, v2, v0 639; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 640; GFX11-NEXT: s_endpgm 641 ptr addrspace(1) %r, 642 ptr addrspace(1) %a, 643 ptr addrspace(1) %b) { 644entry: 645 %a.val = load <4 x half>, ptr addrspace(1) %a 646 %b.val = load <4 x half>, ptr addrspace(1) %b 647 %r.val = fmul <4 x half> %a.val, %b.val 648 store <4 x half> %r.val, ptr addrspace(1) %r 649 ret void 650} 651 652define amdgpu_kernel void @fmul_v4f16_imm_a( 653; SI-LABEL: fmul_v4f16_imm_a: 654; SI: ; %bb.0: ; %entry 655; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 656; SI-NEXT: s_mov_b32 s7, 0xf000 657; SI-NEXT: s_mov_b32 s6, -1 658; SI-NEXT: s_mov_b32 s10, s6 659; SI-NEXT: s_mov_b32 s11, s7 660; SI-NEXT: s_waitcnt lgkmcnt(0) 661; SI-NEXT: s_mov_b32 s8, s2 662; SI-NEXT: s_mov_b32 s9, s3 663; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 664; SI-NEXT: s_mov_b32 s4, s0 665; SI-NEXT: s_mov_b32 s5, s1 666; SI-NEXT: s_waitcnt vmcnt(0) 667; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 668; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 669; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 670; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 671; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 672; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 673; SI-NEXT: v_mul_f32_e32 v3, 0x40400000, v3 674; SI-NEXT: v_mul_f32_e32 v2, 0x41000000, v2 675; SI-NEXT: v_mul_f32_e32 v1, 4.0, v1 676; SI-NEXT: v_add_f32_e32 v0, v0, v0 677; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 678; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 679; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 680; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 681; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 682; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 683; SI-NEXT: v_or_b32_e32 v1, v3, v1 684; SI-NEXT: v_or_b32_e32 v0, v2, v0 685; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 686; SI-NEXT: s_endpgm 687; 688; VI-LABEL: fmul_v4f16_imm_a: 689; VI: ; %bb.0: ; %entry 690; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 691; VI-NEXT: s_mov_b32 s7, 0xf000 692; VI-NEXT: s_mov_b32 s6, -1 693; VI-NEXT: s_mov_b32 s10, s6 694; VI-NEXT: s_mov_b32 s11, s7 695; VI-NEXT: s_waitcnt lgkmcnt(0) 696; VI-NEXT: s_mov_b32 s8, s2 697; VI-NEXT: s_mov_b32 s9, s3 698; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 699; VI-NEXT: v_mov_b32_e32 v2, 0x4400 700; VI-NEXT: s_mov_b32 s4, s0 701; VI-NEXT: s_mov_b32 s5, s1 702; VI-NEXT: s_waitcnt vmcnt(0) 703; VI-NEXT: v_mul_f16_sdwa v2, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 704; VI-NEXT: v_mul_f16_e32 v1, 0x4200, v1 705; VI-NEXT: v_add_f16_sdwa v3, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 706; VI-NEXT: v_mul_f16_e32 v0, 0x4800, v0 707; VI-NEXT: v_or_b32_e32 v1, v1, v2 708; VI-NEXT: v_or_b32_e32 v0, v0, v3 709; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 710; VI-NEXT: s_endpgm 711; 712; GFX9-LABEL: fmul_v4f16_imm_a: 713; GFX9: ; %bb.0: ; %entry 714; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 715; GFX9-NEXT: s_mov_b32 s7, 0xf000 716; GFX9-NEXT: s_mov_b32 s6, -1 717; GFX9-NEXT: s_mov_b32 s10, s6 718; GFX9-NEXT: s_mov_b32 s11, s7 719; GFX9-NEXT: s_waitcnt lgkmcnt(0) 720; GFX9-NEXT: s_mov_b32 s8, s2 721; GFX9-NEXT: s_mov_b32 s9, s3 722; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 723; GFX9-NEXT: s_mov_b32 s2, 0x44004200 724; GFX9-NEXT: s_mov_b32 s3, 0x40004800 725; GFX9-NEXT: s_mov_b32 s4, s0 726; GFX9-NEXT: s_mov_b32 s5, s1 727; GFX9-NEXT: s_waitcnt vmcnt(0) 728; GFX9-NEXT: v_pk_mul_f16 v1, v1, s2 729; GFX9-NEXT: v_pk_mul_f16 v0, v0, s3 730; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 731; GFX9-NEXT: s_endpgm 732; 733; GFX11-LABEL: fmul_v4f16_imm_a: 734; GFX11: ; %bb.0: ; %entry 735; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 736; GFX11-NEXT: s_mov_b32 s6, -1 737; GFX11-NEXT: s_mov_b32 s7, 0x31016000 738; GFX11-NEXT: s_mov_b32 s10, s6 739; GFX11-NEXT: s_mov_b32 s11, s7 740; GFX11-NEXT: s_waitcnt lgkmcnt(0) 741; GFX11-NEXT: s_mov_b32 s8, s2 742; GFX11-NEXT: s_mov_b32 s9, s3 743; GFX11-NEXT: s_mov_b32 s4, s0 744; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 745; GFX11-NEXT: s_mov_b32 s5, s1 746; GFX11-NEXT: s_waitcnt vmcnt(0) 747; GFX11-NEXT: v_pk_mul_f16 v1, 0x44004200, v1 748; GFX11-NEXT: v_pk_mul_f16 v0, 0x40004800, v0 749; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 750; GFX11-NEXT: s_endpgm 751 ptr addrspace(1) %r, 752 ptr addrspace(1) %b) { 753entry: 754 %b.val = load <4 x half>, ptr addrspace(1) %b 755 %r.val = fmul <4 x half> <half 8.0, half 2.0, half 3.0, half 4.0>, %b.val 756 store <4 x half> %r.val, ptr addrspace(1) %r 757 ret void 758} 759