1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefixes=SI %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX89,VI %s 4; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX89,GFX9 %s 5; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11 %s 6 7define amdgpu_kernel void @fsub_f16( 8; SI-LABEL: fsub_f16: 9; SI: ; %bb.0: ; %entry 10; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 11; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 12; SI-NEXT: s_mov_b32 s7, 0xf000 13; SI-NEXT: s_mov_b32 s6, -1 14; SI-NEXT: s_mov_b32 s14, s6 15; SI-NEXT: s_waitcnt lgkmcnt(0) 16; SI-NEXT: s_mov_b32 s12, s2 17; SI-NEXT: s_mov_b32 s13, s3 18; SI-NEXT: s_mov_b32 s15, s7 19; SI-NEXT: s_mov_b32 s10, s6 20; SI-NEXT: s_mov_b32 s11, s7 21; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 22; SI-NEXT: s_waitcnt vmcnt(0) 23; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc 24; SI-NEXT: s_waitcnt vmcnt(0) 25; SI-NEXT: s_mov_b32 s4, s0 26; SI-NEXT: s_mov_b32 s5, s1 27; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 28; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 29; SI-NEXT: v_sub_f32_e32 v0, v0, v1 30; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 31; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 32; SI-NEXT: s_endpgm 33; 34; GFX89-LABEL: fsub_f16: 35; GFX89: ; %bb.0: ; %entry 36; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 37; GFX89-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 38; GFX89-NEXT: s_mov_b32 s7, 0xf000 39; GFX89-NEXT: s_mov_b32 s6, -1 40; GFX89-NEXT: s_mov_b32 s14, s6 41; GFX89-NEXT: s_waitcnt lgkmcnt(0) 42; GFX89-NEXT: s_mov_b32 s12, s2 43; GFX89-NEXT: s_mov_b32 s13, s3 44; GFX89-NEXT: s_mov_b32 s15, s7 45; GFX89-NEXT: s_mov_b32 s10, s6 46; GFX89-NEXT: s_mov_b32 s11, s7 47; GFX89-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 48; GFX89-NEXT: s_waitcnt vmcnt(0) 49; GFX89-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc 50; GFX89-NEXT: s_waitcnt vmcnt(0) 51; GFX89-NEXT: s_mov_b32 s4, s0 52; GFX89-NEXT: s_mov_b32 s5, s1 53; GFX89-NEXT: v_sub_f16_e32 v0, v0, v1 54; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 55; GFX89-NEXT: s_endpgm 56; 57; GFX11-LABEL: fsub_f16: 58; GFX11: ; %bb.0: ; %entry 59; GFX11-NEXT: s_clause 0x1 60; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 61; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 62; GFX11-NEXT: s_mov_b32 s10, -1 63; GFX11-NEXT: s_mov_b32 s11, 0x31016000 64; GFX11-NEXT: s_mov_b32 s14, s10 65; GFX11-NEXT: s_mov_b32 s15, s11 66; GFX11-NEXT: s_mov_b32 s6, s10 67; GFX11-NEXT: s_mov_b32 s7, s11 68; GFX11-NEXT: s_waitcnt lgkmcnt(0) 69; GFX11-NEXT: s_mov_b32 s12, s2 70; GFX11-NEXT: s_mov_b32 s13, s3 71; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc 72; GFX11-NEXT: s_waitcnt vmcnt(0) 73; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc 74; GFX11-NEXT: s_waitcnt vmcnt(0) 75; GFX11-NEXT: s_mov_b32 s8, s0 76; GFX11-NEXT: s_mov_b32 s9, s1 77; GFX11-NEXT: v_sub_f16_e32 v0, v0, v1 78; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 79; GFX11-NEXT: s_endpgm 80 ptr addrspace(1) %r, 81 ptr addrspace(1) %a, 82 ptr addrspace(1) %b) { 83entry: 84 %a.val = load volatile half, ptr addrspace(1) %a 85 %b.val = load volatile half, ptr addrspace(1) %b 86 %r.val = fsub half %a.val, %b.val 87 store half %r.val, ptr addrspace(1) %r 88 ret void 89} 90 91define amdgpu_kernel void @fsub_f16_imm_a( 92; SI-LABEL: fsub_f16_imm_a: 93; SI: ; %bb.0: ; %entry 94; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 95; SI-NEXT: s_mov_b32 s7, 0xf000 96; SI-NEXT: s_mov_b32 s6, -1 97; SI-NEXT: s_mov_b32 s10, s6 98; SI-NEXT: s_mov_b32 s11, s7 99; SI-NEXT: s_waitcnt lgkmcnt(0) 100; SI-NEXT: s_mov_b32 s8, s2 101; SI-NEXT: s_mov_b32 s9, s3 102; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc 103; SI-NEXT: s_waitcnt vmcnt(0) 104; SI-NEXT: s_mov_b32 s4, s0 105; SI-NEXT: s_mov_b32 s5, s1 106; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 107; SI-NEXT: v_sub_f32_e32 v0, 1.0, v0 108; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 109; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 110; SI-NEXT: s_endpgm 111; 112; GFX89-LABEL: fsub_f16_imm_a: 113; GFX89: ; %bb.0: ; %entry 114; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 115; GFX89-NEXT: s_mov_b32 s7, 0xf000 116; GFX89-NEXT: s_mov_b32 s6, -1 117; GFX89-NEXT: s_mov_b32 s10, s6 118; GFX89-NEXT: s_mov_b32 s11, s7 119; GFX89-NEXT: s_waitcnt lgkmcnt(0) 120; GFX89-NEXT: s_mov_b32 s8, s2 121; GFX89-NEXT: s_mov_b32 s9, s3 122; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc 123; GFX89-NEXT: s_waitcnt vmcnt(0) 124; GFX89-NEXT: s_mov_b32 s4, s0 125; GFX89-NEXT: s_mov_b32 s5, s1 126; GFX89-NEXT: v_sub_f16_e32 v0, 1.0, v0 127; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 128; GFX89-NEXT: s_endpgm 129; 130; GFX11-LABEL: fsub_f16_imm_a: 131; GFX11: ; %bb.0: ; %entry 132; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 133; GFX11-NEXT: s_mov_b32 s6, -1 134; GFX11-NEXT: s_mov_b32 s7, 0x31016000 135; GFX11-NEXT: s_mov_b32 s10, s6 136; GFX11-NEXT: s_mov_b32 s11, s7 137; GFX11-NEXT: s_waitcnt lgkmcnt(0) 138; GFX11-NEXT: s_mov_b32 s8, s2 139; GFX11-NEXT: s_mov_b32 s9, s3 140; GFX11-NEXT: s_mov_b32 s4, s0 141; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc 142; GFX11-NEXT: s_waitcnt vmcnt(0) 143; GFX11-NEXT: s_mov_b32 s5, s1 144; GFX11-NEXT: v_sub_f16_e32 v0, 1.0, v0 145; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 146; GFX11-NEXT: s_endpgm 147 ptr addrspace(1) %r, 148 ptr addrspace(1) %b) { 149entry: 150 %b.val = load volatile half, ptr addrspace(1) %b 151 %r.val = fsub half 1.0, %b.val 152 store half %r.val, ptr addrspace(1) %r 153 ret void 154} 155 156define amdgpu_kernel void @fsub_f16_imm_b( 157; SI-LABEL: fsub_f16_imm_b: 158; SI: ; %bb.0: ; %entry 159; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 160; SI-NEXT: s_mov_b32 s7, 0xf000 161; SI-NEXT: s_mov_b32 s6, -1 162; SI-NEXT: s_mov_b32 s10, s6 163; SI-NEXT: s_mov_b32 s11, s7 164; SI-NEXT: s_waitcnt lgkmcnt(0) 165; SI-NEXT: s_mov_b32 s8, s2 166; SI-NEXT: s_mov_b32 s9, s3 167; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc 168; SI-NEXT: s_waitcnt vmcnt(0) 169; SI-NEXT: s_mov_b32 s4, s0 170; SI-NEXT: s_mov_b32 s5, s1 171; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 172; SI-NEXT: v_add_f32_e32 v0, -2.0, v0 173; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 174; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 175; SI-NEXT: s_endpgm 176; 177; GFX89-LABEL: fsub_f16_imm_b: 178; GFX89: ; %bb.0: ; %entry 179; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 180; GFX89-NEXT: s_mov_b32 s7, 0xf000 181; GFX89-NEXT: s_mov_b32 s6, -1 182; GFX89-NEXT: s_mov_b32 s10, s6 183; GFX89-NEXT: s_mov_b32 s11, s7 184; GFX89-NEXT: s_waitcnt lgkmcnt(0) 185; GFX89-NEXT: s_mov_b32 s8, s2 186; GFX89-NEXT: s_mov_b32 s9, s3 187; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc 188; GFX89-NEXT: s_waitcnt vmcnt(0) 189; GFX89-NEXT: s_mov_b32 s4, s0 190; GFX89-NEXT: s_mov_b32 s5, s1 191; GFX89-NEXT: v_add_f16_e32 v0, -2.0, v0 192; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 193; GFX89-NEXT: s_endpgm 194; 195; GFX11-LABEL: fsub_f16_imm_b: 196; GFX11: ; %bb.0: ; %entry 197; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 198; GFX11-NEXT: s_mov_b32 s6, -1 199; GFX11-NEXT: s_mov_b32 s7, 0x31016000 200; GFX11-NEXT: s_mov_b32 s10, s6 201; GFX11-NEXT: s_mov_b32 s11, s7 202; GFX11-NEXT: s_waitcnt lgkmcnt(0) 203; GFX11-NEXT: s_mov_b32 s8, s2 204; GFX11-NEXT: s_mov_b32 s9, s3 205; GFX11-NEXT: s_mov_b32 s4, s0 206; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc 207; GFX11-NEXT: s_waitcnt vmcnt(0) 208; GFX11-NEXT: s_mov_b32 s5, s1 209; GFX11-NEXT: v_add_f16_e32 v0, -2.0, v0 210; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 211; GFX11-NEXT: s_endpgm 212 ptr addrspace(1) %r, 213 ptr addrspace(1) %a) { 214entry: 215 %a.val = load volatile half, ptr addrspace(1) %a 216 %r.val = fsub half %a.val, 2.0 217 store half %r.val, ptr addrspace(1) %r 218 ret void 219} 220 221define amdgpu_kernel void @fsub_v2f16( 222; SI-LABEL: fsub_v2f16: 223; SI: ; %bb.0: ; %entry 224; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 225; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 226; SI-NEXT: s_mov_b32 s7, 0xf000 227; SI-NEXT: s_mov_b32 s6, -1 228; SI-NEXT: s_mov_b32 s10, s6 229; SI-NEXT: s_mov_b32 s11, s7 230; SI-NEXT: s_waitcnt lgkmcnt(0) 231; SI-NEXT: s_mov_b32 s12, s2 232; SI-NEXT: s_mov_b32 s13, s3 233; SI-NEXT: s_mov_b32 s14, s6 234; SI-NEXT: s_mov_b32 s15, s7 235; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 236; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 237; SI-NEXT: s_mov_b32 s4, s0 238; SI-NEXT: s_mov_b32 s5, s1 239; SI-NEXT: s_waitcnt vmcnt(1) 240; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 241; SI-NEXT: s_waitcnt vmcnt(0) 242; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 243; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 244; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 245; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 246; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 247; SI-NEXT: v_sub_f32_e32 v2, v3, v2 248; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 249; SI-NEXT: v_sub_f32_e32 v0, v1, v0 250; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 251; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 252; SI-NEXT: v_or_b32_e32 v0, v0, v1 253; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 254; SI-NEXT: s_endpgm 255; 256; VI-LABEL: fsub_v2f16: 257; VI: ; %bb.0: ; %entry 258; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 259; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 260; VI-NEXT: s_mov_b32 s7, 0xf000 261; VI-NEXT: s_mov_b32 s6, -1 262; VI-NEXT: s_mov_b32 s10, s6 263; VI-NEXT: s_mov_b32 s11, s7 264; VI-NEXT: s_waitcnt lgkmcnt(0) 265; VI-NEXT: s_mov_b32 s12, s2 266; VI-NEXT: s_mov_b32 s13, s3 267; VI-NEXT: s_mov_b32 s14, s6 268; VI-NEXT: s_mov_b32 s15, s7 269; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 270; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 271; VI-NEXT: s_mov_b32 s4, s0 272; VI-NEXT: s_mov_b32 s5, s1 273; VI-NEXT: s_waitcnt vmcnt(0) 274; VI-NEXT: v_sub_f16_sdwa v2, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 275; VI-NEXT: v_sub_f16_e32 v0, v1, v0 276; VI-NEXT: v_or_b32_e32 v0, v0, v2 277; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 278; VI-NEXT: s_endpgm 279; 280; GFX9-LABEL: fsub_v2f16: 281; GFX9: ; %bb.0: ; %entry 282; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 283; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 284; GFX9-NEXT: s_mov_b32 s7, 0xf000 285; GFX9-NEXT: s_mov_b32 s6, -1 286; GFX9-NEXT: s_mov_b32 s14, s6 287; GFX9-NEXT: s_waitcnt lgkmcnt(0) 288; GFX9-NEXT: s_mov_b32 s12, s2 289; GFX9-NEXT: s_mov_b32 s13, s3 290; GFX9-NEXT: s_mov_b32 s15, s7 291; GFX9-NEXT: s_mov_b32 s10, s6 292; GFX9-NEXT: s_mov_b32 s11, s7 293; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 294; GFX9-NEXT: buffer_load_dword v1, off, s[8:11], 0 295; GFX9-NEXT: s_mov_b32 s4, s0 296; GFX9-NEXT: s_mov_b32 s5, s1 297; GFX9-NEXT: s_waitcnt vmcnt(0) 298; GFX9-NEXT: v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1] 299; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 300; GFX9-NEXT: s_endpgm 301; 302; GFX11-LABEL: fsub_v2f16: 303; GFX11: ; %bb.0: ; %entry 304; GFX11-NEXT: s_clause 0x1 305; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 306; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 307; GFX11-NEXT: s_mov_b32 s10, -1 308; GFX11-NEXT: s_mov_b32 s11, 0x31016000 309; GFX11-NEXT: s_mov_b32 s14, s10 310; GFX11-NEXT: s_mov_b32 s15, s11 311; GFX11-NEXT: s_mov_b32 s6, s10 312; GFX11-NEXT: s_mov_b32 s7, s11 313; GFX11-NEXT: s_waitcnt lgkmcnt(0) 314; GFX11-NEXT: s_mov_b32 s12, s2 315; GFX11-NEXT: s_mov_b32 s13, s3 316; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 317; GFX11-NEXT: buffer_load_b32 v1, off, s[4:7], 0 318; GFX11-NEXT: s_mov_b32 s8, s0 319; GFX11-NEXT: s_mov_b32 s9, s1 320; GFX11-NEXT: s_waitcnt vmcnt(0) 321; GFX11-NEXT: v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1] 322; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 323; GFX11-NEXT: s_endpgm 324 ptr addrspace(1) %r, 325 ptr addrspace(1) %a, 326 ptr addrspace(1) %b) { 327entry: 328 %a.val = load <2 x half>, ptr addrspace(1) %a 329 %b.val = load <2 x half>, ptr addrspace(1) %b 330 %r.val = fsub <2 x half> %a.val, %b.val 331 store <2 x half> %r.val, ptr addrspace(1) %r 332 ret void 333} 334 335define amdgpu_kernel void @fsub_v2f16_imm_a( 336; SI-LABEL: fsub_v2f16_imm_a: 337; SI: ; %bb.0: ; %entry 338; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 339; SI-NEXT: s_mov_b32 s7, 0xf000 340; SI-NEXT: s_mov_b32 s6, -1 341; SI-NEXT: s_mov_b32 s10, s6 342; SI-NEXT: s_mov_b32 s11, s7 343; SI-NEXT: s_waitcnt lgkmcnt(0) 344; SI-NEXT: s_mov_b32 s8, s2 345; SI-NEXT: s_mov_b32 s9, s3 346; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 347; SI-NEXT: s_mov_b32 s4, s0 348; SI-NEXT: s_mov_b32 s5, s1 349; SI-NEXT: s_waitcnt vmcnt(0) 350; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 351; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 352; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 353; SI-NEXT: v_sub_f32_e32 v1, 2.0, v1 354; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 355; SI-NEXT: v_sub_f32_e32 v0, 1.0, v0 356; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 357; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 358; SI-NEXT: v_or_b32_e32 v0, v0, v1 359; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 360; SI-NEXT: s_endpgm 361; 362; VI-LABEL: fsub_v2f16_imm_a: 363; VI: ; %bb.0: ; %entry 364; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 365; VI-NEXT: s_mov_b32 s7, 0xf000 366; VI-NEXT: s_mov_b32 s6, -1 367; VI-NEXT: s_mov_b32 s10, s6 368; VI-NEXT: s_mov_b32 s11, s7 369; VI-NEXT: s_waitcnt lgkmcnt(0) 370; VI-NEXT: s_mov_b32 s8, s2 371; VI-NEXT: s_mov_b32 s9, s3 372; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 373; VI-NEXT: v_mov_b32_e32 v1, 0x4000 374; VI-NEXT: s_mov_b32 s4, s0 375; VI-NEXT: s_mov_b32 s5, s1 376; VI-NEXT: s_waitcnt vmcnt(0) 377; VI-NEXT: v_sub_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 378; VI-NEXT: v_sub_f16_e32 v0, 1.0, v0 379; VI-NEXT: v_or_b32_e32 v0, v0, v1 380; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 381; VI-NEXT: s_endpgm 382; 383; GFX9-LABEL: fsub_v2f16_imm_a: 384; GFX9: ; %bb.0: ; %entry 385; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 386; GFX9-NEXT: s_mov_b32 s7, 0xf000 387; GFX9-NEXT: s_mov_b32 s6, -1 388; GFX9-NEXT: s_mov_b32 s10, s6 389; GFX9-NEXT: s_mov_b32 s11, s7 390; GFX9-NEXT: s_waitcnt lgkmcnt(0) 391; GFX9-NEXT: s_mov_b32 s8, s2 392; GFX9-NEXT: s_mov_b32 s9, s3 393; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 394; GFX9-NEXT: s_mov_b32 s4, s0 395; GFX9-NEXT: s_mov_b32 s0, 0x40003c00 396; GFX9-NEXT: s_mov_b32 s5, s1 397; GFX9-NEXT: s_waitcnt vmcnt(0) 398; GFX9-NEXT: v_pk_add_f16 v0, v0, s0 neg_lo:[1,0] neg_hi:[1,0] 399; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 400; GFX9-NEXT: s_endpgm 401; 402; GFX11-LABEL: fsub_v2f16_imm_a: 403; GFX11: ; %bb.0: ; %entry 404; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 405; GFX11-NEXT: s_mov_b32 s6, -1 406; GFX11-NEXT: s_mov_b32 s7, 0x31016000 407; GFX11-NEXT: s_mov_b32 s10, s6 408; GFX11-NEXT: s_mov_b32 s11, s7 409; GFX11-NEXT: s_waitcnt lgkmcnt(0) 410; GFX11-NEXT: s_mov_b32 s8, s2 411; GFX11-NEXT: s_mov_b32 s9, s3 412; GFX11-NEXT: s_mov_b32 s4, s0 413; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 414; GFX11-NEXT: s_mov_b32 s5, s1 415; GFX11-NEXT: s_waitcnt vmcnt(0) 416; GFX11-NEXT: v_pk_add_f16 v0, 0x40003c00, v0 neg_lo:[0,1] neg_hi:[0,1] 417; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 418; GFX11-NEXT: s_endpgm 419 ptr addrspace(1) %r, 420 ptr addrspace(1) %b) { 421entry: 422 %b.val = load <2 x half>, ptr addrspace(1) %b 423 %r.val = fsub <2 x half> <half 1.0, half 2.0>, %b.val 424 store <2 x half> %r.val, ptr addrspace(1) %r 425 ret void 426} 427 428define amdgpu_kernel void @fsub_v2f16_imm_b( 429; SI-LABEL: fsub_v2f16_imm_b: 430; SI: ; %bb.0: ; %entry 431; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 432; SI-NEXT: s_mov_b32 s7, 0xf000 433; SI-NEXT: s_mov_b32 s6, -1 434; SI-NEXT: s_mov_b32 s10, s6 435; SI-NEXT: s_mov_b32 s11, s7 436; SI-NEXT: s_waitcnt lgkmcnt(0) 437; SI-NEXT: s_mov_b32 s8, s2 438; SI-NEXT: s_mov_b32 s9, s3 439; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 440; SI-NEXT: s_mov_b32 s4, s0 441; SI-NEXT: s_mov_b32 s5, s1 442; SI-NEXT: s_waitcnt vmcnt(0) 443; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 444; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 445; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 446; SI-NEXT: v_add_f32_e32 v1, -1.0, v1 447; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 448; SI-NEXT: v_add_f32_e32 v0, -2.0, v0 449; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 450; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 451; SI-NEXT: v_or_b32_e32 v0, v0, v1 452; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 453; SI-NEXT: s_endpgm 454; 455; VI-LABEL: fsub_v2f16_imm_b: 456; VI: ; %bb.0: ; %entry 457; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 458; VI-NEXT: s_mov_b32 s7, 0xf000 459; VI-NEXT: s_mov_b32 s6, -1 460; VI-NEXT: s_mov_b32 s10, s6 461; VI-NEXT: s_mov_b32 s11, s7 462; VI-NEXT: s_waitcnt lgkmcnt(0) 463; VI-NEXT: s_mov_b32 s8, s2 464; VI-NEXT: s_mov_b32 s9, s3 465; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 466; VI-NEXT: v_mov_b32_e32 v1, 0xbc00 467; VI-NEXT: s_mov_b32 s4, s0 468; VI-NEXT: s_mov_b32 s5, s1 469; VI-NEXT: s_waitcnt vmcnt(0) 470; VI-NEXT: v_add_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 471; VI-NEXT: v_add_f16_e32 v0, -2.0, v0 472; VI-NEXT: v_or_b32_e32 v0, v0, v1 473; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 474; VI-NEXT: s_endpgm 475; 476; GFX9-LABEL: fsub_v2f16_imm_b: 477; GFX9: ; %bb.0: ; %entry 478; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 479; GFX9-NEXT: s_mov_b32 s7, 0xf000 480; GFX9-NEXT: s_mov_b32 s6, -1 481; GFX9-NEXT: s_mov_b32 s10, s6 482; GFX9-NEXT: s_mov_b32 s11, s7 483; GFX9-NEXT: s_waitcnt lgkmcnt(0) 484; GFX9-NEXT: s_mov_b32 s8, s2 485; GFX9-NEXT: s_mov_b32 s9, s3 486; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 487; GFX9-NEXT: s_mov_b32 s4, s0 488; GFX9-NEXT: s_mov_b32 s0, 0xbc00c000 489; GFX9-NEXT: s_mov_b32 s5, s1 490; GFX9-NEXT: s_waitcnt vmcnt(0) 491; GFX9-NEXT: v_pk_add_f16 v0, v0, s0 492; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 493; GFX9-NEXT: s_endpgm 494; 495; GFX11-LABEL: fsub_v2f16_imm_b: 496; GFX11: ; %bb.0: ; %entry 497; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 498; GFX11-NEXT: s_mov_b32 s6, -1 499; GFX11-NEXT: s_mov_b32 s7, 0x31016000 500; GFX11-NEXT: s_mov_b32 s10, s6 501; GFX11-NEXT: s_mov_b32 s11, s7 502; GFX11-NEXT: s_waitcnt lgkmcnt(0) 503; GFX11-NEXT: s_mov_b32 s8, s2 504; GFX11-NEXT: s_mov_b32 s9, s3 505; GFX11-NEXT: s_mov_b32 s4, s0 506; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 507; GFX11-NEXT: s_mov_b32 s5, s1 508; GFX11-NEXT: s_waitcnt vmcnt(0) 509; GFX11-NEXT: v_pk_add_f16 v0, 0xbc00c000, v0 510; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 511; GFX11-NEXT: s_endpgm 512 ptr addrspace(1) %r, 513 ptr addrspace(1) %a) { 514entry: 515 %a.val = load <2 x half>, ptr addrspace(1) %a 516 %r.val = fsub <2 x half> %a.val, <half 2.0, half 1.0> 517 store <2 x half> %r.val, ptr addrspace(1) %r 518 ret void 519} 520