1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-NOFMA %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-NOFMA %s 4; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-FMA %s 5; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-NOFMA %s 6; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=ieee -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FMA %s 7 8; FIXME: Remove enable-unsafe-fp-math in RUN line and add flags to IR instrs 9 10; Note: The SI-FMA conversions of type x * (y + 1) --> x * y + x would be 11; beneficial even without fp32 denormals, but they do require no-infs-fp-math 12; for correctness. 13 14declare i32 @llvm.amdgcn.workitem.id.x() #0 15declare double @llvm.fabs.f64(double) #0 16declare double @llvm.fma.f64(double, double, double) #0 17declare float @llvm.fma.f32(float, float, float) #0 18declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #0 19 20; (fadd (fmul x, y), z) -> (fma x, y, z) 21define amdgpu_kernel void @combine_to_fma_f64_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { 22; SI-LABEL: combine_to_fma_f64_0: 23; SI: ; %bb.0: 24; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 25; SI-NEXT: s_mov_b32 s7, 0xf000 26; SI-NEXT: s_mov_b32 s6, 0 27; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 28; SI-NEXT: v_mov_b32_e32 v1, 0 29; SI-NEXT: s_waitcnt lgkmcnt(0) 30; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 31; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc 32; SI-NEXT: s_waitcnt vmcnt(0) 33; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc 34; SI-NEXT: s_waitcnt vmcnt(0) 35; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc 36; SI-NEXT: s_waitcnt vmcnt(0) 37; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 38; SI-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[6:7] 39; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 40; SI-NEXT: s_endpgm 41; 42; GFX11-LABEL: combine_to_fma_f64_0: 43; GFX11: ; %bb.0: 44; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 45; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 46; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 47; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 48; GFX11-NEXT: s_waitcnt lgkmcnt(0) 49; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc 50; GFX11-NEXT: s_waitcnt vmcnt(0) 51; GFX11-NEXT: global_load_b64 v[2:3], v6, s[2:3] offset:8 glc dlc 52; GFX11-NEXT: s_waitcnt vmcnt(0) 53; GFX11-NEXT: global_load_b64 v[4:5], v6, s[2:3] offset:16 glc dlc 54; GFX11-NEXT: s_waitcnt vmcnt(0) 55; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] 56; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1] 57; GFX11-NEXT: s_endpgm 58 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 59 %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid 60 %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1 61 %gep.2 = getelementptr double, ptr addrspace(1) %gep.0, i32 2 62 %gep.out = getelementptr double, ptr addrspace(1) %out, i32 %tid 63 64 %a = load volatile double, ptr addrspace(1) %gep.0 65 %b = load volatile double, ptr addrspace(1) %gep.1 66 %c = load volatile double, ptr addrspace(1) %gep.2 67 68 %mul = fmul double %a, %b 69 %fma = fadd double %mul, %c 70 store double %fma, ptr addrspace(1) %gep.out 71 ret void 72} 73 74; (fadd (fmul x, y), z) -> (fma x, y, z) 75define amdgpu_kernel void @combine_to_fma_f64_0_2use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { 76; SI-LABEL: combine_to_fma_f64_0_2use: 77; SI: ; %bb.0: 78; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 79; SI-NEXT: s_mov_b32 s7, 0xf000 80; SI-NEXT: s_mov_b32 s6, 0 81; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 82; SI-NEXT: v_mov_b32_e32 v1, 0 83; SI-NEXT: s_waitcnt lgkmcnt(0) 84; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 85; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc 86; SI-NEXT: s_waitcnt vmcnt(0) 87; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc 88; SI-NEXT: s_waitcnt vmcnt(0) 89; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc 90; SI-NEXT: s_waitcnt vmcnt(0) 91; SI-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc 92; SI-NEXT: s_waitcnt vmcnt(0) 93; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 94; SI-NEXT: v_fma_f64 v[6:7], v[2:3], v[4:5], v[6:7] 95; SI-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[8:9] 96; SI-NEXT: buffer_store_dwordx2 v[6:7], v[0:1], s[0:3], 0 addr64 97; SI-NEXT: s_waitcnt vmcnt(0) 98; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 offset:8 99; SI-NEXT: s_waitcnt vmcnt(0) 100; SI-NEXT: s_endpgm 101; 102; GFX11-LABEL: combine_to_fma_f64_0_2use: 103; GFX11: ; %bb.0: 104; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 105; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 106; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 107; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 108; GFX11-NEXT: s_waitcnt lgkmcnt(0) 109; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc 110; GFX11-NEXT: s_waitcnt vmcnt(0) 111; GFX11-NEXT: global_load_b64 v[2:3], v8, s[2:3] offset:8 glc dlc 112; GFX11-NEXT: s_waitcnt vmcnt(0) 113; GFX11-NEXT: global_load_b64 v[4:5], v8, s[2:3] offset:16 glc dlc 114; GFX11-NEXT: s_waitcnt vmcnt(0) 115; GFX11-NEXT: global_load_b64 v[6:7], v8, s[2:3] offset:24 glc dlc 116; GFX11-NEXT: s_waitcnt vmcnt(0) 117; GFX11-NEXT: v_fma_f64 v[4:5], v[0:1], v[2:3], v[4:5] 118; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[6:7] 119; GFX11-NEXT: global_store_b64 v8, v[4:5], s[0:1] dlc 120; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 121; GFX11-NEXT: global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc 122; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 123; GFX11-NEXT: s_endpgm 124 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 125 %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid 126 %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1 127 %gep.2 = getelementptr double, ptr addrspace(1) %gep.0, i32 2 128 %gep.3 = getelementptr double, ptr addrspace(1) %gep.0, i32 3 129 %gep.out.0 = getelementptr double, ptr addrspace(1) %out, i32 %tid 130 %gep.out.1 = getelementptr double, ptr addrspace(1) %gep.out.0, i32 1 131 132 %a = load volatile double, ptr addrspace(1) %gep.0 133 %b = load volatile double, ptr addrspace(1) %gep.1 134 %c = load volatile double, ptr addrspace(1) %gep.2 135 %d = load volatile double, ptr addrspace(1) %gep.3 136 137 %mul = fmul double %a, %b 138 %fma0 = fadd double %mul, %c 139 %fma1 = fadd double %mul, %d 140 store volatile double %fma0, ptr addrspace(1) %gep.out.0 141 store volatile double %fma1, ptr addrspace(1) %gep.out.1 142 ret void 143} 144 145; (fadd x, (fmul y, z)) -> (fma y, z, x) 146define amdgpu_kernel void @combine_to_fma_f64_1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { 147; SI-LABEL: combine_to_fma_f64_1: 148; SI: ; %bb.0: 149; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 150; SI-NEXT: s_mov_b32 s7, 0xf000 151; SI-NEXT: s_mov_b32 s6, 0 152; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 153; SI-NEXT: v_mov_b32_e32 v1, 0 154; SI-NEXT: s_waitcnt lgkmcnt(0) 155; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 156; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc 157; SI-NEXT: s_waitcnt vmcnt(0) 158; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc 159; SI-NEXT: s_waitcnt vmcnt(0) 160; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc 161; SI-NEXT: s_waitcnt vmcnt(0) 162; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 163; SI-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[6:7] 164; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 165; SI-NEXT: s_endpgm 166; 167; GFX11-LABEL: combine_to_fma_f64_1: 168; GFX11: ; %bb.0: 169; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 170; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 171; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 172; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 173; GFX11-NEXT: s_waitcnt lgkmcnt(0) 174; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc 175; GFX11-NEXT: s_waitcnt vmcnt(0) 176; GFX11-NEXT: global_load_b64 v[2:3], v6, s[2:3] offset:8 glc dlc 177; GFX11-NEXT: s_waitcnt vmcnt(0) 178; GFX11-NEXT: global_load_b64 v[4:5], v6, s[2:3] offset:16 glc dlc 179; GFX11-NEXT: s_waitcnt vmcnt(0) 180; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] 181; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1] 182; GFX11-NEXT: s_endpgm 183 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 184 %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid 185 %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1 186 %gep.2 = getelementptr double, ptr addrspace(1) %gep.0, i32 2 187 %gep.out = getelementptr double, ptr addrspace(1) %out, i32 %tid 188 189 %a = load volatile double, ptr addrspace(1) %gep.0 190 %b = load volatile double, ptr addrspace(1) %gep.1 191 %c = load volatile double, ptr addrspace(1) %gep.2 192 193 %mul = fmul double %a, %b 194 %fma = fadd double %c, %mul 195 store double %fma, ptr addrspace(1) %gep.out 196 ret void 197} 198 199; (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) 200define amdgpu_kernel void @combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { 201; SI-LABEL: combine_to_fma_fsub_0_f64: 202; SI: ; %bb.0: 203; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 204; SI-NEXT: s_mov_b32 s7, 0xf000 205; SI-NEXT: s_mov_b32 s6, 0 206; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 207; SI-NEXT: v_mov_b32_e32 v1, 0 208; SI-NEXT: s_waitcnt lgkmcnt(0) 209; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 210; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc 211; SI-NEXT: s_waitcnt vmcnt(0) 212; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc 213; SI-NEXT: s_waitcnt vmcnt(0) 214; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc 215; SI-NEXT: s_waitcnt vmcnt(0) 216; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 217; SI-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], -v[6:7] 218; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 219; SI-NEXT: s_endpgm 220; 221; GFX11-LABEL: combine_to_fma_fsub_0_f64: 222; GFX11: ; %bb.0: 223; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 224; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 225; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 226; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 227; GFX11-NEXT: s_waitcnt lgkmcnt(0) 228; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc 229; GFX11-NEXT: s_waitcnt vmcnt(0) 230; GFX11-NEXT: global_load_b64 v[2:3], v6, s[2:3] offset:8 glc dlc 231; GFX11-NEXT: s_waitcnt vmcnt(0) 232; GFX11-NEXT: global_load_b64 v[4:5], v6, s[2:3] offset:16 glc dlc 233; GFX11-NEXT: s_waitcnt vmcnt(0) 234; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], -v[4:5] 235; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1] 236; GFX11-NEXT: s_endpgm 237 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 238 %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid 239 %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1 240 %gep.2 = getelementptr double, ptr addrspace(1) %gep.0, i32 2 241 %gep.out = getelementptr double, ptr addrspace(1) %out, i32 %tid 242 243 %a = load volatile double, ptr addrspace(1) %gep.0 244 %b = load volatile double, ptr addrspace(1) %gep.1 245 %c = load volatile double, ptr addrspace(1) %gep.2 246 247 %mul = fmul double %a, %b 248 %fma = fsub double %mul, %c 249 store double %fma, ptr addrspace(1) %gep.out 250 ret void 251} 252 253; (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) 254define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { 255; SI-LABEL: combine_to_fma_fsub_f64_0_2use: 256; SI: ; %bb.0: 257; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 258; SI-NEXT: s_mov_b32 s7, 0xf000 259; SI-NEXT: s_mov_b32 s6, 0 260; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 261; SI-NEXT: v_mov_b32_e32 v1, 0 262; SI-NEXT: s_waitcnt lgkmcnt(0) 263; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 264; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc 265; SI-NEXT: s_waitcnt vmcnt(0) 266; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc 267; SI-NEXT: s_waitcnt vmcnt(0) 268; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc 269; SI-NEXT: s_waitcnt vmcnt(0) 270; SI-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc 271; SI-NEXT: s_waitcnt vmcnt(0) 272; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 273; SI-NEXT: v_fma_f64 v[6:7], v[2:3], v[4:5], -v[6:7] 274; SI-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], -v[8:9] 275; SI-NEXT: buffer_store_dwordx2 v[6:7], v[0:1], s[0:3], 0 addr64 276; SI-NEXT: s_waitcnt vmcnt(0) 277; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 offset:8 278; SI-NEXT: s_waitcnt vmcnt(0) 279; SI-NEXT: s_endpgm 280; 281; GFX11-LABEL: combine_to_fma_fsub_f64_0_2use: 282; GFX11: ; %bb.0: 283; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 284; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 285; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 286; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 287; GFX11-NEXT: s_waitcnt lgkmcnt(0) 288; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc 289; GFX11-NEXT: s_waitcnt vmcnt(0) 290; GFX11-NEXT: global_load_b64 v[2:3], v8, s[2:3] offset:8 glc dlc 291; GFX11-NEXT: s_waitcnt vmcnt(0) 292; GFX11-NEXT: global_load_b64 v[4:5], v8, s[2:3] offset:16 glc dlc 293; GFX11-NEXT: s_waitcnt vmcnt(0) 294; GFX11-NEXT: global_load_b64 v[6:7], v8, s[2:3] offset:24 glc dlc 295; GFX11-NEXT: s_waitcnt vmcnt(0) 296; GFX11-NEXT: v_fma_f64 v[4:5], v[0:1], v[2:3], -v[4:5] 297; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], -v[6:7] 298; GFX11-NEXT: global_store_b64 v8, v[4:5], s[0:1] dlc 299; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 300; GFX11-NEXT: global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc 301; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 302; GFX11-NEXT: s_endpgm 303 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 304 %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid 305 %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1 306 %gep.2 = getelementptr double, ptr addrspace(1) %gep.0, i32 2 307 %gep.3 = getelementptr double, ptr addrspace(1) %gep.0, i32 3 308 %gep.out.0 = getelementptr double, ptr addrspace(1) %out, i32 %tid 309 %gep.out.1 = getelementptr double, ptr addrspace(1) %gep.out.0, i32 1 310 311 %a = load volatile double, ptr addrspace(1) %gep.0 312 %b = load volatile double, ptr addrspace(1) %gep.1 313 %c = load volatile double, ptr addrspace(1) %gep.2 314 %d = load volatile double, ptr addrspace(1) %gep.3 315 316 %mul = fmul double %a, %b 317 %fma0 = fsub double %mul, %c 318 %fma1 = fsub double %mul, %d 319 store volatile double %fma0, ptr addrspace(1) %gep.out.0 320 store volatile double %fma1, ptr addrspace(1) %gep.out.1 321 ret void 322} 323 324; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) 325define amdgpu_kernel void @combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { 326; SI-LABEL: combine_to_fma_fsub_1_f64: 327; SI: ; %bb.0: 328; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 329; SI-NEXT: s_mov_b32 s7, 0xf000 330; SI-NEXT: s_mov_b32 s6, 0 331; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 332; SI-NEXT: v_mov_b32_e32 v1, 0 333; SI-NEXT: s_waitcnt lgkmcnt(0) 334; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 335; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc 336; SI-NEXT: s_waitcnt vmcnt(0) 337; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc 338; SI-NEXT: s_waitcnt vmcnt(0) 339; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc 340; SI-NEXT: s_waitcnt vmcnt(0) 341; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 342; SI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[4:5], v[6:7] 343; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 344; SI-NEXT: s_endpgm 345; 346; GFX11-LABEL: combine_to_fma_fsub_1_f64: 347; GFX11: ; %bb.0: 348; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 349; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 350; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 351; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 352; GFX11-NEXT: s_waitcnt lgkmcnt(0) 353; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc 354; GFX11-NEXT: s_waitcnt vmcnt(0) 355; GFX11-NEXT: global_load_b64 v[2:3], v6, s[2:3] offset:8 glc dlc 356; GFX11-NEXT: s_waitcnt vmcnt(0) 357; GFX11-NEXT: global_load_b64 v[4:5], v6, s[2:3] offset:16 glc dlc 358; GFX11-NEXT: s_waitcnt vmcnt(0) 359; GFX11-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], v[4:5] 360; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1] 361; GFX11-NEXT: s_endpgm 362 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 363 %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid 364 %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1 365 %gep.2 = getelementptr double, ptr addrspace(1) %gep.0, i32 2 366 %gep.out = getelementptr double, ptr addrspace(1) %out, i32 %tid 367 368 %a = load volatile double, ptr addrspace(1) %gep.0 369 %b = load volatile double, ptr addrspace(1) %gep.1 370 %c = load volatile double, ptr addrspace(1) %gep.2 371 372 %mul = fmul double %a, %b 373 %fma = fsub double %c, %mul 374 store double %fma, ptr addrspace(1) %gep.out 375 ret void 376} 377 378; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) 379define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { 380; SI-LABEL: combine_to_fma_fsub_1_f64_2use: 381; SI: ; %bb.0: 382; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 383; SI-NEXT: s_mov_b32 s7, 0xf000 384; SI-NEXT: s_mov_b32 s6, 0 385; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 386; SI-NEXT: v_mov_b32_e32 v1, 0 387; SI-NEXT: s_waitcnt lgkmcnt(0) 388; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 389; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc 390; SI-NEXT: s_waitcnt vmcnt(0) 391; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc 392; SI-NEXT: s_waitcnt vmcnt(0) 393; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc 394; SI-NEXT: s_waitcnt vmcnt(0) 395; SI-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc 396; SI-NEXT: s_waitcnt vmcnt(0) 397; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 398; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], v[6:7] 399; SI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[4:5], v[8:9] 400; SI-NEXT: buffer_store_dwordx2 v[6:7], v[0:1], s[0:3], 0 addr64 401; SI-NEXT: s_waitcnt vmcnt(0) 402; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 offset:8 403; SI-NEXT: s_waitcnt vmcnt(0) 404; SI-NEXT: s_endpgm 405; 406; GFX11-LABEL: combine_to_fma_fsub_1_f64_2use: 407; GFX11: ; %bb.0: 408; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 409; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 410; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 411; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 412; GFX11-NEXT: s_waitcnt lgkmcnt(0) 413; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc 414; GFX11-NEXT: s_waitcnt vmcnt(0) 415; GFX11-NEXT: global_load_b64 v[2:3], v8, s[2:3] offset:8 glc dlc 416; GFX11-NEXT: s_waitcnt vmcnt(0) 417; GFX11-NEXT: global_load_b64 v[4:5], v8, s[2:3] offset:16 glc dlc 418; GFX11-NEXT: s_waitcnt vmcnt(0) 419; GFX11-NEXT: global_load_b64 v[6:7], v8, s[2:3] offset:24 glc dlc 420; GFX11-NEXT: s_waitcnt vmcnt(0) 421; GFX11-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], v[4:5] 422; GFX11-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], v[6:7] 423; GFX11-NEXT: global_store_b64 v8, v[4:5], s[0:1] dlc 424; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 425; GFX11-NEXT: global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc 426; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 427; GFX11-NEXT: s_endpgm 428 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 429 %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid 430 %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1 431 %gep.2 = getelementptr double, ptr addrspace(1) %gep.0, i32 2 432 %gep.3 = getelementptr double, ptr addrspace(1) %gep.0, i32 3 433 %gep.out.0 = getelementptr double, ptr addrspace(1) %out, i32 %tid 434 %gep.out.1 = getelementptr double, ptr addrspace(1) %gep.out.0, i32 1 435 436 %a = load volatile double, ptr addrspace(1) %gep.0 437 %b = load volatile double, ptr addrspace(1) %gep.1 438 %c = load volatile double, ptr addrspace(1) %gep.2 439 %d = load volatile double, ptr addrspace(1) %gep.3 440 441 %mul = fmul double %a, %b 442 %fma0 = fsub double %c, %mul 443 %fma1 = fsub double %d, %mul 444 store volatile double %fma0, ptr addrspace(1) %gep.out.0 445 store volatile double %fma1, ptr addrspace(1) %gep.out.1 446 ret void 447} 448 449; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) 450define amdgpu_kernel void @combine_to_fma_fsub_2_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { 451; SI-LABEL: combine_to_fma_fsub_2_f64: 452; SI: ; %bb.0: 453; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 454; SI-NEXT: s_mov_b32 s7, 0xf000 455; SI-NEXT: s_mov_b32 s6, 0 456; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 457; SI-NEXT: v_mov_b32_e32 v1, 0 458; SI-NEXT: s_waitcnt lgkmcnt(0) 459; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 460; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc 461; SI-NEXT: s_waitcnt vmcnt(0) 462; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc 463; SI-NEXT: s_waitcnt vmcnt(0) 464; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc 465; SI-NEXT: s_waitcnt vmcnt(0) 466; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 467; SI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[4:5], -v[6:7] 468; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 469; SI-NEXT: s_endpgm 470; 471; GFX11-LABEL: combine_to_fma_fsub_2_f64: 472; GFX11: ; %bb.0: 473; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 474; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 475; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 476; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 477; GFX11-NEXT: s_waitcnt lgkmcnt(0) 478; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc 479; GFX11-NEXT: s_waitcnt vmcnt(0) 480; GFX11-NEXT: global_load_b64 v[2:3], v6, s[2:3] offset:8 glc dlc 481; GFX11-NEXT: s_waitcnt vmcnt(0) 482; GFX11-NEXT: global_load_b64 v[4:5], v6, s[2:3] offset:16 glc dlc 483; GFX11-NEXT: s_waitcnt vmcnt(0) 484; GFX11-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], -v[4:5] 485; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1] 486; GFX11-NEXT: s_endpgm 487 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 488 %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid 489 %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1 490 %gep.2 = getelementptr double, ptr addrspace(1) %gep.0, i32 2 491 %gep.out = getelementptr double, ptr addrspace(1) %out, i32 %tid 492 493 %a = load volatile double, ptr addrspace(1) %gep.0 494 %b = load volatile double, ptr addrspace(1) %gep.1 495 %c = load volatile double, ptr addrspace(1) %gep.2 496 497 %mul = fmul double %a, %b 498 %mul.neg = fsub double -0.0, %mul 499 %fma = fsub double %mul.neg, %c 500 501 store double %fma, ptr addrspace(1) %gep.out 502 ret void 503} 504 505; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) 506define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { 507; SI-LABEL: combine_to_fma_fsub_2_f64_2uses_neg: 508; SI: ; %bb.0: 509; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 510; SI-NEXT: s_mov_b32 s7, 0xf000 511; SI-NEXT: s_mov_b32 s6, 0 512; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 513; SI-NEXT: v_mov_b32_e32 v1, 0 514; SI-NEXT: s_waitcnt lgkmcnt(0) 515; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 516; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc 517; SI-NEXT: s_waitcnt vmcnt(0) 518; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc 519; SI-NEXT: s_waitcnt vmcnt(0) 520; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc 521; SI-NEXT: s_waitcnt vmcnt(0) 522; SI-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc 523; SI-NEXT: s_waitcnt vmcnt(0) 524; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 525; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], -v[6:7] 526; SI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[4:5], -v[8:9] 527; SI-NEXT: buffer_store_dwordx2 v[6:7], v[0:1], s[0:3], 0 addr64 528; SI-NEXT: s_waitcnt vmcnt(0) 529; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 offset:8 530; SI-NEXT: s_waitcnt vmcnt(0) 531; SI-NEXT: s_endpgm 532; 533; GFX11-LABEL: combine_to_fma_fsub_2_f64_2uses_neg: 534; GFX11: ; %bb.0: 535; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 536; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 537; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 538; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 539; GFX11-NEXT: s_waitcnt lgkmcnt(0) 540; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc 541; GFX11-NEXT: s_waitcnt vmcnt(0) 542; GFX11-NEXT: global_load_b64 v[2:3], v8, s[2:3] offset:8 glc dlc 543; GFX11-NEXT: s_waitcnt vmcnt(0) 544; GFX11-NEXT: global_load_b64 v[4:5], v8, s[2:3] offset:16 glc dlc 545; GFX11-NEXT: s_waitcnt vmcnt(0) 546; GFX11-NEXT: global_load_b64 v[6:7], v8, s[2:3] offset:24 glc dlc 547; GFX11-NEXT: s_waitcnt vmcnt(0) 548; GFX11-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], -v[4:5] 549; GFX11-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], -v[6:7] 550; GFX11-NEXT: global_store_b64 v8, v[4:5], s[0:1] dlc 551; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 552; GFX11-NEXT: global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc 553; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 554; GFX11-NEXT: s_endpgm 555 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 556 %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid 557 %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1 558 %gep.2 = getelementptr double, ptr addrspace(1) %gep.0, i32 2 559 %gep.3 = getelementptr double, ptr addrspace(1) %gep.0, i32 3 560 %gep.out.0 = getelementptr double, ptr addrspace(1) %out, i32 %tid 561 %gep.out.1 = getelementptr double, ptr addrspace(1) %gep.out.0, i32 1 562 563 %a = load volatile double, ptr addrspace(1) %gep.0 564 %b = load volatile double, ptr addrspace(1) %gep.1 565 %c = load volatile double, ptr addrspace(1) %gep.2 566 %d = load volatile double, ptr addrspace(1) %gep.3 567 568 %mul = fmul double %a, %b 569 %mul.neg = fsub double -0.0, %mul 570 %fma0 = fsub double %mul.neg, %c 571 %fma1 = fsub double %mul.neg, %d 572 573 store volatile double %fma0, ptr addrspace(1) %gep.out.0 574 store volatile double %fma1, ptr addrspace(1) %gep.out.1 575 ret void 576} 577 578; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) 579define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { 580; SI-LABEL: combine_to_fma_fsub_2_f64_2uses_mul: 581; SI: ; %bb.0: 582; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 583; SI-NEXT: s_mov_b32 s7, 0xf000 584; SI-NEXT: s_mov_b32 s6, 0 585; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 586; SI-NEXT: v_mov_b32_e32 v1, 0 587; SI-NEXT: s_waitcnt lgkmcnt(0) 588; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 589; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc 590; SI-NEXT: s_waitcnt vmcnt(0) 591; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc 592; SI-NEXT: s_waitcnt vmcnt(0) 593; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc 594; SI-NEXT: s_waitcnt vmcnt(0) 595; SI-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc 596; SI-NEXT: s_waitcnt vmcnt(0) 597; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 598; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], -v[6:7] 599; SI-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], -v[8:9] 600; SI-NEXT: buffer_store_dwordx2 v[6:7], v[0:1], s[0:3], 0 addr64 601; SI-NEXT: s_waitcnt vmcnt(0) 602; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 offset:8 603; SI-NEXT: s_waitcnt vmcnt(0) 604; SI-NEXT: s_endpgm 605; 606; GFX11-LABEL: combine_to_fma_fsub_2_f64_2uses_mul: 607; GFX11: ; %bb.0: 608; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 609; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 610; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 611; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 612; GFX11-NEXT: s_waitcnt lgkmcnt(0) 613; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc 614; GFX11-NEXT: s_waitcnt vmcnt(0) 615; GFX11-NEXT: global_load_b64 v[2:3], v8, s[2:3] offset:8 glc dlc 616; GFX11-NEXT: s_waitcnt vmcnt(0) 617; GFX11-NEXT: global_load_b64 v[4:5], v8, s[2:3] offset:16 glc dlc 618; GFX11-NEXT: s_waitcnt vmcnt(0) 619; GFX11-NEXT: global_load_b64 v[6:7], v8, s[2:3] offset:24 glc dlc 620; GFX11-NEXT: s_waitcnt vmcnt(0) 621; GFX11-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], -v[4:5] 622; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], -v[6:7] 623; GFX11-NEXT: global_store_b64 v8, v[4:5], s[0:1] dlc 624; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 625; GFX11-NEXT: global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc 626; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 627; GFX11-NEXT: s_endpgm 628 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 629 %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid 630 %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1 631 %gep.2 = getelementptr double, ptr addrspace(1) %gep.0, i32 2 632 %gep.3 = getelementptr double, ptr addrspace(1) %gep.0, i32 3 633 %gep.out.0 = getelementptr double, ptr addrspace(1) %out, i32 %tid 634 %gep.out.1 = getelementptr double, ptr addrspace(1) %gep.out.0, i32 1 635 636 %a = load volatile double, ptr addrspace(1) %gep.0 637 %b = load volatile double, ptr addrspace(1) %gep.1 638 %c = load volatile double, ptr addrspace(1) %gep.2 639 %d = load volatile double, ptr addrspace(1) %gep.3 640 641 %mul = fmul double %a, %b 642 %mul.neg = fsub double -0.0, %mul 643 %fma0 = fsub double %mul.neg, %c 644 %fma1 = fsub double %mul, %d 645 646 store volatile double %fma0, ptr addrspace(1) %gep.out.0 647 store volatile double %fma1, ptr addrspace(1) %gep.out.1 648 ret void 649} 650 651; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z))) 652define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { 653; SI-NOFMA-LABEL: aggressive_combine_to_fma_fsub_0_f64: 654; SI-NOFMA: ; %bb.0: 655; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 656; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000 657; SI-NOFMA-NEXT: s_mov_b32 s6, 0 658; SI-NOFMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0 659; SI-NOFMA-NEXT: v_mov_b32_e32 v1, 0 660; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) 661; SI-NOFMA-NEXT: s_mov_b64 s[4:5], s[2:3] 662; SI-NOFMA-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc 663; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) 664; SI-NOFMA-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc 665; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) 666; SI-NOFMA-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc 667; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) 668; SI-NOFMA-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc 669; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) 670; SI-NOFMA-NEXT: buffer_load_dwordx2 v[10:11], v[0:1], s[4:7], 0 addr64 offset:32 glc 671; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) 672; SI-NOFMA-NEXT: s_mov_b64 s[2:3], s[6:7] 673; SI-NOFMA-NEXT: v_mul_f64 v[8:9], v[8:9], v[10:11] 674; SI-NOFMA-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[8:9] 675; SI-NOFMA-NEXT: v_add_f64 v[2:3], v[2:3], -v[6:7] 676; SI-NOFMA-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 677; SI-NOFMA-NEXT: s_endpgm 678; 679; SI-FMA-LABEL: aggressive_combine_to_fma_fsub_0_f64: 680; SI-FMA: ; %bb.0: 681; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 682; SI-FMA-NEXT: s_mov_b32 s7, 0xf000 683; SI-FMA-NEXT: s_mov_b32 s6, 0 684; SI-FMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0 685; SI-FMA-NEXT: v_mov_b32_e32 v1, 0 686; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) 687; SI-FMA-NEXT: s_mov_b64 s[4:5], s[2:3] 688; SI-FMA-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc 689; SI-FMA-NEXT: s_waitcnt vmcnt(0) 690; SI-FMA-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc 691; SI-FMA-NEXT: s_waitcnt vmcnt(0) 692; SI-FMA-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc 693; SI-FMA-NEXT: s_waitcnt vmcnt(0) 694; SI-FMA-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc 695; SI-FMA-NEXT: s_waitcnt vmcnt(0) 696; SI-FMA-NEXT: buffer_load_dwordx2 v[10:11], v[0:1], s[4:7], 0 addr64 offset:32 glc 697; SI-FMA-NEXT: s_waitcnt vmcnt(0) 698; SI-FMA-NEXT: s_mov_b64 s[2:3], s[6:7] 699; SI-FMA-NEXT: v_fma_f64 v[6:7], v[8:9], v[10:11], -v[6:7] 700; SI-FMA-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[6:7] 701; SI-FMA-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 702; SI-FMA-NEXT: s_endpgm 703; 704; GFX11-NOFMA-LABEL: aggressive_combine_to_fma_fsub_0_f64: 705; GFX11-NOFMA: ; %bb.0: 706; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 707; GFX11-NOFMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 708; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) 709; GFX11-NOFMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0 710; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) 711; GFX11-NOFMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc 712; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) 713; GFX11-NOFMA-NEXT: global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc 714; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) 715; GFX11-NOFMA-NEXT: global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc 716; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) 717; GFX11-NOFMA-NEXT: global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc 718; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) 719; GFX11-NOFMA-NEXT: global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc 720; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) 721; GFX11-NOFMA-NEXT: v_mul_f64 v[6:7], v[6:7], v[8:9] 722; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 723; GFX11-NOFMA-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[6:7] 724; GFX11-NOFMA-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5] 725; GFX11-NOFMA-NEXT: global_store_b64 v10, v[0:1], s[0:1] 726; GFX11-NOFMA-NEXT: s_endpgm 727; 728; GFX11-FMA-LABEL: aggressive_combine_to_fma_fsub_0_f64: 729; GFX11-FMA: ; %bb.0: 730; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 731; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 732; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) 733; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0 734; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) 735; GFX11-FMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc 736; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) 737; GFX11-FMA-NEXT: global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc 738; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) 739; GFX11-FMA-NEXT: global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc 740; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) 741; GFX11-FMA-NEXT: global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc 742; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) 743; GFX11-FMA-NEXT: global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc 744; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) 745; GFX11-FMA-NEXT: v_fma_f64 v[4:5], v[6:7], v[8:9], -v[4:5] 746; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) 747; GFX11-FMA-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] 748; GFX11-FMA-NEXT: global_store_b64 v10, v[0:1], s[0:1] 749; GFX11-FMA-NEXT: s_endpgm 750 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 751 %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid 752 %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1 753 %gep.2 = getelementptr double, ptr addrspace(1) %gep.0, i32 2 754 %gep.3 = getelementptr double, ptr addrspace(1) %gep.0, i32 3 755 %gep.4 = getelementptr double, ptr addrspace(1) %gep.0, i32 4 756 %gep.out = getelementptr double, ptr addrspace(1) %out, i32 %tid 757 758 %x = load volatile double, ptr addrspace(1) %gep.0 759 %y = load volatile double, ptr addrspace(1) %gep.1 760 %z = load volatile double, ptr addrspace(1) %gep.2 761 %u = load volatile double, ptr addrspace(1) %gep.3 762 %v = load volatile double, ptr addrspace(1) %gep.4 763 764 %tmp0 = fmul double %u, %v 765 %tmp1 = call double @llvm.fma.f64(double %x, double %y, double %tmp0) #0 766 %tmp2 = fsub double %tmp1, %z 767 768 store double %tmp2, ptr addrspace(1) %gep.out 769 ret void 770} 771 772; fold (fsub x, (fma y, z, (fmul u, v))) 773; -> (fma (fneg y), z, (fma (fneg u), v, x)) 774define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { 775; SI-NOFMA-LABEL: aggressive_combine_to_fma_fsub_1_f64: 776; SI-NOFMA: ; %bb.0: 777; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 778; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000 779; SI-NOFMA-NEXT: s_mov_b32 s6, 0 780; SI-NOFMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0 781; SI-NOFMA-NEXT: v_mov_b32_e32 v1, 0 782; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) 783; SI-NOFMA-NEXT: s_mov_b64 s[4:5], s[2:3] 784; SI-NOFMA-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc 785; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) 786; SI-NOFMA-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc 787; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) 788; SI-NOFMA-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc 789; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) 790; SI-NOFMA-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc 791; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) 792; SI-NOFMA-NEXT: buffer_load_dwordx2 v[10:11], v[0:1], s[4:7], 0 addr64 offset:32 glc 793; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) 794; SI-NOFMA-NEXT: s_mov_b64 s[2:3], s[6:7] 795; SI-NOFMA-NEXT: v_mul_f64 v[8:9], v[8:9], v[10:11] 796; SI-NOFMA-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9] 797; SI-NOFMA-NEXT: v_add_f64 v[2:3], v[2:3], -v[4:5] 798; SI-NOFMA-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 799; SI-NOFMA-NEXT: s_endpgm 800; 801; SI-FMA-LABEL: aggressive_combine_to_fma_fsub_1_f64: 802; SI-FMA: ; %bb.0: 803; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 804; SI-FMA-NEXT: s_mov_b32 s7, 0xf000 805; SI-FMA-NEXT: s_mov_b32 s6, 0 806; SI-FMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0 807; SI-FMA-NEXT: v_mov_b32_e32 v1, 0 808; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) 809; SI-FMA-NEXT: s_mov_b64 s[4:5], s[2:3] 810; SI-FMA-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc 811; SI-FMA-NEXT: s_waitcnt vmcnt(0) 812; SI-FMA-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc 813; SI-FMA-NEXT: s_waitcnt vmcnt(0) 814; SI-FMA-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc 815; SI-FMA-NEXT: s_waitcnt vmcnt(0) 816; SI-FMA-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc 817; SI-FMA-NEXT: s_waitcnt vmcnt(0) 818; SI-FMA-NEXT: buffer_load_dwordx2 v[10:11], v[0:1], s[4:7], 0 addr64 offset:32 glc 819; SI-FMA-NEXT: s_waitcnt vmcnt(0) 820; SI-FMA-NEXT: s_mov_b64 s[2:3], s[6:7] 821; SI-FMA-NEXT: v_fma_f64 v[2:3], -v[8:9], v[10:11], v[2:3] 822; SI-FMA-NEXT: v_fma_f64 v[2:3], -v[4:5], v[6:7], v[2:3] 823; SI-FMA-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 824; SI-FMA-NEXT: s_endpgm 825; 826; GFX11-NOFMA-LABEL: aggressive_combine_to_fma_fsub_1_f64: 827; GFX11-NOFMA: ; %bb.0: 828; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 829; GFX11-NOFMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 830; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) 831; GFX11-NOFMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0 832; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) 833; GFX11-NOFMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc 834; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) 835; GFX11-NOFMA-NEXT: global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc 836; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) 837; GFX11-NOFMA-NEXT: global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc 838; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) 839; GFX11-NOFMA-NEXT: global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc 840; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) 841; GFX11-NOFMA-NEXT: global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc 842; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) 843; GFX11-NOFMA-NEXT: v_mul_f64 v[6:7], v[6:7], v[8:9] 844; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 845; GFX11-NOFMA-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[6:7] 846; GFX11-NOFMA-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] 847; GFX11-NOFMA-NEXT: global_store_b64 v10, v[0:1], s[0:1] 848; GFX11-NOFMA-NEXT: s_endpgm 849; 850; GFX11-FMA-LABEL: aggressive_combine_to_fma_fsub_1_f64: 851; GFX11-FMA: ; %bb.0: 852; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 853; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 854; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) 855; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0 856; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) 857; GFX11-FMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc 858; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) 859; GFX11-FMA-NEXT: global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc 860; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) 861; GFX11-FMA-NEXT: global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc 862; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) 863; GFX11-FMA-NEXT: global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc 864; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) 865; GFX11-FMA-NEXT: global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc 866; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) 867; GFX11-FMA-NEXT: v_fma_f64 v[0:1], -v[6:7], v[8:9], v[0:1] 868; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) 869; GFX11-FMA-NEXT: v_fma_f64 v[0:1], -v[2:3], v[4:5], v[0:1] 870; GFX11-FMA-NEXT: global_store_b64 v10, v[0:1], s[0:1] 871; GFX11-FMA-NEXT: s_endpgm 872 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 873 %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid 874 %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1 875 %gep.2 = getelementptr double, ptr addrspace(1) %gep.0, i32 2 876 %gep.3 = getelementptr double, ptr addrspace(1) %gep.0, i32 3 877 %gep.4 = getelementptr double, ptr addrspace(1) %gep.0, i32 4 878 %gep.out = getelementptr double, ptr addrspace(1) %out, i32 %tid 879 880 %x = load volatile double, ptr addrspace(1) %gep.0 881 %y = load volatile double, ptr addrspace(1) %gep.1 882 %z = load volatile double, ptr addrspace(1) %gep.2 883 %u = load volatile double, ptr addrspace(1) %gep.3 884 %v = load volatile double, ptr addrspace(1) %gep.4 885 886 ; nsz flag is needed since this combine may change sign of zero 887 %tmp0 = fmul nsz double %u, %v 888 %tmp1 = call nsz double @llvm.fma.f64(double %y, double %z, double %tmp0) #0 889 %tmp2 = fsub nsz double %x, %tmp1 890 891 store double %tmp2, ptr addrspace(1) %gep.out 892 ret void 893} 894 895; 896; Patterns (+ fneg variants): mul(add(1.0,x),y), mul(sub(1.0,x),y), mul(sub(x,1.0),y) 897; 898 899define amdgpu_kernel void @test_f32_mul_add_x_one_y(ptr addrspace(1) %out, 900; SI-NOFMA-LABEL: test_f32_mul_add_x_one_y: 901; SI-NOFMA: ; %bb.0: 902; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 903; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 904; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000 905; SI-NOFMA-NEXT: s_mov_b32 s6, -1 906; SI-NOFMA-NEXT: s_mov_b32 s14, s6 907; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) 908; SI-NOFMA-NEXT: s_mov_b32 s12, s2 909; SI-NOFMA-NEXT: s_mov_b32 s13, s3 910; SI-NOFMA-NEXT: s_mov_b32 s15, s7 911; SI-NOFMA-NEXT: s_mov_b32 s10, s6 912; SI-NOFMA-NEXT: s_mov_b32 s11, s7 913; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc 914; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) 915; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 glc 916; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) 917; SI-NOFMA-NEXT: s_mov_b32 s4, s0 918; SI-NOFMA-NEXT: s_mov_b32 s5, s1 919; SI-NOFMA-NEXT: v_add_f32_e32 v0, 1.0, v0 920; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 921; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 922; SI-NOFMA-NEXT: s_endpgm 923; 924; SI-FMA-LABEL: test_f32_mul_add_x_one_y: 925; SI-FMA: ; %bb.0: 926; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 927; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 928; SI-FMA-NEXT: s_mov_b32 s7, 0xf000 929; SI-FMA-NEXT: s_mov_b32 s6, -1 930; SI-FMA-NEXT: s_mov_b32 s14, s6 931; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) 932; SI-FMA-NEXT: s_mov_b32 s12, s2 933; SI-FMA-NEXT: s_mov_b32 s13, s3 934; SI-FMA-NEXT: s_mov_b32 s15, s7 935; SI-FMA-NEXT: s_mov_b32 s10, s6 936; SI-FMA-NEXT: s_mov_b32 s11, s7 937; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc 938; SI-FMA-NEXT: s_waitcnt vmcnt(0) 939; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 glc 940; SI-FMA-NEXT: s_waitcnt vmcnt(0) 941; SI-FMA-NEXT: s_mov_b32 s4, s0 942; SI-FMA-NEXT: s_mov_b32 s5, s1 943; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, v1 944; SI-FMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 945; SI-FMA-NEXT: s_endpgm 946; 947; GFX11-NOFMA-LABEL: test_f32_mul_add_x_one_y: 948; GFX11-NOFMA: ; %bb.0: 949; GFX11-NOFMA-NEXT: s_clause 0x1 950; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 951; GFX11-NOFMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 952; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 953; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) 954; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc 955; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) 956; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc 957; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) 958; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, 1.0, v1 959; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) 960; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2 961; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[0:1] 962; GFX11-NOFMA-NEXT: s_endpgm 963; 964; GFX11-FMA-LABEL: test_f32_mul_add_x_one_y: 965; GFX11-FMA: ; %bb.0: 966; GFX11-FMA-NEXT: s_clause 0x1 967; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 968; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 969; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 970; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) 971; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc 972; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) 973; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc 974; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) 975; GFX11-FMA-NEXT: v_fmac_f32_e32 v2, v1, v2 976; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[0:1] 977; GFX11-FMA-NEXT: s_endpgm 978 ptr addrspace(1) %in1, 979 ptr addrspace(1) %in2) { 980 %x = load volatile float, ptr addrspace(1) %in1 981 %y = load volatile float, ptr addrspace(1) %in2 982 %a = fadd float %x, 1.0 983 %m = fmul float %a, %y 984 store float %m, ptr addrspace(1) %out 985 ret void 986} 987 988define amdgpu_kernel void @test_f32_mul_y_add_x_one(ptr addrspace(1) %out, 989; SI-NOFMA-LABEL: test_f32_mul_y_add_x_one: 990; SI-NOFMA: ; %bb.0: 991; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 992; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 993; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000 994; SI-NOFMA-NEXT: s_mov_b32 s6, -1 995; SI-NOFMA-NEXT: s_mov_b32 s14, s6 996; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) 997; SI-NOFMA-NEXT: s_mov_b32 s12, s2 998; SI-NOFMA-NEXT: s_mov_b32 s13, s3 999; SI-NOFMA-NEXT: s_mov_b32 s15, s7 1000; SI-NOFMA-NEXT: s_mov_b32 s10, s6 1001; SI-NOFMA-NEXT: s_mov_b32 s11, s7 1002; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc 1003; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) 1004; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 glc 1005; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) 1006; SI-NOFMA-NEXT: s_mov_b32 s4, s0 1007; SI-NOFMA-NEXT: s_mov_b32 s5, s1 1008; SI-NOFMA-NEXT: v_add_f32_e32 v0, 1.0, v0 1009; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 1010; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 1011; SI-NOFMA-NEXT: s_endpgm 1012; 1013; SI-FMA-LABEL: test_f32_mul_y_add_x_one: 1014; SI-FMA: ; %bb.0: 1015; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1016; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 1017; SI-FMA-NEXT: s_mov_b32 s7, 0xf000 1018; SI-FMA-NEXT: s_mov_b32 s6, -1 1019; SI-FMA-NEXT: s_mov_b32 s14, s6 1020; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) 1021; SI-FMA-NEXT: s_mov_b32 s12, s2 1022; SI-FMA-NEXT: s_mov_b32 s13, s3 1023; SI-FMA-NEXT: s_mov_b32 s15, s7 1024; SI-FMA-NEXT: s_mov_b32 s10, s6 1025; SI-FMA-NEXT: s_mov_b32 s11, s7 1026; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc 1027; SI-FMA-NEXT: s_waitcnt vmcnt(0) 1028; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 glc 1029; SI-FMA-NEXT: s_waitcnt vmcnt(0) 1030; SI-FMA-NEXT: s_mov_b32 s4, s0 1031; SI-FMA-NEXT: s_mov_b32 s5, s1 1032; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, v1 1033; SI-FMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 1034; SI-FMA-NEXT: s_endpgm 1035; 1036; GFX11-NOFMA-LABEL: test_f32_mul_y_add_x_one: 1037; GFX11-NOFMA: ; %bb.0: 1038; GFX11-NOFMA-NEXT: s_clause 0x1 1039; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1040; GFX11-NOFMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1041; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 1042; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) 1043; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc 1044; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) 1045; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc 1046; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) 1047; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, 1.0, v1 1048; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) 1049; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1 1050; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[0:1] 1051; GFX11-NOFMA-NEXT: s_endpgm 1052; 1053; GFX11-FMA-LABEL: test_f32_mul_y_add_x_one: 1054; GFX11-FMA: ; %bb.0: 1055; GFX11-FMA-NEXT: s_clause 0x1 1056; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1057; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1058; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 1059; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) 1060; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc 1061; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) 1062; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc 1063; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) 1064; GFX11-FMA-NEXT: v_fmac_f32_e32 v2, v1, v2 1065; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[0:1] 1066; GFX11-FMA-NEXT: s_endpgm 1067 ptr addrspace(1) %in1, 1068 ptr addrspace(1) %in2) { 1069 %x = load volatile float, ptr addrspace(1) %in1 1070 %y = load volatile float, ptr addrspace(1) %in2 1071 %a = fadd float %x, 1.0 1072 %m = fmul float %y, %a 1073 store float %m, ptr addrspace(1) %out 1074 ret void 1075} 1076 1077define amdgpu_kernel void @test_f32_mul_add_x_negone_y(ptr addrspace(1) %out, 1078; SI-NOFMA-LABEL: test_f32_mul_add_x_negone_y: 1079; SI-NOFMA: ; %bb.0: 1080; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1081; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 1082; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000 1083; SI-NOFMA-NEXT: s_mov_b32 s6, -1 1084; SI-NOFMA-NEXT: s_mov_b32 s14, s6 1085; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) 1086; SI-NOFMA-NEXT: s_mov_b32 s12, s2 1087; SI-NOFMA-NEXT: s_mov_b32 s13, s3 1088; SI-NOFMA-NEXT: s_mov_b32 s15, s7 1089; SI-NOFMA-NEXT: s_mov_b32 s10, s6 1090; SI-NOFMA-NEXT: s_mov_b32 s11, s7 1091; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 1092; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 1093; SI-NOFMA-NEXT: s_mov_b32 s4, s0 1094; SI-NOFMA-NEXT: s_mov_b32 s5, s1 1095; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) 1096; SI-NOFMA-NEXT: v_add_f32_e32 v0, -1.0, v0 1097; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) 1098; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 1099; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 1100; SI-NOFMA-NEXT: s_endpgm 1101; 1102; SI-FMA-LABEL: test_f32_mul_add_x_negone_y: 1103; SI-FMA: ; %bb.0: 1104; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1105; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 1106; SI-FMA-NEXT: s_mov_b32 s7, 0xf000 1107; SI-FMA-NEXT: s_mov_b32 s6, -1 1108; SI-FMA-NEXT: s_mov_b32 s14, s6 1109; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) 1110; SI-FMA-NEXT: s_mov_b32 s12, s2 1111; SI-FMA-NEXT: s_mov_b32 s13, s3 1112; SI-FMA-NEXT: s_mov_b32 s15, s7 1113; SI-FMA-NEXT: s_mov_b32 s10, s6 1114; SI-FMA-NEXT: s_mov_b32 s11, s7 1115; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 1116; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 1117; SI-FMA-NEXT: s_mov_b32 s4, s0 1118; SI-FMA-NEXT: s_mov_b32 s5, s1 1119; SI-FMA-NEXT: s_waitcnt vmcnt(0) 1120; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, -v1 1121; SI-FMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 1122; SI-FMA-NEXT: s_endpgm 1123; 1124; GFX11-NOFMA-LABEL: test_f32_mul_add_x_negone_y: 1125; GFX11-NOFMA: ; %bb.0: 1126; GFX11-NOFMA-NEXT: s_clause 0x1 1127; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1128; GFX11-NOFMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1129; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 1130; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) 1131; GFX11-NOFMA-NEXT: s_clause 0x1 1132; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[2:3] 1133; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5] 1134; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1) 1135; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, -1.0, v1 1136; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) 1137; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) 1138; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2 1139; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[0:1] 1140; GFX11-NOFMA-NEXT: s_endpgm 1141; 1142; GFX11-FMA-LABEL: test_f32_mul_add_x_negone_y: 1143; GFX11-FMA: ; %bb.0: 1144; GFX11-FMA-NEXT: s_clause 0x1 1145; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1146; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1147; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 1148; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) 1149; GFX11-FMA-NEXT: s_clause 0x1 1150; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] 1151; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5] 1152; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) 1153; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, -v2 1154; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] 1155; GFX11-FMA-NEXT: s_endpgm 1156 ptr addrspace(1) %in1, 1157 ptr addrspace(1) %in2) { 1158 %x = load float, ptr addrspace(1) %in1 1159 %y = load float, ptr addrspace(1) %in2 1160 %a = fadd float %x, -1.0 1161 %m = fmul float %a, %y 1162 store float %m, ptr addrspace(1) %out 1163 ret void 1164} 1165 1166define amdgpu_kernel void @test_f32_mul_y_add_x_negone(ptr addrspace(1) %out, 1167; SI-NOFMA-LABEL: test_f32_mul_y_add_x_negone: 1168; SI-NOFMA: ; %bb.0: 1169; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1170; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 1171; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000 1172; SI-NOFMA-NEXT: s_mov_b32 s6, -1 1173; SI-NOFMA-NEXT: s_mov_b32 s14, s6 1174; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) 1175; SI-NOFMA-NEXT: s_mov_b32 s12, s2 1176; SI-NOFMA-NEXT: s_mov_b32 s13, s3 1177; SI-NOFMA-NEXT: s_mov_b32 s15, s7 1178; SI-NOFMA-NEXT: s_mov_b32 s10, s6 1179; SI-NOFMA-NEXT: s_mov_b32 s11, s7 1180; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 1181; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 1182; SI-NOFMA-NEXT: s_mov_b32 s4, s0 1183; SI-NOFMA-NEXT: s_mov_b32 s5, s1 1184; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) 1185; SI-NOFMA-NEXT: v_add_f32_e32 v0, -1.0, v0 1186; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) 1187; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 1188; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 1189; SI-NOFMA-NEXT: s_endpgm 1190; 1191; SI-FMA-LABEL: test_f32_mul_y_add_x_negone: 1192; SI-FMA: ; %bb.0: 1193; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1194; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 1195; SI-FMA-NEXT: s_mov_b32 s7, 0xf000 1196; SI-FMA-NEXT: s_mov_b32 s6, -1 1197; SI-FMA-NEXT: s_mov_b32 s14, s6 1198; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) 1199; SI-FMA-NEXT: s_mov_b32 s12, s2 1200; SI-FMA-NEXT: s_mov_b32 s13, s3 1201; SI-FMA-NEXT: s_mov_b32 s15, s7 1202; SI-FMA-NEXT: s_mov_b32 s10, s6 1203; SI-FMA-NEXT: s_mov_b32 s11, s7 1204; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 1205; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 1206; SI-FMA-NEXT: s_mov_b32 s4, s0 1207; SI-FMA-NEXT: s_mov_b32 s5, s1 1208; SI-FMA-NEXT: s_waitcnt vmcnt(0) 1209; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, -v1 1210; SI-FMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 1211; SI-FMA-NEXT: s_endpgm 1212; 1213; GFX11-NOFMA-LABEL: test_f32_mul_y_add_x_negone: 1214; GFX11-NOFMA: ; %bb.0: 1215; GFX11-NOFMA-NEXT: s_clause 0x1 1216; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1217; GFX11-NOFMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1218; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 1219; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) 1220; GFX11-NOFMA-NEXT: s_clause 0x1 1221; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[2:3] 1222; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5] 1223; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1) 1224; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, -1.0, v1 1225; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) 1226; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) 1227; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1 1228; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[0:1] 1229; GFX11-NOFMA-NEXT: s_endpgm 1230; 1231; GFX11-FMA-LABEL: test_f32_mul_y_add_x_negone: 1232; GFX11-FMA: ; %bb.0: 1233; GFX11-FMA-NEXT: s_clause 0x1 1234; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1235; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1236; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 1237; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) 1238; GFX11-FMA-NEXT: s_clause 0x1 1239; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] 1240; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5] 1241; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) 1242; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, -v2 1243; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] 1244; GFX11-FMA-NEXT: s_endpgm 1245 ptr addrspace(1) %in1, 1246 ptr addrspace(1) %in2) { 1247 %x = load float, ptr addrspace(1) %in1 1248 %y = load float, ptr addrspace(1) %in2 1249 %a = fadd float %x, -1.0 1250 %m = fmul float %y, %a 1251 store float %m, ptr addrspace(1) %out 1252 ret void 1253} 1254 1255define amdgpu_kernel void @test_f32_mul_sub_one_x_y(ptr addrspace(1) %out, 1256; SI-NOFMA-LABEL: test_f32_mul_sub_one_x_y: 1257; SI-NOFMA: ; %bb.0: 1258; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1259; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 1260; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000 1261; SI-NOFMA-NEXT: s_mov_b32 s6, -1 1262; SI-NOFMA-NEXT: s_mov_b32 s14, s6 1263; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) 1264; SI-NOFMA-NEXT: s_mov_b32 s12, s2 1265; SI-NOFMA-NEXT: s_mov_b32 s13, s3 1266; SI-NOFMA-NEXT: s_mov_b32 s15, s7 1267; SI-NOFMA-NEXT: s_mov_b32 s10, s6 1268; SI-NOFMA-NEXT: s_mov_b32 s11, s7 1269; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 1270; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 1271; SI-NOFMA-NEXT: s_mov_b32 s4, s0 1272; SI-NOFMA-NEXT: s_mov_b32 s5, s1 1273; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) 1274; SI-NOFMA-NEXT: v_sub_f32_e32 v0, 1.0, v0 1275; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) 1276; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 1277; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 1278; SI-NOFMA-NEXT: s_endpgm 1279; 1280; SI-FMA-LABEL: test_f32_mul_sub_one_x_y: 1281; SI-FMA: ; %bb.0: 1282; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1283; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 1284; SI-FMA-NEXT: s_mov_b32 s7, 0xf000 1285; SI-FMA-NEXT: s_mov_b32 s6, -1 1286; SI-FMA-NEXT: s_mov_b32 s14, s6 1287; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) 1288; SI-FMA-NEXT: s_mov_b32 s12, s2 1289; SI-FMA-NEXT: s_mov_b32 s13, s3 1290; SI-FMA-NEXT: s_mov_b32 s15, s7 1291; SI-FMA-NEXT: s_mov_b32 s10, s6 1292; SI-FMA-NEXT: s_mov_b32 s11, s7 1293; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 1294; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 1295; SI-FMA-NEXT: s_mov_b32 s4, s0 1296; SI-FMA-NEXT: s_mov_b32 s5, s1 1297; SI-FMA-NEXT: s_waitcnt vmcnt(0) 1298; SI-FMA-NEXT: v_fma_f32 v0, -v0, v1, v1 1299; SI-FMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 1300; SI-FMA-NEXT: s_endpgm 1301; 1302; GFX11-NOFMA-LABEL: test_f32_mul_sub_one_x_y: 1303; GFX11-NOFMA: ; %bb.0: 1304; GFX11-NOFMA-NEXT: s_clause 0x1 1305; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1306; GFX11-NOFMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1307; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 1308; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) 1309; GFX11-NOFMA-NEXT: s_clause 0x1 1310; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[2:3] 1311; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5] 1312; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1) 1313; GFX11-NOFMA-NEXT: v_sub_f32_e32 v1, 1.0, v1 1314; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) 1315; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) 1316; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2 1317; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[0:1] 1318; GFX11-NOFMA-NEXT: s_endpgm 1319; 1320; GFX11-FMA-LABEL: test_f32_mul_sub_one_x_y: 1321; GFX11-FMA: ; %bb.0: 1322; GFX11-FMA-NEXT: s_clause 0x1 1323; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1324; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1325; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 1326; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) 1327; GFX11-FMA-NEXT: s_clause 0x1 1328; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] 1329; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5] 1330; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) 1331; GFX11-FMA-NEXT: v_fma_f32 v1, -v1, v2, v2 1332; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] 1333; GFX11-FMA-NEXT: s_endpgm 1334 ptr addrspace(1) %in1, 1335 ptr addrspace(1) %in2) { 1336 %x = load float, ptr addrspace(1) %in1 1337 %y = load float, ptr addrspace(1) %in2 1338 %s = fsub float 1.0, %x 1339 %m = fmul float %s, %y 1340 store float %m, ptr addrspace(1) %out 1341 ret void 1342} 1343 1344define amdgpu_kernel void @test_f32_mul_y_sub_one_x(ptr addrspace(1) %out, 1345; SI-NOFMA-LABEL: test_f32_mul_y_sub_one_x: 1346; SI-NOFMA: ; %bb.0: 1347; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1348; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 1349; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000 1350; SI-NOFMA-NEXT: s_mov_b32 s6, -1 1351; SI-NOFMA-NEXT: s_mov_b32 s14, s6 1352; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) 1353; SI-NOFMA-NEXT: s_mov_b32 s12, s2 1354; SI-NOFMA-NEXT: s_mov_b32 s13, s3 1355; SI-NOFMA-NEXT: s_mov_b32 s15, s7 1356; SI-NOFMA-NEXT: s_mov_b32 s10, s6 1357; SI-NOFMA-NEXT: s_mov_b32 s11, s7 1358; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 1359; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 1360; SI-NOFMA-NEXT: s_mov_b32 s4, s0 1361; SI-NOFMA-NEXT: s_mov_b32 s5, s1 1362; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) 1363; SI-NOFMA-NEXT: v_sub_f32_e32 v0, 1.0, v0 1364; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) 1365; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 1366; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 1367; SI-NOFMA-NEXT: s_endpgm 1368; 1369; SI-FMA-LABEL: test_f32_mul_y_sub_one_x: 1370; SI-FMA: ; %bb.0: 1371; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1372; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 1373; SI-FMA-NEXT: s_mov_b32 s7, 0xf000 1374; SI-FMA-NEXT: s_mov_b32 s6, -1 1375; SI-FMA-NEXT: s_mov_b32 s14, s6 1376; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) 1377; SI-FMA-NEXT: s_mov_b32 s12, s2 1378; SI-FMA-NEXT: s_mov_b32 s13, s3 1379; SI-FMA-NEXT: s_mov_b32 s15, s7 1380; SI-FMA-NEXT: s_mov_b32 s10, s6 1381; SI-FMA-NEXT: s_mov_b32 s11, s7 1382; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 1383; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 1384; SI-FMA-NEXT: s_mov_b32 s4, s0 1385; SI-FMA-NEXT: s_mov_b32 s5, s1 1386; SI-FMA-NEXT: s_waitcnt vmcnt(0) 1387; SI-FMA-NEXT: v_fma_f32 v0, -v0, v1, v1 1388; SI-FMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 1389; SI-FMA-NEXT: s_endpgm 1390; 1391; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_one_x: 1392; GFX11-NOFMA: ; %bb.0: 1393; GFX11-NOFMA-NEXT: s_clause 0x1 1394; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1395; GFX11-NOFMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1396; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 1397; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) 1398; GFX11-NOFMA-NEXT: s_clause 0x1 1399; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[2:3] 1400; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5] 1401; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1) 1402; GFX11-NOFMA-NEXT: v_sub_f32_e32 v1, 1.0, v1 1403; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) 1404; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) 1405; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1 1406; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[0:1] 1407; GFX11-NOFMA-NEXT: s_endpgm 1408; 1409; GFX11-FMA-LABEL: test_f32_mul_y_sub_one_x: 1410; GFX11-FMA: ; %bb.0: 1411; GFX11-FMA-NEXT: s_clause 0x1 1412; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1413; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1414; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 1415; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) 1416; GFX11-FMA-NEXT: s_clause 0x1 1417; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] 1418; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5] 1419; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) 1420; GFX11-FMA-NEXT: v_fma_f32 v1, -v1, v2, v2 1421; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] 1422; GFX11-FMA-NEXT: s_endpgm 1423 ptr addrspace(1) %in1, 1424 ptr addrspace(1) %in2) { 1425 %x = load float, ptr addrspace(1) %in1 1426 %y = load float, ptr addrspace(1) %in2 1427 %s = fsub float 1.0, %x 1428 %m = fmul float %y, %s 1429 store float %m, ptr addrspace(1) %out 1430 ret void 1431} 1432 1433define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(ptr addrspace(1) %out, 1434; SI-NOFMA-LABEL: test_f32_mul_sub_negone_x_y: 1435; SI-NOFMA: ; %bb.0: 1436; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1437; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 1438; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000 1439; SI-NOFMA-NEXT: s_mov_b32 s6, -1 1440; SI-NOFMA-NEXT: s_mov_b32 s14, s6 1441; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) 1442; SI-NOFMA-NEXT: s_mov_b32 s12, s2 1443; SI-NOFMA-NEXT: s_mov_b32 s13, s3 1444; SI-NOFMA-NEXT: s_mov_b32 s15, s7 1445; SI-NOFMA-NEXT: s_mov_b32 s10, s6 1446; SI-NOFMA-NEXT: s_mov_b32 s11, s7 1447; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 1448; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 1449; SI-NOFMA-NEXT: s_mov_b32 s4, s0 1450; SI-NOFMA-NEXT: s_mov_b32 s5, s1 1451; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) 1452; SI-NOFMA-NEXT: v_sub_f32_e32 v0, -1.0, v0 1453; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) 1454; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 1455; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 1456; SI-NOFMA-NEXT: s_endpgm 1457; 1458; SI-FMA-LABEL: test_f32_mul_sub_negone_x_y: 1459; SI-FMA: ; %bb.0: 1460; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1461; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 1462; SI-FMA-NEXT: s_mov_b32 s7, 0xf000 1463; SI-FMA-NEXT: s_mov_b32 s6, -1 1464; SI-FMA-NEXT: s_mov_b32 s14, s6 1465; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) 1466; SI-FMA-NEXT: s_mov_b32 s12, s2 1467; SI-FMA-NEXT: s_mov_b32 s13, s3 1468; SI-FMA-NEXT: s_mov_b32 s15, s7 1469; SI-FMA-NEXT: s_mov_b32 s10, s6 1470; SI-FMA-NEXT: s_mov_b32 s11, s7 1471; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 1472; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 1473; SI-FMA-NEXT: s_mov_b32 s4, s0 1474; SI-FMA-NEXT: s_mov_b32 s5, s1 1475; SI-FMA-NEXT: s_waitcnt vmcnt(0) 1476; SI-FMA-NEXT: v_fma_f32 v0, -v0, v1, -v1 1477; SI-FMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 1478; SI-FMA-NEXT: s_endpgm 1479; 1480; GFX11-NOFMA-LABEL: test_f32_mul_sub_negone_x_y: 1481; GFX11-NOFMA: ; %bb.0: 1482; GFX11-NOFMA-NEXT: s_clause 0x1 1483; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1484; GFX11-NOFMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1485; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 1486; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) 1487; GFX11-NOFMA-NEXT: s_clause 0x1 1488; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[2:3] 1489; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5] 1490; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1) 1491; GFX11-NOFMA-NEXT: v_sub_f32_e32 v1, -1.0, v1 1492; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) 1493; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) 1494; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2 1495; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[0:1] 1496; GFX11-NOFMA-NEXT: s_endpgm 1497; 1498; GFX11-FMA-LABEL: test_f32_mul_sub_negone_x_y: 1499; GFX11-FMA: ; %bb.0: 1500; GFX11-FMA-NEXT: s_clause 0x1 1501; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1502; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1503; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 1504; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) 1505; GFX11-FMA-NEXT: s_clause 0x1 1506; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] 1507; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5] 1508; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) 1509; GFX11-FMA-NEXT: v_fma_f32 v1, -v1, v2, -v2 1510; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] 1511; GFX11-FMA-NEXT: s_endpgm 1512 ptr addrspace(1) %in1, 1513 ptr addrspace(1) %in2) { 1514 %x = load float, ptr addrspace(1) %in1 1515 %y = load float, ptr addrspace(1) %in2 1516 %s = fsub float -1.0, %x 1517 %m = fmul float %s, %y 1518 store float %m, ptr addrspace(1) %out 1519 ret void 1520} 1521 1522define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(ptr addrspace(1) %out, 1523; SI-NOFMA-LABEL: test_f32_mul_y_sub_negone_x: 1524; SI-NOFMA: ; %bb.0: 1525; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1526; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 1527; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000 1528; SI-NOFMA-NEXT: s_mov_b32 s6, -1 1529; SI-NOFMA-NEXT: s_mov_b32 s14, s6 1530; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) 1531; SI-NOFMA-NEXT: s_mov_b32 s12, s2 1532; SI-NOFMA-NEXT: s_mov_b32 s13, s3 1533; SI-NOFMA-NEXT: s_mov_b32 s15, s7 1534; SI-NOFMA-NEXT: s_mov_b32 s10, s6 1535; SI-NOFMA-NEXT: s_mov_b32 s11, s7 1536; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 1537; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 1538; SI-NOFMA-NEXT: s_mov_b32 s4, s0 1539; SI-NOFMA-NEXT: s_mov_b32 s5, s1 1540; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) 1541; SI-NOFMA-NEXT: v_sub_f32_e32 v0, -1.0, v0 1542; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) 1543; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 1544; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 1545; SI-NOFMA-NEXT: s_endpgm 1546; 1547; SI-FMA-LABEL: test_f32_mul_y_sub_negone_x: 1548; SI-FMA: ; %bb.0: 1549; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1550; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 1551; SI-FMA-NEXT: s_mov_b32 s7, 0xf000 1552; SI-FMA-NEXT: s_mov_b32 s6, -1 1553; SI-FMA-NEXT: s_mov_b32 s14, s6 1554; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) 1555; SI-FMA-NEXT: s_mov_b32 s12, s2 1556; SI-FMA-NEXT: s_mov_b32 s13, s3 1557; SI-FMA-NEXT: s_mov_b32 s15, s7 1558; SI-FMA-NEXT: s_mov_b32 s10, s6 1559; SI-FMA-NEXT: s_mov_b32 s11, s7 1560; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 1561; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 1562; SI-FMA-NEXT: s_mov_b32 s4, s0 1563; SI-FMA-NEXT: s_mov_b32 s5, s1 1564; SI-FMA-NEXT: s_waitcnt vmcnt(0) 1565; SI-FMA-NEXT: v_fma_f32 v0, -v0, v1, -v1 1566; SI-FMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 1567; SI-FMA-NEXT: s_endpgm 1568; 1569; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_negone_x: 1570; GFX11-NOFMA: ; %bb.0: 1571; GFX11-NOFMA-NEXT: s_clause 0x1 1572; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1573; GFX11-NOFMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1574; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 1575; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) 1576; GFX11-NOFMA-NEXT: s_clause 0x1 1577; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[2:3] 1578; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5] 1579; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1) 1580; GFX11-NOFMA-NEXT: v_sub_f32_e32 v1, -1.0, v1 1581; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) 1582; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) 1583; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1 1584; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[0:1] 1585; GFX11-NOFMA-NEXT: s_endpgm 1586; 1587; GFX11-FMA-LABEL: test_f32_mul_y_sub_negone_x: 1588; GFX11-FMA: ; %bb.0: 1589; GFX11-FMA-NEXT: s_clause 0x1 1590; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1591; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1592; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 1593; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) 1594; GFX11-FMA-NEXT: s_clause 0x1 1595; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] 1596; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5] 1597; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) 1598; GFX11-FMA-NEXT: v_fma_f32 v1, -v1, v2, -v2 1599; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] 1600; GFX11-FMA-NEXT: s_endpgm 1601 ptr addrspace(1) %in1, 1602 ptr addrspace(1) %in2) { 1603 %x = load float, ptr addrspace(1) %in1 1604 %y = load float, ptr addrspace(1) %in2 1605 %s = fsub float -1.0, %x 1606 %m = fmul float %y, %s 1607 store float %m, ptr addrspace(1) %out 1608 ret void 1609} 1610 1611define amdgpu_kernel void @test_f32_mul_sub_x_one_y(ptr addrspace(1) %out, 1612; SI-NOFMA-LABEL: test_f32_mul_sub_x_one_y: 1613; SI-NOFMA: ; %bb.0: 1614; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1615; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 1616; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000 1617; SI-NOFMA-NEXT: s_mov_b32 s6, -1 1618; SI-NOFMA-NEXT: s_mov_b32 s14, s6 1619; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) 1620; SI-NOFMA-NEXT: s_mov_b32 s12, s2 1621; SI-NOFMA-NEXT: s_mov_b32 s13, s3 1622; SI-NOFMA-NEXT: s_mov_b32 s15, s7 1623; SI-NOFMA-NEXT: s_mov_b32 s10, s6 1624; SI-NOFMA-NEXT: s_mov_b32 s11, s7 1625; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 1626; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 1627; SI-NOFMA-NEXT: s_mov_b32 s4, s0 1628; SI-NOFMA-NEXT: s_mov_b32 s5, s1 1629; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) 1630; SI-NOFMA-NEXT: v_add_f32_e32 v0, -1.0, v0 1631; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) 1632; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 1633; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 1634; SI-NOFMA-NEXT: s_endpgm 1635; 1636; SI-FMA-LABEL: test_f32_mul_sub_x_one_y: 1637; SI-FMA: ; %bb.0: 1638; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1639; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 1640; SI-FMA-NEXT: s_mov_b32 s7, 0xf000 1641; SI-FMA-NEXT: s_mov_b32 s6, -1 1642; SI-FMA-NEXT: s_mov_b32 s14, s6 1643; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) 1644; SI-FMA-NEXT: s_mov_b32 s12, s2 1645; SI-FMA-NEXT: s_mov_b32 s13, s3 1646; SI-FMA-NEXT: s_mov_b32 s15, s7 1647; SI-FMA-NEXT: s_mov_b32 s10, s6 1648; SI-FMA-NEXT: s_mov_b32 s11, s7 1649; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 1650; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 1651; SI-FMA-NEXT: s_mov_b32 s4, s0 1652; SI-FMA-NEXT: s_mov_b32 s5, s1 1653; SI-FMA-NEXT: s_waitcnt vmcnt(0) 1654; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, -v1 1655; SI-FMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 1656; SI-FMA-NEXT: s_endpgm 1657; 1658; GFX11-NOFMA-LABEL: test_f32_mul_sub_x_one_y: 1659; GFX11-NOFMA: ; %bb.0: 1660; GFX11-NOFMA-NEXT: s_clause 0x1 1661; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1662; GFX11-NOFMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1663; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 1664; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) 1665; GFX11-NOFMA-NEXT: s_clause 0x1 1666; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[2:3] 1667; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5] 1668; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1) 1669; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, -1.0, v1 1670; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) 1671; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) 1672; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2 1673; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[0:1] 1674; GFX11-NOFMA-NEXT: s_endpgm 1675; 1676; GFX11-FMA-LABEL: test_f32_mul_sub_x_one_y: 1677; GFX11-FMA: ; %bb.0: 1678; GFX11-FMA-NEXT: s_clause 0x1 1679; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1680; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1681; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 1682; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) 1683; GFX11-FMA-NEXT: s_clause 0x1 1684; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] 1685; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5] 1686; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) 1687; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, -v2 1688; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] 1689; GFX11-FMA-NEXT: s_endpgm 1690 ptr addrspace(1) %in1, 1691 ptr addrspace(1) %in2) { 1692 %x = load float, ptr addrspace(1) %in1 1693 %y = load float, ptr addrspace(1) %in2 1694 %s = fsub float %x, 1.0 1695 %m = fmul float %s, %y 1696 store float %m, ptr addrspace(1) %out 1697 ret void 1698} 1699 1700define amdgpu_kernel void @test_f32_mul_y_sub_x_one(ptr addrspace(1) %out, 1701; SI-NOFMA-LABEL: test_f32_mul_y_sub_x_one: 1702; SI-NOFMA: ; %bb.0: 1703; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1704; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 1705; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000 1706; SI-NOFMA-NEXT: s_mov_b32 s6, -1 1707; SI-NOFMA-NEXT: s_mov_b32 s14, s6 1708; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) 1709; SI-NOFMA-NEXT: s_mov_b32 s12, s2 1710; SI-NOFMA-NEXT: s_mov_b32 s13, s3 1711; SI-NOFMA-NEXT: s_mov_b32 s15, s7 1712; SI-NOFMA-NEXT: s_mov_b32 s10, s6 1713; SI-NOFMA-NEXT: s_mov_b32 s11, s7 1714; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 1715; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 1716; SI-NOFMA-NEXT: s_mov_b32 s4, s0 1717; SI-NOFMA-NEXT: s_mov_b32 s5, s1 1718; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) 1719; SI-NOFMA-NEXT: v_add_f32_e32 v0, -1.0, v0 1720; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) 1721; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 1722; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 1723; SI-NOFMA-NEXT: s_endpgm 1724; 1725; SI-FMA-LABEL: test_f32_mul_y_sub_x_one: 1726; SI-FMA: ; %bb.0: 1727; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1728; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 1729; SI-FMA-NEXT: s_mov_b32 s7, 0xf000 1730; SI-FMA-NEXT: s_mov_b32 s6, -1 1731; SI-FMA-NEXT: s_mov_b32 s14, s6 1732; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) 1733; SI-FMA-NEXT: s_mov_b32 s12, s2 1734; SI-FMA-NEXT: s_mov_b32 s13, s3 1735; SI-FMA-NEXT: s_mov_b32 s15, s7 1736; SI-FMA-NEXT: s_mov_b32 s10, s6 1737; SI-FMA-NEXT: s_mov_b32 s11, s7 1738; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 1739; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 1740; SI-FMA-NEXT: s_mov_b32 s4, s0 1741; SI-FMA-NEXT: s_mov_b32 s5, s1 1742; SI-FMA-NEXT: s_waitcnt vmcnt(0) 1743; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, -v1 1744; SI-FMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 1745; SI-FMA-NEXT: s_endpgm 1746; 1747; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_x_one: 1748; GFX11-NOFMA: ; %bb.0: 1749; GFX11-NOFMA-NEXT: s_clause 0x1 1750; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1751; GFX11-NOFMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1752; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 1753; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) 1754; GFX11-NOFMA-NEXT: s_clause 0x1 1755; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[2:3] 1756; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5] 1757; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1) 1758; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, -1.0, v1 1759; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) 1760; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) 1761; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1 1762; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[0:1] 1763; GFX11-NOFMA-NEXT: s_endpgm 1764; 1765; GFX11-FMA-LABEL: test_f32_mul_y_sub_x_one: 1766; GFX11-FMA: ; %bb.0: 1767; GFX11-FMA-NEXT: s_clause 0x1 1768; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1769; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1770; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 1771; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) 1772; GFX11-FMA-NEXT: s_clause 0x1 1773; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] 1774; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5] 1775; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) 1776; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, -v2 1777; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] 1778; GFX11-FMA-NEXT: s_endpgm 1779 ptr addrspace(1) %in1, 1780 ptr addrspace(1) %in2) { 1781 %x = load float, ptr addrspace(1) %in1 1782 %y = load float, ptr addrspace(1) %in2 1783 %s = fsub float %x, 1.0 1784 %m = fmul float %y, %s 1785 store float %m, ptr addrspace(1) %out 1786 ret void 1787} 1788 1789define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(ptr addrspace(1) %out, 1790; SI-NOFMA-LABEL: test_f32_mul_sub_x_negone_y: 1791; SI-NOFMA: ; %bb.0: 1792; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1793; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 1794; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000 1795; SI-NOFMA-NEXT: s_mov_b32 s6, -1 1796; SI-NOFMA-NEXT: s_mov_b32 s14, s6 1797; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) 1798; SI-NOFMA-NEXT: s_mov_b32 s12, s2 1799; SI-NOFMA-NEXT: s_mov_b32 s13, s3 1800; SI-NOFMA-NEXT: s_mov_b32 s15, s7 1801; SI-NOFMA-NEXT: s_mov_b32 s10, s6 1802; SI-NOFMA-NEXT: s_mov_b32 s11, s7 1803; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 1804; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 1805; SI-NOFMA-NEXT: s_mov_b32 s4, s0 1806; SI-NOFMA-NEXT: s_mov_b32 s5, s1 1807; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) 1808; SI-NOFMA-NEXT: v_add_f32_e32 v0, 1.0, v0 1809; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) 1810; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 1811; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 1812; SI-NOFMA-NEXT: s_endpgm 1813; 1814; SI-FMA-LABEL: test_f32_mul_sub_x_negone_y: 1815; SI-FMA: ; %bb.0: 1816; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1817; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 1818; SI-FMA-NEXT: s_mov_b32 s7, 0xf000 1819; SI-FMA-NEXT: s_mov_b32 s6, -1 1820; SI-FMA-NEXT: s_mov_b32 s14, s6 1821; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) 1822; SI-FMA-NEXT: s_mov_b32 s12, s2 1823; SI-FMA-NEXT: s_mov_b32 s13, s3 1824; SI-FMA-NEXT: s_mov_b32 s15, s7 1825; SI-FMA-NEXT: s_mov_b32 s10, s6 1826; SI-FMA-NEXT: s_mov_b32 s11, s7 1827; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 1828; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 1829; SI-FMA-NEXT: s_mov_b32 s4, s0 1830; SI-FMA-NEXT: s_mov_b32 s5, s1 1831; SI-FMA-NEXT: s_waitcnt vmcnt(0) 1832; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, v1 1833; SI-FMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 1834; SI-FMA-NEXT: s_endpgm 1835; 1836; GFX11-NOFMA-LABEL: test_f32_mul_sub_x_negone_y: 1837; GFX11-NOFMA: ; %bb.0: 1838; GFX11-NOFMA-NEXT: s_clause 0x1 1839; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1840; GFX11-NOFMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1841; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 1842; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) 1843; GFX11-NOFMA-NEXT: s_clause 0x1 1844; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[2:3] 1845; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5] 1846; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1) 1847; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, 1.0, v1 1848; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) 1849; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) 1850; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2 1851; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[0:1] 1852; GFX11-NOFMA-NEXT: s_endpgm 1853; 1854; GFX11-FMA-LABEL: test_f32_mul_sub_x_negone_y: 1855; GFX11-FMA: ; %bb.0: 1856; GFX11-FMA-NEXT: s_clause 0x1 1857; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1858; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1859; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 1860; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) 1861; GFX11-FMA-NEXT: s_clause 0x1 1862; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] 1863; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5] 1864; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) 1865; GFX11-FMA-NEXT: v_fmac_f32_e32 v2, v1, v2 1866; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[0:1] 1867; GFX11-FMA-NEXT: s_endpgm 1868 ptr addrspace(1) %in1, 1869 ptr addrspace(1) %in2) { 1870 %x = load float, ptr addrspace(1) %in1 1871 %y = load float, ptr addrspace(1) %in2 1872 %s = fsub float %x, -1.0 1873 %m = fmul float %s, %y 1874 store float %m, ptr addrspace(1) %out 1875 ret void 1876} 1877 1878define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(ptr addrspace(1) %out, 1879; SI-NOFMA-LABEL: test_f32_mul_y_sub_x_negone: 1880; SI-NOFMA: ; %bb.0: 1881; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1882; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 1883; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000 1884; SI-NOFMA-NEXT: s_mov_b32 s6, -1 1885; SI-NOFMA-NEXT: s_mov_b32 s14, s6 1886; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) 1887; SI-NOFMA-NEXT: s_mov_b32 s12, s2 1888; SI-NOFMA-NEXT: s_mov_b32 s13, s3 1889; SI-NOFMA-NEXT: s_mov_b32 s15, s7 1890; SI-NOFMA-NEXT: s_mov_b32 s10, s6 1891; SI-NOFMA-NEXT: s_mov_b32 s11, s7 1892; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 1893; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 1894; SI-NOFMA-NEXT: s_mov_b32 s4, s0 1895; SI-NOFMA-NEXT: s_mov_b32 s5, s1 1896; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) 1897; SI-NOFMA-NEXT: v_add_f32_e32 v0, 1.0, v0 1898; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) 1899; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 1900; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 1901; SI-NOFMA-NEXT: s_endpgm 1902; 1903; SI-FMA-LABEL: test_f32_mul_y_sub_x_negone: 1904; SI-FMA: ; %bb.0: 1905; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1906; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 1907; SI-FMA-NEXT: s_mov_b32 s7, 0xf000 1908; SI-FMA-NEXT: s_mov_b32 s6, -1 1909; SI-FMA-NEXT: s_mov_b32 s14, s6 1910; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) 1911; SI-FMA-NEXT: s_mov_b32 s12, s2 1912; SI-FMA-NEXT: s_mov_b32 s13, s3 1913; SI-FMA-NEXT: s_mov_b32 s15, s7 1914; SI-FMA-NEXT: s_mov_b32 s10, s6 1915; SI-FMA-NEXT: s_mov_b32 s11, s7 1916; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 1917; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 1918; SI-FMA-NEXT: s_mov_b32 s4, s0 1919; SI-FMA-NEXT: s_mov_b32 s5, s1 1920; SI-FMA-NEXT: s_waitcnt vmcnt(0) 1921; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, v1 1922; SI-FMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 1923; SI-FMA-NEXT: s_endpgm 1924; 1925; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_x_negone: 1926; GFX11-NOFMA: ; %bb.0: 1927; GFX11-NOFMA-NEXT: s_clause 0x1 1928; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1929; GFX11-NOFMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1930; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 1931; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) 1932; GFX11-NOFMA-NEXT: s_clause 0x1 1933; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[2:3] 1934; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5] 1935; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1) 1936; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, 1.0, v1 1937; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) 1938; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) 1939; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1 1940; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[0:1] 1941; GFX11-NOFMA-NEXT: s_endpgm 1942; 1943; GFX11-FMA-LABEL: test_f32_mul_y_sub_x_negone: 1944; GFX11-FMA: ; %bb.0: 1945; GFX11-FMA-NEXT: s_clause 0x1 1946; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1947; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1948; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 1949; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) 1950; GFX11-FMA-NEXT: s_clause 0x1 1951; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] 1952; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5] 1953; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) 1954; GFX11-FMA-NEXT: v_fmac_f32_e32 v2, v1, v2 1955; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[0:1] 1956; GFX11-FMA-NEXT: s_endpgm 1957 ptr addrspace(1) %in1, 1958 ptr addrspace(1) %in2) { 1959 %x = load float, ptr addrspace(1) %in1 1960 %y = load float, ptr addrspace(1) %in2 1961 %s = fsub float %x, -1.0 1962 %m = fmul float %y, %s 1963 store float %m, ptr addrspace(1) %out 1964 ret void 1965} 1966 1967; 1968; Interpolation Patterns: add(mul(x,t),mul(sub(1.0,t),y)) 1969; 1970 1971define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out, 1972; SI-NOFMA-LABEL: test_f32_interp: 1973; SI-NOFMA: ; %bb.0: 1974; SI-NOFMA-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 1975; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 1976; SI-NOFMA-NEXT: s_mov_b32 s10, -1 1977; SI-NOFMA-NEXT: s_mov_b32 s14, s10 1978; SI-NOFMA-NEXT: s_mov_b32 s15, s11 1979; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) 1980; SI-NOFMA-NEXT: s_mov_b32 s16, s4 1981; SI-NOFMA-NEXT: s_mov_b32 s17, s5 1982; SI-NOFMA-NEXT: s_mov_b32 s4, s6 1983; SI-NOFMA-NEXT: s_mov_b32 s5, s7 1984; SI-NOFMA-NEXT: s_mov_b32 s6, s10 1985; SI-NOFMA-NEXT: s_mov_b32 s7, s11 1986; SI-NOFMA-NEXT: s_mov_b32 s12, s2 1987; SI-NOFMA-NEXT: s_mov_b32 s13, s3 1988; SI-NOFMA-NEXT: s_mov_b32 s18, s10 1989; SI-NOFMA-NEXT: s_mov_b32 s19, s11 1990; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[4:7], 0 1991; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[16:19], 0 1992; SI-NOFMA-NEXT: buffer_load_dword v2, off, s[12:15], 0 1993; SI-NOFMA-NEXT: s_mov_b32 s8, s0 1994; SI-NOFMA-NEXT: s_mov_b32 s9, s1 1995; SI-NOFMA-NEXT: s_waitcnt vmcnt(2) 1996; SI-NOFMA-NEXT: v_sub_f32_e32 v3, 1.0, v0 1997; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) 1998; SI-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v3 1999; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) 2000; SI-NOFMA-NEXT: v_mac_f32_e32 v1, v2, v0 2001; SI-NOFMA-NEXT: buffer_store_dword v1, off, s[8:11], 0 2002; SI-NOFMA-NEXT: s_endpgm 2003; 2004; SI-FMA-LABEL: test_f32_interp: 2005; SI-FMA: ; %bb.0: 2006; SI-FMA-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 2007; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 2008; SI-FMA-NEXT: s_mov_b32 s10, -1 2009; SI-FMA-NEXT: s_mov_b32 s18, s10 2010; SI-FMA-NEXT: s_mov_b32 s19, s11 2011; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) 2012; SI-FMA-NEXT: s_mov_b32 s16, s4 2013; SI-FMA-NEXT: s_mov_b32 s17, s5 2014; SI-FMA-NEXT: s_mov_b32 s14, s10 2015; SI-FMA-NEXT: s_mov_b32 s12, s2 2016; SI-FMA-NEXT: s_mov_b32 s13, s3 2017; SI-FMA-NEXT: s_mov_b32 s15, s11 2018; SI-FMA-NEXT: s_mov_b32 s4, s6 2019; SI-FMA-NEXT: s_mov_b32 s5, s7 2020; SI-FMA-NEXT: s_mov_b32 s6, s10 2021; SI-FMA-NEXT: s_mov_b32 s7, s11 2022; SI-FMA-NEXT: buffer_load_dword v0, off, s[16:19], 0 2023; SI-FMA-NEXT: buffer_load_dword v1, off, s[4:7], 0 2024; SI-FMA-NEXT: buffer_load_dword v2, off, s[12:15], 0 2025; SI-FMA-NEXT: s_mov_b32 s8, s0 2026; SI-FMA-NEXT: s_mov_b32 s9, s1 2027; SI-FMA-NEXT: s_waitcnt vmcnt(1) 2028; SI-FMA-NEXT: v_fma_f32 v0, -v1, v0, v0 2029; SI-FMA-NEXT: s_waitcnt vmcnt(0) 2030; SI-FMA-NEXT: v_fma_f32 v0, v2, v1, v0 2031; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 2032; SI-FMA-NEXT: s_endpgm 2033; 2034; GFX11-NOFMA-LABEL: test_f32_interp: 2035; GFX11-NOFMA: ; %bb.0: 2036; GFX11-NOFMA-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 2037; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 2038; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) 2039; GFX11-NOFMA-NEXT: s_clause 0x2 2040; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7] 2041; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5] 2042; GFX11-NOFMA-NEXT: global_load_b32 v3, v0, s[2:3] 2043; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(2) 2044; GFX11-NOFMA-NEXT: v_sub_f32_e32 v4, 1.0, v1 2045; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1) 2046; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 2047; GFX11-NOFMA-NEXT: v_mul_f32_e32 v2, v2, v4 2048; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) 2049; GFX11-NOFMA-NEXT: v_fmac_f32_e32 v2, v3, v1 2050; GFX11-NOFMA-NEXT: global_store_b32 v0, v2, s[0:1] 2051; GFX11-NOFMA-NEXT: s_endpgm 2052; 2053; GFX11-FMA-LABEL: test_f32_interp: 2054; GFX11-FMA: ; %bb.0: 2055; GFX11-FMA-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 2056; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 2057; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) 2058; GFX11-FMA-NEXT: s_clause 0x2 2059; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[4:5] 2060; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[6:7] 2061; GFX11-FMA-NEXT: global_load_b32 v3, v0, s[2:3] 2062; GFX11-FMA-NEXT: s_waitcnt vmcnt(1) 2063; GFX11-FMA-NEXT: v_fma_f32 v1, -v2, v1, v1 2064; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) 2065; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) 2066; GFX11-FMA-NEXT: v_fmac_f32_e32 v1, v3, v2 2067; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] 2068; GFX11-FMA-NEXT: s_endpgm 2069 ptr addrspace(1) %in1, 2070 ptr addrspace(1) %in2, 2071 ptr addrspace(1) %in3) { 2072 %x = load float, ptr addrspace(1) %in1 2073 %y = load float, ptr addrspace(1) %in2 2074 %t = load float, ptr addrspace(1) %in3 2075 %t1 = fsub float 1.0, %t 2076 %tx = fmul float %x, %t 2077 %ty = fmul float %y, %t1 2078 %r = fadd float %tx, %ty 2079 store float %r, ptr addrspace(1) %out 2080 ret void 2081} 2082 2083define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out, 2084; SI-FMA-LABEL: test_f64_interp: 2085; SI-FMA: ; %bb.0: 2086; SI-FMA-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 2087; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 2088; SI-FMA-NEXT: s_mov_b32 s10, -1 2089; SI-FMA-NEXT: s_mov_b32 s18, s10 2090; SI-FMA-NEXT: s_mov_b32 s19, s11 2091; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) 2092; SI-FMA-NEXT: s_mov_b32 s16, s4 2093; SI-FMA-NEXT: s_mov_b32 s17, s5 2094; SI-FMA-NEXT: s_mov_b32 s4, s6 2095; SI-FMA-NEXT: s_mov_b32 s5, s7 2096; SI-FMA-NEXT: s_mov_b32 s6, s10 2097; SI-FMA-NEXT: s_mov_b32 s7, s11 2098; SI-FMA-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0 2099; SI-FMA-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 2100; SI-FMA-NEXT: s_mov_b32 s14, s10 2101; SI-FMA-NEXT: s_mov_b32 s12, s2 2102; SI-FMA-NEXT: s_mov_b32 s13, s3 2103; SI-FMA-NEXT: s_mov_b32 s15, s11 2104; SI-FMA-NEXT: buffer_load_dwordx2 v[4:5], off, s[12:15], 0 2105; SI-FMA-NEXT: s_mov_b32 s8, s0 2106; SI-FMA-NEXT: s_mov_b32 s9, s1 2107; SI-FMA-NEXT: s_waitcnt vmcnt(1) 2108; SI-FMA-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], v[0:1] 2109; SI-FMA-NEXT: s_waitcnt vmcnt(0) 2110; SI-FMA-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1] 2111; SI-FMA-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 2112; SI-FMA-NEXT: s_endpgm 2113; 2114; GFX11-NOFMA-LABEL: test_f64_interp: 2115; GFX11-NOFMA: ; %bb.0: 2116; GFX11-NOFMA-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 2117; GFX11-NOFMA-NEXT: v_mov_b32_e32 v8, 0 2118; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) 2119; GFX11-NOFMA-NEXT: s_clause 0x2 2120; GFX11-NOFMA-NEXT: global_load_b64 v[0:1], v8, s[6:7] 2121; GFX11-NOFMA-NEXT: global_load_b64 v[2:3], v8, s[4:5] 2122; GFX11-NOFMA-NEXT: global_load_b64 v[4:5], v8, s[2:3] 2123; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(2) 2124; GFX11-NOFMA-NEXT: v_add_f64 v[6:7], -v[0:1], 1.0 2125; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1) 2126; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 2127; GFX11-NOFMA-NEXT: v_mul_f64 v[2:3], v[2:3], v[6:7] 2128; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) 2129; GFX11-NOFMA-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3] 2130; GFX11-NOFMA-NEXT: global_store_b64 v8, v[0:1], s[0:1] 2131; GFX11-NOFMA-NEXT: s_endpgm 2132; 2133; GFX11-FMA-LABEL: test_f64_interp: 2134; GFX11-FMA: ; %bb.0: 2135; GFX11-FMA-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 2136; GFX11-FMA-NEXT: v_mov_b32_e32 v6, 0 2137; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) 2138; GFX11-FMA-NEXT: s_clause 0x2 2139; GFX11-FMA-NEXT: global_load_b64 v[0:1], v6, s[4:5] 2140; GFX11-FMA-NEXT: global_load_b64 v[2:3], v6, s[6:7] 2141; GFX11-FMA-NEXT: global_load_b64 v[4:5], v6, s[2:3] 2142; GFX11-FMA-NEXT: s_waitcnt vmcnt(1) 2143; GFX11-FMA-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], v[0:1] 2144; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) 2145; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) 2146; GFX11-FMA-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1] 2147; GFX11-FMA-NEXT: global_store_b64 v6, v[0:1], s[0:1] 2148; GFX11-FMA-NEXT: s_endpgm 2149 ptr addrspace(1) %in1, 2150 ptr addrspace(1) %in2, 2151 ptr addrspace(1) %in3) { 2152 %x = load double, ptr addrspace(1) %in1 2153 %y = load double, ptr addrspace(1) %in2 2154 %t = load double, ptr addrspace(1) %in3 2155 %t1 = fsub double 1.0, %t 2156 %tx = fmul double %x, %t 2157 %ty = fmul double %y, %t1 2158 %r = fadd double %tx, %ty 2159 store double %r, ptr addrspace(1) %out 2160 ret void 2161} 2162 2163; Make sure negative constant cancels out fneg 2164define amdgpu_kernel void @fma_neg_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 2165; SI-LABEL: fma_neg_2.0_neg_a_b_f32: 2166; SI: ; %bb.0: 2167; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 2168; SI-NEXT: s_mov_b32 s3, 0xf000 2169; SI-NEXT: s_mov_b32 s2, 0 2170; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2171; SI-NEXT: v_mov_b32_e32 v1, 0 2172; SI-NEXT: s_waitcnt lgkmcnt(0) 2173; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc 2174; SI-NEXT: s_waitcnt vmcnt(0) 2175; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc 2176; SI-NEXT: s_waitcnt vmcnt(0) 2177; SI-NEXT: v_fma_f32 v2, v2, 2.0, v3 2178; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 2179; SI-NEXT: s_endpgm 2180; 2181; GFX11-LABEL: fma_neg_2.0_neg_a_b_f32: 2182; GFX11: ; %bb.0: 2183; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 2184; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2185; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2186; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2187; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2188; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc 2189; GFX11-NEXT: s_waitcnt vmcnt(0) 2190; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:4 glc dlc 2191; GFX11-NEXT: s_waitcnt vmcnt(0) 2192; GFX11-NEXT: v_fmac_f32_e32 v2, 2.0, v1 2193; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] 2194; GFX11-NEXT: s_endpgm 2195 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2196 %gep.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid 2197 %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 2198 %gep.out = getelementptr float, ptr addrspace(1) %out, i32 %tid 2199 2200 %r1 = load volatile float, ptr addrspace(1) %gep.0 2201 %r2 = load volatile float, ptr addrspace(1) %gep.1 2202 2203 %r1.fneg = fneg float %r1 2204 2205 %r3 = tail call float @llvm.fma.f32(float -2.0, float %r1.fneg, float %r2) 2206 store float %r3, ptr addrspace(1) %gep.out 2207 ret void 2208} 2209 2210define amdgpu_kernel void @fma_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 2211; SI-LABEL: fma_2.0_neg_a_b_f32: 2212; SI: ; %bb.0: 2213; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 2214; SI-NEXT: s_mov_b32 s3, 0xf000 2215; SI-NEXT: s_mov_b32 s2, 0 2216; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2217; SI-NEXT: v_mov_b32_e32 v1, 0 2218; SI-NEXT: s_waitcnt lgkmcnt(0) 2219; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc 2220; SI-NEXT: s_waitcnt vmcnt(0) 2221; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc 2222; SI-NEXT: s_waitcnt vmcnt(0) 2223; SI-NEXT: v_fma_f32 v2, v2, -2.0, v3 2224; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 2225; SI-NEXT: s_endpgm 2226; 2227; GFX11-LABEL: fma_2.0_neg_a_b_f32: 2228; GFX11: ; %bb.0: 2229; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 2230; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2231; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2232; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2233; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2234; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc 2235; GFX11-NEXT: s_waitcnt vmcnt(0) 2236; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:4 glc dlc 2237; GFX11-NEXT: s_waitcnt vmcnt(0) 2238; GFX11-NEXT: v_fmac_f32_e32 v2, -2.0, v1 2239; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] 2240; GFX11-NEXT: s_endpgm 2241 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2242 %gep.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid 2243 %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 2244 %gep.out = getelementptr float, ptr addrspace(1) %out, i32 %tid 2245 2246 %r1 = load volatile float, ptr addrspace(1) %gep.0 2247 %r2 = load volatile float, ptr addrspace(1) %gep.1 2248 2249 %r1.fneg = fneg float %r1 2250 2251 %r3 = tail call float @llvm.fma.f32(float 2.0, float %r1.fneg, float %r2) 2252 store float %r3, ptr addrspace(1) %gep.out 2253 ret void 2254} 2255 2256define amdgpu_kernel void @fma_neg_b_c_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #2 { 2257; SI-LABEL: fma_neg_b_c_v4f32: 2258; SI: ; %bb.0: 2259; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2260; SI-NEXT: s_mov_b32 s7, 0xf000 2261; SI-NEXT: s_mov_b32 s6, 0 2262; SI-NEXT: v_lshlrev_b32_e32 v12, 4, v0 2263; SI-NEXT: v_mov_b32_e32 v13, 0 2264; SI-NEXT: s_waitcnt lgkmcnt(0) 2265; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 2266; SI-NEXT: buffer_load_dwordx4 v[0:3], v[12:13], s[4:7], 0 addr64 2267; SI-NEXT: buffer_load_dwordx4 v[4:7], v[12:13], s[4:7], 0 addr64 offset:16 2268; SI-NEXT: buffer_load_dwordx4 v[8:11], v[12:13], s[4:7], 0 addr64 offset:48 2269; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 2270; SI-NEXT: s_waitcnt vmcnt(0) 2271; SI-NEXT: v_fma_f32 v3, v11, -v3, -v7 2272; SI-NEXT: v_fma_f32 v2, v10, -v2, -v6 2273; SI-NEXT: v_fma_f32 v1, v9, -v1, -v5 2274; SI-NEXT: v_fma_f32 v0, v8, -v0, -v4 2275; SI-NEXT: buffer_store_dwordx4 v[0:3], v[12:13], s[0:3], 0 addr64 2276; SI-NEXT: s_endpgm 2277; 2278; GFX11-LABEL: fma_neg_b_c_v4f32: 2279; GFX11: ; %bb.0: 2280; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2281; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2282; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2283; GFX11-NEXT: v_lshlrev_b32_e32 v12, 4, v0 2284; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2285; GFX11-NEXT: s_clause 0x2 2286; GFX11-NEXT: global_load_b128 v[0:3], v12, s[2:3] offset:16 2287; GFX11-NEXT: global_load_b128 v[4:7], v12, s[2:3] 2288; GFX11-NEXT: global_load_b128 v[8:11], v12, s[2:3] offset:48 2289; GFX11-NEXT: s_waitcnt vmcnt(0) 2290; GFX11-NEXT: v_fma_f32 v3, v11, -v7, -v3 2291; GFX11-NEXT: v_fma_f32 v2, v10, -v6, -v2 2292; GFX11-NEXT: v_fma_f32 v1, v9, -v5, -v1 2293; GFX11-NEXT: v_fma_f32 v0, v8, -v4, -v0 2294; GFX11-NEXT: global_store_b128 v12, v[0:3], s[0:1] 2295; GFX11-NEXT: s_endpgm 2296 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2297 %gep.0 = getelementptr <4 x float>, ptr addrspace(1) %in, i32 %tid 2298 %gep.1 = getelementptr <4 x float>, ptr addrspace(1) %gep.0, i32 1 2299 %gep.2 = getelementptr <4 x float>, ptr addrspace(1) %gep.1, i32 2 2300 %gep.out = getelementptr <4 x float>, ptr addrspace(1) %out, i32 %tid 2301 2302 %tmp0 = load <4 x float>, ptr addrspace(1) %gep.0 2303 %tmp1 = load <4 x float>, ptr addrspace(1) %gep.1 2304 %tmp2 = load <4 x float>, ptr addrspace(1) %gep.2 2305 2306 %fneg0 = fneg fast <4 x float> %tmp0 2307 %fneg1 = fneg fast <4 x float> %tmp1 2308 %fma0 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %tmp2, <4 x float> %fneg0, <4 x float> %fneg1) 2309 2310 store <4 x float> %fma0, ptr addrspace(1) %gep.out 2311 ret void 2312} 2313 2314attributes #0 = { nounwind readnone } 2315attributes #1 = { nounwind } 2316attributes #2 = { nounwind "no-signed-zeros-fp-math"="true" } 2317