1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 2; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6 %s 3; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s 5; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-MAD %s 6; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-MAD %s 7 8; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GFX940-FMA %s 9; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GFX10-FMA %s 10; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GFX11-FMA %s 11 12declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 13declare float @llvm.fabs.f32(float) nounwind readnone 14 15define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, ptr addrspace(1) noalias %in.b) #0 { 16; GFX6-LABEL: madak_f32: 17; GFX6: ; %bb.0: 18; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 19; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 20; GFX6-NEXT: s_mov_b32 s7, 0xf000 21; GFX6-NEXT: s_mov_b32 s6, 0 22; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 23; GFX6-NEXT: s_waitcnt lgkmcnt(0) 24; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 25; GFX6-NEXT: v_mov_b32_e32 v1, 0 26; GFX6-NEXT: s_mov_b64 s[10:11], s[6:7] 27; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 28; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 29; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 30; GFX6-NEXT: s_waitcnt vmcnt(0) 31; GFX6-NEXT: v_madak_f32 v2, v2, v3, 0x41200000 32; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 33; GFX6-NEXT: s_endpgm 34; 35; GFX8-LABEL: madak_f32: 36; GFX8: ; %bb.0: 37; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 38; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 39; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0 40; GFX8-NEXT: s_waitcnt lgkmcnt(0) 41; GFX8-NEXT: v_mov_b32_e32 v1, s3 42; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v4 43; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 44; GFX8-NEXT: v_mov_b32_e32 v3, s5 45; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v4 46; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 47; GFX8-NEXT: flat_load_dword v5, v[0:1] 48; GFX8-NEXT: flat_load_dword v2, v[2:3] 49; GFX8-NEXT: v_mov_b32_e32 v1, s1 50; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v4 51; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 52; GFX8-NEXT: s_waitcnt vmcnt(0) 53; GFX8-NEXT: v_madak_f32 v2, v5, v2, 0x41200000 54; GFX8-NEXT: flat_store_dword v[0:1], v2 55; GFX8-NEXT: s_endpgm 56; 57; GFX9-LABEL: madak_f32: 58; GFX9: ; %bb.0: 59; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 60; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 61; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 62; GFX9-NEXT: s_waitcnt lgkmcnt(0) 63; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 64; GFX9-NEXT: global_load_dword v2, v0, s[6:7] 65; GFX9-NEXT: s_waitcnt vmcnt(0) 66; GFX9-NEXT: v_madak_f32 v1, v1, v2, 0x41200000 67; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 68; GFX9-NEXT: s_endpgm 69; 70; GFX10-MAD-LABEL: madak_f32: 71; GFX10-MAD: ; %bb.0: 72; GFX10-MAD-NEXT: s_clause 0x1 73; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 74; GFX10-MAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 75; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 76; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) 77; GFX10-MAD-NEXT: s_clause 0x1 78; GFX10-MAD-NEXT: global_load_dword v1, v0, s[2:3] 79; GFX10-MAD-NEXT: global_load_dword v2, v0, s[6:7] 80; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) 81; GFX10-MAD-NEXT: v_madak_f32 v1, v1, v2, 0x41200000 82; GFX10-MAD-NEXT: global_store_dword v0, v1, s[0:1] 83; GFX10-MAD-NEXT: s_endpgm 84; 85; GFX11-MAD-LABEL: madak_f32: 86; GFX11-MAD: ; %bb.0: 87; GFX11-MAD-NEXT: s_clause 0x1 88; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 89; GFX11-MAD-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 90; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 91; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) 92; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 93; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) 94; GFX11-MAD-NEXT: s_clause 0x1 95; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[2:3] 96; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[4:5] 97; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) 98; GFX11-MAD-NEXT: v_mul_f32_e32 v1, v1, v2 99; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) 100; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1 101; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[0:1] 102; GFX11-MAD-NEXT: s_endpgm 103; 104; GFX940-FMA-LABEL: madak_f32: 105; GFX940-FMA: ; %bb.0: 106; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 107; GFX940-FMA-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 108; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 109; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 110; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) 111; GFX940-FMA-NEXT: global_load_dword v1, v0, s[2:3] 112; GFX940-FMA-NEXT: global_load_dword v2, v0, s[6:7] 113; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) 114; GFX940-FMA-NEXT: v_fmaak_f32 v1, v1, v2, 0x41200000 115; GFX940-FMA-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 116; GFX940-FMA-NEXT: s_endpgm 117; 118; GFX10-FMA-LABEL: madak_f32: 119; GFX10-FMA: ; %bb.0: 120; GFX10-FMA-NEXT: s_clause 0x1 121; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 122; GFX10-FMA-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 123; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 124; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) 125; GFX10-FMA-NEXT: s_clause 0x1 126; GFX10-FMA-NEXT: global_load_dword v1, v0, s[2:3] 127; GFX10-FMA-NEXT: global_load_dword v2, v0, s[6:7] 128; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) 129; GFX10-FMA-NEXT: v_fmaak_f32 v1, v1, v2, 0x41200000 130; GFX10-FMA-NEXT: global_store_dword v0, v1, s[0:1] 131; GFX10-FMA-NEXT: s_endpgm 132; 133; GFX11-FMA-LABEL: madak_f32: 134; GFX11-FMA: ; %bb.0: 135; GFX11-FMA-NEXT: s_clause 0x1 136; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 137; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 138; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 139; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) 140; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 141; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) 142; GFX11-FMA-NEXT: s_clause 0x1 143; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] 144; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5] 145; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) 146; GFX11-FMA-NEXT: v_fmaak_f32 v1, v1, v2, 0x41200000 147; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] 148; GFX11-FMA-NEXT: s_endpgm 149 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 150 %in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid 151 %in.b.gep = getelementptr float, ptr addrspace(1) %in.b, i32 %tid 152 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid 153 154 %a = load float, ptr addrspace(1) %in.a.gep, align 4 155 %b = load float, ptr addrspace(1) %in.b.gep, align 4 156 157 %mul = fmul float %a, %b 158 %madak = fadd float %mul, 10.0 159 store float %madak, ptr addrspace(1) %out.gep, align 4 160 ret void 161} 162 163; Make sure this is only folded with one use. This is a code size 164; optimization and if we fold the immediate multiple times, we'll undo 165; it. 166define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 { 167; GFX6-LABEL: madak_2_use_f32: 168; GFX6: ; %bb.0: 169; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 170; GFX6-NEXT: s_mov_b32 s7, 0xf000 171; GFX6-NEXT: s_mov_b32 s6, 0 172; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 173; GFX6-NEXT: v_mov_b32_e32 v1, 0 174; GFX6-NEXT: s_waitcnt lgkmcnt(0) 175; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 176; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc 177; GFX6-NEXT: s_waitcnt vmcnt(0) 178; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc 179; GFX6-NEXT: s_waitcnt vmcnt(0) 180; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc 181; GFX6-NEXT: s_waitcnt vmcnt(0) 182; GFX6-NEXT: v_mov_b32_e32 v5, 0x41200000 183; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 184; GFX6-NEXT: v_madak_f32 v3, v2, v3, 0x41200000 185; GFX6-NEXT: v_mac_f32_e32 v5, v2, v4 186; GFX6-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 187; GFX6-NEXT: s_waitcnt vmcnt(0) 188; GFX6-NEXT: buffer_store_dword v5, v[0:1], s[4:7], 0 addr64 offset:4 189; GFX6-NEXT: s_waitcnt vmcnt(0) 190; GFX6-NEXT: s_endpgm 191; 192; GFX8-LABEL: madak_2_use_f32: 193; GFX8: ; %bb.0: 194; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 195; GFX8-NEXT: v_lshlrev_b32_e32 v6, 2, v0 196; GFX8-NEXT: s_waitcnt lgkmcnt(0) 197; GFX8-NEXT: v_mov_b32_e32 v1, s3 198; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v6 199; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 200; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 201; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 202; GFX8-NEXT: v_add_u32_e32 v4, vcc, 8, v0 203; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 204; GFX8-NEXT: flat_load_dword v7, v[0:1] glc 205; GFX8-NEXT: s_waitcnt vmcnt(0) 206; GFX8-NEXT: flat_load_dword v8, v[2:3] glc 207; GFX8-NEXT: s_waitcnt vmcnt(0) 208; GFX8-NEXT: flat_load_dword v4, v[4:5] glc 209; GFX8-NEXT: s_waitcnt vmcnt(0) 210; GFX8-NEXT: v_mov_b32_e32 v1, s1 211; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v6 212; GFX8-NEXT: v_mov_b32_e32 v5, 0x41200000 213; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 214; GFX8-NEXT: v_madak_f32 v6, v7, v8, 0x41200000 215; GFX8-NEXT: v_mac_f32_e32 v5, v7, v4 216; GFX8-NEXT: flat_store_dword v[0:1], v6 217; GFX8-NEXT: s_waitcnt vmcnt(0) 218; GFX8-NEXT: flat_store_dword v[2:3], v5 219; GFX8-NEXT: s_waitcnt vmcnt(0) 220; GFX8-NEXT: s_endpgm 221; 222; GFX9-LABEL: madak_2_use_f32: 223; GFX9: ; %bb.0: 224; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 225; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 226; GFX9-NEXT: v_mov_b32_e32 v4, 0x41200000 227; GFX9-NEXT: s_waitcnt lgkmcnt(0) 228; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc 229; GFX9-NEXT: s_waitcnt vmcnt(0) 230; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc 231; GFX9-NEXT: s_waitcnt vmcnt(0) 232; GFX9-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc 233; GFX9-NEXT: s_waitcnt vmcnt(0) 234; GFX9-NEXT: v_madak_f32 v2, v1, v2, 0x41200000 235; GFX9-NEXT: v_mac_f32_e32 v4, v1, v3 236; GFX9-NEXT: global_store_dword v0, v2, s[0:1] 237; GFX9-NEXT: s_waitcnt vmcnt(0) 238; GFX9-NEXT: global_store_dword v0, v4, s[2:3] offset:4 239; GFX9-NEXT: s_waitcnt vmcnt(0) 240; GFX9-NEXT: s_endpgm 241; 242; GFX10-MAD-LABEL: madak_2_use_f32: 243; GFX10-MAD: ; %bb.0: 244; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 245; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 246; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) 247; GFX10-MAD-NEXT: global_load_dword v1, v0, s[2:3] glc dlc 248; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) 249; GFX10-MAD-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc 250; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) 251; GFX10-MAD-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc 252; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) 253; GFX10-MAD-NEXT: v_madak_f32 v2, v1, v2, 0x41200000 254; GFX10-MAD-NEXT: v_madak_f32 v1, v1, v3, 0x41200000 255; GFX10-MAD-NEXT: global_store_dword v0, v2, s[0:1] 256; GFX10-MAD-NEXT: s_waitcnt_vscnt null, 0x0 257; GFX10-MAD-NEXT: global_store_dword v0, v1, s[2:3] offset:4 258; GFX10-MAD-NEXT: s_waitcnt_vscnt null, 0x0 259; GFX10-MAD-NEXT: s_endpgm 260; 261; GFX11-MAD-LABEL: madak_2_use_f32: 262; GFX11-MAD: ; %bb.0: 263; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 264; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 265; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) 266; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 267; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) 268; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc 269; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) 270; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[2:3] offset:4 glc dlc 271; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) 272; GFX11-MAD-NEXT: global_load_b32 v3, v0, s[2:3] offset:8 glc dlc 273; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) 274; GFX11-MAD-NEXT: v_mul_f32_e32 v2, v1, v2 275; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 276; GFX11-MAD-NEXT: v_dual_mul_f32 v1, v1, v3 :: v_dual_add_f32 v2, 0x41200000, v2 277; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1 278; GFX11-MAD-NEXT: global_store_b32 v0, v2, s[0:1] dlc 279; GFX11-MAD-NEXT: s_waitcnt_vscnt null, 0x0 280; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[2:3] offset:4 dlc 281; GFX11-MAD-NEXT: s_waitcnt_vscnt null, 0x0 282; GFX11-MAD-NEXT: s_endpgm 283; 284; GFX940-FMA-LABEL: madak_2_use_f32: 285; GFX940-FMA: ; %bb.0: 286; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 287; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 288; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 289; GFX940-FMA-NEXT: v_mov_b32_e32 v4, 0x41200000 290; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) 291; GFX940-FMA-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1 292; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) 293; GFX940-FMA-NEXT: global_load_dword v2, v0, s[2:3] offset:4 sc0 sc1 294; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) 295; GFX940-FMA-NEXT: global_load_dword v3, v0, s[2:3] offset:8 sc0 sc1 296; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) 297; GFX940-FMA-NEXT: v_fmaak_f32 v2, v1, v2, 0x41200000 298; GFX940-FMA-NEXT: v_fmac_f32_e32 v4, v1, v3 299; GFX940-FMA-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 300; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) 301; GFX940-FMA-NEXT: global_store_dword v0, v4, s[2:3] offset:4 sc0 sc1 302; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) 303; GFX940-FMA-NEXT: s_endpgm 304; 305; GFX10-FMA-LABEL: madak_2_use_f32: 306; GFX10-FMA: ; %bb.0: 307; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 308; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 309; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) 310; GFX10-FMA-NEXT: global_load_dword v1, v0, s[2:3] glc dlc 311; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) 312; GFX10-FMA-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc 313; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) 314; GFX10-FMA-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc 315; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) 316; GFX10-FMA-NEXT: v_fmaak_f32 v2, v1, v2, 0x41200000 317; GFX10-FMA-NEXT: v_fmaak_f32 v1, v1, v3, 0x41200000 318; GFX10-FMA-NEXT: global_store_dword v0, v2, s[0:1] 319; GFX10-FMA-NEXT: s_waitcnt_vscnt null, 0x0 320; GFX10-FMA-NEXT: global_store_dword v0, v1, s[2:3] offset:4 321; GFX10-FMA-NEXT: s_waitcnt_vscnt null, 0x0 322; GFX10-FMA-NEXT: s_endpgm 323; 324; GFX11-FMA-LABEL: madak_2_use_f32: 325; GFX11-FMA: ; %bb.0: 326; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 327; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 328; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) 329; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 330; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) 331; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc 332; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) 333; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3] offset:4 glc dlc 334; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) 335; GFX11-FMA-NEXT: global_load_b32 v3, v0, s[2:3] offset:8 glc dlc 336; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) 337; GFX11-FMA-NEXT: v_fmaak_f32 v2, v1, v2, 0x41200000 338; GFX11-FMA-NEXT: v_fmaak_f32 v1, v1, v3, 0x41200000 339; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[0:1] dlc 340; GFX11-FMA-NEXT: s_waitcnt_vscnt null, 0x0 341; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[2:3] offset:4 dlc 342; GFX11-FMA-NEXT: s_waitcnt_vscnt null, 0x0 343; GFX11-FMA-NEXT: s_endpgm 344 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 345 346 %in.gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid 347 %in.gep.1 = getelementptr float, ptr addrspace(1) %in.gep.0, i32 1 348 %in.gep.2 = getelementptr float, ptr addrspace(1) %in.gep.0, i32 2 349 350 %out.gep.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid 351 %out.gep.1 = getelementptr float, ptr addrspace(1) %in.gep.0, i32 1 352 353 %a = load volatile float, ptr addrspace(1) %in.gep.0, align 4 354 %b = load volatile float, ptr addrspace(1) %in.gep.1, align 4 355 %c = load volatile float, ptr addrspace(1) %in.gep.2, align 4 356 357 %mul0 = fmul float %a, %b 358 %mul1 = fmul float %a, %c 359 %madak0 = fadd float %mul0, 10.0 360 %madak1 = fadd float %mul1, 10.0 361 362 store volatile float %madak0, ptr addrspace(1) %out.gep.0, align 4 363 store volatile float %madak1, ptr addrspace(1) %out.gep.1, align 4 364 ret void 365} 366 367define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a) #0 { 368; GFX6-LABEL: madak_m_inline_imm_f32: 369; GFX6: ; %bb.0: 370; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 371; GFX6-NEXT: s_mov_b32 s7, 0xf000 372; GFX6-NEXT: s_mov_b32 s6, 0 373; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 374; GFX6-NEXT: v_mov_b32_e32 v1, 0 375; GFX6-NEXT: s_waitcnt lgkmcnt(0) 376; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 377; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 378; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 379; GFX6-NEXT: s_waitcnt vmcnt(0) 380; GFX6-NEXT: v_madak_f32 v2, 4.0, v2, 0x41200000 381; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 382; GFX6-NEXT: s_endpgm 383; 384; GFX8-LABEL: madak_m_inline_imm_f32: 385; GFX8: ; %bb.0: 386; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 387; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 388; GFX8-NEXT: s_waitcnt lgkmcnt(0) 389; GFX8-NEXT: v_mov_b32_e32 v1, s3 390; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 391; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 392; GFX8-NEXT: flat_load_dword v3, v[0:1] 393; GFX8-NEXT: v_mov_b32_e32 v1, s1 394; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 395; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 396; GFX8-NEXT: s_waitcnt vmcnt(0) 397; GFX8-NEXT: v_madak_f32 v2, 4.0, v3, 0x41200000 398; GFX8-NEXT: flat_store_dword v[0:1], v2 399; GFX8-NEXT: s_endpgm 400; 401; GFX9-LABEL: madak_m_inline_imm_f32: 402; GFX9: ; %bb.0: 403; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 404; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 405; GFX9-NEXT: s_waitcnt lgkmcnt(0) 406; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 407; GFX9-NEXT: s_waitcnt vmcnt(0) 408; GFX9-NEXT: v_madak_f32 v1, 4.0, v1, 0x41200000 409; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 410; GFX9-NEXT: s_endpgm 411; 412; GFX10-MAD-LABEL: madak_m_inline_imm_f32: 413; GFX10-MAD: ; %bb.0: 414; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 415; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 416; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) 417; GFX10-MAD-NEXT: global_load_dword v1, v0, s[2:3] 418; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) 419; GFX10-MAD-NEXT: v_madak_f32 v1, 4.0, v1, 0x41200000 420; GFX10-MAD-NEXT: global_store_dword v0, v1, s[0:1] 421; GFX10-MAD-NEXT: s_endpgm 422; 423; GFX11-MAD-LABEL: madak_m_inline_imm_f32: 424; GFX11-MAD: ; %bb.0: 425; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 426; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 427; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) 428; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 429; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) 430; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[2:3] 431; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) 432; GFX11-MAD-NEXT: v_mul_f32_e32 v1, 4.0, v1 433; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1 434; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[0:1] 435; GFX11-MAD-NEXT: s_endpgm 436; 437; GFX940-FMA-LABEL: madak_m_inline_imm_f32: 438; GFX940-FMA: ; %bb.0: 439; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 440; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 441; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 442; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) 443; GFX940-FMA-NEXT: global_load_dword v1, v0, s[2:3] 444; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) 445; GFX940-FMA-NEXT: v_fmaak_f32 v1, 4.0, v1, 0x41200000 446; GFX940-FMA-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 447; GFX940-FMA-NEXT: s_endpgm 448; 449; GFX10-FMA-LABEL: madak_m_inline_imm_f32: 450; GFX10-FMA: ; %bb.0: 451; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 452; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 453; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) 454; GFX10-FMA-NEXT: global_load_dword v1, v0, s[2:3] 455; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) 456; GFX10-FMA-NEXT: v_fmaak_f32 v1, 4.0, v1, 0x41200000 457; GFX10-FMA-NEXT: global_store_dword v0, v1, s[0:1] 458; GFX10-FMA-NEXT: s_endpgm 459; 460; GFX11-FMA-LABEL: madak_m_inline_imm_f32: 461; GFX11-FMA: ; %bb.0: 462; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 463; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 464; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) 465; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 466; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) 467; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] 468; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) 469; GFX11-FMA-NEXT: v_fmaak_f32 v1, 4.0, v1, 0x41200000 470; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] 471; GFX11-FMA-NEXT: s_endpgm 472 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 473 %in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid 474 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid 475 476 %a = load float, ptr addrspace(1) %in.a.gep, align 4 477 478 %mul = fmul float 4.0, %a 479 %madak = fadd float %mul, 10.0 480 store float %madak, ptr addrspace(1) %out.gep, align 4 481 ret void 482} 483 484; Make sure nothing weird happens with a value that is also allowed as 485; an inline immediate. 486define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, ptr addrspace(1) noalias %in.b) #0 { 487; GFX6-LABEL: madak_inline_imm_f32: 488; GFX6: ; %bb.0: 489; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 490; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 491; GFX6-NEXT: s_mov_b32 s7, 0xf000 492; GFX6-NEXT: s_mov_b32 s6, 0 493; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 494; GFX6-NEXT: s_waitcnt lgkmcnt(0) 495; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 496; GFX6-NEXT: v_mov_b32_e32 v1, 0 497; GFX6-NEXT: s_mov_b64 s[10:11], s[6:7] 498; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 499; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 500; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 501; GFX6-NEXT: s_waitcnt vmcnt(0) 502; GFX6-NEXT: v_mad_f32 v2, v2, v3, 4.0 503; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 504; GFX6-NEXT: s_endpgm 505; 506; GFX8-LABEL: madak_inline_imm_f32: 507; GFX8: ; %bb.0: 508; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 509; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 510; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0 511; GFX8-NEXT: s_waitcnt lgkmcnt(0) 512; GFX8-NEXT: v_mov_b32_e32 v1, s3 513; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v4 514; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 515; GFX8-NEXT: v_mov_b32_e32 v3, s5 516; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v4 517; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 518; GFX8-NEXT: flat_load_dword v5, v[0:1] 519; GFX8-NEXT: flat_load_dword v2, v[2:3] 520; GFX8-NEXT: v_mov_b32_e32 v1, s1 521; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v4 522; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 523; GFX8-NEXT: s_waitcnt vmcnt(0) 524; GFX8-NEXT: v_mad_f32 v2, v5, v2, 4.0 525; GFX8-NEXT: flat_store_dword v[0:1], v2 526; GFX8-NEXT: s_endpgm 527; 528; GFX9-LABEL: madak_inline_imm_f32: 529; GFX9: ; %bb.0: 530; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 531; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 532; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 533; GFX9-NEXT: s_waitcnt lgkmcnt(0) 534; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 535; GFX9-NEXT: global_load_dword v2, v0, s[6:7] 536; GFX9-NEXT: s_waitcnt vmcnt(0) 537; GFX9-NEXT: v_mad_f32 v1, v1, v2, 4.0 538; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 539; GFX9-NEXT: s_endpgm 540; 541; GFX10-MAD-LABEL: madak_inline_imm_f32: 542; GFX10-MAD: ; %bb.0: 543; GFX10-MAD-NEXT: s_clause 0x1 544; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 545; GFX10-MAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 546; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 547; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) 548; GFX10-MAD-NEXT: s_clause 0x1 549; GFX10-MAD-NEXT: global_load_dword v1, v0, s[2:3] 550; GFX10-MAD-NEXT: global_load_dword v2, v0, s[6:7] 551; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) 552; GFX10-MAD-NEXT: v_mad_f32 v1, v1, v2, 4.0 553; GFX10-MAD-NEXT: global_store_dword v0, v1, s[0:1] 554; GFX10-MAD-NEXT: s_endpgm 555; 556; GFX11-MAD-LABEL: madak_inline_imm_f32: 557; GFX11-MAD: ; %bb.0: 558; GFX11-MAD-NEXT: s_clause 0x1 559; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 560; GFX11-MAD-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 561; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 562; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) 563; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 564; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) 565; GFX11-MAD-NEXT: s_clause 0x1 566; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[2:3] 567; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[4:5] 568; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) 569; GFX11-MAD-NEXT: v_mul_f32_e32 v1, v1, v2 570; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) 571; GFX11-MAD-NEXT: v_add_f32_e32 v1, 4.0, v1 572; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[0:1] 573; GFX11-MAD-NEXT: s_endpgm 574; 575; GFX940-FMA-LABEL: madak_inline_imm_f32: 576; GFX940-FMA: ; %bb.0: 577; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 578; GFX940-FMA-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 579; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 580; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 581; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) 582; GFX940-FMA-NEXT: global_load_dword v1, v0, s[2:3] 583; GFX940-FMA-NEXT: global_load_dword v2, v0, s[6:7] 584; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) 585; GFX940-FMA-NEXT: v_fma_f32 v1, v1, v2, 4.0 586; GFX940-FMA-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 587; GFX940-FMA-NEXT: s_endpgm 588; 589; GFX10-FMA-LABEL: madak_inline_imm_f32: 590; GFX10-FMA: ; %bb.0: 591; GFX10-FMA-NEXT: s_clause 0x1 592; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 593; GFX10-FMA-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 594; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 595; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) 596; GFX10-FMA-NEXT: s_clause 0x1 597; GFX10-FMA-NEXT: global_load_dword v1, v0, s[2:3] 598; GFX10-FMA-NEXT: global_load_dword v2, v0, s[6:7] 599; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) 600; GFX10-FMA-NEXT: v_fma_f32 v1, v1, v2, 4.0 601; GFX10-FMA-NEXT: global_store_dword v0, v1, s[0:1] 602; GFX10-FMA-NEXT: s_endpgm 603; 604; GFX11-FMA-LABEL: madak_inline_imm_f32: 605; GFX11-FMA: ; %bb.0: 606; GFX11-FMA-NEXT: s_clause 0x1 607; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 608; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 609; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 610; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) 611; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 612; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) 613; GFX11-FMA-NEXT: s_clause 0x1 614; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] 615; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5] 616; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) 617; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, 4.0 618; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] 619; GFX11-FMA-NEXT: s_endpgm 620 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 621 %in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid 622 %in.b.gep = getelementptr float, ptr addrspace(1) %in.b, i32 %tid 623 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid 624 625 %a = load float, ptr addrspace(1) %in.a.gep, align 4 626 %b = load float, ptr addrspace(1) %in.b.gep, align 4 627 628 %mul = fmul float %a, %b 629 %madak = fadd float %mul, 4.0 630 store float %madak, ptr addrspace(1) %out.gep, align 4 631 ret void 632} 633 634; We can't use an SGPR when forming madak 635define amdgpu_kernel void @s_v_madak_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, float %b) #0 { 636; GFX6-LABEL: s_v_madak_f32: 637; GFX6: ; %bb.0: 638; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 639; GFX6-NEXT: s_load_dword s8, s[4:5], 0xd 640; GFX6-NEXT: s_mov_b32 s7, 0xf000 641; GFX6-NEXT: s_mov_b32 s6, 0 642; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 643; GFX6-NEXT: s_waitcnt lgkmcnt(0) 644; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 645; GFX6-NEXT: v_mov_b32_e32 v1, 0 646; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 647; GFX6-NEXT: v_mov_b32_e32 v3, 0x41200000 648; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 649; GFX6-NEXT: s_waitcnt vmcnt(0) 650; GFX6-NEXT: v_mac_f32_e32 v3, s8, v2 651; GFX6-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 652; GFX6-NEXT: s_endpgm 653; 654; GFX8-LABEL: s_v_madak_f32: 655; GFX8: ; %bb.0: 656; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 657; GFX8-NEXT: s_load_dword s4, s[4:5], 0x34 658; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 659; GFX8-NEXT: s_waitcnt lgkmcnt(0) 660; GFX8-NEXT: v_mov_b32_e32 v1, s3 661; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 662; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 663; GFX8-NEXT: flat_load_dword v3, v[0:1] 664; GFX8-NEXT: v_mov_b32_e32 v1, s1 665; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 666; GFX8-NEXT: v_mov_b32_e32 v2, 0x41200000 667; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 668; GFX8-NEXT: s_waitcnt vmcnt(0) 669; GFX8-NEXT: v_mac_f32_e32 v2, s4, v3 670; GFX8-NEXT: flat_store_dword v[0:1], v2 671; GFX8-NEXT: s_endpgm 672; 673; GFX9-LABEL: s_v_madak_f32: 674; GFX9: ; %bb.0: 675; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 676; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34 677; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 678; GFX9-NEXT: v_mov_b32_e32 v2, 0x41200000 679; GFX9-NEXT: s_waitcnt lgkmcnt(0) 680; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 681; GFX9-NEXT: s_waitcnt vmcnt(0) 682; GFX9-NEXT: v_mac_f32_e32 v2, s6, v1 683; GFX9-NEXT: global_store_dword v0, v2, s[0:1] 684; GFX9-NEXT: s_endpgm 685; 686; GFX10-MAD-LABEL: s_v_madak_f32: 687; GFX10-MAD: ; %bb.0: 688; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 689; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 690; GFX10-MAD-NEXT: s_load_dword s4, s[4:5], 0x34 691; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) 692; GFX10-MAD-NEXT: global_load_dword v1, v0, s[2:3] 693; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) 694; GFX10-MAD-NEXT: v_madak_f32 v1, s4, v1, 0x41200000 695; GFX10-MAD-NEXT: global_store_dword v0, v1, s[0:1] 696; GFX10-MAD-NEXT: s_endpgm 697; 698; GFX11-MAD-LABEL: s_v_madak_f32: 699; GFX11-MAD: ; %bb.0: 700; GFX11-MAD-NEXT: s_clause 0x1 701; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 702; GFX11-MAD-NEXT: s_load_b32 s4, s[4:5], 0x34 703; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 704; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) 705; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 706; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) 707; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[2:3] 708; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) 709; GFX11-MAD-NEXT: v_mul_f32_e32 v1, s4, v1 710; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1 711; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[0:1] 712; GFX11-MAD-NEXT: s_endpgm 713; 714; GFX940-FMA-LABEL: s_v_madak_f32: 715; GFX940-FMA: ; %bb.0: 716; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 717; GFX940-FMA-NEXT: s_load_dword s6, s[4:5], 0x34 718; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 719; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 720; GFX940-FMA-NEXT: v_mov_b32_e32 v2, 0x41200000 721; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) 722; GFX940-FMA-NEXT: global_load_dword v1, v0, s[2:3] 723; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) 724; GFX940-FMA-NEXT: v_fmac_f32_e32 v2, s6, v1 725; GFX940-FMA-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 726; GFX940-FMA-NEXT: s_endpgm 727; 728; GFX10-FMA-LABEL: s_v_madak_f32: 729; GFX10-FMA: ; %bb.0: 730; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 731; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 732; GFX10-FMA-NEXT: s_load_dword s4, s[4:5], 0x34 733; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) 734; GFX10-FMA-NEXT: global_load_dword v1, v0, s[2:3] 735; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) 736; GFX10-FMA-NEXT: v_fmaak_f32 v1, s4, v1, 0x41200000 737; GFX10-FMA-NEXT: global_store_dword v0, v1, s[0:1] 738; GFX10-FMA-NEXT: s_endpgm 739; 740; GFX11-FMA-LABEL: s_v_madak_f32: 741; GFX11-FMA: ; %bb.0: 742; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 743; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 744; GFX11-FMA-NEXT: s_load_b32 s4, s[4:5], 0x34 745; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) 746; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 747; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) 748; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] 749; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) 750; GFX11-FMA-NEXT: v_fmaak_f32 v1, s4, v1, 0x41200000 751; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] 752; GFX11-FMA-NEXT: s_endpgm 753 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 754 %in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid 755 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid 756 757 %a = load float, ptr addrspace(1) %in.a.gep, align 4 758 759 %mul = fmul float %a, %b 760 %madak = fadd float %mul, 10.0 761 store float %madak, ptr addrspace(1) %out.gep, align 4 762 ret void 763} 764 765define amdgpu_kernel void @v_s_madak_f32(ptr addrspace(1) noalias %out, float %a, ptr addrspace(1) noalias %in.b) #0 { 766; GFX6-LABEL: v_s_madak_f32: 767; GFX6: ; %bb.0: 768; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 769; GFX6-NEXT: s_mov_b32 s3, 0xf000 770; GFX6-NEXT: s_mov_b32 s2, 0 771; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 772; GFX6-NEXT: v_mov_b32_e32 v1, 0 773; GFX6-NEXT: s_waitcnt lgkmcnt(0) 774; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 775; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb 776; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 777; GFX6-NEXT: v_mov_b32_e32 v3, 0x41200000 778; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 779; GFX6-NEXT: v_mac_f32_e32 v3, s6, v2 780; GFX6-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 781; GFX6-NEXT: s_endpgm 782; 783; GFX8-LABEL: v_s_madak_f32: 784; GFX8: ; %bb.0: 785; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 786; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 787; GFX8-NEXT: s_waitcnt lgkmcnt(0) 788; GFX8-NEXT: v_mov_b32_e32 v1, s1 789; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 790; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 791; GFX8-NEXT: flat_load_dword v3, v[0:1] 792; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 793; GFX8-NEXT: s_load_dword s2, s[4:5], 0x2c 794; GFX8-NEXT: s_waitcnt lgkmcnt(0) 795; GFX8-NEXT: v_mov_b32_e32 v1, s1 796; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 797; GFX8-NEXT: v_mov_b32_e32 v2, 0x41200000 798; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 799; GFX8-NEXT: s_waitcnt vmcnt(0) 800; GFX8-NEXT: v_mac_f32_e32 v2, s2, v3 801; GFX8-NEXT: flat_store_dword v[0:1], v2 802; GFX8-NEXT: s_endpgm 803; 804; GFX9-LABEL: v_s_madak_f32: 805; GFX9: ; %bb.0: 806; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 807; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 808; GFX9-NEXT: v_mov_b32_e32 v2, 0x41200000 809; GFX9-NEXT: s_waitcnt lgkmcnt(0) 810; GFX9-NEXT: global_load_dword v1, v0, s[0:1] 811; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c 812; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 813; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 814; GFX9-NEXT: v_mac_f32_e32 v2, s2, v1 815; GFX9-NEXT: global_store_dword v0, v2, s[0:1] 816; GFX9-NEXT: s_endpgm 817; 818; GFX10-MAD-LABEL: v_s_madak_f32: 819; GFX10-MAD: ; %bb.0: 820; GFX10-MAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 821; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 822; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) 823; GFX10-MAD-NEXT: global_load_dword v1, v0, s[0:1] 824; GFX10-MAD-NEXT: s_clause 0x1 825; GFX10-MAD-NEXT: s_load_dword s2, s[4:5], 0x2c 826; GFX10-MAD-NEXT: s_waitcnt_depctr 0xffe3 827; GFX10-MAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 828; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 829; GFX10-MAD-NEXT: v_madak_f32 v1, s2, v1, 0x41200000 830; GFX10-MAD-NEXT: global_store_dword v0, v1, s[0:1] 831; GFX10-MAD-NEXT: s_endpgm 832; 833; GFX11-MAD-LABEL: v_s_madak_f32: 834; GFX11-MAD: ; %bb.0: 835; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 836; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 837; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) 838; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 839; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) 840; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[0:1] 841; GFX11-MAD-NEXT: s_clause 0x1 842; GFX11-MAD-NEXT: s_load_b32 s2, s[4:5], 0x2c 843; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 844; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 845; GFX11-MAD-NEXT: v_mul_f32_e32 v1, s2, v1 846; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) 847; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1 848; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[0:1] 849; GFX11-MAD-NEXT: s_endpgm 850; 851; GFX940-FMA-LABEL: v_s_madak_f32: 852; GFX940-FMA: ; %bb.0: 853; GFX940-FMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 854; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 855; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 856; GFX940-FMA-NEXT: v_mov_b32_e32 v2, 0x41200000 857; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) 858; GFX940-FMA-NEXT: global_load_dword v1, v0, s[0:1] 859; GFX940-FMA-NEXT: s_load_dword s2, s[4:5], 0x2c 860; GFX940-FMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 861; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 862; GFX940-FMA-NEXT: v_fmac_f32_e32 v2, s2, v1 863; GFX940-FMA-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 864; GFX940-FMA-NEXT: s_endpgm 865; 866; GFX10-FMA-LABEL: v_s_madak_f32: 867; GFX10-FMA: ; %bb.0: 868; GFX10-FMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 869; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 870; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) 871; GFX10-FMA-NEXT: global_load_dword v1, v0, s[0:1] 872; GFX10-FMA-NEXT: s_clause 0x1 873; GFX10-FMA-NEXT: s_load_dword s2, s[4:5], 0x2c 874; GFX10-FMA-NEXT: s_waitcnt_depctr 0xffe3 875; GFX10-FMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 876; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 877; GFX10-FMA-NEXT: v_fmaak_f32 v1, s2, v1, 0x41200000 878; GFX10-FMA-NEXT: global_store_dword v0, v1, s[0:1] 879; GFX10-FMA-NEXT: s_endpgm 880; 881; GFX11-FMA-LABEL: v_s_madak_f32: 882; GFX11-FMA: ; %bb.0: 883; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 884; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 885; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) 886; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 887; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) 888; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[0:1] 889; GFX11-FMA-NEXT: s_clause 0x1 890; GFX11-FMA-NEXT: s_load_b32 s2, s[4:5], 0x2c 891; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 892; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 893; GFX11-FMA-NEXT: v_fmaak_f32 v1, s2, v1, 0x41200000 894; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] 895; GFX11-FMA-NEXT: s_endpgm 896 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 897 %in.b.gep = getelementptr float, ptr addrspace(1) %in.b, i32 %tid 898 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid 899 900 %b = load float, ptr addrspace(1) %in.b.gep, align 4 901 902 %mul = fmul float %a, %b 903 %madak = fadd float %mul, 10.0 904 store float %madak, ptr addrspace(1) %out.gep, align 4 905 ret void 906} 907 908define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float %b) #0 { 909; GFX6-LABEL: s_s_madak_f32: 910; GFX6: ; %bb.0: 911; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 912; GFX6-NEXT: v_mov_b32_e32 v0, 0x41200000 913; GFX6-NEXT: s_mov_b32 s7, 0xf000 914; GFX6-NEXT: s_mov_b32 s6, -1 915; GFX6-NEXT: s_waitcnt lgkmcnt(0) 916; GFX6-NEXT: v_mov_b32_e32 v1, s3 917; GFX6-NEXT: s_mov_b32 s4, s0 918; GFX6-NEXT: s_mov_b32 s5, s1 919; GFX6-NEXT: v_mac_f32_e32 v0, s2, v1 920; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 921; GFX6-NEXT: s_endpgm 922; 923; GFX8-LABEL: s_s_madak_f32: 924; GFX8: ; %bb.0: 925; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 926; GFX8-NEXT: v_mov_b32_e32 v2, 0x41200000 927; GFX8-NEXT: s_waitcnt lgkmcnt(0) 928; GFX8-NEXT: v_mov_b32_e32 v0, s3 929; GFX8-NEXT: v_mac_f32_e32 v2, s2, v0 930; GFX8-NEXT: v_mov_b32_e32 v0, s0 931; GFX8-NEXT: v_mov_b32_e32 v1, s1 932; GFX8-NEXT: flat_store_dword v[0:1], v2 933; GFX8-NEXT: s_endpgm 934; 935; GFX9-LABEL: s_s_madak_f32: 936; GFX9: ; %bb.0: 937; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 938; GFX9-NEXT: v_mov_b32_e32 v1, 0x41200000 939; GFX9-NEXT: v_mov_b32_e32 v0, 0 940; GFX9-NEXT: s_waitcnt lgkmcnt(0) 941; GFX9-NEXT: v_mov_b32_e32 v2, s3 942; GFX9-NEXT: v_mac_f32_e32 v1, s2, v2 943; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 944; GFX9-NEXT: s_endpgm 945; 946; GFX10-MAD-LABEL: s_s_madak_f32: 947; GFX10-MAD: ; %bb.0: 948; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 949; GFX10-MAD-NEXT: v_mov_b32_e32 v1, 0 950; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) 951; GFX10-MAD-NEXT: v_mov_b32_e32 v0, s3 952; GFX10-MAD-NEXT: v_madak_f32 v0, s2, v0, 0x41200000 953; GFX10-MAD-NEXT: global_store_dword v1, v0, s[0:1] 954; GFX10-MAD-NEXT: s_endpgm 955; 956; GFX11-MAD-LABEL: s_s_madak_f32: 957; GFX11-MAD: ; %bb.0: 958; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 959; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) 960; GFX11-MAD-NEXT: v_mul_f32_e64 v0, s2, s3 961; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) 962; GFX11-MAD-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_f32 v0, 0x41200000, v0 963; GFX11-MAD-NEXT: global_store_b32 v1, v0, s[0:1] 964; GFX11-MAD-NEXT: s_endpgm 965; 966; GFX940-FMA-LABEL: s_s_madak_f32: 967; GFX940-FMA: ; %bb.0: 968; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 969; GFX940-FMA-NEXT: v_mov_b32_e32 v1, 0x41200000 970; GFX940-FMA-NEXT: v_mov_b32_e32 v0, 0 971; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) 972; GFX940-FMA-NEXT: v_mov_b32_e32 v2, s3 973; GFX940-FMA-NEXT: v_fmac_f32_e32 v1, s2, v2 974; GFX940-FMA-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 975; GFX940-FMA-NEXT: s_endpgm 976; 977; GFX10-FMA-LABEL: s_s_madak_f32: 978; GFX10-FMA: ; %bb.0: 979; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 980; GFX10-FMA-NEXT: v_mov_b32_e32 v1, 0 981; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) 982; GFX10-FMA-NEXT: v_mov_b32_e32 v0, s3 983; GFX10-FMA-NEXT: v_fmaak_f32 v0, s2, v0, 0x41200000 984; GFX10-FMA-NEXT: global_store_dword v1, v0, s[0:1] 985; GFX10-FMA-NEXT: s_endpgm 986; 987; GFX11-FMA-LABEL: s_s_madak_f32: 988; GFX11-FMA: ; %bb.0: 989; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 990; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) 991; GFX11-FMA-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3 992; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) 993; GFX11-FMA-NEXT: v_fmaak_f32 v0, s2, v0, 0x41200000 994; GFX11-FMA-NEXT: global_store_b32 v1, v0, s[0:1] 995; GFX11-FMA-NEXT: s_endpgm 996 %mul = fmul float %a, %b 997 %madak = fadd float %mul, 10.0 998 store float %madak, ptr addrspace(1) %out, align 4 999 ret void 1000} 1001 1002define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, ptr addrspace(1) noalias %in.b) #0 { 1003; GFX6-LABEL: no_madak_src0_modifier_f32: 1004; GFX6: ; %bb.0: 1005; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1006; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 1007; GFX6-NEXT: s_mov_b32 s7, 0xf000 1008; GFX6-NEXT: s_mov_b32 s6, 0 1009; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1010; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1011; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 1012; GFX6-NEXT: v_mov_b32_e32 v1, 0 1013; GFX6-NEXT: s_mov_b64 s[10:11], s[6:7] 1014; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1015; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 1016; GFX6-NEXT: s_mov_b32 s4, 0x41200000 1017; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 1018; GFX6-NEXT: s_waitcnt vmcnt(0) 1019; GFX6-NEXT: v_mad_f32 v2, |v2|, v3, s4 1020; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 1021; GFX6-NEXT: s_endpgm 1022; 1023; GFX8-LABEL: no_madak_src0_modifier_f32: 1024; GFX8: ; %bb.0: 1025; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1026; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1027; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0 1028; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1029; GFX8-NEXT: v_mov_b32_e32 v1, s3 1030; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v4 1031; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1032; GFX8-NEXT: v_mov_b32_e32 v3, s5 1033; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v4 1034; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1035; GFX8-NEXT: flat_load_dword v5, v[0:1] 1036; GFX8-NEXT: flat_load_dword v2, v[2:3] 1037; GFX8-NEXT: v_mov_b32_e32 v1, s1 1038; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v4 1039; GFX8-NEXT: s_mov_b32 s0, 0x41200000 1040; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1041; GFX8-NEXT: s_waitcnt vmcnt(0) 1042; GFX8-NEXT: v_mad_f32 v2, |v5|, v2, s0 1043; GFX8-NEXT: flat_store_dword v[0:1], v2 1044; GFX8-NEXT: s_endpgm 1045; 1046; GFX9-LABEL: no_madak_src0_modifier_f32: 1047; GFX9: ; %bb.0: 1048; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1049; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1050; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1051; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1052; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1053; GFX9-NEXT: global_load_dword v2, v0, s[6:7] 1054; GFX9-NEXT: s_mov_b32 s2, 0x41200000 1055; GFX9-NEXT: s_waitcnt vmcnt(0) 1056; GFX9-NEXT: v_mad_f32 v1, |v1|, v2, s2 1057; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1058; GFX9-NEXT: s_endpgm 1059; 1060; GFX10-MAD-LABEL: no_madak_src0_modifier_f32: 1061; GFX10-MAD: ; %bb.0: 1062; GFX10-MAD-NEXT: s_clause 0x1 1063; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1064; GFX10-MAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1065; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1066; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) 1067; GFX10-MAD-NEXT: s_clause 0x1 1068; GFX10-MAD-NEXT: global_load_dword v1, v0, s[2:3] 1069; GFX10-MAD-NEXT: global_load_dword v2, v0, s[6:7] 1070; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) 1071; GFX10-MAD-NEXT: v_mad_f32 v1, |v1|, v2, 0x41200000 1072; GFX10-MAD-NEXT: global_store_dword v0, v1, s[0:1] 1073; GFX10-MAD-NEXT: s_endpgm 1074; 1075; GFX11-MAD-LABEL: no_madak_src0_modifier_f32: 1076; GFX11-MAD: ; %bb.0: 1077; GFX11-MAD-NEXT: s_clause 0x1 1078; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1079; GFX11-MAD-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1080; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1081; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) 1082; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1083; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) 1084; GFX11-MAD-NEXT: s_clause 0x1 1085; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[2:3] 1086; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[4:5] 1087; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) 1088; GFX11-MAD-NEXT: v_mul_f32_e64 v1, |v1|, v2 1089; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) 1090; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1 1091; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[0:1] 1092; GFX11-MAD-NEXT: s_endpgm 1093; 1094; GFX940-FMA-LABEL: no_madak_src0_modifier_f32: 1095; GFX940-FMA: ; %bb.0: 1096; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1097; GFX940-FMA-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1098; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1099; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1100; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) 1101; GFX940-FMA-NEXT: global_load_dword v1, v0, s[2:3] 1102; GFX940-FMA-NEXT: global_load_dword v2, v0, s[6:7] 1103; GFX940-FMA-NEXT: s_mov_b32 s2, 0x41200000 1104; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) 1105; GFX940-FMA-NEXT: v_fma_f32 v1, |v1|, v2, s2 1106; GFX940-FMA-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 1107; GFX940-FMA-NEXT: s_endpgm 1108; 1109; GFX10-FMA-LABEL: no_madak_src0_modifier_f32: 1110; GFX10-FMA: ; %bb.0: 1111; GFX10-FMA-NEXT: s_clause 0x1 1112; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1113; GFX10-FMA-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1114; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1115; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) 1116; GFX10-FMA-NEXT: s_clause 0x1 1117; GFX10-FMA-NEXT: global_load_dword v1, v0, s[2:3] 1118; GFX10-FMA-NEXT: global_load_dword v2, v0, s[6:7] 1119; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) 1120; GFX10-FMA-NEXT: v_fma_f32 v1, |v1|, v2, 0x41200000 1121; GFX10-FMA-NEXT: global_store_dword v0, v1, s[0:1] 1122; GFX10-FMA-NEXT: s_endpgm 1123; 1124; GFX11-FMA-LABEL: no_madak_src0_modifier_f32: 1125; GFX11-FMA: ; %bb.0: 1126; GFX11-FMA-NEXT: s_clause 0x1 1127; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1128; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1129; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1130; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) 1131; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1132; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) 1133; GFX11-FMA-NEXT: s_clause 0x1 1134; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] 1135; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5] 1136; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) 1137; GFX11-FMA-NEXT: v_fma_f32 v1, |v1|, v2, 0x41200000 1138; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] 1139; GFX11-FMA-NEXT: s_endpgm 1140 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 1141 %in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid 1142 %in.b.gep = getelementptr float, ptr addrspace(1) %in.b, i32 %tid 1143 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid 1144 1145 %a = load float, ptr addrspace(1) %in.a.gep, align 4 1146 %b = load float, ptr addrspace(1) %in.b.gep, align 4 1147 1148 %a.fabs = call float @llvm.fabs.f32(float %a) nounwind readnone 1149 1150 %mul = fmul float %a.fabs, %b 1151 %madak = fadd float %mul, 10.0 1152 store float %madak, ptr addrspace(1) %out.gep, align 4 1153 ret void 1154} 1155 1156define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, ptr addrspace(1) noalias %in.b) #0 { 1157; GFX6-LABEL: no_madak_src1_modifier_f32: 1158; GFX6: ; %bb.0: 1159; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1160; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 1161; GFX6-NEXT: s_mov_b32 s7, 0xf000 1162; GFX6-NEXT: s_mov_b32 s6, 0 1163; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1164; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1165; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 1166; GFX6-NEXT: v_mov_b32_e32 v1, 0 1167; GFX6-NEXT: s_mov_b64 s[10:11], s[6:7] 1168; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1169; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 1170; GFX6-NEXT: s_mov_b32 s4, 0x41200000 1171; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 1172; GFX6-NEXT: s_waitcnt vmcnt(0) 1173; GFX6-NEXT: v_mad_f32 v2, v2, |v3|, s4 1174; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 1175; GFX6-NEXT: s_endpgm 1176; 1177; GFX8-LABEL: no_madak_src1_modifier_f32: 1178; GFX8: ; %bb.0: 1179; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1180; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1181; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0 1182; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1183; GFX8-NEXT: v_mov_b32_e32 v1, s3 1184; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v4 1185; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1186; GFX8-NEXT: v_mov_b32_e32 v3, s5 1187; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v4 1188; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1189; GFX8-NEXT: flat_load_dword v5, v[0:1] 1190; GFX8-NEXT: flat_load_dword v2, v[2:3] 1191; GFX8-NEXT: v_mov_b32_e32 v1, s1 1192; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v4 1193; GFX8-NEXT: s_mov_b32 s0, 0x41200000 1194; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1195; GFX8-NEXT: s_waitcnt vmcnt(0) 1196; GFX8-NEXT: v_mad_f32 v2, v5, |v2|, s0 1197; GFX8-NEXT: flat_store_dword v[0:1], v2 1198; GFX8-NEXT: s_endpgm 1199; 1200; GFX9-LABEL: no_madak_src1_modifier_f32: 1201; GFX9: ; %bb.0: 1202; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1203; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1204; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1205; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1206; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1207; GFX9-NEXT: global_load_dword v2, v0, s[6:7] 1208; GFX9-NEXT: s_mov_b32 s2, 0x41200000 1209; GFX9-NEXT: s_waitcnt vmcnt(0) 1210; GFX9-NEXT: v_mad_f32 v1, v1, |v2|, s2 1211; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1212; GFX9-NEXT: s_endpgm 1213; 1214; GFX10-MAD-LABEL: no_madak_src1_modifier_f32: 1215; GFX10-MAD: ; %bb.0: 1216; GFX10-MAD-NEXT: s_clause 0x1 1217; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1218; GFX10-MAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1219; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1220; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) 1221; GFX10-MAD-NEXT: s_clause 0x1 1222; GFX10-MAD-NEXT: global_load_dword v1, v0, s[2:3] 1223; GFX10-MAD-NEXT: global_load_dword v2, v0, s[6:7] 1224; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) 1225; GFX10-MAD-NEXT: v_mad_f32 v1, v1, |v2|, 0x41200000 1226; GFX10-MAD-NEXT: global_store_dword v0, v1, s[0:1] 1227; GFX10-MAD-NEXT: s_endpgm 1228; 1229; GFX11-MAD-LABEL: no_madak_src1_modifier_f32: 1230; GFX11-MAD: ; %bb.0: 1231; GFX11-MAD-NEXT: s_clause 0x1 1232; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1233; GFX11-MAD-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1234; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1235; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) 1236; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1237; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) 1238; GFX11-MAD-NEXT: s_clause 0x1 1239; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[2:3] 1240; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[4:5] 1241; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) 1242; GFX11-MAD-NEXT: v_mul_f32_e64 v1, v1, |v2| 1243; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) 1244; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1 1245; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[0:1] 1246; GFX11-MAD-NEXT: s_endpgm 1247; 1248; GFX940-FMA-LABEL: no_madak_src1_modifier_f32: 1249; GFX940-FMA: ; %bb.0: 1250; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1251; GFX940-FMA-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1252; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1253; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1254; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) 1255; GFX940-FMA-NEXT: global_load_dword v1, v0, s[2:3] 1256; GFX940-FMA-NEXT: global_load_dword v2, v0, s[6:7] 1257; GFX940-FMA-NEXT: s_mov_b32 s2, 0x41200000 1258; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) 1259; GFX940-FMA-NEXT: v_fma_f32 v1, v1, |v2|, s2 1260; GFX940-FMA-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 1261; GFX940-FMA-NEXT: s_endpgm 1262; 1263; GFX10-FMA-LABEL: no_madak_src1_modifier_f32: 1264; GFX10-FMA: ; %bb.0: 1265; GFX10-FMA-NEXT: s_clause 0x1 1266; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1267; GFX10-FMA-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1268; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1269; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) 1270; GFX10-FMA-NEXT: s_clause 0x1 1271; GFX10-FMA-NEXT: global_load_dword v1, v0, s[2:3] 1272; GFX10-FMA-NEXT: global_load_dword v2, v0, s[6:7] 1273; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) 1274; GFX10-FMA-NEXT: v_fma_f32 v1, v1, |v2|, 0x41200000 1275; GFX10-FMA-NEXT: global_store_dword v0, v1, s[0:1] 1276; GFX10-FMA-NEXT: s_endpgm 1277; 1278; GFX11-FMA-LABEL: no_madak_src1_modifier_f32: 1279; GFX11-FMA: ; %bb.0: 1280; GFX11-FMA-NEXT: s_clause 0x1 1281; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1282; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1283; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1284; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) 1285; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1286; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) 1287; GFX11-FMA-NEXT: s_clause 0x1 1288; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] 1289; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5] 1290; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) 1291; GFX11-FMA-NEXT: v_fma_f32 v1, v1, |v2|, 0x41200000 1292; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] 1293; GFX11-FMA-NEXT: s_endpgm 1294 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 1295 %in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid 1296 %in.b.gep = getelementptr float, ptr addrspace(1) %in.b, i32 %tid 1297 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid 1298 1299 %a = load float, ptr addrspace(1) %in.a.gep, align 4 1300 %b = load float, ptr addrspace(1) %in.b.gep, align 4 1301 1302 %b.fabs = call float @llvm.fabs.f32(float %b) nounwind readnone 1303 1304 %mul = fmul float %a, %b.fabs 1305 %madak = fadd float %mul, 10.0 1306 store float %madak, ptr addrspace(1) %out.gep, align 4 1307 ret void 1308} 1309 1310; SIFoldOperands should not fold the SGPR copy into the instruction before GFX10 1311; because the implicit immediate already uses the constant bus. 1312; On GFX10+ we can use two scalar operands. 1313define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], float %sgpr0, float %sgpr1) #0 { 1314; GFX6-LABEL: madak_constant_bus_violation: 1315; GFX6: ; %bb.0: ; %bb 1316; GFX6-NEXT: s_load_dword s0, s[4:5], 0x9 1317; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1318; GFX6-NEXT: s_cmp_lg_u32 s0, 0 1319; GFX6-NEXT: s_cbranch_scc1 .LBB9_2 1320; GFX6-NEXT: ; %bb.1: ; %bb3 1321; GFX6-NEXT: s_mov_b32 s3, 0xf000 1322; GFX6-NEXT: s_mov_b32 s2, -1 1323; GFX6-NEXT: v_mov_b32_e32 v0, 0 1324; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 1325; GFX6-NEXT: s_waitcnt vmcnt(0) 1326; GFX6-NEXT: .LBB9_2: ; %bb4 1327; GFX6-NEXT: s_mov_b32 s3, 0xf000 1328; GFX6-NEXT: s_mov_b32 s2, -1 1329; GFX6-NEXT: s_waitcnt expcnt(0) 1330; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc 1331; GFX6-NEXT: s_waitcnt vmcnt(0) 1332; GFX6-NEXT: s_load_dword s0, s[4:5], 0x12 1333; GFX6-NEXT: v_mov_b32_e32 v1, 0x42280000 1334; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1335; GFX6-NEXT: v_mac_f32_e64 v1, s0, 0.5 1336; GFX6-NEXT: v_mul_f32_e32 v0, v1, v0 1337; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 1338; GFX6-NEXT: s_waitcnt vmcnt(0) 1339; GFX6-NEXT: s_endpgm 1340; 1341; GFX8-LABEL: madak_constant_bus_violation: 1342; GFX8: ; %bb.0: ; %bb 1343; GFX8-NEXT: s_load_dword s0, s[4:5], 0x24 1344; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1345; GFX8-NEXT: s_cmp_lg_u32 s0, 0 1346; GFX8-NEXT: s_cbranch_scc1 .LBB9_2 1347; GFX8-NEXT: ; %bb.1: ; %bb3 1348; GFX8-NEXT: v_mov_b32_e32 v0, 0 1349; GFX8-NEXT: flat_store_dword v[0:1], v0 1350; GFX8-NEXT: s_waitcnt vmcnt(0) 1351; GFX8-NEXT: .LBB9_2: ; %bb4 1352; GFX8-NEXT: flat_load_dword v0, v[0:1] glc 1353; GFX8-NEXT: s_waitcnt vmcnt(0) 1354; GFX8-NEXT: s_load_dword s0, s[4:5], 0x48 1355; GFX8-NEXT: v_mov_b32_e32 v1, 0x42280000 1356; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1357; GFX8-NEXT: v_mac_f32_e64 v1, s0, 0.5 1358; GFX8-NEXT: v_mul_f32_e32 v0, v1, v0 1359; GFX8-NEXT: flat_store_dword v[0:1], v0 1360; GFX8-NEXT: s_waitcnt vmcnt(0) 1361; GFX8-NEXT: s_endpgm 1362; 1363; GFX9-LABEL: madak_constant_bus_violation: 1364; GFX9: ; %bb.0: ; %bb 1365; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 1366; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1367; GFX9-NEXT: s_cmp_lg_u32 s0, 0 1368; GFX9-NEXT: s_cbranch_scc1 .LBB9_2 1369; GFX9-NEXT: ; %bb.1: ; %bb3 1370; GFX9-NEXT: v_mov_b32_e32 v0, 0 1371; GFX9-NEXT: global_store_dword v[0:1], v0, off 1372; GFX9-NEXT: s_waitcnt vmcnt(0) 1373; GFX9-NEXT: .LBB9_2: ; %bb4 1374; GFX9-NEXT: global_load_dword v0, v[0:1], off glc 1375; GFX9-NEXT: s_waitcnt vmcnt(0) 1376; GFX9-NEXT: s_load_dword s0, s[4:5], 0x48 1377; GFX9-NEXT: v_mov_b32_e32 v1, 0x42280000 1378; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1379; GFX9-NEXT: v_mac_f32_e64 v1, s0, 0.5 1380; GFX9-NEXT: v_mul_f32_e32 v0, v1, v0 1381; GFX9-NEXT: global_store_dword v[0:1], v0, off 1382; GFX9-NEXT: s_waitcnt vmcnt(0) 1383; GFX9-NEXT: s_endpgm 1384; 1385; GFX10-MAD-LABEL: madak_constant_bus_violation: 1386; GFX10-MAD: ; %bb.0: ; %bb 1387; GFX10-MAD-NEXT: s_load_dword s0, s[4:5], 0x24 1388; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) 1389; GFX10-MAD-NEXT: s_cmp_lg_u32 s0, 0 1390; GFX10-MAD-NEXT: s_cbranch_scc1 .LBB9_2 1391; GFX10-MAD-NEXT: ; %bb.1: ; %bb3 1392; GFX10-MAD-NEXT: v_mov_b32_e32 v0, 0 1393; GFX10-MAD-NEXT: global_store_dword v[0:1], v0, off 1394; GFX10-MAD-NEXT: s_waitcnt_vscnt null, 0x0 1395; GFX10-MAD-NEXT: .LBB9_2: ; %bb4 1396; GFX10-MAD-NEXT: global_load_dword v0, v[0:1], off glc dlc 1397; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) 1398; GFX10-MAD-NEXT: s_load_dword s0, s[4:5], 0x48 1399; GFX10-MAD-NEXT: v_mov_b32_e32 v1, 0.5 1400; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) 1401; GFX10-MAD-NEXT: v_madak_f32 v1, s0, v1, 0x42280000 1402; GFX10-MAD-NEXT: v_mul_f32_e32 v0, v1, v0 1403; GFX10-MAD-NEXT: global_store_dword v[0:1], v0, off 1404; GFX10-MAD-NEXT: s_waitcnt_vscnt null, 0x0 1405; GFX10-MAD-NEXT: s_endpgm 1406; 1407; GFX11-MAD-LABEL: madak_constant_bus_violation: 1408; GFX11-MAD: ; %bb.0: ; %bb 1409; GFX11-MAD-NEXT: s_load_b32 s0, s[4:5], 0x24 1410; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) 1411; GFX11-MAD-NEXT: s_cmp_lg_u32 s0, 0 1412; GFX11-MAD-NEXT: s_cbranch_scc1 .LBB9_2 1413; GFX11-MAD-NEXT: ; %bb.1: ; %bb3 1414; GFX11-MAD-NEXT: v_mov_b32_e32 v0, 0 1415; GFX11-MAD-NEXT: global_store_b32 v[0:1], v0, off dlc 1416; GFX11-MAD-NEXT: s_waitcnt_vscnt null, 0x0 1417; GFX11-MAD-NEXT: .LBB9_2: ; %bb4 1418; GFX11-MAD-NEXT: global_load_b32 v0, v[0:1], off glc dlc 1419; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) 1420; GFX11-MAD-NEXT: s_load_b32 s0, s[4:5], 0x48 1421; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) 1422; GFX11-MAD-NEXT: v_mul_f32_e64 v1, s0, 0.5 1423; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1424; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x42280000, v1 1425; GFX11-MAD-NEXT: v_mul_f32_e32 v0, v1, v0 1426; GFX11-MAD-NEXT: global_store_b32 v[0:1], v0, off dlc 1427; GFX11-MAD-NEXT: s_waitcnt_vscnt null, 0x0 1428; GFX11-MAD-NEXT: s_endpgm 1429; 1430; GFX940-FMA-LABEL: madak_constant_bus_violation: 1431; GFX940-FMA: ; %bb.0: ; %bb 1432; GFX940-FMA-NEXT: s_load_dword s0, s[4:5], 0x24 1433; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) 1434; GFX940-FMA-NEXT: s_cmp_lg_u32 s0, 0 1435; GFX940-FMA-NEXT: s_cbranch_scc1 .LBB9_2 1436; GFX940-FMA-NEXT: ; %bb.1: ; %bb3 1437; GFX940-FMA-NEXT: v_mov_b32_e32 v0, 0 1438; GFX940-FMA-NEXT: global_store_dword v[0:1], v0, off sc0 sc1 1439; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) 1440; GFX940-FMA-NEXT: .LBB9_2: ; %bb4 1441; GFX940-FMA-NEXT: global_load_dword v0, v[0:1], off sc0 sc1 1442; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) 1443; GFX940-FMA-NEXT: s_load_dword s0, s[4:5], 0x48 1444; GFX940-FMA-NEXT: v_mov_b32_e32 v1, 0x42280000 1445; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) 1446; GFX940-FMA-NEXT: v_fmac_f32_e64 v1, s0, 0.5 1447; GFX940-FMA-NEXT: v_mul_f32_e32 v0, v1, v0 1448; GFX940-FMA-NEXT: global_store_dword v[0:1], v0, off sc0 sc1 1449; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) 1450; GFX940-FMA-NEXT: s_endpgm 1451; 1452; GFX10-FMA-LABEL: madak_constant_bus_violation: 1453; GFX10-FMA: ; %bb.0: ; %bb 1454; GFX10-FMA-NEXT: s_load_dword s0, s[4:5], 0x24 1455; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) 1456; GFX10-FMA-NEXT: s_cmp_lg_u32 s0, 0 1457; GFX10-FMA-NEXT: s_cbranch_scc1 .LBB9_2 1458; GFX10-FMA-NEXT: ; %bb.1: ; %bb3 1459; GFX10-FMA-NEXT: v_mov_b32_e32 v0, 0 1460; GFX10-FMA-NEXT: global_store_dword v[0:1], v0, off 1461; GFX10-FMA-NEXT: s_waitcnt_vscnt null, 0x0 1462; GFX10-FMA-NEXT: .LBB9_2: ; %bb4 1463; GFX10-FMA-NEXT: global_load_dword v0, v[0:1], off glc dlc 1464; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) 1465; GFX10-FMA-NEXT: s_load_dword s0, s[4:5], 0x48 1466; GFX10-FMA-NEXT: v_mov_b32_e32 v1, 0.5 1467; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) 1468; GFX10-FMA-NEXT: v_fmaak_f32 v1, s0, v1, 0x42280000 1469; GFX10-FMA-NEXT: v_mul_f32_e32 v0, v1, v0 1470; GFX10-FMA-NEXT: global_store_dword v[0:1], v0, off 1471; GFX10-FMA-NEXT: s_waitcnt_vscnt null, 0x0 1472; GFX10-FMA-NEXT: s_endpgm 1473; 1474; GFX11-FMA-LABEL: madak_constant_bus_violation: 1475; GFX11-FMA: ; %bb.0: ; %bb 1476; GFX11-FMA-NEXT: s_load_b32 s0, s[4:5], 0x24 1477; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) 1478; GFX11-FMA-NEXT: s_cmp_lg_u32 s0, 0 1479; GFX11-FMA-NEXT: s_cbranch_scc1 .LBB9_2 1480; GFX11-FMA-NEXT: ; %bb.1: ; %bb3 1481; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 1482; GFX11-FMA-NEXT: global_store_b32 v[0:1], v0, off dlc 1483; GFX11-FMA-NEXT: s_waitcnt_vscnt null, 0x0 1484; GFX11-FMA-NEXT: .LBB9_2: ; %bb4 1485; GFX11-FMA-NEXT: global_load_b32 v0, v[0:1], off glc dlc 1486; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) 1487; GFX11-FMA-NEXT: s_load_b32 s0, s[4:5], 0x48 1488; GFX11-FMA-NEXT: v_mov_b32_e32 v1, 0.5 1489; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) 1490; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1491; GFX11-FMA-NEXT: v_fmaak_f32 v1, s0, v1, 0x42280000 1492; GFX11-FMA-NEXT: v_mul_f32_e32 v0, v1, v0 1493; GFX11-FMA-NEXT: global_store_b32 v[0:1], v0, off dlc 1494; GFX11-FMA-NEXT: s_waitcnt_vscnt null, 0x0 1495; GFX11-FMA-NEXT: s_endpgm 1496bb: 1497 %tmp = icmp eq i32 %arg1, 0 1498 br i1 %tmp, label %bb3, label %bb4 1499 1500bb3: 1501 store volatile float 0.0, ptr addrspace(1) undef 1502 br label %bb4 1503 1504bb4: 1505 %vgpr = load volatile float, ptr addrspace(1) undef 1506 %tmp0 = fmul float %sgpr0, 0.5 1507 %tmp1 = fadd float %tmp0, 42.0 1508 %tmp2 = fmul float %tmp1, %vgpr 1509 store volatile float %tmp2, ptr addrspace(1) undef, align 4 1510 ret void 1511} 1512 1513attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } 1514